| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | |
|---|
| 3 | # 2011 Steffen Hoffmann |
|---|
| 4 | |
|---|
| 5 | import htmlentitydefs |
|---|
| 6 | import re |
|---|
| 7 | import unittest |
|---|
| 8 | |
|---|
| 9 | from codecs import getencoder |
|---|
| 10 | |
|---|
| 11 | from trac.resource import ResourceSystem |
|---|
| 12 | from trac.util.text import to_unicode |
|---|
| 13 | |
|---|
| 14 | from compat import json |
|---|
| 15 | |
|---|
| 16 | __all__ = ['resource_from_page', 'xml_escape', 'xml_unescape'] |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | # code from an article published by Uche Ogbuji on 15-Jun-2005 at |
|---|
| 20 | # http://www.xml.com/pub/a/2005/06/15/py-xml.html |
|---|
| 21 | def xml_escape(text): |
|---|
| 22 | enc = getencoder('us-ascii') |
|---|
| 23 | return enc(to_unicode(text), 'xmlcharrefreplace')[0] |
|---|
| 24 | |
|---|
| 25 | |
|---|
| 26 | # adapted from code published by John J. Lee on 06-Jun-2007 at |
|---|
| 27 | # http://www.velocityreviews.com/forums |
|---|
| 28 | # /t511850-how-do-you-htmlentities-in-python.html |
|---|
| 29 | unichresc_RE = re.compile(r'&#?[A-Za-z0-9]+?;') |
|---|
| 30 | |
|---|
| 31 | def xml_unescape(text): |
|---|
| 32 | return unichresc_RE.sub(_replace_entities, text) |
|---|
| 33 | |
|---|
| 34 | def _unescape_charref(ref): |
|---|
| 35 | name = ref[2:-1] |
|---|
| 36 | base = 10 |
|---|
| 37 | # DEVEL: gain 20 % performance by omitting hex references |
|---|
| 38 | if name.startswith("x"): |
|---|
| 39 | name = name[1:] |
|---|
| 40 | base = 16 |
|---|
| 41 | return unichr(int(name, base)) |
|---|
| 42 | |
|---|
| 43 | def _replace_entities(match): |
|---|
| 44 | ent = match.group() |
|---|
| 45 | if ent[1] == "#": |
|---|
| 46 | return _unescape_charref(ent) |
|---|
| 47 | repl = htmlentitydefs.name2codepoint.get(ent[1:-1]) |
|---|
| 48 | if repl is not None: |
|---|
| 49 | repl = unichr(repl) |
|---|
| 50 | else: |
|---|
| 51 | repl = ent |
|---|
| 52 | return repl |
|---|
| 53 | |
|---|
| 54 | def resource_from_page(env, page): |
|---|
| 55 | resource_realm = None |
|---|
| 56 | resources = ResourceSystem(env) |
|---|
| 57 | for realm in resources.get_known_realms(): |
|---|
| 58 | if page.startswith('/' + realm): |
|---|
| 59 | resource_realm = realm |
|---|
| 60 | break |
|---|
| 61 | if resource_realm is not None: |
|---|
| 62 | return (resource_realm, |
|---|
| 63 | re.sub('/' + resource_realm, '', page).lstrip('/')) |
|---|
| 64 | else: |
|---|
| 65 | return page, None |
|---|
| 66 | |
|---|
| 67 | |
|---|
| 68 | class UnescapeTests(unittest.TestCase): |
|---|
| 69 | |
|---|
| 70 | def test_unescape_charref(self): |
|---|
| 71 | self.assertEqual(unescape_charref(u"&"), u"&") |
|---|
| 72 | self.assertEqual(unescape_charref(u"—"), u"\N{EM DASH}") |
|---|
| 73 | self.assertEqual(unescape_charref(u"—"), u"\N{EM DASH}") |
|---|
| 74 | |
|---|
| 75 | def test_unescape(self): |
|---|
| 76 | self.assertEqual(unescape(u"& < — — —"), |
|---|
| 77 | u"& < %s %s %s" % tuple(u"\N{EM DASH}"*3) |
|---|
| 78 | ) |
|---|
| 79 | self.assertEqual(unescape(u"&a&"), u"&a&") |
|---|
| 80 | self.assertEqual(unescape(u"a&"), u"a&") |
|---|
| 81 | self.assertEqual(unescape(u"&nonexistent;"), u"&nonexistent;") |
|---|
| 82 | |
|---|
| 83 | # unittest.main() |
|---|
| 84 | |
|---|