littleparser.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Converting entities to unicode
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. from HTMLParser import HTMLParser
  9. import unicodedata
  10. import htmlentitydefs
  11. import re
  12. import codecs
  13. import sys
  14. entities = re.compile(r'&amp;([a-zA-Z]{2,8});', re.IGNORECASE)
  15. lessthan = re.compile(r'<')
  16. class LittleParser(HTMLParser):
  17. """Translate text
  18. handles all of these:
  19. &eacute;
  20. &#1234;
  21. &amp;mu;
  22. """
  23. def __init__ (self):
  24. HTMLParser.__init__(self)
  25. self.buffer = u''
  26. def handle_charref(self, name):
  27. self.buffer += unichr(int(name))
  28. def handle_entityref(self, name):
  29. self.buffer += unichr(htmlentitydefs.name2codepoint[name])
  30. def handle_data(self, data):
  31. if type(data) == unicode:
  32. self.buffer += data
  33. else:
  34. self.buffer += unicode(data, 'utf-8')
  35. def translate(self, text):
  36. global entities
  37. if type(text) != unicode:
  38. text = unicode(text, 'utf-8')
  39. self.reset()
  40. self.buffer = u''
  41. unq = entities.sub(r'&\1;', text)
  42. unq = lessthan.sub(r'&lt;', unq)
  43. try:
  44. self.feed(unq)
  45. self.close()
  46. except KeyError:
  47. #print('failed on: "{0!r:s}" using-> "{1:!r:s}"'.format(text, unq))
  48. return unq
  49. if type(self.buffer) == unicode:
  50. return self.buffer.strip()
  51. return unicode(self.buffer, 'utf-8').strip()
  52. # tests
  53. def main():
  54. p = LittleParser().translate
  55. text = '''
  56. start test:
  57. [&egrave;] [&#1234;] [&eacute;] [%20] [%ff] [&nbsp;]
  58. [&mu;] [&amp;mu;] [&lt;/br/&gt;] [&egrave;] [</br/>]
  59. [&lt;noinclude&gt;]
  60. end:test
  61. '''
  62. correct = u'''
  63. start test:
  64. [\xe8] [\u04d2] [\xe9] [%20] [%ff] [\xa0]
  65. [\u03bc] [\u03bc] [</br/>] [\xe8] [</br/>]
  66. [<noinclude>]
  67. end:test
  68. '''
  69. result = p(text)
  70. print('Text: {0:s}'.format(text))
  71. print('Result: {0:s}'.format(result.encode('utf-8')))
  72. print('Repr: {0!r:s}'.format(result))
  73. if correct == result:
  74. print('PASS:')
  75. else:
  76. print('FAIL: mismatch')
  77. # run the program
  78. if __name__ == "__main__":
  79. main()