TidyUp.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Tidy up and remove unwanted wiki markup
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import os, sys
  9. import re
  10. # Regular expressions for parsing the XML
  11. subs = [
  12. # surely the code has one level of escapes so can just globally substitute
  13. (re.compile(r'&lt;', re.IGNORECASE), r'<'),
  14. (re.compile(r'&gt;', re.IGNORECASE), r'>'),
  15. (re.compile(r'&quot;', re.IGNORECASE), r'"'),
  16. # &amp; must be last
  17. (re.compile(r'&amp;', re.IGNORECASE), r'&'),
  18. # remove external links
  19. (re.compile(r'\s*(==\s*External\s+links\s*==.*)' + '\n\n', re.IGNORECASE + re.DOTALL), ''),
  20. # remove pictures
  21. (re.compile(r'\s*<gallery>.*?</gallery>', re.IGNORECASE + re.DOTALL), ''),
  22. # remove references
  23. (re.compile(r'<ref\s+name.*?/>', re.IGNORECASE), ''),
  24. # remove comments and multi-line references
  25. (re.compile(r'(<!--.*?-->)|(<ref.*?</ref>)', re.IGNORECASE + re.DOTALL), ''),
  26. # change br to newline
  27. (re.compile(r'<br[\s"a-zA-Z0-9=]*/?>', re.IGNORECASE), '\n'),
  28. # Wikipedia's installed Parser extension tags
  29. # <categorytree>, <charinsert>, <hiero>, <imagemap>, <inputbox>, <poem>,
  30. # <pre>, <ref>, <references>, <source>, <syntaxhighlight> and <timeline>
  31. # All referenced using special characters
  32. # Remove some of these
  33. (re.compile(r'\s*<timeline>.*?</timeline>', re.IGNORECASE + re.DOTALL), ''),
  34. (re.compile(r'\s*<imagemap>.*?</imagemap>', re.IGNORECASE + re.DOTALL), ''),
  35. (re.compile(r'\s*<noinclude>.*?</noinclude>', re.IGNORECASE + re.DOTALL), ''),
  36. (re.compile(r'<references[\s"a-zA-Z0-9=]*/?>', re.IGNORECASE), ''),
  37. # remove div
  38. (re.compile(r'<div\s+style="clear:\s+both;">\s*</div>', re.IGNORECASE), ''),
  39. # remove unwanted tags
  40. (re.compile(r'</?\s*(poem|source|pre)\s*>', re.IGNORECASE), ''),
  41. # fix broken lists <li/> -> </li>
  42. (re.compile(r'<li\s*/>', re.IGNORECASE), r'</li>'),
  43. # change % so php: wr_parser_sa does not convert them
  44. (re.compile(r'%', re.IGNORECASE), r'%25'),
  45. ]
  46. def tidy(text):
  47. """Private: generic tidy up routine"""
  48. global subs
  49. # convert to unicode errors substituting bad sequences to '�'
  50. if unicode != type(text):
  51. while True:
  52. try:
  53. text = unicode(text, 'utf-8')
  54. break
  55. except UnicodeDecodeError, error:
  56. (_, _, start, stop, _) = error
  57. text = text[:start] + '\xef\xbf\xbd' + text[stop:]
  58. text = text.strip().strip(u'\u200e\u200f')
  59. for e,r in subs:
  60. text = e.sub(r, text)
  61. return text
  62. def article(text):
  63. """Tidy up article text"""
  64. return tidy(text)
  65. def template(text):
  66. """Tidy up template text"""
  67. return tidy(text)
  68. def main():
  69. """reserved for tests"""
  70. pass
  71. # run the program
  72. if __name__ == "__main__":
  73. main()