FileScanner.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Read an process the XML file
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import sys, os
  9. import re
  10. import getopt
  11. import FilterWords
  12. import RedirectedTo
  13. class FileScanner(object):
  14. def __init__(self, *args, **kw):
  15. #super(FileScanner, self).__init__(*args, **kw)
  16. super(FileScanner, self).__init__()
  17. self.file_list = []
  18. self.category_to_key = {}
  19. self.key_to_category = {}
  20. self.current_file_id = -1 # no file yet
  21. KEY_ARTICLE = 0
  22. KEY_TEMPLATE = 10
  23. def file_id(self):
  24. return self.current_file_id
  25. def current_filename(self):
  26. return self.file_list[self.current_file_id]
  27. def all_file_names(self):
  28. return self.file_list
  29. def namespace(self, key, text):
  30. pass
  31. def title(self, category, key, title, seek):
  32. return True
  33. def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
  34. pass
  35. def body(self, category, key, title, text, seek):
  36. pass
  37. namespaces_start = '<namespaces>'
  38. namespaces_end = '</namespaces>'
  39. namespace_start = '<namespace key="'
  40. namespace_stop = '/>'
  41. namespace_cont = '">'
  42. namespace_end = '</namespace>'
  43. title_start = '<title>'
  44. title_end = '</title>'
  45. title_end_len = len(title_end)
  46. text_start = '<text '
  47. text_cont = 'xml:space="preserve">'
  48. text_stop = '/>'
  49. text_end = '</text>'
  50. # en: redirect: <text.....#redirect.....[[title#relative link]].....
  51. # es: redirección ""
  52. #redirected_to = re.compile(r'#\s*(redirect|redirecci..n)[^\[]*\[\[(.*?)([#|].*?)?\]\]', re.IGNORECASE)
  53. StateMachine = {
  54. 'start': [
  55. (namespaces_start, len(namespaces_start), 'category_start', 'spaces'),
  56. (title_start, len(title_start), 'drop', 'title'),
  57. ],
  58. 'spaces': [
  59. (namespaces_end, len(namespaces_end), 'drop', 'start'),
  60. (namespace_start, len(namespace_start), 'drop', 'key'),
  61. ],
  62. 'key': [
  63. (namespace_stop, len(namespace_stop), 'drop', 'spaces'),
  64. (namespace_cont, len(namespace_cont), 'key', 'ns'),
  65. ],
  66. 'ns': [
  67. (namespace_end, len(namespace_end), 'namespace', 'spaces'),
  68. ],
  69. 'title': [
  70. (title_end, len(title_end), 'title', 'text'),
  71. ],
  72. 'text': [
  73. (text_start, len(text_start), 'drop', 'prebody'),
  74. ],
  75. 'prebody': [
  76. (text_cont, len(text_cont), 'drop', 'body'),
  77. (text_stop, len(text_stop), 'zero', 'start'),
  78. ],
  79. 'body': [
  80. (text_end, len(text_end), 'body', 'start'),
  81. ]
  82. }
  83. def process(self, filename, limit):
  84. self.file_list += [filename]
  85. self.current_file_id = len(self.file_list) - 1
  86. block = ''
  87. seek = 0
  88. key = None
  89. category = None
  90. title = None
  91. file = open(filename, 'rb')
  92. end = False
  93. more = True
  94. wanted = True
  95. CurrentState = self.StateMachine['start']
  96. run = True
  97. while run:
  98. if more or (not end and len(block) < 1024):
  99. more = False
  100. block2 = file.read(65536)
  101. if len(block2) == 0:
  102. end = True
  103. else:
  104. block += block2
  105. pos = -1
  106. state = None
  107. for s in CurrentState:
  108. p = block.find(s[0])
  109. if p >= 0:
  110. if pos < 0 or p < pos:
  111. pos = p
  112. state = s
  113. if None == state:
  114. if end:
  115. return limit
  116. else:
  117. more = True
  118. else:
  119. (tag, length, proc, next) = state
  120. CurrentState = self.StateMachine[next]
  121. flag = False
  122. if 'key' == proc:
  123. key = block[:pos].split('"')[0].strip()
  124. elif 'namespace' == proc:
  125. category = block[:pos].strip().lower()
  126. key = int(key)
  127. self.category_to_key[category] = key
  128. self.key_to_category[key] = category
  129. self.namespace(key, category)
  130. key = None
  131. elif 'title' == proc:
  132. (category, key, title) = self.get_category(block[:pos].strip())
  133. wanted = self.title(category, key, title, seek)
  134. elif 'body' == proc:
  135. body = block[:pos].strip()
  136. body_leading_blanks = pos - len(block[:pos].lstrip())
  137. flag = True
  138. if '#' in body[0:10] or '#' in body[0:10]:
  139. match = RedirectedTo.regex.match(body)
  140. if wanted and match:
  141. (rcategory, rkey, rtitle) = self.get_category(match.group(2).strip())
  142. self.redirect(category, key, title, rcategory, rkey, rtitle, seek + body_leading_blanks)
  143. flag = False
  144. elif 'zero' == proc:
  145. flag = True
  146. body = ''
  147. elif 'category_start' == proc:
  148. self.category_to_key = {}
  149. self.key_to_category = {}
  150. if wanted and flag:
  151. self.body(category, key, title, body, seek + body_leading_blanks)
  152. if limit != 'all':
  153. limit -= 1
  154. if limit <= 0:
  155. run = False
  156. break
  157. title = None
  158. block = block[pos + length:]
  159. seek += pos + length
  160. return limit
  161. def get_category(self, title):
  162. """split title into category, key, title"""
  163. if ':' in title:
  164. (category, t) = title.split(':', 1)
  165. category = category.strip().lower()
  166. t = t.strip()
  167. if category in self.category_to_key:
  168. key = self.category_to_key[category]
  169. return (category, key, t)
  170. return ('', 0, title)
  171. class MyTestScanner(FileScanner):
  172. def __init__(self, *args, **kw):
  173. super(MyTestScanner, self).__init__(*args, **kw)
  174. self.count = 0
  175. self.articles = 0
  176. self.article_index = {}
  177. def namespace(self, key, text):
  178. print('namespace "{0:d}"->"{1:s}"'.format(key, text))
  179. def title(self, category, key, title, seek):
  180. self.count += 1
  181. print('T:{0:d} {1:d} : {2:s}[{3:d}]:{4:s}'.format(self.count, seek, category, key, title))
  182. if self.KEY_ARTICLE != key:
  183. return False
  184. return True
  185. def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
  186. #pass
  187. print('R:{0:d} {1:d} : {2:s}[{3:d}]:{4:s} -> {5:s}[{6:d}]:{7:s}'
  188. .format(self.count, seek, category, key, title, rcategory, rkey, rtitle))
  189. def body(self, category, key, title, text, seek):
  190. if not filter(title, text):
  191. self.articles += 1
  192. self.article_index[title] = [self.articles, seek, len(text)]
  193. print('B:{0:d} {1:d} [{2:s}[{3:d}]{4:s}] : {5:s}'
  194. .format(self.count, seek, category, key, title, text[:100]))
  195. def filter(title, text):
  196. (restricted, contains) = FilterWords.find_restricted(title + text)
  197. if restricted:
  198. print('TITLE: "{0:s}" restricted: [{1:s}]'.format(title, contains))
  199. return restricted
  200. def usage(message):
  201. if None != message:
  202. print('error: {0:s}'.format(message))
  203. print('usage: {0:s} <options> xml-file...'.format(os.path.basename(__file__)))
  204. print(' --help This message')
  205. print(' --count=n Number of article to process [all]')
  206. print(' --limit=number Limit the number of articles processed')
  207. print(' --prefix=name Device file name portion for .fnd/.pfx [pedia]')
  208. print(' --templates=file Database for templates [templates.db]')
  209. exit(1)
  210. def main():
  211. global verbose
  212. global debug
  213. try:
  214. opts, args = getopt.getopt(sys.argv[1:], 'hvc:d:',
  215. ['help', 'verbose',
  216. 'count=',
  217. 'debug='])
  218. except getopt.GetoptError, err:
  219. usage(err)
  220. verbose = False
  221. debug = 0
  222. count = 'all'
  223. for opt, arg in opts:
  224. if opt in ('-v', '--verbose'):
  225. verbose = True
  226. elif opt in ('-h', '--help'):
  227. usage(None)
  228. elif opt in ('-d', '--debug'):
  229. try:
  230. debug = int(arg)
  231. except ValueError:
  232. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  233. elif opt in ('-c', '--count'):
  234. if arg[-1] == 'k':
  235. arg = arg[:-1] + '000'
  236. if arg != 'all':
  237. try:
  238. count = int(arg)
  239. except ValueError:
  240. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  241. if count <= 0:
  242. usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
  243. else:
  244. usage('unhandled option: ' + opt)
  245. if len(args) == 0:
  246. usage('no files to process')
  247. scanner = MyTestScanner()
  248. for f in args:
  249. print('Processing file: {0:s}'.format(f))
  250. count = scanner.process(f, count)
  251. if 0 == count:
  252. break
  253. # run the program
  254. if __name__ == "__main__":
  255. main()