ArticleParser.py 8.9 KB


  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Article Parser
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import os, sys, traceback
  9. import re
  10. import subprocess
  11. import time
  12. import getopt
  13. import os.path
  14. import sqlite3
  15. import TidyUp
  16. import PrintLog
  17. from types import *
  18. verbose = False
  19. PARSER_COMMAND = '(cd mediawiki-offline && php wr_parser_sa.php -)'
  20. def usage(message):
  21. if None != message:
  22. print('error: {0:s}'.format(message))
  23. print('usage: {0:s} <options> xml-files...'.format(os.path.basename(__file__)))
  24. print(' --help This message')
  25. print(' --verbose Enable verbose output')
  26. print(' --xhtml=file XHTML output [all_articles.html]')
  27. print(' --language=lang Set language for PHP parser [en]')
  28. print(' --start=n First artcle to process [1] (1k => 1000)')
  29. print(' --count=n Number of artcles to process [all] (1k => 1000)')
  30. print(' --article-offsets=file Article file offsets database input [offsets.db]')
  31. print(' --templates=file Database for templates [templates.db]')
  32. print(' --parse-workdir=dir Work directory for the PHP parser [/tmp]')
  33. print(' --just-cat Replace php parser be "cat" for debugging')
  34. print(' --no-output Do not run any parsing')
  35. exit(1)
  36. def main():
  37. global verbose
  38. global PARSER_COMMAND
  39. global total_articles
  40. try:
  41. opts, args = getopt.getopt(sys.argv[1:], 'hvx:s:c:o:t:l:jnw:T:',
  42. ['help', 'verbose', 'xhtml=',
  43. 'start=', 'count=',
  44. 'article-offsets=',
  45. 'templates=',
  46. 'language=',
  47. 'just-cat',
  48. 'no-output',
  49. 'parser-workdir=',
  50. 'parser-tempdir=',
  51. ])
  52. except getopt.GetoptError, err:
  53. usage(err)
  54. verbose = False
  55. out_name = 'all_articles.html'
  56. off_name = 'offsets.db'
  57. parser_workdir = '/tmp'
  58. parser_tempdir = os.path.join(parser_workdir, 'tmp')
  59. start_article = 1
  60. article_count = 'all'
  61. failed_articles = 0
  62. do_output = True
  63. template_name = 'templates.db'
  64. language = 'en'
  65. for opt, arg in opts:
  66. if opt in ('-v', '--verbose'):
  67. verbose = True
  68. elif opt in ('-h', '--help'):
  69. usage(None)
  70. elif opt in ('-x', '--xhtml'):
  71. out_name = arg
  72. elif opt in ('-o', '--article-offsets'):
  73. off_name = arg
  74. elif opt in ('-t', '--templates'):
  75. template_name = arg
  76. elif opt in ('-l', '--language'):
  77. language = arg
  78. elif opt in ('-w', '--parser-workdir'):
  79. parser_workdir = arg
  80. elif opt in ('-T', '--parser-tempdir'):
  81. parser_tempdir = arg
  82. elif opt in ('-j', '--just-cat'):
  83. PARSER_COMMAND = 'cat'
  84. elif opt in ('-n', '--no-output'):
  85. do_output = False
  86. elif opt in ('-s', '--start'):
  87. if arg[-1] == 'k':
  88. arg = arg[:-1] + '000'
  89. try:
  90. start_article = int(arg)
  91. except ValueError:
  92. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  93. if start_article < 1:
  94. usage('"{0:s}={1:s}" must be >= 1'.format(opt, arg))
  95. elif opt in ('-c', '--count'):
  96. if arg[-1] == 'k':
  97. arg = arg[:-1] + '000'
  98. if arg != 'all':
  99. try:
  100. article_count = int(arg)
  101. except ValueError:
  102. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  103. if article_count <= 0:
  104. usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
  105. else:
  106. usage('unhandled option: ' + opt)
  107. if not os.path.isdir(parser_workdir):
  108. usage('workdir: {0:s} does not exist'.format(parser_workdir))
  109. if not os.path.isdir(parser_tempdir):
  110. usage('tempdir: {0:s} does not exist'.format(parser_tempdir))
  111. # pass parameters to the PHP parser
  112. os.environ['WORKDIR'] = parser_workdir
  113. os.environ['TEMPDIR'] = parser_tempdir
  114. os.environ['LANGUAGE'] = language
  115. os.environ['TEMPLATE_DB'] = template_name
  116. offset_db = sqlite3.connect(off_name)
  117. offset_db.execute('pragma synchronous = 0')
  118. offset_db.execute('pragma temp_store = 2')
  119. offset_db.execute('pragma read_uncommitted = true')
  120. offset_db.execute('pragma cache_size = 20000000')
  121. offset_db.execute('pragma default_cache_size = 20000000')
  122. offset_db.execute('pragma journal_mode = off')
  123. offset_cursor = offset_db.cursor()
  124. if do_output:
  125. background_process = PARSER_COMMAND + ' > ' + out_name
  126. else:
  127. background_process = None
  128. # process all required articles
  129. out_base_name = os.path.basename(out_name) # for logging messages
  130. current_file_id = None
  131. input_file = None
  132. process_id = None
  133. total_articles = 0
  134. start_time = time.time()
  135. while article_count == 'all' or article_count != 0:
  136. offset_cursor.execute('select file_id, title, seek, length from offsets where article_number = ? limit 1',
  137. (start_article,))
  138. row = offset_cursor.fetchone()
  139. if None == row:
  140. break
  141. (file_id, title, seek, length) = row
  142. if file_id != current_file_id:
  143. current_file_id = file_id
  144. if input_file:
  145. input_file.close()
  146. offset_cursor.execute('select filename from files where file_id = ? limit 1', (file_id,))
  147. filename = offset_cursor.fetchone()[0]
  148. input_file = open(filename, 'rb')
  149. if not input_file:
  150. PrintlogLog.message('Failed to open: {0:s}'.format(filename))
  151. current_file_id = None
  152. continue
  153. if verbose:
  154. PrintLog.message(u'Opened: {0:s}'.format(filename))
  155. try:
  156. input_file.seek(seek)
  157. except Exception, e:
  158. PrintLog.message(u'seek failed: e={0:!s:s} seek={1:d} f={2:s}'.format(e, seek, filename))
  159. sys.exit(1)
  160. # restart the background process if it fails to try to record all failing articles
  161. if None != background_process and None == process_id:
  162. process_id = subprocess.Popen(background_process, shell=True, stdin=subprocess.PIPE)
  163. try:
  164. process_article_text(current_file_id, total_articles + 1, title,
  165. input_file.read(length), process_id.stdin)
  166. except Exception, e:
  167. failed_articles += 1
  168. # extract from log by: grep '^!' log-file
  169. PrintLog.message(u'!Process failed, file: {0:s} article({1:d}): {2:s} because: {3!s:s}'
  170. .format(filename, total_articles, title, e))
  171. trace = sys.exc_info()
  172. if None != trace:
  173. traceback.print_tb(trace[2])
  174. process_id.stdin.close()
  175. process_id.wait()
  176. process_id = None
  177. if article_count != 'all':
  178. article_count -= 1
  179. total_articles += 1
  180. start_article += 1
  181. if not verbose and total_articles % 1000 == 0:
  182. if 0 != failed_articles:
  183. failed_message = 'Failed: {0:d}'.format(failed_articles)
  184. else:
  185. failed_message = ''
  186. now_time = time.time()
  187. PrintLog.message(u'Parse[{0:s}]: {1:7.2f}s {2:10d} {3:s}'
  188. .format(out_base_name, now_time - start_time,
  189. total_articles, failed_message))
  190. start_time = now_time
  191. # close files
  192. if input_file:
  193. input_file.close()
  194. # wait for background process to finish
  195. if process_id:
  196. process_id.stdin.close()
  197. process_id.wait()
  198. # output some statistics
  199. PrintLog.message(u'Parse[{0:s}]: Total: {1:d}'.format(out_base_name, total_articles))
  200. # indicate failures
  201. if 0 != failed_articles:
  202. PrintLog.message(u'Parse[{0:s}]: Failed: {1:d}'.format(out_base_name, failed_articles))
  203. sys.exit(1)
  204. def process_article_text(id, count, title, text, newf):
  205. global verbose
  206. if verbose:
  207. PrintLog.message(u'[PA {0:d}] {1:s}'.format(count, title))
  208. text = TidyUp.article(text)
  209. if newf:
  210. newf.write('{0:d}:'.format(id))
  211. newf.write(title[1:].encode('utf-8')) # We pad the title to force the database to import strings
  212. newf.write('\n__NOTOC__\n')
  213. newf.write(text.encode('utf-8') + '\n')
  214. newf.write('***EOF***\n')
  215. # run the program
  216. if __name__ == "__main__":
  217. main()