ArticleIndex.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Create Article Indices
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. from __future__ import with_statement
  9. import os, sys, re
  10. import struct
  11. import littleparser
  12. import getopt
  13. import os.path
  14. import time
  15. import subprocess
  16. import sqlite3
  17. import FilterWords
  18. import FileScanner
  19. import TidyUp
  20. import PrintLog
  21. # this _must_ be in ascending ASCII sequence
  22. KEYPAD_KEYS = """ !#$%&'()*+,-.0123456789=?@abcdefghijklmnopqrstuvwxyz"""
  23. # to check if in order: uncomment and look at result
  24. #for c in KEYPAD_KEYS:
  25. # print('{0:d}'.format(ord(c)))
  26. #sys.exit(0)
  27. # underscore and space
  28. whitespaces = re.compile(r'([\s_]+)', re.IGNORECASE)
  29. # to catch loop in redirections
  30. class CycleError(Exception):
  31. pass
  32. verbose = False
  33. enable_templates = True # $$$ When this is false, templates are included as articles :/
  34. error_flag = False # indicates error in indexing, but processing will still continue
  35. # to find more errors
  36. bigram = {}
  37. def usage(message):
  38. if None != message:
  39. print('error: {0:s}'.format(message))
  40. print('usage: {0:s} <options> xml-file...'.format(os.path.basename(__file__)))
  41. print(' --help This message')
  42. print(' --verbose Enable verbose output')
  43. print(' --article-index=file Article index database output [articles.db]')
  44. print(' --article-offsets=file Article file offsets database output [offsets.db]')
  45. print(' --article-counts=file File to store the counts [counts.text]')
  46. print(' --limit=number Limit the number of articles processed')
  47. print(' --prefix=name Device file name portion for .fnd/.pfx [pedia]')
  48. print(' --templates=file Database for templates [templates.db]')
  49. exit(1)
  50. def main():
  51. global verbose
  52. global error_flag
  53. try:
  54. opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:c:t:l:p:',
  55. ['help', 'verbose',
  56. 'article-index=',
  57. 'article-offsets=',
  58. 'article-counts=',
  59. 'templates=',
  60. 'limit=',
  61. 'prefix='])
  62. except getopt.GetoptError, err:
  63. usage(err)
  64. verbose = False
  65. art_name = "articles.db"
  66. off_name = "offsets.db"
  67. cnt_name = "counts.text"
  68. fnd_name = 'pedia.fnd'
  69. pfx_name = 'pedia.pfx'
  70. template_name = 'templates.db'
  71. limit = 'all'
  72. for opt, arg in opts:
  73. if opt in ('-v', '--verbose'):
  74. verbose = True
  75. elif opt in ('-h', '--help'):
  76. usage(None)
  77. elif opt in ('-i', '--article-index'):
  78. art_name = arg
  79. elif opt in ('-o', '--article-offsets'):
  80. off_name = arg
  81. elif opt in ('-c', '--article-counts'):
  82. cnt_name = arg
  83. elif opt in ('-t', '--templates'):
  84. template_name = arg
  85. elif opt in ('-l', '--limit'):
  86. if arg[-1] == 'k':
  87. arg = arg[:-1] + '000'
  88. if arg != 'all':
  89. try:
  90. limit = int(arg)
  91. except ValueError:
  92. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  93. if limit <= 0:
  94. usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
  95. elif opt in ('-p', '--prefix'):
  96. fnd_name = arg + '.fnd'
  97. pfx_name = arg + '.pfx'
  98. else:
  99. usage('unhandled option: ' + opt)
  100. if [] == args:
  101. usage('Missing argument(s)')
  102. processor = FileProcessing(articles = art_name, offsets = off_name, templates = template_name)
  103. for f in args:
  104. limit = processor.process(f, limit)
  105. if limit != 'all' and limit <= 0:
  106. break
  107. # record initial counts
  108. a = processor.article_count
  109. r = processor.redirect_count
  110. # fix up redirects
  111. m = a + processor.resolve_redirects()
  112. # record combined count and display statistics
  113. s = a + r
  114. cf = open(cnt_name, 'w')
  115. for f in (sys.stdout, cf):
  116. f.write('Articles: {0:10d}\n'.format(a))
  117. f.write('Redirects: {0:10d}\n'.format(r))
  118. f.write('Sum: {0:10d}\n'.format(s))
  119. f.write('Merged: {0:10d}\n'.format(m))
  120. f.write('Difference: {0:10d}\n'.format(m - s))
  121. f.write('Restricted: {0:10d}\n'.format(processor.restricted_count))
  122. f.write('Templates: {0:10d}\n'.format(processor.template_count))
  123. f.write('rTemplates: {0:10d}\n'.format(processor.template_redirect_count))
  124. f.write('Characters: {0:10d}\n'.format(processor.total_character_count))
  125. cf.close()
  126. output_fnd(fnd_name, processor)
  127. output_pfx(pfx_name)
  128. del processor
  129. # return non-zero status if there have been any errors
  130. if error_flag:
  131. PrintLog.message('*** ERROR in Index build')
  132. PrintLog.message('*** Currently "Duplicate Title" is the only condition that causes this error')
  133. PrintLog.message('*** Likely "license.xml" or "terms.xml" file duplicates a title in main wiki file')
  134. PrintLog.message('*** Manually edit "license.xml" or "terms.xml" file to change the title')
  135. sys.exit(1)
  136. def generate_bigram(text):
  137. global bigram
  138. if len(text) > 2:
  139. try:
  140. if text[0].lower() in KEYPAD_KEYS and text[1].lower() in KEYPAD_KEYS:
  141. bigram[text[0:2]] += 1
  142. except KeyError:
  143. bigram[text[0:2]] = 1
  144. if len(text) > 4:
  145. try:
  146. if text[2].lower() in KEYPAD_KEYS and text[3].lower() in KEYPAD_KEYS:
  147. bigram[text[2:4]] += 1
  148. except KeyError:
  149. bigram[text[2:4]] = 1
  150. class FileProcessing(FileScanner.FileScanner):
  151. def __init__(self, *args, **kw):
  152. super(FileProcessing, self).__init__(*args, **kw)
  153. self.article_db_name = kw['articles']
  154. self.article_import = self.article_db_name + '.import'
  155. self.offset_db_name = kw['offsets']
  156. self.offset_import = self.offset_db_name + '.import'
  157. self.file_import = self.offset_db_name + '.files'
  158. self.template_db_name = kw['templates']
  159. for filename in [self.article_db_name,
  160. self.article_import,
  161. self.offset_db_name,
  162. self.offset_import,
  163. self.template_db_name,
  164. self.file_import]:
  165. if os.path.exists(filename):
  166. os.remove(filename)
  167. self.restricted_count = 0
  168. self.redirect_count = 0
  169. self.article_count = 0
  170. self.template_count = 0
  171. self.template_redirect_count = 0
  172. self.all_titles = []
  173. self.translate = littleparser.LittleParser().translate
  174. self.redirects = {}
  175. self.articles = {}
  176. self.offsets = {}
  177. self.total_character_count = 0
  178. self.time = time.time()
  179. self.template_db = sqlite3.connect(self.template_db_name)
  180. self.template_db.execute('pragma synchronous = 0')
  181. self.template_db.execute('pragma temp_store = 2')
  182. self.template_db.execute('pragma read_uncommitted = true')
  183. self.template_db.execute('pragma cache_size = 20000000')
  184. self.template_db.execute('pragma default_cache_size = 20000000')
  185. self.template_db.execute('pragma journal_mode = off')
  186. self.template_db.execute('''
  187. create table templates (
  188. title varchar primary key,
  189. body varchar
  190. )
  191. ''')
  192. self.template_db.execute('''
  193. create table redirects (
  194. title varchar primary key,
  195. redirect varchar
  196. )
  197. ''')
  198. self.template_db.commit()
  199. self.template_cursor = self.template_db.cursor()
  200. def __del__(self):
  201. PrintLog.message(u'Flushing databases')
  202. self.template_db.commit()
  203. self.template_cursor.close()
  204. self.template_db.close()
  205. PrintLog.message(u'Writing: files')
  206. start_time = time.time()
  207. i = 0
  208. with open(self.file_import, 'w') as f:
  209. for filename in self.file_list:
  210. f.write('{0:d}\t{1:s}\n'.format(i, filename))
  211. i += 1
  212. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  213. PrintLog.message(u'Writing: articles')
  214. start_time = time.time()
  215. with open(self.article_import, 'w') as f:
  216. for title in self.articles:
  217. (article_number, fnd_offset, restricted) = self.articles[title]
  218. f.write('~' + title.encode('utf-8')) # force string
  219. f.write('\t{0:d}\t{1:d}\t{2:d}\n'.format(article_number, fnd_offset, restricted))
  220. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  221. PrintLog.message(u'Writing: offsets')
  222. start_time = time.time()
  223. with open(self.offset_import, 'w') as f:
  224. for article_number in self.offsets:
  225. (file_id, title, seek, length, accumulated) = self.offsets[article_number]
  226. f.write('{0:d}\t{1:d}\t'.format(article_number, file_id))
  227. f.write('~' + title.encode('utf-8')) # force string
  228. f.write('\t{0:d}\t{1:d}\t{2:d}\n'.format(seek, length, accumulated))
  229. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  230. PrintLog.message(u'Loading: articles')
  231. start_time = time.time()
  232. p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.article_db_name, shell=True, stdin=subprocess.PIPE)
  233. p.stdin.write("""
  234. create table articles (
  235. title varchar primary key,
  236. article_number integer,
  237. fnd_offset integer,
  238. restricted varchar
  239. );
  240. pragma synchronous = 0;
  241. pragma temp_store = 2;
  242. pragma locking_mode = exclusive;
  243. pragma cache_size = 20000000;
  244. pragma default_cache_size = 20000000;
  245. pragma journal_mode = memory;
  246. .mode tabs
  247. .import {0:s} articles
  248. .exit
  249. """.format(self.article_import))
  250. p.stdin.close()
  251. p.wait()
  252. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  253. PrintLog.message(u'Loading: offsets and files')
  254. start_time = time.time()
  255. p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.offset_db_name, shell=True, stdin=subprocess.PIPE)
  256. p.stdin.write("""
  257. create table offsets (
  258. article_number integer primary key,
  259. file_id integer,
  260. title varchar,
  261. seek integer,
  262. length integer,
  263. accumulated integer
  264. );
  265. create table files (
  266. file_id integer primary key,
  267. filename varchar
  268. );
  269. pragma synchronous = 0;
  270. pragma temp_store = 2;
  271. pragma locking_mode = exclusive;
  272. pragma cache_size = 20000000;
  273. pragma default_cache_size = 20000000;
  274. pragma journal_mode = memory;
  275. .mode tabs
  276. .import {0:s} offsets
  277. .import {1:s} files
  278. .exit
  279. """.format(self.offset_import, self.file_import))
  280. p.stdin.close()
  281. p.wait()
  282. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  283. def title(self, category, key, title, seek):
  284. global verbose
  285. global enable_templates
  286. if self.KEY_ARTICLE == key:
  287. return True
  288. if enable_templates and self.KEY_TEMPLATE == key:
  289. if verbose:
  290. PrintLog.message(u'Template Title: {0:s}'.format(unicode(title, 'utf-8')))
  291. return True
  292. return False
  293. def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
  294. global whitespaces
  295. global verbose
  296. title = self.translate(title).strip(u'\u200e\u200f')
  297. rtitle = self.translate(rtitle).strip().strip(u'\u200e\u200f')
  298. rtitle = whitespaces.sub(' ', rtitle).strip().lstrip(':')
  299. if self.KEY_TEMPLATE == key:
  300. if title != rtitle:
  301. title = unicode(category, 'utf-8') + ':' + title.lower()
  302. rtitle = unicode(rcategory, 'utf-8') + ':' + rtitle.lower()
  303. self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)',
  304. [u'~{0:d}~{1:s}'.format(self.file_id(), title),
  305. u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)])
  306. self.template_redirect_count += 1
  307. return
  308. if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
  309. if verbose:
  310. PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'
  311. .format(category, key, title, rcategory, rkey, rtitle))
  312. return
  313. if '' == rtitle:
  314. PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title))
  315. else:
  316. self.redirects[title] = rtitle
  317. self.redirect_count += 1
  318. if verbose:
  319. PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'
  320. .format(category, key, title, rcategory, rkey, rtitle))
  321. def body(self, category, key, title, text, seek):
  322. global verbose
  323. global error_flag
  324. title = self.translate(title).strip(u'\u200e\u200f')
  325. if self.KEY_TEMPLATE == key:
  326. t1 = unicode(category, 'utf-8') + ':' + title.lower()
  327. t_body = TidyUp.template(text)
  328. self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
  329. [u'~{0:d}~{1:s}'.format(self.file_id(), t1), u'~' + t_body])
  330. self.template_count += 1
  331. return
  332. restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)
  333. self.article_count += 1
  334. # do closer inspection to see if realy restricted
  335. if restricted:
  336. (restricted, bad_words) = FilterWords.find_restricted(text)
  337. if restricted:
  338. self.restricted_count += 1
  339. if not verbose and self.article_count % 10000 == 0:
  340. start_time = time.time()
  341. PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
  342. self.time = start_time
  343. generate_bigram(title)
  344. if verbose:
  345. if restricted:
  346. PrintLog.message(u'Restricted Title: {0:s}'.format(title))
  347. PrintLog.message(u' --> {0:s}'.format(bad_words))
  348. else:
  349. PrintLog.message(u'Title: {0:s}'.format(title))
  350. character_count = len(text)
  351. self.total_character_count += character_count
  352. self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)
  353. if self.set_index(title, (self.article_count, -1, restricted)): # -1 == pfx place holder
  354. PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
  355. error_flag = True
  356. def resolve_redirects(self):
  357. """add redirect to article_index"""
  358. count = 0
  359. for item in self.redirects:
  360. try:
  361. self.set_index(item, self.find(item))
  362. count += 1
  363. except KeyError:
  364. PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
  365. except CycleError:
  366. PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
  367. return count
  368. def set_index(self, title, data):
  369. """returns false if the key did not already exist"""
  370. if type(title) == str:
  371. title = unicode(title, 'utf-8')
  372. result = title in self.articles
  373. self.articles[title] = data
  374. return result
  375. def get_index(self, title):
  376. if type(title) == str:
  377. title = unicode(title, 'utf-8')
  378. return self.articles[title]
  379. def all_indices(self):
  380. return self.articles.keys()
  381. def find(self, title, level = 0):
  382. """get index from article title
  383. also handles redirects
  384. returns: [index, fnd]
  385. """
  386. if '' == title:
  387. raise CycleError('Empty title detected')
  388. if level > 10:
  389. raise CycleError('Redirect cycle: ' + title)
  390. try:
  391. title = self.redirects[title]
  392. except KeyError:
  393. title = self.redirects[title[0].swapcase() + title[1:]]
  394. try:
  395. result = self.get_index(title)
  396. except KeyError:
  397. try:
  398. result = self.get_index(title[0].swapcase() + title[1:])
  399. except:
  400. result = self.find(title, level + 1)
  401. return result
  402. import unicodedata
  403. def strip_accents(s):
  404. if type(s) == str:
  405. s = unicode(s, 'utf-8')
  406. return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
  407. def bigram_encode(title):
  408. global bigram
  409. result = ''
  410. title = strip_accents(title)
  411. while len(title) >= 2:
  412. if title[0].lower() in KEYPAD_KEYS:
  413. b = title[0:2]
  414. if b in bigram:
  415. result += bigram[b]
  416. title = title[2:]
  417. else:
  418. result += chr(ord(title[0:1]))
  419. title = title[1:]
  420. else:
  421. #result += '?'
  422. title = title[1:]
  423. if len(title) == 1:
  424. if title[0].lower() in KEYPAD_KEYS:
  425. result += chr(ord(title[0]))
  426. #else:
  427. # result += '?'
  428. return result
  429. def output_fnd(filename, article_index):
  430. """create bigram table"""
  431. global bigram
  432. global index_matrix
  433. PrintLog.message(u'Writing bigrams: {0:s}'.format(filename))
  434. start_time = time.time()
  435. out_f = open(filename, 'w')
  436. sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
  437. sortedgram.sort()
  438. sortedgram.reverse()
  439. bigram = {}
  440. i = 0
  441. for k, v in sortedgram:
  442. out_f.write(v)
  443. bigram[v] = chr(i + 128)
  444. i += 1
  445. if i >= 128:
  446. break
  447. while i < 128:
  448. out_f.write('zz')
  449. bigram['zz'] = chr(i + 128)
  450. i += 1
  451. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  452. # create pfx matrix and write encoded titles
  453. #article_list = [strip_accents(k) for k in article_index.keys()]
  454. #article_list.sort(key = lambda x: strip_accents(x).lower())
  455. def sort_key(key):
  456. global KEYPAD_KEYS
  457. return ''.join(c for c in strip_accents(key).lower() if c in KEYPAD_KEYS)
  458. PrintLog.message(u'Sorting titles')
  459. start_time = time.time()
  460. article_list = [ (sort_key(title), title) for title in article_index.all_indices() ]
  461. article_list.sort()
  462. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  463. PrintLog.message(u'Writing matrix: {0:s}'.format(filename))
  464. start_time = time.time()
  465. index_matrix = {}
  466. index_matrix['\0\0\0'] = out_f.tell()
  467. for stripped_title, title in article_list:
  468. offset = out_f.tell()
  469. key3 = (title[0:3] + '\0\0\0')[0:3].lower()
  470. key2 = key3[0:2] + '\0'
  471. key1 = key3[0:1] + '\0\0'
  472. if key1 not in index_matrix:
  473. index_matrix[key1] = offset
  474. if key2 not in index_matrix:
  475. index_matrix[key2] = offset
  476. if key3 not in index_matrix:
  477. index_matrix[key3] = offset
  478. (article_number, dummy, restricted) = article_index.get_index(title)
  479. article_index.set_index(title, (article_number, offset, restricted))
  480. out_f.write(struct.pack('Ib', article_number, 0) + bigram_encode(title) + '\0')
  481. out_f.close()
  482. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  483. def output_pfx(filename):
  484. """output the pfx matrix"""
  485. global index_matrix
  486. PrintLog.message(u'Writing: {0:s}'.format(filename))
  487. start_time = time.time()
  488. out_f = open(filename, 'w')
  489. list = '\0' + KEYPAD_KEYS
  490. for k1 in list:
  491. for k2 in list:
  492. for k3 in list:
  493. key = k1+k2+k3
  494. if key in index_matrix:
  495. offset = index_matrix[key]
  496. else:
  497. offset = 0
  498. out_f.write(struct.pack('I', offset))
  499. out_f.close()
  500. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  501. # run the program
  502. if __name__ == "__main__":
  503. main()