ArticleIndex.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Create Article Indices
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. from __future__ import with_statement
  9. import os, sys, re
  10. import struct
  11. import littleparser
  12. import urllib
  13. import getopt
  14. import os.path
  15. import time
  16. import subprocess
  17. import sqlite3
  18. import FilterWords
  19. import FileScanner
  20. import TidyUp
  21. import PrintLog
  22. import LanguageTranslation
  23. import SearchKey
  24. # maximum string lengths for FND file
  25. # when not truncating the actual title can be twice this length (+1 for the '\0')
  26. MAXIMUM_TITLE_LENGTH = 63 # c-code is 64 including '\0'
  27. MAXIMUM_TITLE_ACTUAL = 255 # c-code is 256 including '\0'
  28. # to catch loop in redirections
  29. class CycleError(Exception):
  30. pass
  31. verbose = False
  32. enable_templates = True # $$$ When this is false, templates are included as articles :/
  33. error_flag = False # indicates error in indexing, but processing will still continue
  34. # to find more errors
  35. bigram = {}
  36. def usage(message):
  37. if None != message:
  38. print('error: {0:s}'.format(message))
  39. print('usage: {0:s} <options> xml-file...'.format(os.path.basename(__file__)))
  40. print(' --help This message')
  41. print(' --verbose Enable verbose output')
  42. print(' --article-index=file Article index database output [articles.db]')
  43. print(' --article-offsets=file Article file offsets database output [offsets.db]')
  44. print(' --article-counts=file File to store the counts [counts.text]')
  45. print(' --language=<xx> Set language for index conversions [en]')
  46. print(' --limit=number Limit the number of articles processed')
  47. print(' --prefix=name Device file name portion for .fnd/.pfx [pedia]')
  48. print(' --templates=file Database for templates [templates.db]')
  49. print(' --truncate-title Set when not using language links to save space')
  50. exit(1)
  51. def main():
  52. global verbose
  53. global error_flag
  54. try:
  55. opts, args = getopt.getopt(sys.argv[1:], 'hvi:o:c:t:l:p:L:T',
  56. ['help', 'verbose',
  57. 'article-index=',
  58. 'article-offsets=',
  59. 'article-counts=',
  60. 'templates=',
  61. 'limit=',
  62. 'prefix=',
  63. 'language=',
  64. 'truncate-title',
  65. ])
  66. except getopt.GetoptError, err:
  67. usage(err)
  68. verbose = False
  69. art_name = "articles.db"
  70. off_name = "offsets.db"
  71. cnt_name = "counts.text"
  72. fnd_name = 'pedia.fnd'
  73. pfx_name = 'pedia.pfx'
  74. template_name = 'templates.db'
  75. limit = 'all'
  76. language = 'en' # some languages may require special processing
  77. truncate_title = False # set tru when not using language links
  78. for opt, arg in opts:
  79. if opt in ('-v', '--verbose'):
  80. verbose = True
  81. elif opt in ('-h', '--help'):
  82. usage(None)
  83. elif opt in ('-i', '--article-index'):
  84. art_name = arg
  85. elif opt in ('-o', '--article-offsets'):
  86. off_name = arg
  87. elif opt in ('-c', '--article-counts'):
  88. cnt_name = arg
  89. elif opt in ('-t', '--templates'):
  90. template_name = arg
  91. elif opt in ('-T', '--truncate-title'):
  92. truncate_title = True
  93. elif opt in ('-l', '--limit'):
  94. if arg[-1] == 'k':
  95. arg = arg[:-1] + '000'
  96. if arg != 'all':
  97. try:
  98. limit = int(arg)
  99. except ValueError:
  100. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  101. if limit <= 0:
  102. usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
  103. elif opt in ('-p', '--prefix'):
  104. fnd_name = arg + '.fnd'
  105. pfx_name = arg + '.pfx'
  106. elif opt in ('-L', '--language'):
  107. language = arg
  108. else:
  109. usage('unhandled option: ' + opt)
  110. if [] == args:
  111. usage('Missing argument(s)')
  112. language_convert = LanguageTranslation.LanguageNormal()
  113. if 'ja' == language:
  114. language_convert = LanguageTranslation.LanguageJapanese()
  115. processor = FileProcessing(articles = art_name, offsets = off_name,
  116. templates = template_name,
  117. language = language_convert)
  118. for f in args:
  119. limit = processor.process(f, limit)
  120. if limit != 'all' and limit <= 0:
  121. break
  122. # record initial counts
  123. a = processor.article_count
  124. r = processor.redirect_count
  125. # fix up redirects
  126. m = a + processor.resolve_redirects()
  127. # record combined count and display statistics
  128. s = a + r
  129. cf = open(cnt_name, 'w')
  130. for f in (sys.stdout, cf):
  131. f.write('Articles: {0:10d}\n'.format(a))
  132. f.write('Redirects: {0:10d}\n'.format(r))
  133. f.write('Sum: {0:10d}\n'.format(s))
  134. f.write('Merged: {0:10d}\n'.format(m))
  135. f.write('Difference: {0:10d}\n'.format(m - s))
  136. f.write('Restricted: {0:10d}\n'.format(processor.restricted_count))
  137. f.write('Templates: {0:10d}\n'.format(processor.template_count))
  138. f.write('rTemplates: {0:10d}\n'.format(processor.template_redirect_count))
  139. f.write('Characters: {0:10d}\n'.format(processor.total_character_count))
  140. cf.close()
  141. output_fnd(fnd_name, processor, language_convert, truncate_title)
  142. output_pfx(pfx_name)
  143. del processor
  144. # return non-zero status if there have been any errors
  145. if error_flag:
  146. PrintLog.message('*** ERROR in Index build')
  147. PrintLog.message('*** Currently "Duplicate Title" is the only condition that causes this error')
  148. PrintLog.message('*** Likely "license.xml" or "terms.xml" file duplicates a title in main wiki file')
  149. PrintLog.message('*** Manually edit "license.xml" or "terms.xml" file to change the title')
  150. sys.exit(1)
  151. def generate_bigram(text):
  152. """create bigram from pairs of characters"""
  153. global bigram
  154. if len(text) > 2:
  155. try:
  156. if SearchKey.is_valid_character(text[0]) and SearchKey.is_valid_character(text[1]):
  157. bigram[text[0:2]] += 1
  158. except KeyError:
  159. bigram[text[0:2]] = 1
  160. if len(text) > 4:
  161. try:
  162. if SearchKey.is_valid_character(text[2]) and SearchKey.is_valid_character(text[3]):
  163. bigram[text[2:4]] += 1
  164. except KeyError:
  165. bigram[text[2:4]] = 1
  166. class FileProcessing(FileScanner.FileScanner):
  167. def __init__(self, *args, **kw):
  168. super(FileProcessing, self).__init__(*args, **kw)
  169. self.language_processor = kw['language']
  170. self.article_db_name = kw['articles']
  171. self.article_import = self.article_db_name + '.import'
  172. self.offset_db_name = kw['offsets']
  173. self.offset_import = self.offset_db_name + '.import'
  174. self.file_import = self.offset_db_name + '.files'
  175. self.template_db_name = kw['templates']
  176. for filename in [self.article_db_name,
  177. self.article_import,
  178. self.offset_db_name,
  179. self.offset_import,
  180. self.template_db_name,
  181. self.file_import]:
  182. if os.path.exists(filename):
  183. os.remove(filename)
  184. self.restricted_count = 0
  185. self.redirect_count = 0
  186. self.article_count = 0
  187. self.template_count = 0
  188. self.template_redirect_count = 0
  189. self.all_titles = []
  190. self.translate = littleparser.LittleParser().translate
  191. self.redirects = {}
  192. self.articles = {}
  193. self.offsets = {}
  194. self.total_character_count = 0
  195. self.time = time.time()
  196. self.template_db = sqlite3.connect(self.template_db_name)
  197. self.template_db.execute('pragma synchronous = 0')
  198. self.template_db.execute('pragma temp_store = 2')
  199. self.template_db.execute('pragma read_uncommitted = true')
  200. self.template_db.execute('pragma cache_size = 20000000')
  201. self.template_db.execute('pragma default_cache_size = 20000000')
  202. self.template_db.execute('pragma journal_mode = off')
  203. self.template_db.execute('''
  204. create table templates (
  205. title varchar primary key,
  206. body varchar
  207. )
  208. ''')
  209. self.template_db.execute('''
  210. create table redirects (
  211. title varchar primary key,
  212. redirect varchar
  213. )
  214. ''')
  215. self.template_db.commit()
  216. self.template_cursor = self.template_db.cursor()
  217. def __del__(self):
  218. PrintLog.message(u'Flushing databases')
  219. self.template_db.commit()
  220. self.template_cursor.close()
  221. self.template_db.close()
  222. PrintLog.message(u'Writing: files')
  223. start_time = time.time()
  224. i = 0
  225. with open(self.file_import, 'w') as f:
  226. for filename in self.file_list:
  227. f.write('{0:d}\t{1:s}\n'.format(i, filename))
  228. i += 1
  229. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  230. PrintLog.message(u'Writing: articles')
  231. start_time = time.time()
  232. with open(self.article_import, 'w') as f:
  233. for title in self.articles:
  234. (article_number, fnd_offset, restricted, is_redirect) = self.articles[title]
  235. f.write('~' + title.encode('utf-8')) # force string
  236. f.write('\t{0:d}\t{1:d}\t{2:d}\t{3:d}\n'.format(article_number, fnd_offset, restricted, is_redirect))
  237. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  238. PrintLog.message(u'Writing: offsets')
  239. start_time = time.time()
  240. with open(self.offset_import, 'w') as f:
  241. for article_number in self.offsets:
  242. (file_id, title, seek, length, accumulated) = self.offsets[article_number]
  243. f.write('{0:d}\t{1:d}\t'.format(article_number, file_id))
  244. f.write('~' + title.encode('utf-8')) # force string
  245. f.write('\t{0:d}\t{1:d}\t{2:d}\n'.format(seek, length, accumulated))
  246. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  247. PrintLog.message(u'Loading: articles')
  248. start_time = time.time()
  249. p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.article_db_name, shell=True, stdin=subprocess.PIPE)
  250. p.stdin.write("""
  251. create table articles (
  252. title varchar primary key,
  253. article_number integer,
  254. fnd_offset integer,
  255. restricted integer,
  256. is_redirect integer
  257. );
  258. pragma synchronous = 0;
  259. pragma temp_store = 2;
  260. pragma locking_mode = exclusive;
  261. pragma cache_size = 20000000;
  262. pragma default_cache_size = 20000000;
  263. pragma journal_mode = memory;
  264. .mode tabs
  265. .import {0:s} articles
  266. .exit
  267. """.format(self.article_import))
  268. p.stdin.close()
  269. p.wait()
  270. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  271. PrintLog.message(u'Loading: offsets and files')
  272. start_time = time.time()
  273. p = subprocess.Popen('sqlite3 > /dev/null 2>&1 ' + self.offset_db_name, shell=True, stdin=subprocess.PIPE)
  274. p.stdin.write("""
  275. create table offsets (
  276. article_number integer primary key,
  277. file_id integer,
  278. title varchar,
  279. seek integer,
  280. length integer,
  281. accumulated integer
  282. );
  283. create table files (
  284. file_id integer primary key,
  285. filename varchar
  286. );
  287. pragma synchronous = 0;
  288. pragma temp_store = 2;
  289. pragma locking_mode = exclusive;
  290. pragma cache_size = 20000000;
  291. pragma default_cache_size = 20000000;
  292. pragma journal_mode = memory;
  293. .mode tabs
  294. .import {0:s} offsets
  295. .import {1:s} files
  296. .exit
  297. """.format(self.offset_import, self.file_import))
  298. p.stdin.close()
  299. p.wait()
  300. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  301. def title(self, category, key, title, seek):
  302. global verbose
  303. global enable_templates
  304. if self.KEY_ARTICLE == key:
  305. return True
  306. if enable_templates and self.KEY_TEMPLATE == key:
  307. if verbose:
  308. PrintLog.message(u'Template Title: {0:s}'.format(unicode(title, 'utf-8')))
  309. return True
  310. return False
  311. def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
  312. global verbose
  313. title = self.translate(title).strip(u'\u200e\u200f')
  314. rtitle = self.translate(rtitle).strip().strip(u'\u200e\u200f')
  315. # redirected title may contain '%xx' items - treat as unicode sequence
  316. # if it fails just keep the %xx sequences intact since it must represent
  317. # either real %xx or some unknowable coding scheme
  318. try:
  319. rtitle = unicode(urllib.unquote(rtitle.encode('utf-8')), 'utf-8').strip().strip(u'\u200e\u200f')
  320. except UnicodeDecodeError:
  321. pass
  322. rtitle = SearchKey.compact_spaces(rtitle).lstrip(':').strip()
  323. if self.KEY_TEMPLATE == key:
  324. if title != rtitle:
  325. title = unicode(category, 'utf-8') + ':' + title.lower()
  326. rtitle = unicode(rcategory, 'utf-8') + ':' + rtitle.lower()
  327. self.template_cursor.execute(u'insert or replace into redirects (title, redirect) values(?, ?)',
  328. [u'~{0:d}~{1:s}'.format(self.file_id(), title),
  329. u'~{0:d}~{1:s}'.format(self.file_id(), rtitle)])
  330. self.template_redirect_count += 1
  331. return
  332. if self.KEY_ARTICLE != key or self.KEY_ARTICLE != rkey:
  333. if verbose:
  334. PrintLog.message(u'Non-article Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'
  335. .format(category, key, title, rcategory, rkey, rtitle))
  336. return
  337. if '' == rtitle:
  338. PrintLog.message(u'Empty Redirect for: {0:s}[{1:d}]:{2:s}'.format(category, key, title))
  339. else:
  340. self.redirects[title] = rtitle
  341. self.redirect_count += 1
  342. generate_bigram(self.language_processor.translate(title))
  343. if verbose:
  344. PrintLog.message(u'Redirect: {0:s}[{1:d}]:{2:s} -> {3:s}[{4:d}]:{5:s}'
  345. .format(category, key, title, rcategory, rkey, rtitle))
  346. def body(self, category, key, title, text, seek):
  347. global verbose
  348. global error_flag
  349. title = self.translate(title).strip(u'\u200e\u200f')
  350. if self.KEY_TEMPLATE == key:
  351. t1 = unicode(category, 'utf-8') + ':' + title.lower()
  352. t_body = TidyUp.template(text)
  353. self.template_cursor.execute(u'insert or replace into templates (title, body) values(?, ?)',
  354. [u'~{0:d}~{1:s}'.format(self.file_id(), t1), u'~' + t_body])
  355. self.template_count += 1
  356. return
  357. restricted = FilterWords.is_restricted(title) or FilterWords.is_restricted(text)
  358. self.article_count += 1
  359. # do closer inspection to see if realy restricted
  360. if restricted:
  361. (restricted, bad_words) = FilterWords.find_restricted(text)
  362. if restricted:
  363. self.restricted_count += 1
  364. if not verbose and self.article_count % 10000 == 0:
  365. start_time = time.time()
  366. PrintLog.message(u'Index: {0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
  367. self.time = start_time
  368. generate_bigram(self.language_processor.translate(title))
  369. if verbose:
  370. if restricted:
  371. PrintLog.message(u'Restricted Title: {0:s}'.format(title))
  372. PrintLog.message(u' --> {0:s}'.format(bad_words))
  373. else:
  374. PrintLog.message(u'Title: {0:s}'.format(title))
  375. character_count = len(text)
  376. self.total_character_count += character_count
  377. self.offsets[self.article_count] = (self.file_id(), title, seek, character_count, self.total_character_count)
  378. if self.set_index(title, (self.article_count, -1, restricted, False)): # -1 == place holder
  379. PrintLog.message(u'ERROR: Duplicate Title: {0:s}'.format(title))
  380. error_flag = True
  381. def resolve_redirects(self):
  382. """add redirect to article_index"""
  383. count = 0
  384. for item in self.redirects:
  385. try:
  386. self.set_index(item, self.find(item)[:3] + (True,))
  387. count += 1
  388. except KeyError:
  389. PrintLog.message(u'Unresolved redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
  390. except CycleError:
  391. PrintLog.message(u'Cyclic redirect: {0:s} -> {1:s}'.format(item, self.redirects[item]))
  392. return count
  393. def set_index(self, title, data):
  394. """returns false if the key did not already exist"""
  395. if type(title) == str:
  396. title = unicode(title, 'utf-8')
  397. result = title in self.articles
  398. self.articles[title] = data
  399. return result
  400. def get_index(self, title):
  401. if type(title) == str:
  402. title = unicode(title, 'utf-8')
  403. return self.articles[title]
  404. def all_indices(self):
  405. return self.articles.keys()
  406. def find(self, title, level = 0):
  407. """get index from article title
  408. also handles redirects
  409. returns: [index, fnd]
  410. """
  411. if '' == title:
  412. raise CycleError('Empty title detected')
  413. if level > 10:
  414. raise CycleError('Redirect cycle: ' + title)
  415. try:
  416. title = self.redirects[title]
  417. except KeyError:
  418. title = self.redirects[title[0].swapcase() + title[1:]]
  419. try:
  420. result = self.get_index(title)
  421. except KeyError:
  422. try:
  423. result = self.get_index(title[0].swapcase() + title[1:])
  424. except:
  425. result = self.find(title, level + 1)
  426. return result
  427. def bigram_encode(title):
  428. """encode a title in bigram form"""
  429. global bigram
  430. result = ''
  431. title = SearchKey.strip_accents(title)
  432. while len(title) >= 2:
  433. if SearchKey.is_valid_character(title[0]):
  434. b = title[0:2]
  435. if b in bigram:
  436. result += bigram[b]
  437. title = title[2:]
  438. else:
  439. result += chr(ord(title[0:1]))
  440. title = title[1:]
  441. else:
  442. #result += '?'
  443. title = title[1:]
  444. if len(title) == 1:
  445. if SearchKey.is_valid_character(title[0]):
  446. result += chr(ord(title[0]))
  447. #else:
  448. # result += '?'
  449. return SearchKey.compact_spaces(result)
  450. def output_fnd(filename, article_index, language_processor, truncate_title):
  451. """create bigram table"""
  452. global bigram
  453. global index_matrix
  454. global MAXIMUM_TITLE_LENGTH
  455. global MAXIMUM_TITLE_ACTUAL
  456. PrintLog.message(u'Writing bigrams: {0:s}'.format(filename))
  457. start_time = time.time()
  458. out_f = open(filename, 'wb')
  459. sortedgram = [ (value, key) for key, value in bigram.iteritems() ]
  460. sortedgram.sort()
  461. sortedgram.reverse()
  462. bigram = {}
  463. i = 0
  464. for k, v in sortedgram:
  465. out_f.write(v)
  466. bigram[v] = chr(i + 128)
  467. i += 1
  468. if i >= 128:
  469. break
  470. while i < 128:
  471. out_f.write('zz')
  472. bigram['zz'] = chr(i + 128)
  473. i += 1
  474. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  475. # create pfx matrix and write encoded titles
  476. #article_list = [strip_accents(k) for k in article_index.keys()]
  477. #article_list.sort(key = lambda x: strip_accents(x).lower())
  478. PrintLog.message(u'Sorting titles')
  479. start_time = time.time()
  480. article_list = [ (SearchKey.make_key(language_processor.translate(title)), title)
  481. for title in article_index.all_indices() ]
  482. article_list.sort()
  483. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  484. PrintLog.message(u'Writing matrix: {0:s}'.format(filename))
  485. start_time = time.time()
  486. index_matrix = {}
  487. index_matrix['\0\0\0'] = out_f.tell()
  488. previous_bigram_title = ''
  489. previous_utf8_title = ''
  490. mod_counter = 0
  491. for stripped_title, title in article_list:
  492. bigram_title = bigram_encode(stripped_title)[:MAXIMUM_TITLE_LENGTH]
  493. (article_number, dummy, restricted, is_redirect) = article_index.get_index(title)
  494. if '' == bigram_title and is_redirect:
  495. continue
  496. utf8_title = title.encode('utf-8')
  497. if truncate_title:
  498. utf8_title = utf8_title[:MAXIMUM_TITLE_LENGTH]
  499. else:
  500. utf8_title = utf8_title[:MAXIMUM_TITLE_ACTUAL]
  501. offset = out_f.tell()
  502. article_index.set_index(title, (article_number, offset, restricted, is_redirect))
  503. key3 = (stripped_title[0:3] + '\0\0\0')[0:3].lower()
  504. key2 = key3[0:2] + '\0'
  505. key1 = key3[0:1] + '\0\0'
  506. if key1 not in index_matrix:
  507. index_matrix[key1] = offset
  508. if key2 not in index_matrix:
  509. index_matrix[key2] = offset
  510. if key3 not in index_matrix:
  511. index_matrix[key3] = offset
  512. if 0 == mod_counter & 0x0f:
  513. bigram_common_length = 0
  514. utf8_common_length = 0
  515. else:
  516. bigram_common_length = common_prefix_length(previous_bigram_title, bigram_title)
  517. utf8_common_length = common_prefix_length(previous_utf8_title, utf8_title)
  518. mod_counter += 1
  519. previous_bigram_title = bigram_title
  520. previous_utf8_title = utf8_title
  521. if bigram_common_length > 1:
  522. bigram_title = chr(bigram_common_length - 1) + bigram_title[bigram_common_length:]
  523. if utf8_common_length > 1:
  524. utf8_title = chr(utf8_common_length - 1) + utf8_title[utf8_common_length:]
  525. out_f.write(struct.pack('<I', article_number) + '\0' + bigram_title + '\0' + utf8_title + '\0')
  526. out_f.close()
  527. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  528. def common_prefix_length(s1, s2, max = 32):
  529. l1 = len(s1)
  530. l2 = len(s2)
  531. if 0 == l1 or 0 == l2 or s1[0] != s2[0]:
  532. return 0
  533. size = min(l1, l2, max)
  534. for i in range(1, size):
  535. if s1[i] != s2[i]:
  536. return i
  537. return size
  538. def output_pfx(filename):
  539. """output the pfx matrix"""
  540. global index_matrix
  541. PrintLog.message(u'Writing: {0:s}'.format(filename))
  542. start_time = time.time()
  543. out_f = open(filename, 'wb')
  544. list = '\0' + SearchKey.all_characters()
  545. for k1 in list:
  546. for k2 in list:
  547. for k3 in list:
  548. key = k1+k2+k3
  549. if key in index_matrix:
  550. offset = index_matrix[key]
  551. else:
  552. offset = 0
  553. out_f.write(struct.pack('<I', offset))
  554. out_f.close()
  555. PrintLog.message(u'Time: {0:7.1f}s'.format(time.time() - start_time))
  556. # run the program
  557. if __name__ == "__main__":
  558. main()