ArticleRenderer.py 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Article Rendering
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import sys, os, struct, os.path, re
  9. import io
  10. import time
  11. import HTMLParser
  12. import unicodedata
  13. import htmlentitydefs
  14. import codecs
  15. import getopt
  16. import os.path
  17. import sqlite3
  18. import WordWrap
  19. import bucket
  20. import PrintLog
  21. import LanguageTranslation
  22. try:
  23. import gd
  24. except:
  25. print 'error: Missing python module: python-gd'
  26. print ' sudo apt-get install python-gd'
  27. exit(1)
  28. # try to find a lzma library interface
  29. no_compression = True
  30. # python-lzma
  31. if no_compression:
  32. try:
  33. import lzma
  34. def CompressData(data):
  35. c = lzma.compress(data, options={'format': 'alone'})
  36. # header: options(1) dictionary-size(4) uncompressed-length(8) = 13 bytes
  37. return c[:5] + c[13:] # drop the uncompressed length (always 0xffff_ffff_ffff_ffff)
  38. no_compression = False
  39. except:
  40. pass
  41. # PyLZMA
  42. if no_compression:
  43. try:
  44. import pylzma
  45. def CompressData(data):
  46. return pylzma.compress(data,
  47. dictionary = 24, fastBytes = 32,
  48. literalContextBits = 3,
  49. literalPosBits = 0, posBits = 2,
  50. algorithm = 1, eos = 1)
  51. no_compression = False
  52. except:
  53. pass
  54. # none detected
  55. if no_compression:
  56. print 'error: Missing python LZMA compression module'
  57. print 'alternative 1: (preferred)'
  58. print ' sudo apt-get install python-lzma'
  59. print 'alternative 2:'
  60. print ' sudo apt-get install python-pylzma'
  61. print 'alternative 3: compile/install local PyLZMA'
  62. print ' make local-pylzma-install'
  63. exit(1)
  64. verbose = False
  65. warnings = False
  66. article_count = 0
  67. # NASTY HACK: allow this </div class="something">
  68. HTMLParser.endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*[^>]*>')
  69. # from: wiki-app/bmf.h
  70. FONT_BMF_HEADER = '<4bI' # struct font_bmf_header (header)
  71. CHARMETRIC_BMF = '<8b48s' # struct charmetric_bmf (font)
  72. FONT_BMF_HEADER_SIZE = struct.calcsize(FONT_BMF_HEADER)
  73. CHARMETRIC_BMF_SIZE = struct.calcsize(CHARMETRIC_BMF)
  74. # font face defines - match the #defines of the same name in: wiki-app/lcd_buf_draw.h
  75. ITALIC_FONT_IDX = 1
  76. DEFAULT_FONT_IDX = 2
  77. TITLE_FONT_IDX = 3
  78. SUBTITLE_FONT_IDX = 4
  79. DEFAULT_ALL_FONT_IDX = 5
  80. TITLE_ALL_FONT_IDX = 6
  81. SUBTITLE_ALL_FONT_IDX = 7
  82. FONT_FACE_NAME = {
  83. ITALIC_FONT_IDX: 'Italic',
  84. DEFAULT_FONT_IDX: 'Default',
  85. TITLE_FONT_IDX: 'Title',
  86. SUBTITLE_FONT_IDX: 'Subtitle',
  87. DEFAULT_ALL_FONT_IDX: 'Default All',
  88. TITLE_ALL_FONT_IDX: 'Title All',
  89. SUBTITLE_ALL_FONT_IDX: 'Subtitle All',
  90. }
  91. # Screen dimensions
  92. LCD_WIDTH = 240
  93. LCD_LEFT_MARGIN = 6 # def. in lcd_buf_draw.h
  94. LCD_IMG_MARGIN = 8
  95. # Line Spaces (read directly from the font using gdbfed)
  96. H1_LSPACE = 19
  97. H2_LSPACE = 17
  98. H3_LSPACE = H2_LSPACE
  99. H4_LSPACE = H2_LSPACE
  100. H5_LSPACE = H2_LSPACE
  101. H6_LSPACE = H2_LSPACE
  102. P_LSPACE = 15
  103. # Margins & Spacing
  104. LIST_INDENT = 16
  105. DIV_MARGIN_TOP = 10
  106. P_MARGIN_TOP = DIV_MARGIN_TOP
  107. BLOCKQUOTE_MARGIN_TOP = DIV_MARGIN_TOP
  108. BLOCKQUOTE_MARGIN_LEFT = LIST_INDENT
  109. BLOCKQUOTE_MARGIN_RIGHT = LIST_INDENT
  110. LIST_MARGIN_TOP = DIV_MARGIN_TOP
  111. BR_MARGIN_TOP = DIV_MARGIN_TOP
  112. DEFAULT_LWIDTH = (LCD_WIDTH-LCD_LEFT_MARGIN)
  113. H1_MARGIN_TOP = 8
  114. H1_MARGIN_BOTTOM = P_MARGIN_TOP
  115. H2_MARGIN_TOP = 14
  116. H3_MARGIN_TOP = H2_MARGIN_TOP
  117. H4_MARGIN_TOP = H2_MARGIN_TOP
  118. H5_MARGIN_TOP = H2_MARGIN_TOP
  119. H6_MARGIN_TOP = H2_MARGIN_TOP
  120. LIMAX_INDENT_LEVELS = 3
  121. MAX_QUOTE_LEVEL = 1
  122. # bullet[0] charater is not used (the '!')
  123. bullet_c = u"!\u25aa\u2022\u25ab"
  124. LIMAX_BULLETS = len(bullet_c) - 1
  125. font_id_values = {}
  126. g_starty = 0
  127. g_curr_face = DEFAULT_FONT_IDX
  128. g_halign = 0
  129. g_this_article_title = 'NO TITLE'
  130. g_links = {}
  131. g_link_cnt = 0
  132. i_out = None
  133. f_out = None
  134. file_number = 0
  135. article_db = None
  136. output = None
  137. compress = True
  138. article_writer = None
  139. def usage(message):
  140. if None != message:
  141. print('error: {0:s}'.format(message))
  142. print('usage: {0:s} <options> html-files...'.format(os.path.basename(__file__)))
  143. print(' --help This message')
  144. print(' --verbose Enable verbose output')
  145. print(' --warnings Enable warnings output')
  146. print(' --number=n Number for the .dat/.idx-tmp files [0]')
  147. print(' --test=file Output the uncompressed file for testing')
  148. print(' --font-path=dir Path to font files (*.bmf) [fonts]')
  149. print(' --article-index=file Article index dictionary input [articles.db]')
  150. print(' --prefix=name Device file name portion for .dat/.idx-tmp [pedia]')
  151. print(' --languages-links=<YN> Turn on/off inter-wiki links [YES]')
  152. print(' --images=<YN> Turn on/off in-line math images [YES]')
  153. print(' --articles=<N> Articles per block [32]')
  154. print(' --block-size=<bytes> Max size for artical block [262144]')
  155. exit(1)
  156. def main():
  157. global verbose, warnings, compress
  158. global f_out, output, i_out
  159. global font_id_values
  160. global file_number
  161. global article_count
  162. global article_db
  163. global start_time
  164. global article_writer
  165. try:
  166. opts, args = getopt.getopt(sys.argv[1:],
  167. 'hvwn:p:i:t:f:l:a:b:',
  168. ['help',
  169. 'verbose',
  170. 'warnings',
  171. 'number=',
  172. 'prefix=',
  173. 'article-index=',
  174. 'test=',
  175. 'font-path=',
  176. 'language-links=',
  177. 'images=',
  178. 'articles=',
  179. 'block-size=',
  180. ])
  181. except getopt.GetoptError, err:
  182. usage(err)
  183. verbose = False
  184. warnings = False
  185. data_file = 'pedia{0:d}.dat'
  186. index_file = 'pedia{0:d}.idx-tmp'
  187. art_file = 'articles.db'
  188. file_number = 0
  189. test_file = ''
  190. font_path = "../fonts"
  191. article_db = None
  192. inter_links = True
  193. enable_images = True
  194. articles_per_block = 32
  195. block_size = 262144
  196. for opt, arg in opts:
  197. if opt in ('-v', '--verbose'):
  198. verbose = True
  199. elif opt in ('-w', '--warnings'):
  200. warnings = True
  201. elif opt in ('-h', '--help'):
  202. usage(None)
  203. elif opt in ('-t', '--test'):
  204. test_file = arg
  205. elif opt in ('-i', '--article-index'):
  206. art_file = arg
  207. elif opt in ('-n', '--number'):
  208. try:
  209. file_number = int(arg)
  210. except ValueError:
  211. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  212. elif opt in ('-p', '--prefix'):
  213. data_file = arg + '{0:d}.dat'
  214. index_file = arg + '{0:d}.idx-tmp'
  215. elif opt in ('-f', '--font-path'):
  216. font_path = arg
  217. elif opt in ('-l', '--language-links'):
  218. arg = arg.lower()
  219. inter_links = ('yes' == arg)
  220. elif opt in ('-l', '--images'):
  221. arg = arg.lower()
  222. enable_images = ('yes' == arg)
  223. elif opt in ('-a', '--articles'):
  224. try:
  225. articles_per_block = int(arg)
  226. except ValueError:
  227. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  228. if articles_per_block < 1 or articles_per_block > 64:
  229. usage('"{0:s}={1:s}" is out of range [1..64]'.format(opt, arg))
  230. elif opt in ('-b', '--block-size'):
  231. try:
  232. block_size = int(arg)
  233. except ValueError:
  234. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  235. if block_size < 65536 or block_size > 524288:
  236. usage('"{0:s}={1:s}" is out of range [65536..524288]'.format(opt, arg))
  237. else:
  238. usage('unhandled option: ' + opt)
  239. start_time = time.time()
  240. f_fontr = open(os.path.join(font_path, "text.bmf"), "rb")
  241. f_fonti = open(os.path.join(font_path, "texti.bmf"), "rb")
  242. f_fontt = open(os.path.join(font_path, "title.bmf"), "rb")
  243. f_fontst = open(os.path.join(font_path, "subtitle.bmf"), "rb")
  244. f_font_all = open(os.path.join(font_path, "textall.bmf"), "rb")
  245. f_fontt_all = open(os.path.join(font_path, "titleall.bmf"), "rb")
  246. f_fontst_all = open(os.path.join(font_path, "subtlall.bmf"), "rb")
  247. font_id_values = {
  248. ITALIC_FONT_IDX: f_fonti,
  249. DEFAULT_FONT_IDX: f_fontr,
  250. TITLE_FONT_IDX: f_fontt,
  251. TITLE_ALL_FONT_IDX: f_fontt_all,
  252. SUBTITLE_FONT_IDX: f_fontst,
  253. SUBTITLE_ALL_FONT_IDX: f_fontst_all,
  254. DEFAULT_ALL_FONT_IDX: f_font_all
  255. }
  256. article_db = sqlite3.connect(art_file)
  257. article_db.execute('pragma auto_vacuum = none')
  258. article_db.execute('pragma synchronous = off')
  259. article_db.execute('pragma temp_store = memory')
  260. article_db.execute('pragma locking_mode = normal')
  261. article_db.execute('pragma read_uncommitted = true')
  262. article_db.execute('pragma cache_size = 20000000')
  263. article_db.execute('pragma default_cache_size = 20000000')
  264. article_db.execute('pragma journal_mode = off')
  265. output = io.BytesIO('')
  266. if test_file == '':
  267. compress = True
  268. i_out = open(index_file.format(file_number), 'wb')
  269. f_out = open(data_file.format(file_number), 'wb')
  270. article_writer = ArticleWriter(file_number, f_out, i_out,
  271. max_buckets = 50,
  272. bucket_size = block_size,
  273. max_items_per_bucket = articles_per_block)
  274. else:
  275. compress = False
  276. f_out = open(test_file, 'wb')
  277. for name in args:
  278. f = codecs.open(name, 'r', 'utf-8', 'replace')
  279. WrProcess(f, inter_links, enable_images)
  280. f.close()
  281. for item in font_id_values:
  282. font_id_values[item].close()
  283. if output != None:
  284. output.close()
  285. if article_writer != None:
  286. del article_writer
  287. if f_out != None:
  288. f_out.close()
  289. if i_out != None:
  290. i_out.close()
  291. if article_db != None:
  292. article_db.close()
  293. for i in font_id_values:
  294. font_id_values[i].close()
  295. # final message
  296. PrintLog.message("Render[{0:d}]: Total: {1:d}".format(file_number, article_count))
  297. #
  298. # cached font information
  299. #
  300. font_width_cache = {}
  301. font_default_cache = {}
  302. def get_utf8_cwidth(c, face):
  303. global font_width_cache
  304. global font_default_cache
  305. global font_id_values
  306. global FONT_BMF_HEADER
  307. global CHARMETRIC_BMF
  308. global FONT_BMF_HEADER_SIZE
  309. global CHARMETRIC_BMF_SIZE
  310. global FONT_FACE_NAME
  311. if type(c) != unicode:
  312. c = unicode(c, 'utf-8')
  313. if (c, face) in font_width_cache:
  314. return font_width_cache[(c, face)]
  315. font_file = font_id_values[face]
  316. if face not in font_default_cache:
  317. font_file.seek(0)
  318. buffer = font_file.read(FONT_BMF_HEADER_SIZE)
  319. if len(buffer) != 0:
  320. linespace, ascent, descent, bmp_buffer_len, default_char = struct.unpack(FONT_BMF_HEADER, buffer)
  321. else:
  322. linespace, ascent, descent, bmp_buffer_len, default_char = (0, 0, 0, 0, ord(u' '))
  323. font_default_cache[face] = unichr(default_char)
  324. font_file.seek(ord(c) * CHARMETRIC_BMF_SIZE + FONT_BMF_HEADER_SIZE)
  325. buffer = font_file.read(CHARMETRIC_BMF_SIZE)
  326. if len(buffer) != 0:
  327. width, height, widthBytes, widthBits, ascent, descent, LSBearing, widthDevice, bitmap = struct.unpack(CHARMETRIC_BMF, buffer)
  328. else:
  329. width, height, widthBytes, widthBits, ascent, descent, LSBearing, widthDevice, bitmap = (0,0,0,0,0,0,0,0,
  330. r'\x55' * 48)
  331. character_width = widthDevice
  332. if 0 == character_width:
  333. if TITLE_FONT_IDX == face:
  334. character_width = get_utf8_cwidth(c, TITLE_ALL_FONT_IDX)
  335. elif SUBTITLE_FONT_IDX == face:
  336. character_width = get_utf8_cwidth(c, SUBTITLE_ALL_FONT_IDX)
  337. elif face in [TITLE_ALL_FONT_IDX, SUBTITLE_ALL_FONT_IDX, DEFAULT_ALL_FONT_IDX]:
  338. character_width = get_utf8_cwidth(font_default_cache[face], face)
  339. else:
  340. character_width = get_utf8_cwidth(c, DEFAULT_ALL_FONT_IDX)
  341. font_width_cache[(c, face)] = character_width
  342. return character_width
  343. def get_lineheight(face):
  344. values = {
  345. ITALIC_FONT_IDX: P_LSPACE,
  346. DEFAULT_FONT_IDX: P_LSPACE,
  347. TITLE_FONT_IDX: H1_LSPACE,
  348. TITLE_ALL_FONT_IDX: H1_LSPACE,
  349. SUBTITLE_FONT_IDX: H2_LSPACE,
  350. SUBTITLE_ALL_FONT_IDX: H2_LSPACE,
  351. DEFAULT_ALL_FONT_IDX: P_LSPACE
  352. }
  353. return values[face]
  354. def make_link(url, x0, x1, text):
  355. global g_starty, g_curr_face, g_link_cnt, g_links
  356. if article_index(url):
  357. esc_code10(x1 - x0)
  358. g_links[g_link_cnt] = (x0, g_starty - get_lineheight(g_curr_face), x1, g_starty, url)
  359. g_link_cnt = g_link_cnt + 1
  360. def get_imgdata(imgfile, indent):
  361. try:
  362. img = gd.image(imgfile)
  363. except IOError, e:
  364. PrintLog.message(u'unable to open image file: {0:s}'.format(imgfile))
  365. return (0, 0, r'')
  366. (width, height) = img.size()
  367. if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
  368. is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((x, y)))
  369. h_range = range(0, width)
  370. v_range = range(0, height)
  371. elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
  372. is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((y, x)))
  373. v_range = range(0, width)
  374. h_range = range(height - 1, -1, -1)
  375. (width, height) = (height, width)
  376. else:
  377. PrintLog.message(u'image file: {0:s} is too big'.format(imgfile))
  378. return (0, 0, r'')
  379. data = ''
  380. for v in v_range:
  381. byte = 0
  382. bit_count = 8
  383. for h in h_range:
  384. if is_black(h, v):
  385. pixel = 1
  386. else:
  387. pixel = 0
  388. bit_count -= 1
  389. byte |= pixel << bit_count
  390. if 0 == bit_count:
  391. data += struct.pack('<B', byte)
  392. byte = 0
  393. bit_count = 8
  394. if 8 != bit_count:
  395. data += struct.pack('<B', byte)
  396. return (width, height, data)
  397. def esc_code0(num_pixels):
  398. """blank line height in pixels"""
  399. global g_starty
  400. global output
  401. output.write(struct.pack('<BB', 1, num_pixels))
  402. g_starty += num_pixels
  403. def esc_code1():
  404. """new line with default font and default line space"""
  405. global g_starty, g_curr_face
  406. global output
  407. output.write(struct.pack('<B', 2))
  408. g_starty += get_lineheight(DEFAULT_FONT_IDX)
  409. g_curr_face = DEFAULT_FONT_IDX
  410. def esc_code2():
  411. """new line with current font and current line space"""
  412. global g_starty, g_curr_face
  413. global output
  414. output.write(struct.pack('<B', 3))
  415. g_starty += get_lineheight(g_curr_face)
  416. def esc_code3(face):
  417. """new line using new font face."""
  418. global g_starty, g_curr_face
  419. global output
  420. num_pixels = get_lineheight(face)
  421. output.write(struct.pack('<BB', 4, face|(num_pixels<<3)))
  422. g_starty += num_pixels
  423. g_curr_face = face
  424. def esc_code4(face, halign=0):
  425. """change font with current horizontal alignment (in pixels)"""
  426. global g_curr_face
  427. global output
  428. output.write(struct.pack('<BB', 5, face|(halign<<3)))
  429. g_curr_face = face
  430. def esc_code5():
  431. """set font as default"""
  432. global g_curr_face
  433. global output
  434. output.write(struct.pack('<B', 6))
  435. g_curr_face = DEFAULT_FONT_IDX
  436. def esc_code6():
  437. """set default alignment"""
  438. global output
  439. output.write(struct.pack('<B', 7))
  440. def esc_code7(num_pixels):
  441. """move right num_pixels"""
  442. global output
  443. output.write(struct.pack('<BB', 8, num_pixels))
  444. def esc_code8(num_pixels):
  445. """move left num_pixels"""
  446. global output
  447. output.write(struct.pack('<BB', 9, num_pixels))
  448. def esc_code9(num_pixels):
  449. """alignment adjustment"""
  450. global g_halign
  451. global output
  452. output.write(struct.pack('<Bb', 10, num_pixels))
  453. g_halign = num_pixels
  454. def esc_code10(num_pixels):
  455. """draw line from right to left"""
  456. global output
  457. output.write(struct.pack('<BB', 11, num_pixels))
  458. def esc_code14(width, height, data):
  459. """output bitmap"""
  460. global g_starty, g_curr_face
  461. global output
  462. if 0 == width or 0 == height:
  463. return
  464. output.write(struct.pack('<BBH', 15, width, height))
  465. output.write(data)
  466. lineh = get_lineheight(g_curr_face)
  467. if (height) > lineh:
  468. g_starty += (height)-lineh + 3 # since Eric draws images 3px lower for alignment
  469. #
  470. # Parse the HTML into the WikiReader's format
  471. #
  472. class WrProcess(HTMLParser.HTMLParser):
  473. READ_BLOCK_SIZE = 64 * (1024 * 1024)
  474. def __init__ (self, f, inter_links = True, enable_images = True):
  475. global g_this_article_title, article_count
  476. HTMLParser.HTMLParser.__init__(self)
  477. self.wordwrap = WordWrap.WordWrap(get_utf8_cwidth)
  478. self.local_init()
  479. self.tag_stack = []
  480. self.inter_links = inter_links
  481. self.enable_images = enable_images
  482. self.bucket = bucket.Bucket()
  483. block = f.read(self.READ_BLOCK_SIZE)
  484. while block:
  485. self.feed(block)
  486. block = f.read(self.READ_BLOCK_SIZE)
  487. def local_init(self):
  488. global g_starty, g_curr_face, g_halign
  489. global g_this_article_title, g_links, g_link_cnt
  490. self.in_html = False
  491. self.in_title = False
  492. self.in_body = False
  493. self.in_h1 = False
  494. self.in_h2 = False
  495. self.in_h3 = False
  496. self.in_h4 = False
  497. self.in_h5 = False
  498. self.in_h6 = False
  499. self.in_table = 0
  500. self.in_p = False
  501. self.in_b = False
  502. self.in_big = False
  503. self.in_strong = False
  504. self.in_del = False
  505. self.in_ins = False
  506. self.in_i = False
  507. self.in_a = False
  508. self.in_br = False
  509. self.in_img = False
  510. self.quote = 0
  511. self.level = 0
  512. self.lwidth = DEFAULT_LWIDTH
  513. self.indent = 0
  514. self.li_cnt = {}
  515. self.li_inside = {}
  516. self.li_type = {}
  517. self.link_x = 0
  518. self.link_y = 0
  519. self.url = None
  520. self.language_links = []
  521. self.printing = True
  522. g_starty = 0
  523. g_curr_face = DEFAULT_FONT_IDX
  524. g_halign = 0
  525. g_this_article_title = 'NO TITLE'
  526. g_links = {}
  527. g_link_cnt = 0
  528. def handle_starttag(self, tag, attrs):
  529. global g_starty, g_curr_face, g_halign
  530. global g_this_article_title, g_links, g_link_cnt
  531. global warnings
  532. attrs = dict(attrs)
  533. # must always do the <html> tag
  534. if tag == 'html':
  535. self.local_init()
  536. self.in_html = True
  537. self.tag_stack = [(tag, True)]
  538. return
  539. self.tag_stack.append((tag, self.printing))
  540. # we want to skip content that isn't for printing
  541. if 'class' in attrs:
  542. if 'noprint' in attrs['class']:
  543. self.printing = False
  544. # create a list of language links
  545. if self.inter_links and tag == 'a' and 'lang-link' in attrs['class']:
  546. self.language_links.append(attrs['href'])
  547. # handle the tags
  548. if not self.printing:
  549. return;
  550. elif tag == 'script':
  551. self.printing = False
  552. elif tag == 'title':
  553. self.in_title = True
  554. g_this_article_title = ''
  555. elif tag == 'body':
  556. self.in_body = True
  557. elif tag == 'table':
  558. self.in_table += 1
  559. # if in a table suppress everything after this point
  560. if self.in_table > 0:
  561. return
  562. elif tag == 'h1':
  563. self.flush_buffer()
  564. self.in_h1 = True
  565. esc_code0(H1_MARGIN_TOP)
  566. elif tag == 'h2':
  567. self.flush_buffer()
  568. self.in_h2 = True
  569. esc_code0(H2_MARGIN_TOP)
  570. elif tag == 'h3':
  571. self.flush_buffer()
  572. self.in_h3 = True
  573. esc_code0(H3_MARGIN_TOP)
  574. elif tag == 'h4':
  575. self.flush_buffer()
  576. self.in_h4 = True
  577. esc_code0(H4_MARGIN_TOP)
  578. elif tag == 'h5':
  579. self.flush_buffer()
  580. self.in_h5 = True
  581. esc_code0(H5_MARGIN_TOP)
  582. elif tag == 'h6':
  583. self.flush_buffer()
  584. self.in_h6 = True
  585. esc_code0(H6_MARGIN_TOP)
  586. elif tag == 'div':
  587. self.flush_buffer()
  588. # suppress thumb info boxes
  589. if 'class' in attrs:
  590. c = attrs['class']
  591. if 'thumb' in c or 'left' in c or 'right' in c \
  592. or 'dablink' in c or 'magnify' in c:
  593. self.printing = False
  594. return
  595. esc_code0(DIV_MARGIN_TOP)
  596. elif tag == 'p':
  597. self.flush_buffer()
  598. self.in_p = True
  599. esc_code0(P_MARGIN_TOP)
  600. elif tag == 'blockquote':
  601. self.flush_buffer()
  602. self.quote += 1
  603. if self.quote < MAX_QUOTE_LEVEL:
  604. esc_code0(BLOCKQUOTE_MARGIN_TOP)
  605. self.indent += BLOCKQUOTE_MARGIN_LEFT
  606. self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
  607. esc_code9(BLOCKQUOTE_MARGIN_LEFT)
  608. elif tag == 'b':
  609. self.in_b = True
  610. elif tag == 'i':
  611. self.in_i = True
  612. elif tag == 'big': # Not sure what to do with this one
  613. self.in_b = True
  614. elif tag == 'strong':
  615. self.in_b = True
  616. elif tag == 'del':
  617. self.in_del = True
  618. elif tag == 'ins':
  619. self.in_ins = True
  620. elif tag == 'a' and 'href' in attrs:
  621. self.in_a = True
  622. self.url = attrs['href']
  623. elif tag in ['ul', 'ol', 'dl']:
  624. if 'start' in attrs:
  625. list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start'])
  626. try:
  627. list_start = int(list_start)
  628. except ValueError:
  629. list_start = 1
  630. self.enter_list(tag, list_start)
  631. else:
  632. self.enter_list(tag)
  633. elif tag == 'li':
  634. if 0 == self.level:
  635. if warnings:
  636. (line, column) = self.getpos()
  637. PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  638. .format(tag, line, column, article_count + 1, g_this_article_title))
  639. (t, p) = self.tag_stack.pop()
  640. return # just ignore it
  641. # force ul since this is a li without a parent
  642. #(t, p) = self.tag_stack.pop()
  643. #self.tag_stack.append(('ul', p))
  644. #self.tag_stack.append((t,p))
  645. #self.enter_list('ul')
  646. # handle missing </li> at the same level
  647. # simulate </li> and continue
  648. if self.li_inside[self.level]:
  649. if warnings:
  650. (line, column) = self.getpos()
  651. PrintLog.message(u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  652. .format(tag, line, column, article_count + 1, g_this_article_title))
  653. (t, p) = self.tag_stack.pop()
  654. self.flush_buffer(False)
  655. self.list_decrease_indent()
  656. self.li_inside[self.level] = True
  657. if 'value' in attrs:
  658. list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value'])
  659. try:
  660. self.li_cnt[self.level] = int(list_index)
  661. except ValueError:
  662. pass
  663. else:
  664. self.li_cnt[self.level] += 1
  665. if self.li_type[self.level] == 'ol':
  666. self.wordwrap.append(("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None)
  667. else:
  668. if self.level > LIMAX_BULLETS:
  669. bullet_num = LIMAX_BULLETS
  670. else:
  671. bullet_num = self.level
  672. self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None)
  673. self.flush_buffer()
  674. self.list_increase_indent()
  675. elif tag == 'dd':
  676. if 0 == self.level:
  677. if warnings:
  678. (line, column) = self.getpos()
  679. PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  680. .format(tag, line, column, article_count + 1, g_this_article_title))
  681. (t, p) = self.tag_stack.pop()
  682. return # just ignore it
  683. self.li_cnt[self.level] += 1
  684. self.list_increase_indent()
  685. elif tag == 'br':
  686. self.in_br = True
  687. elif tag == 'img' and 'src' in attrs:
  688. # include either image or the 'alt' text
  689. if self.enable_images:
  690. (width, height, data) = get_imgdata(attrs['src'], self.indent)
  691. self.wordwrap.AppendImage(width, height, data, None)
  692. elif 'alt' in attrs:
  693. self.handle_data(attrs['alt'])
  694. self.in_img = True
  695. def handle_endtag(self, tag):
  696. global g_this_article_title
  697. global article_count
  698. global warnings
  699. # ignore end tag without start tag
  700. if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack:
  701. if warnings:
  702. (line, column) = self.getpos()
  703. PrintLog.message(u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  704. .format(tag, line, column, article_count + 1, g_this_article_title))
  705. return
  706. # backtrack up the stack closing each open tag until there is a match
  707. (start_tag, self.printing) = self.tag_stack.pop()
  708. while start_tag != tag:
  709. self.tag_stack.append((start_tag, self.printing))
  710. if warnings:
  711. (line, column) = self.getpos()
  712. PrintLog.message(u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  713. .format(start_tag, line, column, article_count + 1, g_this_article_title))
  714. self.handle_endtag(start_tag)
  715. (start_tag, self.printing) = self.tag_stack.pop()
  716. # must always do </html> tag
  717. if tag == 'html':
  718. self.printing = True
  719. self.tag_stack = []
  720. self.in_html = False
  721. esc_code1()
  722. write_article(self.language_links)
  723. return
  724. if not self.printing:
  725. return
  726. elif tag == 'script':
  727. pass
  728. elif tag == 'title':
  729. self.in_title = False
  730. g_this_article_title = g_this_article_title.strip()
  731. elif tag == 'body':
  732. self.in_body = False
  733. self.flush_buffer()
  734. elif tag == 'table':
  735. if self.in_table > 0:
  736. self.in_table -= 1
  737. # if in a table suppress everything after this point
  738. if self.in_table > 0:
  739. return
  740. elif tag == 'h1':
  741. self.flush_buffer()
  742. self.in_h1 = False
  743. esc_code0(H1_MARGIN_BOTTOM)
  744. elif tag == 'h2':
  745. self.flush_buffer()
  746. self.in_h2 = False
  747. elif tag == 'h3':
  748. self.flush_buffer()
  749. self.in_h3 = False
  750. elif tag == 'h4':
  751. self.flush_buffer()
  752. self.in_h4 = False
  753. elif tag == 'h5':
  754. self.flush_buffer()
  755. self.in_h5 = False
  756. elif tag == 'h6':
  757. self.flush_buffer()
  758. self.in_h6 = False
  759. elif tag == 'div':
  760. self.flush_buffer()
  761. elif tag == 'p':
  762. self.flush_buffer()
  763. self.in_p = False
  764. elif tag == 'blockquote':
  765. self.flush_buffer()
  766. if self.quote > 0:
  767. if self.quote < MAX_QUOTE_LEVEL:
  768. self.indent -= BLOCKQUOTE_MARGIN_LEFT
  769. self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
  770. esc_code9(-BLOCKQUOTE_MARGIN_LEFT)
  771. self.quote -= 1
  772. elif tag == 'b':
  773. self.in_b = False
  774. elif tag == 'big':
  775. self.in_b = False
  776. elif tag == 'strong':
  777. self.in_b = False
  778. elif tag == 'i':
  779. self.in_i = False
  780. elif tag == 'del':
  781. self.in_del = False
  782. elif tag == 'ins':
  783. self.in_ins = False
  784. elif tag == 'a':
  785. self.in_a = False
  786. self.url = ""
  787. elif tag in ['ul', 'ol', 'dl']:
  788. self.leave_list()
  789. elif tag == 'li':
  790. if 0 == self.level:
  791. if warnings:
  792. (line, column) = self.getpos()
  793. PrintLog.message(u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  794. .format(tag, line, column, article_count + 1, g_this_article_title))
  795. else:
  796. self.flush_buffer(False)
  797. self.list_decrease_indent()
  798. self.li_inside[self.level] = False
  799. elif tag == 'dd':
  800. self.flush_buffer()
  801. self.list_decrease_indent()
  802. elif tag == 'dt':
  803. self.flush_buffer()
  804. elif tag == 'br':
  805. self.flush_buffer()
  806. self.in_br = False
  807. elif tag == 'img':
  808. self.in_img = False
  809. def enter_list(self, list_type, start = 1):
  810. self.flush_buffer(False)
  811. esc_code0(LIST_MARGIN_TOP)
  812. self.level += 1
  813. self.li_cnt[self.level] = start - 1
  814. self.li_inside[self.level] = False
  815. self.li_type[self.level] = list_type
  816. def list_increase_indent(self):
  817. if self.level <= LIMAX_INDENT_LEVELS:
  818. esc_code9(LIST_INDENT)
  819. esc_code8(LIST_INDENT) ### Bug in lcd_buf_draw ASK ERIC
  820. self.lwidth -= LIST_INDENT
  821. self.indent += LIST_INDENT
  822. def leave_list(self):
  823. self.flush_buffer()
  824. if self.level > 0:
  825. esc_code0(LIST_MARGIN_TOP)
  826. del self.li_cnt[self.level]
  827. del self.li_inside[self.level]
  828. self.level -= 1
  829. def list_decrease_indent(self):
  830. if self.level <= LIMAX_INDENT_LEVELS:
  831. esc_code9(- LIST_INDENT)
  832. self.lwidth += LIST_INDENT
  833. self.indent -= LIST_INDENT
  834. def handle_charref(self, name):
  835. """handle &#DDDD; &#xXXXX;"""
  836. if 0 == len(name):
  837. return
  838. if 'x' == name[0] or 'X' == name[0]:
  839. try:
  840. value = int(name[1:], 16)
  841. except ValueError:
  842. PrintLog.message(u'charref: "{0:s}" is not hexadecimal'.format(name))
  843. return
  844. elif name.isdigit():
  845. try:
  846. value = int(name)
  847. except ValueError:
  848. PrintLog.message(u'charref: "{0:s}" is not decimal'.format(name))
  849. return
  850. try:
  851. c = unichr(value)
  852. except ValueError:
  853. PrintLog.message(u'charref: "{0:d}" is not convertible to unicode'.format(value))
  854. c = '?'
  855. self.handle_data(c)
  856. def handle_entityref(self, name):
  857. """handle &amp; &gt; ..."""
  858. try:
  859. self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
  860. except KeyError:
  861. PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(name, g_this_article_title))
  862. def handle_data(self, data):
  863. global g_this_article_title
  864. if self.in_title:
  865. g_this_article_title += data
  866. # only parse valid tags in <body>
  867. # skip tables for now
  868. if not self.in_body or self.in_table > 0 or not self.printing:
  869. return
  870. # defaults
  871. data = re.sub("\s+" , " ", data)
  872. face = DEFAULT_FONT_IDX
  873. url = None
  874. # only use italic fonts now (don't care about bold)
  875. if self.in_i:
  876. face = ITALIC_FONT_IDX
  877. if self.in_h1:
  878. face = TITLE_FONT_IDX
  879. elif self.in_h2 or self.in_h3 or self.in_h4 or self.in_h5 or self.in_h6:
  880. face = SUBTITLE_FONT_IDX
  881. # figure out if we need a url
  882. if self.in_a:
  883. url = self.url
  884. self.wordwrap.append(data, face, url)
  885. def flush_buffer(self, new_line = True):
  886. global output
  887. font = -1
  888. while self.wordwrap.have():
  889. url = None
  890. x0 = self.indent
  891. url_x0 = x0
  892. line = self.wordwrap.wrap(self.lwidth)
  893. if line == []:
  894. break
  895. if tuple == type(line[0][1]):
  896. if font < 0:
  897. new_font = DEFAULT_FONT_IDX
  898. else:
  899. new_font = font
  900. else:
  901. new_font = line[0][1]
  902. if new_line:
  903. if font != new_font:
  904. font = new_font
  905. esc_code3(font)
  906. else:
  907. esc_code2()
  908. else:
  909. if font != new_font:
  910. font = new_font
  911. esc_code4(font)
  912. new_line = True
  913. for i in line:
  914. if tuple == type(i[1]):
  915. (width, height, data) = i[1]
  916. esc_code14(width, height, data)
  917. else:
  918. if font != i[1]:
  919. font = i[1]
  920. esc_code4(font)
  921. if url != i[2]:
  922. if url != None:
  923. make_link(url, url_x0, x0, i[0])
  924. url = i[2]
  925. if url != None:
  926. url_x0 = x0
  927. output.write(i[0].encode('utf-8'))
  928. x0 += i[3]
  929. if url != None:
  930. make_link(url, url_x0, x0, line[-1][0])
  931. def link_number(url):
  932. global article_index
  933. try:
  934. n = article_index(url)[0]
  935. except KeyError:
  936. n = -1
  937. return n
  938. # Add the '~' padding back here
  939. def article_index(title):
  940. global article_db
  941. c = article_db.cursor()
  942. c.execute('select article_number, fnd_offset, restricted from articles where title = ? limit 1', ["~" + title])
  943. result = c.fetchone()
  944. c.close()
  945. return result # this returns a tuple of text strings, so beware!
  946. def write_article(language_links):
  947. global compress
  948. global verbose
  949. global output, f_out, i_out
  950. global article_count
  951. global g_this_article_title
  952. global file_number
  953. global start_time
  954. global article_writer
  955. article_count += 1
  956. if verbose:
  957. PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title))
  958. elif article_count % 1000 == 0:
  959. now_time = time.time()
  960. PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count))
  961. start_time = now_time
  962. output.flush()
  963. # create links
  964. links_stream = io.BytesIO('')
  965. for i in g_links:
  966. (x0, y0, x1, y1, url) = g_links[i]
  967. links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))
  968. links_stream.flush()
  969. links = links_stream.getvalue()
  970. links_stream.close()
  971. # create language links
  972. links_stream = io.BytesIO('')
  973. japanese_convert = LanguageTranslation.LanguageJapanese().translate
  974. normal_convert = LanguageTranslation.LanguageNormal().translate
  975. for l in language_links:
  976. language, link = l.split(':', 1)
  977. if 'ja' == language:
  978. stripped = japanese_convert(link)
  979. else:
  980. stripped = normal_convert(link)
  981. if link == stripped:
  982. links_stream.write(l.encode('utf-8') + '\0')
  983. else:
  984. links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0')
  985. links_stream.flush()
  986. langs = links_stream.getvalue()
  987. links_stream.close()
  988. # create the header (header size = 8)
  989. header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
  990. body = output.getvalue()
  991. # combine the data
  992. whole_article = header + links + langs + body
  993. if compress:
  994. try:
  995. (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
  996. restricted = bool(int(restricted)) # '0' is True so turn it into False
  997. article_writer.add_article(article_number, whole_article, fnd_offset, restricted)
  998. except KeyError:
  999. PrintLog.message(u'Error in: write_article, Title not found')
  1000. PrintLog.message(u'Title: {0:s}'.format(g_this_article_title))
  1001. PrintLog.message(u'Offset: {0:s}'.format(file_offset))
  1002. PrintLog.message(u'Count: {0:s}'.format(article_count))
  1003. else:
  1004. f_out.write(whole_article)
  1005. f_out.flush()
  1006. # Note: some versions of Python do not move file position on truncate
  1007. # so an explicit seek is needed to avoid nul padding bytes.
  1008. output.seek(0)
  1009. output.truncate(0)
  1010. class ArticleWriter(bucket.Bucket):
  1011. """to combine sets of articles and compress them together"""
  1012. def __init__(self, file_number, data_file, index_file,
  1013. max_buckets = 50, bucket_size = 524288, max_items_per_bucket = 64):
  1014. super(ArticleWriter, self).__init__(max_buckets = max_buckets,
  1015. bucket_size = bucket_size,
  1016. max_items_per_bucket = max_items_per_bucket)
  1017. self.file_number = file_number
  1018. self.index_file = index_file
  1019. self.data_file = data_file
  1020. self.index = {}
  1021. def add_article(self, article_index, article_data, fnd_offset, restricted):
  1022. self.add((article_index, article_data, fnd_offset, restricted), len(article_data))
  1023. def write(self, data):
  1024. """output the article data"""
  1025. all_data = ''
  1026. blocks = ''
  1027. offset = 0
  1028. for size, item in data:
  1029. article_index, article_data, fnd_offset, restricted = item
  1030. blocks += struct.pack('<3I', article_index,
  1031. offset + (0x80000000 if restricted else 0),
  1032. size)
  1033. offset += size
  1034. all_data += article_data
  1035. sizeof_one_block = 12 # number of bytes generated by struct above
  1036. ah = chr(len(blocks) / sizeof_one_block) + blocks
  1037. ac = CompressData(all_data)
  1038. file_offset = self.data_file.tell()
  1039. data_length = struct.pack('<I', len(ac))
  1040. self.data_file.write(ah + data_length + ac)
  1041. for size, item in data:
  1042. article_index, article_data, fnd_offset, restricted = item
  1043. self.index[article_index] = struct.pack('<2IB', file_offset, fnd_offset, self.file_number)
  1044. def __del__(self):
  1045. self.flush()
  1046. keys = self.index.keys()
  1047. keys.sort()
  1048. for k in keys:
  1049. self.index_file.write(self.index[k])
  1050. # run the program
  1051. if __name__ == "__main__":
  1052. main()