ArticleRenderer.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Article Rendering
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import sys, os, struct, os.path, re
  9. import io
  10. import time
  11. import HTMLParser
  12. import pylzma
  13. import unicodedata
  14. import htmlentitydefs
  15. import codecs
  16. import getopt
  17. import os.path
  18. import sqlite3
  19. import WordWrap
  20. import PrintLog
  21. import gd
  22. verbose = False
  23. warnings = False
  24. article_count = 0
  25. # NASTY HACK: allow this </div class="something">
  26. HTMLParser.endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*[^>]*>')
  27. fh = '4b' # struct font_bmf_header (header)
  28. cmr = '8b48s' # struct charmetric_bmf (font)
  29. fh_size = struct.calcsize(fh)
  30. cmr_size = struct.calcsize(cmr)
  31. # font face defines
  32. ITALIC_FONT_IDX = 1
  33. DEFAULT_FONT_IDX = 2
  34. TITLE_FONT_IDX = 3
  35. SUBTITLE_FONT_IDX = 4
  36. DEFAULT_ALL_FONT_IDX = 5
  37. # Screen dimensions
  38. LCD_WIDTH = 240
  39. LCD_LEFT_MARGIN = 6 # def. in lcd_buf_draw.h
  40. LCD_IMG_MARGIN = 8
  41. # Line Spaces (read directly from the font using gdbfed)
  42. LINE_SPACE_ADDON = 1 # added in lcd_buf_draw.h
  43. H1_LSPACE = 19
  44. H2_LSPACE = 17
  45. H3_LSPACE = H2_LSPACE
  46. H4_LSPACE = H2_LSPACE
  47. H5_LSPACE = H2_LSPACE
  48. H6_LSPACE = H2_LSPACE
  49. P_LSPACE = 15 + LINE_SPACE_ADDON
  50. # Margins & Spacing
  51. LIST_INDENT = 16
  52. DIV_MARGIN_TOP = 10
  53. P_MARGIN_TOP = DIV_MARGIN_TOP
  54. BLOCKQUOTE_MARGIN_TOP = DIV_MARGIN_TOP
  55. BLOCKQUOTE_MARGIN_LEFT = LIST_INDENT
  56. BLOCKQUOTE_MARGIN_RIGHT = LIST_INDENT
  57. LIST_MARGIN_TOP = DIV_MARGIN_TOP
  58. BR_MARGIN_TOP = DIV_MARGIN_TOP
  59. DEFAULT_LWIDTH = (LCD_WIDTH-LCD_LEFT_MARGIN)
  60. H1_MARGIN_TOP = 8
  61. H1_MARGIN_BOTTOM = P_MARGIN_TOP
  62. H2_MARGIN_TOP = 14
  63. H3_MARGIN_TOP = H2_MARGIN_TOP
  64. H4_MARGIN_TOP = H2_MARGIN_TOP
  65. H5_MARGIN_TOP = H2_MARGIN_TOP
  66. H6_MARGIN_TOP = H2_MARGIN_TOP
  67. LIMAX_INDENT_LEVELS = 3
  68. MAX_QUOTE_LEVEL = 1
  69. # bullet[0] charater is not used (the '!')
  70. bullet_c = u"!\u25aa\u2022\u25ab"
  71. LIMAX_BULLETS = len(bullet_c) - 1
  72. font_id_values = {}
  73. g_starty = 0
  74. g_curr_face = DEFAULT_FONT_IDX
  75. g_halign = 0
  76. g_this_article_title = 'NO TITLE'
  77. g_links = {}
  78. g_link_cnt = 0
  79. i_out = None
  80. f_out = None
  81. file_number = 0
  82. article_db = None
  83. output = None
  84. compress = True
  85. def usage(message):
  86. if None != message:
  87. print('error: {0:s}'.format(message))
  88. print('usage: {0:s} <options> html-files...'.format(os.path.basename(__file__)))
  89. print(' --help This message')
  90. print(' --verbose Enable verbose output')
  91. print(' --warnings Enable warnings output')
  92. print(' --number=n Number for the .dat/.idx-tmp files [0]')
  93. print(' --test=file Output the uncompressed file for testing')
  94. print(' --font-path=dir Path to font files (*.bmf) [fonts]')
  95. print(' --article-index=file Article index dictionary input [articles.db]')
  96. print(' --prefix=name Device file name portion for .dat/.idx-tmp [pedia]')
  97. exit(1)
  98. def main():
  99. global verbose, warnings, compress
  100. global f_out, output, i_out
  101. global font_id_values
  102. global file_number
  103. global article_count
  104. global article_db
  105. global start_time
  106. try:
  107. opts, args = getopt.getopt(sys.argv[1:],
  108. 'hvwn:p:i:t:f:',
  109. ['help',
  110. 'verbose',
  111. 'warnings',
  112. 'number=',
  113. 'prefix=',
  114. 'article-index=',
  115. 'test=',
  116. 'font-path='])
  117. except getopt.GetoptError, err:
  118. usage(err)
  119. verbose = False
  120. warnings = False
  121. data_file = 'pedia{0:d}.dat'
  122. index_file = 'pedia{0:d}.idx-tmp'
  123. art_file = 'articles.db'
  124. file_number = 0
  125. test_file = ''
  126. font_path = "../fonts"
  127. article_db = None
  128. for opt, arg in opts:
  129. if opt in ('-v', '--verbose'):
  130. verbose = True
  131. elif opt in ('-w', '--warnings'):
  132. warnings = True
  133. elif opt in ('-h', '--help'):
  134. usage(None)
  135. elif opt in ('-t', '--test'):
  136. test_file = arg
  137. elif opt in ('-i', '--article-index'):
  138. art_file = arg
  139. elif opt in ('-n', '--number'):
  140. try:
  141. file_number = int(arg)
  142. except ValueError:
  143. usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
  144. elif opt in ('-p', '--prefix'):
  145. data_file = arg + '{0:d}.dat'
  146. index_file = arg + '{0:d}.idx-tmp'
  147. elif opt in ('-f', '--font-path'):
  148. font_path = arg
  149. else:
  150. usage('unhandled option: ' + opt)
  151. start_time = time.time()
  152. f_fontr = open(os.path.join(font_path, "text.bmf"), "r")
  153. f_fonti = open(os.path.join(font_path, "texti.bmf"), "r")
  154. f_fontt = open(os.path.join(font_path, "title.bmf"), "r")
  155. f_fontst = open(os.path.join(font_path, "subtitle.bmf"), "r")
  156. f_fontall = open(os.path.join(font_path, "textall.bmf"), "r")
  157. font_id_values = {
  158. ITALIC_FONT_IDX: f_fonti,
  159. DEFAULT_FONT_IDX: f_fontr,
  160. TITLE_FONT_IDX: f_fontt,
  161. SUBTITLE_FONT_IDX: f_fontst,
  162. DEFAULT_ALL_FONT_IDX: f_fontall
  163. }
  164. article_db = sqlite3.connect(art_file)
  165. article_db.execute('pragma auto_vacuum = none')
  166. article_db.execute('pragma synchronous = off')
  167. article_db.execute('pragma temp_store = memory')
  168. article_db.execute('pragma locking_mode = normal')
  169. article_db.execute('pragma read_uncommitted = true')
  170. article_db.execute('pragma cache_size = 20000000')
  171. article_db.execute('pragma default_cache_size = 20000000')
  172. article_db.execute('pragma journal_mode = off')
  173. output = io.BytesIO('')
  174. if test_file == '':
  175. compress = True
  176. i_out = open(index_file.format(file_number), 'w')
  177. f_out = open(data_file.format(file_number), 'w')
  178. else:
  179. compress = False
  180. f_out = open(test_file, 'w')
  181. for name in args:
  182. f = codecs.open(name, 'r', 'utf-8', 'replace')
  183. WrProcess(f)
  184. f.close()
  185. for item in font_id_values:
  186. font_id_values[item].close()
  187. if output != None:
  188. output.close()
  189. if f_out != None:
  190. f_out.close()
  191. if i_out != None:
  192. i_out.close()
  193. if article_db != None:
  194. article_db.close()
  195. for i in font_id_values:
  196. font_id_values[i].close()
  197. # final message
  198. PrintLog.message("Render[{0:d}]: Total: {1:d}".format(file_number, article_count))
  199. #
  200. # Get the width of a character in a given font face
  201. #
  202. width_cache = {}
  203. def get_utf8_cwidth(c, face):
  204. global width_cache, font_id_values
  205. global cmr, fh, cmr_size, fh_size
  206. if type(c) != unicode:
  207. c = unicode(c, 'utf-8')
  208. if (c, face) in width_cache:
  209. return width_cache[(c, face)]
  210. f = font_id_values[face]
  211. f.seek(ord(c) * cmr_size + fh_size)
  212. buffer = f.read(cmr_size)
  213. if len(buffer) != 0:
  214. width, height, widthBytes, widthBits, ascent, descent, LSBearing, RSBearing, bitmap = struct.unpack(cmr, buffer)
  215. else:
  216. width, height, widthBytes, widthBits, ascent, descent, LSBearing, RSBearing, bitmap = (0,0,0,0,0,0,0,0,
  217. r'\x55' * 48)
  218. if 0 == width and face != DEFAULT_ALL_FONT_IDX:
  219. return get_utf8_cwidth(c, DEFAULT_ALL_FONT_IDX)
  220. width += LSBearing + LINE_SPACE_ADDON
  221. width_cache[(c, face)] = width
  222. return width
  223. def get_lineheight(face):
  224. values = {
  225. ITALIC_FONT_IDX: P_LSPACE,
  226. DEFAULT_FONT_IDX: P_LSPACE,
  227. TITLE_FONT_IDX: H1_LSPACE,
  228. SUBTITLE_FONT_IDX: H2_LSPACE,
  229. DEFAULT_ALL_FONT_IDX: P_LSPACE
  230. }
  231. return values[face]
  232. def make_link(url, x0, x1, text):
  233. global g_starty, g_curr_face, g_link_cnt, g_links
  234. if article_index(url):
  235. esc_code10(x1 - x0)
  236. g_links[g_link_cnt] = (x0, g_starty - get_lineheight(g_curr_face), x1, g_starty, url)
  237. g_link_cnt = g_link_cnt + 1
  238. def get_imgdata(imgfile, indent):
  239. try:
  240. img = gd.image(imgfile)
  241. except IOError, e:
  242. PrintLog.message(u'unable to open image file: {0:s}'.format(imgfile))
  243. return (0, 0, r'')
  244. (width, height) = img.size()
  245. if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
  246. is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((x, y)))
  247. h_range = range(0, width)
  248. v_range = range(0, height)
  249. elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
  250. is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((y, x)))
  251. v_range = range(0, width)
  252. h_range = range(height - 1, -1, -1)
  253. (width, height) = (height, width)
  254. else:
  255. PrintLog.message(u'image file: {0:s} is too big'.format(imgfile))
  256. return (0, 0, r'')
  257. data = ''
  258. for v in v_range:
  259. byte = 0
  260. bit_count = 8
  261. for h in h_range:
  262. if is_black(h, v):
  263. pixel = 1
  264. else:
  265. pixel = 0
  266. bit_count -= 1
  267. byte |= pixel << bit_count
  268. if 0 == bit_count:
  269. data += struct.pack('B', byte)
  270. byte = 0
  271. bit_count = 8
  272. if 8 != bit_count:
  273. data += struct.pack('B', byte)
  274. return (width, height, data)
  275. def esc_code0(num_pixels):
  276. """blank line height in pixels"""
  277. global g_starty
  278. global output
  279. output.write(struct.pack('BB', 1, num_pixels))
  280. g_starty += num_pixels
  281. def esc_code1():
  282. """new line with default font and default line space"""
  283. global g_starty, g_curr_face
  284. global output
  285. output.write(struct.pack('B', 2))
  286. g_starty += get_lineheight(DEFAULT_FONT_IDX)
  287. g_curr_face = DEFAULT_FONT_IDX
  288. def esc_code2():
  289. """new line with current font and current line space"""
  290. global g_starty, g_curr_face
  291. global output
  292. output.write(struct.pack('B', 3))
  293. g_starty += get_lineheight(g_curr_face)
  294. def esc_code3(face):
  295. """new line using new font face."""
  296. global g_starty, g_curr_face
  297. global output
  298. num_pixels = get_lineheight(face)
  299. output.write(struct.pack('BB', 4, face|(num_pixels<<3)))
  300. g_starty += num_pixels
  301. g_curr_face = face
  302. def esc_code4(face, halign=0):
  303. """change font with current horizontal alignment (in pixels)"""
  304. global g_curr_face
  305. global output
  306. output.write(struct.pack('BB', 5, face|(halign<<3)))
  307. g_curr_face = face
  308. def esc_code5():
  309. """set font as default"""
  310. global g_curr_face
  311. global output
  312. output.write(struct.pack('B', 6))
  313. g_curr_face = DEFAULT_FONT_IDX
  314. def esc_code6():
  315. """set default alignment"""
  316. global output
  317. output.write(struct.pack('B', 7))
  318. def esc_code7(num_pixels):
  319. """move right num_pixels"""
  320. global output
  321. output.write(struct.pack('BB', 8, num_pixels))
  322. def esc_code8(num_pixels):
  323. """move left num_pixels"""
  324. global output
  325. output.write(struct.pack('BB', 9, num_pixels))
  326. def esc_code9(num_pixels):
  327. """alignment adjustment"""
  328. global g_halign
  329. global output
  330. output.write(struct.pack('Bb', 10, num_pixels))
  331. g_halign = num_pixels
  332. def esc_code10(num_pixels):
  333. """draw line from right to left"""
  334. global output
  335. output.write(struct.pack('BB', 11, num_pixels))
  336. def esc_code14(width, height, data):
  337. """output bitmap"""
  338. global g_starty, g_curr_face
  339. global output
  340. if 0 == width or 0 == height:
  341. return
  342. output.write(struct.pack('<BBH', 15, width, height))
  343. output.write(data)
  344. lineh = get_lineheight(g_curr_face)
  345. if (height) > lineh:
  346. g_starty += (height)-lineh + 3 # since Eric draws images 3px lower for alignment
  347. #
  348. # Parse the HTML into the WikiReader's format
  349. #
  350. class WrProcess(HTMLParser.HTMLParser):
  351. READ_BLOCK_SIZE = 64 * (1024 * 1024)
  352. def __init__ (self, f):
  353. global g_this_article_title, article_count
  354. HTMLParser.HTMLParser.__init__(self)
  355. self.wordwrap = WordWrap.WordWrap(get_utf8_cwidth)
  356. self.local_init()
  357. self.tag_stack = []
  358. block = f.read(self.READ_BLOCK_SIZE)
  359. while block:
  360. self.feed(block)
  361. block = f.read(self.READ_BLOCK_SIZE)
  362. def local_init(self):
  363. global g_starty, g_curr_face, g_halign
  364. global g_this_article_title, g_links, g_link_cnt
  365. self.in_html = False
  366. self.in_title = False
  367. self.in_body = False
  368. self.in_h1 = False
  369. self.in_h2 = False
  370. self.in_h3 = False
  371. self.in_h4 = False
  372. self.in_h5 = False
  373. self.in_h6 = False
  374. self.in_table = 0
  375. self.in_p = False
  376. self.in_b = False
  377. self.in_big = False
  378. self.in_strong = False
  379. self.in_del = False
  380. self.in_ins = False
  381. self.in_i = False
  382. self.in_a = False
  383. self.in_br = False
  384. self.in_img = False
  385. self.quote = 0
  386. self.level = 0
  387. self.lwidth = DEFAULT_LWIDTH
  388. self.indent = 0
  389. self.li_cnt = {}
  390. self.li_inside = {}
  391. self.li_type = {}
  392. self.link_x = 0
  393. self.link_y = 0
  394. self.url = None
  395. self.language_links = []
  396. self.printing = True
  397. g_starty = 0
  398. g_curr_face = DEFAULT_FONT_IDX
  399. g_halign = 0
  400. g_this_article_title = 'NO TITLE'
  401. g_links = {}
  402. g_link_cnt = 0
  403. def handle_starttag(self, tag, attrs):
  404. global g_starty, g_curr_face, g_halign
  405. global g_this_article_title, g_links, g_link_cnt
  406. global warnings
  407. attrs = dict(attrs)
  408. # must always do the <html> tag
  409. if tag == 'html':
  410. self.local_init()
  411. self.in_html = True
  412. self.tag_stack = [(tag, True)]
  413. return
  414. self.tag_stack.append((tag, self.printing))
  415. # we want to skip content that isn't for printing
  416. if 'class' in attrs:
  417. if 'noprint' in attrs['class']:
  418. self.printing = False
  419. # create a list of language links
  420. if tag == 'a' and 'lang-link' in attrs['class']:
  421. self.language_links.append(attrs['href'])
  422. # handle the tags
  423. if not self.printing:
  424. return;
  425. elif tag == 'script':
  426. self.printing = False
  427. elif tag == 'title':
  428. self.in_title = True
  429. g_this_article_title = ''
  430. elif tag == 'body':
  431. self.in_body = True
  432. elif tag == 'table':
  433. self.in_table += 1
  434. # if in a table suppress everything after this point
  435. if self.in_table > 0:
  436. return
  437. elif tag == 'h1':
  438. self.flush_buffer()
  439. self.in_h1 = True
  440. esc_code0(H1_MARGIN_TOP)
  441. elif tag == 'h2':
  442. self.flush_buffer()
  443. self.in_h2 = True
  444. esc_code0(H2_MARGIN_TOP)
  445. elif tag == 'h3':
  446. self.flush_buffer()
  447. self.in_h3 = True
  448. esc_code0(H3_MARGIN_TOP)
  449. elif tag == 'h4':
  450. self.flush_buffer()
  451. self.in_h4 = True
  452. esc_code0(H4_MARGIN_TOP)
  453. elif tag == 'h5':
  454. self.flush_buffer()
  455. self.in_h5 = True
  456. esc_code0(H5_MARGIN_TOP)
  457. elif tag == 'h6':
  458. self.flush_buffer()
  459. self.in_h6 = True
  460. esc_code0(H6_MARGIN_TOP)
  461. elif tag == 'div':
  462. self.flush_buffer()
  463. # suppress thumb info boxes
  464. if 'class' in attrs:
  465. c = attrs['class']
  466. if 'thumb' in c or 'left' in c or 'right' in c \
  467. or 'dablink' in c or 'magnify' in c:
  468. self.printing = False
  469. return
  470. esc_code0(DIV_MARGIN_TOP)
  471. elif tag == 'p':
  472. self.flush_buffer()
  473. self.in_p = True
  474. esc_code0(P_MARGIN_TOP)
  475. elif tag == 'blockquote':
  476. self.flush_buffer()
  477. self.quote += 1
  478. if self.quote < MAX_QUOTE_LEVEL:
  479. esc_code0(BLOCKQUOTE_MARGIN_TOP)
  480. self.indent += BLOCKQUOTE_MARGIN_LEFT
  481. self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
  482. esc_code9(BLOCKQUOTE_MARGIN_LEFT)
  483. elif tag == 'b':
  484. self.in_b = True
  485. elif tag == 'i':
  486. self.in_i = True
  487. elif tag == 'big': # Not sure what to do with this one
  488. self.in_b = True
  489. elif tag == 'strong':
  490. self.in_b = True
  491. elif tag == 'del':
  492. self.in_del = True
  493. elif tag == 'ins':
  494. self.in_ins = True
  495. elif tag == 'a' and 'href' in attrs:
  496. self.in_a = True
  497. self.url = attrs['href']
  498. elif tag in ['ul', 'ol', 'dl']:
  499. if 'start' in attrs:
  500. list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start'])
  501. try:
  502. list_start = int(list_start)
  503. except ValueError:
  504. list_start = 1
  505. self.enter_list(tag, list_start)
  506. else:
  507. self.enter_list(tag)
  508. elif tag == 'li':
  509. if 0 == self.level:
  510. if warnings:
  511. (line, column) = self.getpos()
  512. PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  513. .format(tag, line, column, article_count + 1, g_this_article_title))
  514. (t, p) = self.tag_stack.pop()
  515. return # just ignore it
  516. # force ul since this is a li without a parent
  517. #(t, p) = self.tag_stack.pop()
  518. #self.tag_stack.append(('ul', p))
  519. #self.tag_stack.append((t,p))
  520. #self.enter_list('ul')
  521. # handle missing </li> at the same level
  522. # simulate </li> and continue
  523. if self.li_inside[self.level]:
  524. if warnings:
  525. (line, column) = self.getpos()
  526. PrintLog.message(u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  527. .format(tag, line, column, article_count + 1, g_this_article_title))
  528. (t, p) = self.tag_stack.pop()
  529. self.flush_buffer(False)
  530. self.list_decrease_indent()
  531. self.li_inside[self.level] = True
  532. if 'value' in attrs:
  533. list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value'])
  534. try:
  535. self.li_cnt[self.level] = int(list_index)
  536. except ValueError:
  537. pass
  538. else:
  539. self.li_cnt[self.level] += 1
  540. if self.li_type[self.level] == 'ol':
  541. self.wordwrap.append(("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None)
  542. else:
  543. if self.level > LIMAX_BULLETS:
  544. bullet_num = LIMAX_BULLETS
  545. else:
  546. bullet_num = self.level
  547. self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None)
  548. self.flush_buffer()
  549. self.list_increase_indent()
  550. elif tag == 'dd':
  551. if 0 == self.level:
  552. if warnings:
  553. (line, column) = self.getpos()
  554. PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  555. .format(tag, line, column, article_count + 1, g_this_article_title))
  556. (t, p) = self.tag_stack.pop()
  557. return # just ignore it
  558. self.li_cnt[self.level] += 1
  559. self.list_increase_indent()
  560. elif tag == 'br':
  561. self.in_br = True
  562. elif tag == 'img' and 'src' in attrs:
  563. (width, height, data) = get_imgdata(attrs['src'], self.indent)
  564. self.wordwrap.AppendImage(width, height, data, None)
  565. self.in_img = True
  566. def handle_endtag(self, tag):
  567. global g_this_article_title
  568. global article_count
  569. global warnings
  570. # ignore end tag without start tag
  571. if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack:
  572. if warnings:
  573. (line, column) = self.getpos()
  574. PrintLog.message(u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  575. .format(tag, line, column, article_count + 1, g_this_article_title))
  576. return
  577. # backtrack up the stack closing each open tag until there is a match
  578. (start_tag, self.printing) = self.tag_stack.pop()
  579. while start_tag != tag:
  580. self.tag_stack.append((start_tag, self.printing))
  581. if warnings:
  582. (line, column) = self.getpos()
  583. PrintLog.message(u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  584. .format(start_tag, line, column, article_count + 1, g_this_article_title))
  585. self.handle_endtag(start_tag)
  586. (start_tag, self.printing) = self.tag_stack.pop()
  587. # must always do </html> tag
  588. if tag == 'html':
  589. self.printing = True
  590. self.tag_stack = []
  591. self.in_html = False
  592. esc_code1()
  593. write_article(self.language_links)
  594. return
  595. if not self.printing:
  596. return
  597. elif tag == 'script':
  598. pass
  599. elif tag == 'title':
  600. self.in_title = False
  601. g_this_article_title = g_this_article_title.strip()
  602. elif tag == 'body':
  603. self.in_body = False
  604. self.flush_buffer()
  605. elif tag == 'table':
  606. if self.in_table > 0:
  607. self.in_table -= 1
  608. # if in a table suppress everything after this point
  609. if self.in_table > 0:
  610. return
  611. elif tag == 'h1':
  612. self.flush_buffer()
  613. self.in_h1 = False
  614. esc_code0(H1_MARGIN_BOTTOM)
  615. elif tag == 'h2':
  616. self.flush_buffer()
  617. self.in_h2 = False
  618. elif tag == 'h3':
  619. self.flush_buffer()
  620. self.in_h3 = False
  621. elif tag == 'h4':
  622. self.flush_buffer()
  623. self.in_h4 = False
  624. elif tag == 'h5':
  625. self.flush_buffer()
  626. self.in_h5 = False
  627. elif tag == 'h6':
  628. self.flush_buffer()
  629. self.in_h6 = False
  630. elif tag == 'div':
  631. self.flush_buffer()
  632. elif tag == 'p':
  633. self.flush_buffer()
  634. self.in_p = False
  635. elif tag == 'blockquote':
  636. self.flush_buffer()
  637. if self.quote > 0:
  638. if self.quote < MAX_QUOTE_LEVEL:
  639. self.indent -= BLOCKQUOTE_MARGIN_LEFT
  640. self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
  641. esc_code9(-BLOCKQUOTE_MARGIN_LEFT)
  642. self.quote -= 1
  643. elif tag == 'b':
  644. self.in_b = False
  645. elif tag == 'big':
  646. self.in_b = False
  647. elif tag == 'strong':
  648. self.in_b = False
  649. elif tag == 'i':
  650. self.in_i = False
  651. elif tag == 'del':
  652. self.in_del = False
  653. elif tag == 'ins':
  654. self.in_ins = False
  655. elif tag == 'a':
  656. self.in_a = False
  657. self.url = ""
  658. elif tag in ['ul', 'ol', 'dl']:
  659. self.leave_list()
  660. elif tag == 'li':
  661. if 0 == self.level:
  662. if warnings:
  663. (line, column) = self.getpos()
  664. PrintLog.message(u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
  665. .format(tag, line, column, article_count + 1, g_this_article_title))
  666. else:
  667. self.flush_buffer(False)
  668. self.list_decrease_indent()
  669. self.li_inside[self.level] = False
  670. elif tag == 'dd':
  671. self.flush_buffer()
  672. self.list_decrease_indent()
  673. elif tag == 'dt':
  674. self.flush_buffer()
  675. elif tag == 'br':
  676. self.flush_buffer()
  677. self.in_br = False
  678. elif tag == 'img':
  679. self.in_img = False
  680. def enter_list(self, list_type, start = 1):
  681. self.flush_buffer(False)
  682. esc_code0(LIST_MARGIN_TOP)
  683. self.level += 1
  684. self.li_cnt[self.level] = start - 1
  685. self.li_inside[self.level] = False
  686. self.li_type[self.level] = list_type
  687. def list_increase_indent(self):
  688. if self.level <= LIMAX_INDENT_LEVELS:
  689. esc_code9(LIST_INDENT)
  690. esc_code8(LIST_INDENT) ### Bug in lcd_buf_draw ASK ERIC
  691. self.lwidth -= LIST_INDENT
  692. self.indent += LIST_INDENT
  693. def leave_list(self):
  694. self.flush_buffer()
  695. if self.level > 0:
  696. esc_code0(LIST_MARGIN_TOP)
  697. del self.li_cnt[self.level]
  698. del self.li_inside[self.level]
  699. self.level -= 1
  700. def list_decrease_indent(self):
  701. if self.level <= LIMAX_INDENT_LEVELS:
  702. esc_code9(- LIST_INDENT)
  703. self.lwidth += LIST_INDENT
  704. self.indent -= LIST_INDENT
  705. def handle_charref(self, name):
  706. """handle &#DDDD; &#xXXXX;"""
  707. if 0 == len(name):
  708. return
  709. if 'x' == name[0] or 'X' == name[0]:
  710. try:
  711. value = int(name[1:], 16)
  712. except ValueError:
  713. PrintLog.message(u'charref: "{0:s}" is not hexadecimal'.format(name))
  714. return
  715. elif name.isdigit():
  716. try:
  717. value = int(name)
  718. except ValueError:
  719. PrintLog.message(u'charref: "{0:s}" is not decimal'.format(name))
  720. return
  721. try:
  722. c = unichr(value)
  723. except ValueError:
  724. PrintLog.message(u'charref: "{0:d}" is not convertible to unicode'.format(value))
  725. c = '?'
  726. self.handle_data(c)
  727. def handle_entityref(self, name):
  728. """handle &amp; &gt; ..."""
  729. try:
  730. self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
  731. except KeyError:
  732. PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(name, g_this_article_title))
  733. def handle_data(self, data):
  734. global g_this_article_title
  735. if self.in_title:
  736. g_this_article_title += data
  737. # only parse valid tags in <body>
  738. # skip tables for now
  739. if not self.in_body or self.in_table > 0 or not self.printing:
  740. return
  741. # defaults
  742. data = re.sub("\s+" , " ", data)
  743. face = DEFAULT_FONT_IDX
  744. url = None
  745. # only use italic fonts now (don't care about bold)
  746. if self.in_i:
  747. face = ITALIC_FONT_IDX
  748. if self.in_h1:
  749. face = TITLE_FONT_IDX
  750. elif self.in_h2 or self.in_h3 or self.in_h4 or self.in_h5 or self.in_h6:
  751. face = SUBTITLE_FONT_IDX
  752. # figure out if we need a url
  753. if self.in_a:
  754. url = self.url
  755. self.wordwrap.append(data, face, url)
  756. def flush_buffer(self, new_line = True):
  757. global output
  758. font = -1
  759. while self.wordwrap.have():
  760. url = None
  761. x0 = self.indent
  762. url_x0 = x0
  763. line = self.wordwrap.wrap(self.lwidth)
  764. if line == []:
  765. break
  766. if tuple == type(line[0][1]):
  767. if font < 0:
  768. new_font = DEFAULT_FONT_IDX
  769. else:
  770. new_font = font
  771. else:
  772. new_font = line[0][1]
  773. if new_line:
  774. if font != new_font:
  775. font = new_font
  776. esc_code3(font)
  777. else:
  778. esc_code2()
  779. else:
  780. if font != new_font:
  781. font = new_font
  782. esc_code4(font)
  783. new_line = True
  784. for i in line:
  785. if tuple == type(i[1]):
  786. (width, height, data) = i[1]
  787. esc_code14(width, height, data)
  788. else:
  789. if font != i[1]:
  790. font = i[1]
  791. esc_code4(font)
  792. if url != i[2]:
  793. if url != None:
  794. make_link(url, url_x0, x0, i[0])
  795. url = i[2]
  796. if url != None:
  797. url_x0 = x0
  798. output.write(i[0].encode('utf-8'))
  799. x0 += i[3]
  800. if url != None:
  801. make_link(url, url_x0, x0, line[-1][0])
  802. def link_number(url):
  803. global article_index
  804. try:
  805. n = article_index(url)[0]
  806. except KeyError:
  807. n = -1
  808. return n
  809. # Add the '~' padding back here
  810. def article_index(title):
  811. global article_db
  812. c = article_db.cursor()
  813. c.execute('select article_number, fnd_offset, restricted from articles where title = ? limit 1', ["~" + title])
  814. result = c.fetchone()
  815. c.close()
  816. return result # this returns a tuple of text strings, so beware!
  817. def write_article(language_links):
  818. global compress
  819. global verbose
  820. global output, f_out, i_out
  821. global article_count
  822. global g_this_article_title
  823. global file_number
  824. global start_time
  825. article_count += 1
  826. if verbose:
  827. PrintLog.message("[MWR {0:d}] {1:s}".format(article_count, g_this_article_title))
  828. elif article_count % 1000 == 0:
  829. now_time = time.time()
  830. PrintLog.message("Render[{0:d}]: {1:7.2f}s {2:10d}".format(file_number, now_time - start_time, article_count))
  831. start_time = now_time
  832. output.flush()
  833. # create links
  834. links_stream = io.BytesIO('')
  835. for i in g_links:
  836. (x0, y0, x1, y1, url) = g_links[i]
  837. links_stream.write(struct.pack('III', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))
  838. links_stream.flush()
  839. links = links_stream.getvalue()
  840. links_stream.close()
  841. # create language links
  842. links_stream = io.BytesIO('')
  843. for l in language_links:
  844. links_stream.write(l.encode('utf-8') + '\0')
  845. links_stream.flush()
  846. langs = links_stream.getvalue()
  847. links_stream.close()
  848. # create the header (header size = 8)
  849. header = struct.pack('I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
  850. body = output.getvalue()
  851. # output the article data
  852. file_offset = f_out.tell()
  853. whole_article = header + links + langs + body
  854. if compress:
  855. body = chr(5) + pylzma.compress(whole_article,
  856. dictionary = 24, fastBytes = 32,
  857. literalContextBits = 3,
  858. literalPosBits = 0, posBits = 2,
  859. algorithm = 1, eos = 1)
  860. f_out.write(body)
  861. write_article_index(file_offset, len(body))
  862. else:
  863. f_out.write(whole_article)
  864. f_out.flush()
  865. output.truncate(0)
  866. def write_article_index(file_offset, length):
  867. global verbose
  868. global output, f_out, i_out
  869. global g_this_article_title
  870. global file_number
  871. try:
  872. (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
  873. data_offset = (file_offset & 0x7fffffff)
  874. if bool(int(restricted)): # '0' is True so turn it into False
  875. data_offset |= 0x80000000
  876. data_length = (0x80 << 24) | (file_number << 24) | length # 0x80 => lzma encoding
  877. i_out.write(struct.pack('III', data_offset, fnd_offset, data_length))
  878. i_out.flush()
  879. except KeyError:
  880. PrintLog.message(u'Error in: write_article, Title not found')
  881. PrintLog.message(u'Title: {0:s}'.format(g_this_article_title))
  882. PrintLog.message(u'Offset: {0:s}'.format(file_offset))
  883. PrintLog.message(u'Count: {0:s}'.format(article_count))
  884. # run the program
  885. if __name__ == "__main__":
  886. main()