jadedctrl
/
wrdk
mirror of https://github.com/nzmichaelh/wrdk


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137
							#! /usr/bin/env python
# -*- coding: utf-8 -*-
# COPYRIGHT: Openmoko Inc. 2010
# LICENSE: GPL Version 3 or later
# DESCRIPTION: Article Rendering
# AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
#          Christopher Hall <hsw@openmoko.com>

import sys, os, struct, os.path, re
import io
import time
import HTMLParser
import pylzma
import unicodedata
import htmlentitydefs
import codecs
import getopt
import os.path
import sqlite3
import WordWrap
import PrintLog
import gd

verbose = False
warnings = False
article_count = 0

# NASTY HACK: allow this </div class="something">
HTMLParser.endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*[^>]*>')


fh       = '4b' # struct font_bmf_header (header)
cmr      = '8b48s'  # struct charmetric_bmf (font)

fh_size  = struct.calcsize(fh)
cmr_size = struct.calcsize(cmr)

# font face defines
ITALIC_FONT_IDX         = 1
DEFAULT_FONT_IDX        = 2
TITLE_FONT_IDX          = 3
SUBTITLE_FONT_IDX       = 4
DEFAULT_ALL_FONT_IDX    = 5

# Screen dimensions
LCD_WIDTH               = 240
LCD_LEFT_MARGIN         = 6     # def. in lcd_buf_draw.h
LCD_IMG_MARGIN          = 8

# Line Spaces (read directly from the font using gdbfed)
LINE_SPACE_ADDON        = 1 # added in lcd_buf_draw.h
H1_LSPACE               = 19
H2_LSPACE               = 17
H3_LSPACE               = H2_LSPACE
H4_LSPACE               = H2_LSPACE
H5_LSPACE               = H2_LSPACE
H6_LSPACE               = H2_LSPACE
P_LSPACE                = 15 + LINE_SPACE_ADDON

# Margins & Spacing
LIST_INDENT             = 16
DIV_MARGIN_TOP          = 10
P_MARGIN_TOP            = DIV_MARGIN_TOP
BLOCKQUOTE_MARGIN_TOP   = DIV_MARGIN_TOP
BLOCKQUOTE_MARGIN_LEFT  = LIST_INDENT
BLOCKQUOTE_MARGIN_RIGHT = LIST_INDENT
LIST_MARGIN_TOP         = DIV_MARGIN_TOP
BR_MARGIN_TOP           = DIV_MARGIN_TOP
DEFAULT_LWIDTH          = (LCD_WIDTH-LCD_LEFT_MARGIN)
H1_MARGIN_TOP           = 8
H1_MARGIN_BOTTOM        = P_MARGIN_TOP
H2_MARGIN_TOP           = 14
H3_MARGIN_TOP           = H2_MARGIN_TOP
H4_MARGIN_TOP           = H2_MARGIN_TOP
H5_MARGIN_TOP           = H2_MARGIN_TOP
H6_MARGIN_TOP           = H2_MARGIN_TOP


LIMAX_INDENT_LEVELS     = 3
MAX_QUOTE_LEVEL         = 1

# bullet[0] charater is not used (the '!')
bullet_c                = u"!\u25aa\u2022\u25ab"
LIMAX_BULLETS           = len(bullet_c) - 1

font_id_values = {}


g_starty = 0
g_curr_face = DEFAULT_FONT_IDX
g_halign = 0
g_this_article_title = 'NO TITLE'
g_links = {}
g_link_cnt = 0
i_out = None
f_out = None
file_number = 0

article_db = None

output = None
compress = True


def usage(message):
    if None != message:
        print('error: {0:s}'.format(message))
    print('usage: {0:s} <options> html-files...'.format(os.path.basename(__file__)))
    print('       --help                  This message')
    print('       --verbose               Enable verbose output')
    print('       --warnings              Enable warnings output')
    print('       --number=n              Number for the .dat/.idx-tmp files [0]')
    print('       --test=file             Output the uncompressed file for testing')
    print('       --font-path=dir         Path to font files (*.bmf) [fonts]')
    print('       --article-index=file    Article index dictionary input [articles.db]')
    print('       --prefix=name           Device file name portion for .dat/.idx-tmp [pedia]')
    exit(1)


def main():
    global verbose, warnings, compress
    global f_out, output, i_out
    global font_id_values
    global file_number
    global article_count
    global article_db
    global start_time

    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hvwn:p:i:t:f:',
                                   ['help',
                                    'verbose',
                                    'warnings',
                                    'number=',
                                    'prefix=',
                                    'article-index=',
                                    'test=',
                                    'font-path='])
    except getopt.GetoptError, err:
        usage(err)

    verbose = False
    warnings = False
    data_file = 'pedia{0:d}.dat'
    index_file = 'pedia{0:d}.idx-tmp'
    art_file = 'articles.db'
    file_number = 0
    test_file = ''
    font_path = "../fonts"
    article_db = None

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-w', '--warnings'):
            warnings = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-t', '--test'):
            test_file = arg
        elif opt in ('-i', '--article-index'):
            art_file = arg
        elif opt in ('-n', '--number'):
            try:
                file_number = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
        elif opt in ('-p', '--prefix'):
            data_file = arg + '{0:d}.dat'
            index_file = arg + '{0:d}.idx-tmp'
        elif opt in ('-f', '--font-path'):
            font_path = arg
        else:
            usage('unhandled option: ' + opt)

    start_time = time.time()

    f_fontr  = open(os.path.join(font_path, "text.bmf"), "r")
    f_fonti  = open(os.path.join(font_path, "texti.bmf"), "r")
    f_fontt  = open(os.path.join(font_path, "title.bmf"), "r")
    f_fontst = open(os.path.join(font_path, "subtitle.bmf"), "r")
    f_fontall = open(os.path.join(font_path, "textall.bmf"), "r")

    font_id_values = {
        ITALIC_FONT_IDX: f_fonti,
        DEFAULT_FONT_IDX: f_fontr,
        TITLE_FONT_IDX: f_fontt,
        SUBTITLE_FONT_IDX: f_fontst,
        DEFAULT_ALL_FONT_IDX: f_fontall
    }

    article_db = sqlite3.connect(art_file)

    article_db.execute('pragma auto_vacuum = none')
    article_db.execute('pragma synchronous = off')
    article_db.execute('pragma temp_store = memory')
    article_db.execute('pragma locking_mode = normal')
    article_db.execute('pragma read_uncommitted = true')
    article_db.execute('pragma cache_size = 20000000')
    article_db.execute('pragma default_cache_size = 20000000')
    article_db.execute('pragma journal_mode = off')

    output = io.BytesIO('')

    if test_file == '':
        compress = True
        i_out = open(index_file.format(file_number), 'w')
        f_out = open(data_file.format(file_number), 'w')
    else:
        compress = False
        f_out = open(test_file, 'w')

    for name in args:
        f = codecs.open(name, 'r', 'utf-8', 'replace')
        WrProcess(f)
        f.close()

    for item in font_id_values:
        font_id_values[item].close()

    if output != None:
        output.close()

    if f_out != None:
        f_out.close()
    if i_out != None:
        i_out.close()

    if article_db != None:
        article_db.close()

    for i in font_id_values:
        font_id_values[i].close()

    # final message
    PrintLog.message("Render[{0:d}]: Total: {1:d}".format(file_number, article_count))


#
# Get the width of a character in a given font face
#
width_cache = {}

def get_utf8_cwidth(c, face):
    global width_cache, font_id_values
    global cmr, fh, cmr_size, fh_size

    if type(c) != unicode:
        c = unicode(c, 'utf-8')

    if (c, face) in width_cache:
        return width_cache[(c, face)]

    f = font_id_values[face]

    f.seek(ord(c) * cmr_size + fh_size)
    buffer = f.read(cmr_size)

    if len(buffer) != 0:
        width, height, widthBytes, widthBits, ascent, descent, LSBearing, RSBearing, bitmap = struct.unpack(cmr, buffer)
    else:
        width, height, widthBytes, widthBits, ascent, descent, LSBearing, RSBearing, bitmap = (0,0,0,0,0,0,0,0,
                                                                                               r'\x55' * 48)

    if 0 == width and face != DEFAULT_ALL_FONT_IDX:
        return get_utf8_cwidth(c, DEFAULT_ALL_FONT_IDX)

    width += LSBearing + LINE_SPACE_ADDON
    width_cache[(c, face)] = width

    return width


def get_lineheight(face):

    values = {
            ITALIC_FONT_IDX:      P_LSPACE,
            DEFAULT_FONT_IDX:     P_LSPACE,
            TITLE_FONT_IDX:       H1_LSPACE,
            SUBTITLE_FONT_IDX:    H2_LSPACE,
            DEFAULT_ALL_FONT_IDX: P_LSPACE
        }

    return values[face]


def make_link(url, x0, x1, text):
    global g_starty, g_curr_face, g_link_cnt, g_links

    if article_index(url):
        esc_code10(x1 - x0)
        g_links[g_link_cnt] = (x0, g_starty - get_lineheight(g_curr_face), x1, g_starty, url)
        g_link_cnt =  g_link_cnt + 1


def get_imgdata(imgfile, indent):
    try:
        img = gd.image(imgfile)
    except IOError, e:
        PrintLog.message(u'unable to open image file: {0:s}'.format(imgfile))
        return (0, 0, r'')

    (width, height) = img.size()
    if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((x, y)))
        h_range = range(0, width)
        v_range = range(0, height)
    elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((y, x)))
        v_range = range(0, width)
        h_range = range(height - 1, -1, -1)
        (width, height) = (height, width)
    else:
        PrintLog.message(u'image file: {0:s} is too big'.format(imgfile))
        return (0, 0, r'')

    data = ''
    for v in v_range:
        byte = 0
        bit_count = 8

        for h in h_range:
            if is_black(h, v):
                pixel = 1
            else:
                pixel = 0
            bit_count -= 1
            byte |= pixel << bit_count
            if 0 == bit_count:
                data += struct.pack('B', byte)
                byte = 0
                bit_count = 8
        if 8 != bit_count:
            data += struct.pack('B', byte)

    return (width, height, data)

def esc_code0(num_pixels):
    """blank line height in pixels"""
    global g_starty
    global output

    output.write(struct.pack('BB', 1, num_pixels))
    g_starty += num_pixels


def esc_code1():
    """new line with default font and default line space"""
    global g_starty, g_curr_face
    global output

    output.write(struct.pack('B', 2))
    g_starty += get_lineheight(DEFAULT_FONT_IDX)
    g_curr_face = DEFAULT_FONT_IDX


def esc_code2():
    """new line with current font and current line space"""
    global g_starty, g_curr_face
    global output

    output.write(struct.pack('B', 3))
    g_starty += get_lineheight(g_curr_face)


def esc_code3(face):
    """new line using new font face."""
    global g_starty, g_curr_face
    global output

    num_pixels = get_lineheight(face)
    output.write(struct.pack('BB', 4, face|(num_pixels<<3)))
    g_starty += num_pixels
    g_curr_face = face

def esc_code4(face, halign=0):
    """change font with current horizontal alignment (in pixels)"""
    global g_curr_face
    global output

    output.write(struct.pack('BB', 5, face|(halign<<3)))
    g_curr_face = face


def esc_code5():
    """set font as default"""
    global g_curr_face
    global output

    output.write(struct.pack('B', 6))
    g_curr_face = DEFAULT_FONT_IDX


def esc_code6():
    """set default alignment"""
    global output

    output.write(struct.pack('B', 7))


def esc_code7(num_pixels):
    """move right num_pixels"""
    global output

    output.write(struct.pack('BB', 8, num_pixels))


def esc_code8(num_pixels):
    """move left num_pixels"""
    global output

    output.write(struct.pack('BB', 9, num_pixels))


def esc_code9(num_pixels):
    """alignment adjustment"""
    global g_halign
    global output

    output.write(struct.pack('Bb', 10, num_pixels))
    g_halign = num_pixels


def esc_code10(num_pixels):
    """draw line from right to left"""
    global output

    output.write(struct.pack('BB', 11, num_pixels))


def esc_code14(width, height, data):
    """output bitmap"""
    global g_starty, g_curr_face
    global output

    if 0 == width or 0 == height:
        return

    output.write(struct.pack('<BBH', 15, width, height))
    output.write(data)

    lineh = get_lineheight(g_curr_face)

    if (height) > lineh:
        g_starty += (height)-lineh + 3   # since Eric draws images 3px lower for alignment


#
# Parse the HTML into the WikiReader's format
#
class WrProcess(HTMLParser.HTMLParser):

    READ_BLOCK_SIZE = 64 * (1024 * 1024)

    def __init__ (self, f):
        global g_this_article_title, article_count

        HTMLParser.HTMLParser.__init__(self)
        self.wordwrap = WordWrap.WordWrap(get_utf8_cwidth)
        self.local_init()
        self.tag_stack = []
        block = f.read(self.READ_BLOCK_SIZE)
        while block:
            self.feed(block)
            block = f.read(self.READ_BLOCK_SIZE)


    def local_init(self):

        global g_starty, g_curr_face, g_halign
        global g_this_article_title, g_links, g_link_cnt

        self.in_html = False
        self.in_title = False
        self.in_body = False

        self.in_h1 = False
        self.in_h2 = False
        self.in_h3 = False
        self.in_h4 = False
        self.in_h5 = False
        self.in_h6 = False

        self.in_table = 0
        self.in_p  = False
        self.in_b  = False
        self.in_big = False
        self.in_strong = False
        self.in_del = False
        self.in_ins = False
        self.in_i  = False
        self.in_a  = False
        self.in_br = False
        self.in_img = False

        self.quote = 0
        self.level = 0
        self.lwidth = DEFAULT_LWIDTH
        self.indent = 0
        self.li_cnt = {}
        self.li_inside = {}
        self.li_type = {}
        self.link_x = 0
        self.link_y = 0
        self.url = None

        self.language_links = []

        self.printing = True

        g_starty = 0
        g_curr_face = DEFAULT_FONT_IDX
        g_halign = 0
        g_this_article_title = 'NO TITLE'
        g_links = {}
        g_link_cnt = 0


    def handle_starttag(self, tag, attrs):

        global g_starty, g_curr_face, g_halign
        global g_this_article_title, g_links, g_link_cnt
        global warnings

        attrs = dict(attrs)

        # must always do the <html> tag
        if tag == 'html':
            self.local_init()
            self.in_html = True
            self.tag_stack = [(tag, True)]
            return

        self.tag_stack.append((tag, self.printing))

        # we want to skip content that isn't for printing
        if 'class' in attrs:
            if 'noprint' in attrs['class']:
                self.printing = False

            # create a list of language links
            if tag == 'a' and 'lang-link' in attrs['class']:
                self.language_links.append(attrs['href'])

        # handle the tags
        if not self.printing:
            return;

        elif tag == 'script':
            self.printing = False

        elif tag == 'title':
            self.in_title = True
            g_this_article_title = ''

        elif tag == 'body':
            self.in_body = True

        elif tag == 'table':
            self.in_table += 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = True
            esc_code0(H1_MARGIN_TOP)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = True
            esc_code0(H2_MARGIN_TOP)

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = True
            esc_code0(H3_MARGIN_TOP)

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = True
            esc_code0(H4_MARGIN_TOP)

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = True
            esc_code0(H5_MARGIN_TOP)

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = True
            esc_code0(H6_MARGIN_TOP)

        elif tag == 'div':
            self.flush_buffer()
            # suppress thumb info boxes
            if 'class' in attrs:
                c = attrs['class']
                if 'thumb' in c or 'left' in c or 'right' in c \
                    or 'dablink' in c or 'magnify' in c:
                    self.printing = False
                    return
            esc_code0(DIV_MARGIN_TOP)

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = True
            esc_code0(P_MARGIN_TOP)

        elif tag == 'blockquote':
            self.flush_buffer()
            self.quote += 1
            if self.quote < MAX_QUOTE_LEVEL:
                esc_code0(BLOCKQUOTE_MARGIN_TOP)
                self.indent += BLOCKQUOTE_MARGIN_LEFT
                self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                esc_code9(BLOCKQUOTE_MARGIN_LEFT)

        elif tag == 'b':
            self.in_b = True

        elif tag == 'i':
            self.in_i = True

        elif tag == 'big':            # Not sure what to do with this one
            self.in_b = True

        elif tag == 'strong':
            self.in_b = True

        elif tag == 'del':
            self.in_del = True

        elif tag == 'ins':
            self.in_ins = True

        elif tag == 'a' and 'href' in attrs:
            self.in_a = True
            self.url  = attrs['href']

        elif tag in ['ul', 'ol', 'dl']:
            if 'start' in attrs:
                list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start'])
                try:
                    list_start = int(list_start)
                except ValueError:
                    list_start = 1

                self.enter_list(tag, list_start)
            else:
                self.enter_list(tag)

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
                # force ul since this is a li without a parent
                #(t, p) = self.tag_stack.pop()
                #self.tag_stack.append(('ul', p))
                #self.tag_stack.append((t,p))
                #self.enter_list('ul')

            # handle missing </li> at the same level
            # simulate </li> and continue
            if self.li_inside[self.level]:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                self.flush_buffer(False)
                self.list_decrease_indent()

            self.li_inside[self.level] = True

            if 'value' in attrs:
                list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value'])
                try:
                    self.li_cnt[self.level] = int(list_index)
                except ValueError:
                    pass
            else:
                self.li_cnt[self.level] += 1

            if self.li_type[self.level] == 'ol':
                self.wordwrap.append(("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None)
            else:
                if self.level > LIMAX_BULLETS:
                    bullet_num = LIMAX_BULLETS
                else:
                    bullet_num = self.level

                self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None)

            self.flush_buffer()
            self.list_increase_indent()

        elif tag == 'dd':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
            self.li_cnt[self.level] += 1
            self.list_increase_indent()

        elif tag == 'br':
            self.in_br = True

        elif tag == 'img' and 'src' in attrs:
            (width, height, data) = get_imgdata(attrs['src'], self.indent)
            self.wordwrap.AppendImage(width, height, data, None)
            self.in_img = True


    def handle_endtag(self, tag):
        global g_this_article_title
        global article_count
        global warnings

        # ignore end tag without start tag
        if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack:
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                 .format(tag, line, column, article_count + 1, g_this_article_title))
            return

        # backtrack up the stack closing each open tag until there is a match
        (start_tag, self.printing) = self.tag_stack.pop()
        while start_tag != tag:
            self.tag_stack.append((start_tag, self.printing))
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                 .format(start_tag, line, column, article_count + 1, g_this_article_title))
            self.handle_endtag(start_tag)
            (start_tag, self.printing) = self.tag_stack.pop()

        # must always do </html> tag
        if tag == 'html':
            self.printing = True
            self.tag_stack = []
            self.in_html = False
            esc_code1()
            write_article(self.language_links)
            return

        if not self.printing:
            return

        elif tag == 'script':
            pass

        elif tag == 'title':
            self.in_title = False
            g_this_article_title = g_this_article_title.strip()

        elif tag == 'body':
            self.in_body = False
            self.flush_buffer()

        elif tag == 'table':
            if self.in_table > 0:
                self.in_table -= 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = False
            esc_code0(H1_MARGIN_BOTTOM)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = False

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = False

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = False

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = False

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = False

        elif tag == 'div':
            self.flush_buffer()

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = False

        elif tag == 'blockquote':
            self.flush_buffer()
            if self.quote > 0:
                if self.quote < MAX_QUOTE_LEVEL:
                    self.indent -= BLOCKQUOTE_MARGIN_LEFT
                    self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                    esc_code9(-BLOCKQUOTE_MARGIN_LEFT)
                self.quote -= 1

        elif tag == 'b':
            self.in_b = False

        elif tag == 'big':
            self.in_b = False

        elif tag == 'strong':
            self.in_b = False

        elif tag == 'i':
            self.in_i = False

        elif tag == 'del':
            self.in_del = False

        elif tag == 'ins':
            self.in_ins = False

        elif tag == 'a':
            self.in_a = False
            self.url  = ""

        elif tag in ['ul', 'ol', 'dl']:
            self.leave_list()

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
            else:
                self.flush_buffer(False)
                self.list_decrease_indent()
                self.li_inside[self.level] = False

        elif tag == 'dd':
            self.flush_buffer()
            self.list_decrease_indent()

        elif tag == 'dt':
            self.flush_buffer()

        elif tag == 'br':
            self.flush_buffer()
            self.in_br = False

        elif tag == 'img':
            self.in_img = False


    def enter_list(self, list_type, start = 1):
        self.flush_buffer(False)
        esc_code0(LIST_MARGIN_TOP)
        self.level += 1
        self.li_cnt[self.level] = start - 1
        self.li_inside[self.level] = False
        self.li_type[self.level] = list_type


    def list_increase_indent(self):
        if self.level <= LIMAX_INDENT_LEVELS:
            esc_code9(LIST_INDENT)
            esc_code8(LIST_INDENT)  ### Bug in lcd_buf_draw ASK ERIC
            self.lwidth -= LIST_INDENT
            self.indent += LIST_INDENT


    def leave_list(self):
        self.flush_buffer()
        if self.level > 0:
            esc_code0(LIST_MARGIN_TOP)
            del self.li_cnt[self.level]
            del self.li_inside[self.level]
            self.level -= 1

    def list_decrease_indent(self):
        if self.level <= LIMAX_INDENT_LEVELS:
            esc_code9(- LIST_INDENT)
            self.lwidth += LIST_INDENT
            self.indent -= LIST_INDENT


    def handle_charref(self, name):
        """handle &#DDDD; &#xXXXX;"""

        if 0 == len(name):
            return

        if 'x' == name[0] or 'X' == name[0]:
            try:
                value = int(name[1:], 16)
            except ValueError:
                PrintLog.message(u'charref: "{0:s}" is not hexadecimal'.format(name))
                return

        elif name.isdigit():
            try:
                value = int(name)
            except ValueError:
                PrintLog.message(u'charref: "{0:s}" is not decimal'.format(name))
                return

        try:
            c = unichr(value)
        except ValueError:
            PrintLog.message(u'charref: "{0:d}" is not convertible to unicode'.format(value))
            c = '?'
        self.handle_data(c)


    def handle_entityref(self, name):
        """handle &amp; &gt; ..."""
        try:
            self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
        except KeyError:
            PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(name, g_this_article_title))


    def handle_data(self, data):
        global g_this_article_title

        if self.in_title:
            g_this_article_title += data

        # only parse valid tags in <body>
        # skip tables for now
        if not self.in_body or self.in_table > 0 or not self.printing:
            return

        # defaults
        data = re.sub("\s+" , " ", data)
        face = DEFAULT_FONT_IDX
        url  = None

        # only use italic fonts now (don't care about bold)
        if self.in_i:
            face = ITALIC_FONT_IDX

        if self.in_h1:
            face = TITLE_FONT_IDX
        elif self.in_h2 or self.in_h3 or self.in_h4 or self.in_h5 or self.in_h6:
            face = SUBTITLE_FONT_IDX

        # figure out if we need a url
        if self.in_a:
            url = self.url

        self.wordwrap.append(data, face, url)


    def flush_buffer(self, new_line = True):
        global output
        font = -1

        while self.wordwrap.have():
            url = None
            x0 = self.indent
            url_x0 = x0
            line = self.wordwrap.wrap(self.lwidth)
            if line == []:
                break

            if tuple == type(line[0][1]):
                if font < 0:
                    new_font = DEFAULT_FONT_IDX
                else:
                    new_font = font
            else:
                new_font = line[0][1]

            if new_line:
                if font != new_font:
                    font = new_font
                    esc_code3(font)
                else:
                    esc_code2()
            else:
                if font != new_font:
                    font = new_font
                    esc_code4(font)
                new_line = True


            for i in line:
                if tuple == type(i[1]):
                    (width, height, data) = i[1]
                    esc_code14(width, height, data)
                else:
                    if font != i[1]:
                        font = i[1]
                        esc_code4(font)

                    if url != i[2]:
                        if url != None:
                            make_link(url, url_x0, x0, i[0])
                        url = i[2]
                        if url != None:
                            url_x0 = x0
                    output.write(i[0].encode('utf-8'))
                x0 += i[3]

            if url != None:
                make_link(url, url_x0, x0, line[-1][0])


def link_number(url):
    global article_index

    try:
        n = article_index(url)[0]
    except KeyError:
        n = -1
    return n


# Add the '~' padding back here
def article_index(title):
    global article_db

    c = article_db.cursor()
    c.execute('select article_number, fnd_offset, restricted from articles where title = ? limit 1', ["~" + title])
    result = c.fetchone()
    c.close()
    return result  # this returns a tuple of text strings, so beware!


def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time

    article_count += 1
    if verbose:
        PrintLog.message("[MWR {0:d}] {1:s}".format(article_count, g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message("Render[{0:d}]: {1:7.2f}s {2:10d}".format(file_number, now_time - start_time, article_count))
        start_time = now_time

    output.flush()

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(struct.pack('III', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')

    for l in language_links:
        links_stream.write(l.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.getvalue()

    # output the article data
    file_offset = f_out.tell()
    whole_article = header + links + langs + body
    if compress:
        body = chr(5) + pylzma.compress(whole_article,
                                        dictionary = 24, fastBytes = 32,
                                        literalContextBits = 3,
                                        literalPosBits = 0, posBits = 2,
                                        algorithm = 1, eos = 1)
        f_out.write(body)
        write_article_index(file_offset, len(body))
    else:
        f_out.write(whole_article)

    f_out.flush()
    output.truncate(0)


def write_article_index(file_offset, length):
    global verbose
    global output, f_out, i_out
    global g_this_article_title
    global file_number

    try:
        (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
        data_offset = (file_offset & 0x7fffffff)

        if bool(int(restricted)):  # '0' is True so turn it into False
            data_offset |= 0x80000000
        data_length =  (0x80 << 24) | (file_number << 24) | length  # 0x80 => lzma encoding
        i_out.write(struct.pack('III', data_offset, fnd_offset, data_length))
        i_out.flush()
    except KeyError:
        PrintLog.message(u'Error in: write_article, Title not found')
        PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
        PrintLog.message(u'Offset: {0:s}'.format(file_offset))
        PrintLog.message(u'Count:  {0:s}'.format(article_count))


# run the program
if __name__ == "__main__":
    main()