jadedctrl
/
wrdk
tükrözi: https://github.com/nzmichaelh/wrdk


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347
							#! /usr/bin/env python
# -*- coding: utf-8 -*-
# COPYRIGHT: Openmoko Inc. 2010
# LICENSE: GPL Version 3 or later
# DESCRIPTION: Article Rendering
# AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
#          Christopher Hall <hsw@openmoko.com>

import sys, os, struct, os.path, re
import io
import time
import HTMLParser
import unicodedata
import htmlentitydefs
import codecs
import getopt
import os.path
import sqlite3
import WordWrap
import bucket
import PrintLog
import LanguageTranslation

try:
    import gd
except:
    print 'error: Missing python module: python-gd'
    print '       sudo apt-get install python-gd'
    exit(1)

# try to find a lzma library interface

no_compression = True

# python-lzma
if no_compression:
    try:
        import lzma

        def CompressData(data):
            c = lzma.compress(data, options={'format': 'alone'})
            # header: options(1) dictionary-size(4) uncompressed-length(8) = 13 bytes
            return c[:5] + c[13:] # drop the uncompressed length (always 0xffff_ffff_ffff_ffff)

        no_compression = False

    except:
        pass


# PyLZMA
if no_compression:
    try:
        import pylzma

        def CompressData(data):
            return pylzma.compress(data,
                                   dictionary = 24, fastBytes = 32,
                                   literalContextBits = 3,
                                   literalPosBits = 0, posBits = 2,
                                   algorithm = 1, eos = 1)

        no_compression = False

    except:
        pass


# none detected
if no_compression:
    print 'error: Missing python LZMA compression module'
    print 'alternative 1: (preferred)'
    print '       sudo apt-get install python-lzma'
    print 'alternative 2:'
    print '       sudo apt-get install python-pylzma'
    print 'alternative 3: compile/install local PyLZMA'
    print '       make local-pylzma-install'
    exit(1)


verbose = False
warnings = False
article_count = 0

# NASTY HACK: allow this </div class="something">
HTMLParser.endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*[^>]*>')


# from: wiki-app/bmf.h
FONT_BMF_HEADER = '<4bI'    # struct font_bmf_header (header)
CHARMETRIC_BMF  = '<8b48s'  # struct charmetric_bmf (font)

FONT_BMF_HEADER_SIZE = struct.calcsize(FONT_BMF_HEADER)
CHARMETRIC_BMF_SIZE  = struct.calcsize(CHARMETRIC_BMF)

# font face defines - match the #defines of the same name in: wiki-app/lcd_buf_draw.h
ITALIC_FONT_IDX         = 1
DEFAULT_FONT_IDX        = 2
TITLE_FONT_IDX          = 3
SUBTITLE_FONT_IDX       = 4
DEFAULT_ALL_FONT_IDX    = 5
TITLE_ALL_FONT_IDX      = 6
SUBTITLE_ALL_FONT_IDX   = 7

FONT_FACE_NAME = {
    ITALIC_FONT_IDX: 'Italic',
    DEFAULT_FONT_IDX: 'Default',
    TITLE_FONT_IDX: 'Title',
    SUBTITLE_FONT_IDX: 'Subtitle',
    DEFAULT_ALL_FONT_IDX: 'Default All',
    TITLE_ALL_FONT_IDX: 'Title All',
    SUBTITLE_ALL_FONT_IDX: 'Subtitle All',
}

# Screen dimensions
LCD_WIDTH               = 240
LCD_LEFT_MARGIN         = 6     # def. in lcd_buf_draw.h
LCD_IMG_MARGIN          = 8

# Line Spaces (read directly from the font using gdbfed)
H1_LSPACE               = 19
H2_LSPACE               = 17
H3_LSPACE               = H2_LSPACE
H4_LSPACE               = H2_LSPACE
H5_LSPACE               = H2_LSPACE
H6_LSPACE               = H2_LSPACE
P_LSPACE                = 15

# Margins & Spacing
LIST_INDENT             = 16
DIV_MARGIN_TOP          = 10
P_MARGIN_TOP            = DIV_MARGIN_TOP
BLOCKQUOTE_MARGIN_TOP   = DIV_MARGIN_TOP
BLOCKQUOTE_MARGIN_LEFT  = LIST_INDENT
BLOCKQUOTE_MARGIN_RIGHT = LIST_INDENT
LIST_MARGIN_TOP         = DIV_MARGIN_TOP
BR_MARGIN_TOP           = DIV_MARGIN_TOP
DEFAULT_LWIDTH          = (LCD_WIDTH-LCD_LEFT_MARGIN)
H1_MARGIN_TOP           = 8
H1_MARGIN_BOTTOM        = P_MARGIN_TOP
H2_MARGIN_TOP           = 14
H3_MARGIN_TOP           = H2_MARGIN_TOP
H4_MARGIN_TOP           = H2_MARGIN_TOP
H5_MARGIN_TOP           = H2_MARGIN_TOP
H6_MARGIN_TOP           = H2_MARGIN_TOP


LIMAX_INDENT_LEVELS     = 3
MAX_QUOTE_LEVEL         = 1

# bullet[0] charater is not used (the '!')
bullet_c                = u"!\u25aa\u2022\u25ab"
LIMAX_BULLETS           = len(bullet_c) - 1

font_id_values = {}


g_starty = 0
g_curr_face = DEFAULT_FONT_IDX
g_halign = 0
g_this_article_title = 'NO TITLE'
g_links = {}
g_link_cnt = 0
i_out = None
f_out = None
file_number = 0

article_db = None

output = None
compress = True

article_writer = None


def usage(message):
    if None != message:
        print('error: {0:s}'.format(message))
    print('usage: {0:s} <options> html-files...'.format(os.path.basename(__file__)))
    print('       --help                  This message')
    print('       --verbose               Enable verbose output')
    print('       --warnings              Enable warnings output')
    print('       --number=n              Number for the .dat/.idx-tmp files [0]')
    print('       --test=file             Output the uncompressed file for testing')
    print('       --font-path=dir         Path to font files (*.bmf) [fonts]')
    print('       --article-index=file    Article index dictionary input [articles.db]')
    print('       --prefix=name           Device file name portion for .dat/.idx-tmp [pedia]')
    print('       --languages-links=<YN>  Turn on/off inter-wiki links [YES]')
    print('       --images=<YN>           Turn on/off in-line math images [YES]')
    print('       --articles=<N>          Articles per block [32]')
    print('       --block-size=<bytes>    Max size for artical block [262144]')
    exit(1)


def main():
    global verbose, warnings, compress
    global f_out, output, i_out
    global font_id_values
    global file_number
    global article_count
    global article_db
    global start_time
    global article_writer

    try:
        opts, args = getopt.getopt(sys.argv[1:],
                                   'hvwn:p:i:t:f:l:a:b:',
                                   ['help',
                                    'verbose',
                                    'warnings',
                                    'number=',
                                    'prefix=',
                                    'article-index=',
                                    'test=',
                                    'font-path=',
                                    'language-links=',
                                    'images=',
                                    'articles=',
                                    'block-size=',
                                    ])
    except getopt.GetoptError, err:
        usage(err)

    verbose = False
    warnings = False
    data_file = 'pedia{0:d}.dat'
    index_file = 'pedia{0:d}.idx-tmp'
    art_file = 'articles.db'
    file_number = 0
    test_file = ''
    font_path = "../fonts"
    article_db = None
    inter_links = True
    enable_images = True
    articles_per_block = 32
    block_size = 262144


    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-w', '--warnings'):
            warnings = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-t', '--test'):
            test_file = arg
        elif opt in ('-i', '--article-index'):
            art_file = arg
        elif opt in ('-n', '--number'):
            try:
                file_number = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
        elif opt in ('-p', '--prefix'):
            data_file = arg + '{0:d}.dat'
            index_file = arg + '{0:d}.idx-tmp'
        elif opt in ('-f', '--font-path'):
            font_path = arg
        elif opt in ('-l', '--language-links'):
            arg = arg.lower()
            inter_links = ('yes' == arg)
        elif opt in ('-l', '--images'):
            arg = arg.lower()
            enable_images = ('yes' == arg)
        elif opt in ('-a', '--articles'):
            try:
                articles_per_block = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if articles_per_block < 1 or articles_per_block > 64:
                usage('"{0:s}={1:s}" is out of range [1..64]'.format(opt, arg))
        elif opt in ('-b', '--block-size'):
            try:
                block_size = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if block_size < 65536 or block_size > 524288:
                usage('"{0:s}={1:s}" is out of range [65536..524288]'.format(opt, arg))
        else:
            usage('unhandled option: ' + opt)

    start_time = time.time()

    f_fontr  = open(os.path.join(font_path, "text.bmf"), "rb")
    f_fonti  = open(os.path.join(font_path, "texti.bmf"), "rb")
    f_fontt  = open(os.path.join(font_path, "title.bmf"), "rb")
    f_fontst = open(os.path.join(font_path, "subtitle.bmf"), "rb")
    f_font_all = open(os.path.join(font_path, "textall.bmf"), "rb")
    f_fontt_all = open(os.path.join(font_path, "titleall.bmf"), "rb")
    f_fontst_all = open(os.path.join(font_path, "subtlall.bmf"), "rb")

    font_id_values = {
        ITALIC_FONT_IDX: f_fonti,
        DEFAULT_FONT_IDX: f_fontr,
        TITLE_FONT_IDX: f_fontt,
        TITLE_ALL_FONT_IDX: f_fontt_all,
        SUBTITLE_FONT_IDX: f_fontst,
        SUBTITLE_ALL_FONT_IDX: f_fontst_all,
        DEFAULT_ALL_FONT_IDX: f_font_all
    }

    article_db = sqlite3.connect(art_file)

    article_db.execute('pragma auto_vacuum = none')
    article_db.execute('pragma synchronous = off')
    article_db.execute('pragma temp_store = memory')
    article_db.execute('pragma locking_mode = normal')
    article_db.execute('pragma read_uncommitted = true')
    article_db.execute('pragma cache_size = 20000000')
    article_db.execute('pragma default_cache_size = 20000000')
    article_db.execute('pragma journal_mode = off')

    output = io.BytesIO('')

    if test_file == '':
        compress = True
        i_out = open(index_file.format(file_number), 'wb')
        f_out = open(data_file.format(file_number), 'wb')
        article_writer = ArticleWriter(file_number, f_out, i_out,
                                       max_buckets = 50,
                                       bucket_size = block_size,
                                       max_items_per_bucket = articles_per_block)
    else:
        compress = False
        f_out = open(test_file, 'wb')

    for name in args:
        f = codecs.open(name, 'r', 'utf-8', 'replace')
        WrProcess(f, inter_links, enable_images)
        f.close()

    for item in font_id_values:
        font_id_values[item].close()

    if output != None:
        output.close()

    if article_writer != None:
        del article_writer

    if f_out != None:
        f_out.close()
    if i_out != None:
        i_out.close()

    if article_db != None:
        article_db.close()

    for i in font_id_values:
        font_id_values[i].close()

    # final message
    PrintLog.message("Render[{0:d}]: Total: {1:d}".format(file_number, article_count))


#
# cached font information
#
font_width_cache = {}
font_default_cache = {}

def get_utf8_cwidth(c, face):
    global font_width_cache
    global font_default_cache
    global font_id_values
    global FONT_BMF_HEADER
    global CHARMETRIC_BMF
    global FONT_BMF_HEADER_SIZE
    global CHARMETRIC_BMF_SIZE
    global FONT_FACE_NAME

    if type(c) != unicode:
        c = unicode(c, 'utf-8')

    if (c, face) in font_width_cache:
        return font_width_cache[(c, face)]

    font_file = font_id_values[face]

    if face not in font_default_cache:
        font_file.seek(0)
        buffer = font_file.read(FONT_BMF_HEADER_SIZE)

        if len(buffer) != 0:
            linespace, ascent, descent, bmp_buffer_len, default_char = struct.unpack(FONT_BMF_HEADER, buffer)
        else:
            linespace, ascent, descent, bmp_buffer_len, default_char = (0, 0, 0, 0, ord(u' '))

        font_default_cache[face] = unichr(default_char)

    font_file.seek(ord(c) * CHARMETRIC_BMF_SIZE + FONT_BMF_HEADER_SIZE)
    buffer = font_file.read(CHARMETRIC_BMF_SIZE)

    if len(buffer) != 0:
        width, height, widthBytes, widthBits, ascent, descent, LSBearing, widthDevice, bitmap = struct.unpack(CHARMETRIC_BMF, buffer)
    else:
        width, height, widthBytes, widthBits, ascent, descent, LSBearing, widthDevice, bitmap = (0,0,0,0,0,0,0,0,
                                                                                               r'\x55' * 48)
    character_width = widthDevice

    if 0 == character_width:

        if TITLE_FONT_IDX == face:
            character_width = get_utf8_cwidth(c, TITLE_ALL_FONT_IDX)

        elif SUBTITLE_FONT_IDX == face:
            character_width = get_utf8_cwidth(c, SUBTITLE_ALL_FONT_IDX)

        elif face in [TITLE_ALL_FONT_IDX, SUBTITLE_ALL_FONT_IDX, DEFAULT_ALL_FONT_IDX]:
            character_width = get_utf8_cwidth(font_default_cache[face], face)

        else:
            character_width = get_utf8_cwidth(c, DEFAULT_ALL_FONT_IDX)

    font_width_cache[(c, face)] = character_width

    return character_width


def get_lineheight(face):

    values = {
            ITALIC_FONT_IDX:       P_LSPACE,
            DEFAULT_FONT_IDX:      P_LSPACE,
            TITLE_FONT_IDX:        H1_LSPACE,
            TITLE_ALL_FONT_IDX:    H1_LSPACE,
            SUBTITLE_FONT_IDX:     H2_LSPACE,
            SUBTITLE_ALL_FONT_IDX: H2_LSPACE,
            DEFAULT_ALL_FONT_IDX:  P_LSPACE
        }

    return values[face]


def make_link(url, x0, x1, text):
    global g_starty, g_curr_face, g_link_cnt, g_links

    if article_index(url):
        esc_code10(x1 - x0)
        g_links[g_link_cnt] = (x0, g_starty - get_lineheight(g_curr_face), x1, g_starty, url)
        g_link_cnt =  g_link_cnt + 1


def get_imgdata(imgfile, indent):
    try:
        img = gd.image(imgfile)
    except IOError, e:
        PrintLog.message(u'unable to open image file: {0:s}'.format(imgfile))
        return (0, 0, r'')

    (width, height) = img.size()
    if width <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((x, y)))
        h_range = range(0, width)
        v_range = range(0, height)
    elif height <= (LCD_WIDTH - LCD_IMG_MARGIN - indent):
        is_black = lambda x, y: (0, 0, 0) == img.colorComponents(img.getPixel((y, x)))
        v_range = range(0, width)
        h_range = range(height - 1, -1, -1)
        (width, height) = (height, width)
    else:
        PrintLog.message(u'image file: {0:s} is too big'.format(imgfile))
        return (0, 0, r'')

    data = ''
    for v in v_range:
        byte = 0
        bit_count = 8

        for h in h_range:
            if is_black(h, v):
                pixel = 1
            else:
                pixel = 0
            bit_count -= 1
            byte |= pixel << bit_count
            if 0 == bit_count:
                data += struct.pack('<B', byte)
                byte = 0
                bit_count = 8
        if 8 != bit_count:
            data += struct.pack('<B', byte)

    return (width, height, data)

def esc_code0(num_pixels):
    """blank line height in pixels"""
    global g_starty
    global output

    output.write(struct.pack('<BB', 1, num_pixels))
    g_starty += num_pixels


def esc_code1():
    """new line with default font and default line space"""
    global g_starty, g_curr_face
    global output

    output.write(struct.pack('<B', 2))
    g_starty += get_lineheight(DEFAULT_FONT_IDX)
    g_curr_face = DEFAULT_FONT_IDX


def esc_code2():
    """new line with current font and current line space"""
    global g_starty, g_curr_face
    global output

    output.write(struct.pack('<B', 3))
    g_starty += get_lineheight(g_curr_face)


def esc_code3(face):
    """new line using new font face."""
    global g_starty, g_curr_face
    global output

    num_pixels = get_lineheight(face)
    output.write(struct.pack('<BB', 4, face|(num_pixels<<3)))
    g_starty += num_pixels
    g_curr_face = face

def esc_code4(face, halign=0):
    """change font with current horizontal alignment (in pixels)"""
    global g_curr_face
    global output

    output.write(struct.pack('<BB', 5, face|(halign<<3)))
    g_curr_face = face


def esc_code5():
    """set font as default"""
    global g_curr_face
    global output

    output.write(struct.pack('<B', 6))
    g_curr_face = DEFAULT_FONT_IDX


def esc_code6():
    """set default alignment"""
    global output

    output.write(struct.pack('<B', 7))


def esc_code7(num_pixels):
    """move right num_pixels"""
    global output

    output.write(struct.pack('<BB', 8, num_pixels))


def esc_code8(num_pixels):
    """move left num_pixels"""
    global output

    output.write(struct.pack('<BB', 9, num_pixels))


def esc_code9(num_pixels):
    """alignment adjustment"""
    global g_halign
    global output

    output.write(struct.pack('<Bb', 10, num_pixels))
    g_halign = num_pixels


def esc_code10(num_pixels):
    """draw line from right to left"""
    global output

    output.write(struct.pack('<BB', 11, num_pixels))


def esc_code14(width, height, data):
    """output bitmap"""
    global g_starty, g_curr_face
    global output

    if 0 == width or 0 == height:
        return

    output.write(struct.pack('<BBH', 15, width, height))
    output.write(data)

    lineh = get_lineheight(g_curr_face)

    if (height) > lineh:
        g_starty += (height)-lineh + 3   # since Eric draws images 3px lower for alignment


#
# Parse the HTML into the WikiReader's format
#
class WrProcess(HTMLParser.HTMLParser):

    READ_BLOCK_SIZE = 64 * (1024 * 1024)

    def __init__ (self, f, inter_links = True, enable_images = True):
        global g_this_article_title, article_count

        HTMLParser.HTMLParser.__init__(self)
        self.wordwrap = WordWrap.WordWrap(get_utf8_cwidth)
        self.local_init()
        self.tag_stack = []
        self.inter_links = inter_links
        self.enable_images = enable_images
        self.bucket = bucket.Bucket()
        block = f.read(self.READ_BLOCK_SIZE)
        while block:
            self.feed(block)
            block = f.read(self.READ_BLOCK_SIZE)


    def local_init(self):

        global g_starty, g_curr_face, g_halign
        global g_this_article_title, g_links, g_link_cnt

        self.in_html = False
        self.in_title = False
        self.in_body = False

        self.in_h1 = False
        self.in_h2 = False
        self.in_h3 = False
        self.in_h4 = False
        self.in_h5 = False
        self.in_h6 = False

        self.in_table = 0
        self.in_p  = False
        self.in_b  = False
        self.in_big = False
        self.in_strong = False
        self.in_del = False
        self.in_ins = False
        self.in_i  = False
        self.in_a  = False
        self.in_br = False
        self.in_img = False

        self.quote = 0
        self.level = 0
        self.lwidth = DEFAULT_LWIDTH
        self.indent = 0
        self.li_cnt = {}
        self.li_inside = {}
        self.li_type = {}
        self.link_x = 0
        self.link_y = 0
        self.url = None

        self.language_links = []

        self.printing = True

        g_starty = 0
        g_curr_face = DEFAULT_FONT_IDX
        g_halign = 0
        g_this_article_title = 'NO TITLE'
        g_links = {}
        g_link_cnt = 0


    def handle_starttag(self, tag, attrs):

        global g_starty, g_curr_face, g_halign
        global g_this_article_title, g_links, g_link_cnt
        global warnings

        attrs = dict(attrs)

        # must always do the <html> tag
        if tag == 'html':
            self.local_init()
            self.in_html = True
            self.tag_stack = [(tag, True)]
            return

        self.tag_stack.append((tag, self.printing))

        # we want to skip content that isn't for printing
        if 'class' in attrs:
            if 'noprint' in attrs['class']:
                self.printing = False

            # create a list of language links
            if self.inter_links and tag == 'a' and 'lang-link' in attrs['class']:
                self.language_links.append(attrs['href'])

        # handle the tags
        if not self.printing:
            return;

        elif tag == 'script':
            self.printing = False

        elif tag == 'title':
            self.in_title = True
            g_this_article_title = ''

        elif tag == 'body':
            self.in_body = True

        elif tag == 'table':
            self.in_table += 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = True
            esc_code0(H1_MARGIN_TOP)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = True
            esc_code0(H2_MARGIN_TOP)

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = True
            esc_code0(H3_MARGIN_TOP)

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = True
            esc_code0(H4_MARGIN_TOP)

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = True
            esc_code0(H5_MARGIN_TOP)

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = True
            esc_code0(H6_MARGIN_TOP)

        elif tag == 'div':
            self.flush_buffer()
            # suppress thumb info boxes
            if 'class' in attrs:
                c = attrs['class']
                if 'thumb' in c or 'left' in c or 'right' in c \
                    or 'dablink' in c or 'magnify' in c:
                    self.printing = False
                    return
            esc_code0(DIV_MARGIN_TOP)

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = True
            esc_code0(P_MARGIN_TOP)

        elif tag == 'blockquote':
            self.flush_buffer()
            self.quote += 1
            if self.quote < MAX_QUOTE_LEVEL:
                esc_code0(BLOCKQUOTE_MARGIN_TOP)
                self.indent += BLOCKQUOTE_MARGIN_LEFT
                self.lwidth -= BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                esc_code9(BLOCKQUOTE_MARGIN_LEFT)

        elif tag == 'b':
            self.in_b = True

        elif tag == 'i':
            self.in_i = True

        elif tag == 'big':            # Not sure what to do with this one
            self.in_b = True

        elif tag == 'strong':
            self.in_b = True

        elif tag == 'del':
            self.in_del = True

        elif tag == 'ins':
            self.in_ins = True

        elif tag == 'a' and 'href' in attrs:
            self.in_a = True
            self.url  = attrs['href']

        elif tag in ['ul', 'ol', 'dl']:
            if 'start' in attrs:
                list_start = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['start'])
                try:
                    list_start = int(list_start)
                except ValueError:
                    list_start = 1

                self.enter_list(tag, list_start)
            else:
                self.enter_list(tag)

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
                # force ul since this is a li without a parent
                #(t, p) = self.tag_stack.pop()
                #self.tag_stack.append(('ul', p))
                #self.tag_stack.append((t,p))
                #self.enter_list('ul')

            # handle missing </li> at the same level
            # simulate </li> and continue
            if self.li_inside[self.level]:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: missing </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                self.flush_buffer(False)
                self.list_decrease_indent()

            self.li_inside[self.level] = True

            if 'value' in attrs:
                list_index = re.sub(r'^\D*(\d+)\D?.*$', r'\1', attrs['value'])
                try:
                    self.li_cnt[self.level] = int(list_index)
                except ValueError:
                    pass
            else:
                self.li_cnt[self.level] += 1

            if self.li_type[self.level] == 'ol':
                self.wordwrap.append(("{0:d}".format(self.li_cnt[self.level])) + u".", DEFAULT_FONT_IDX, None)
            else:
                if self.level > LIMAX_BULLETS:
                    bullet_num = LIMAX_BULLETS
                else:
                    bullet_num = self.level

                self.wordwrap.append(bullet_c[bullet_num], DEFAULT_FONT_IDX, None)

            self.flush_buffer()
            self.list_increase_indent()

        elif tag == 'dd':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray <{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
                (t, p) = self.tag_stack.pop()
                return  # just ignore it
            self.li_cnt[self.level] += 1
            self.list_increase_indent()

        elif tag == 'br':
            self.in_br = True

        elif tag == 'img' and 'src' in attrs:
            # include either image or the 'alt' text
            if self.enable_images:
                (width, height, data) = get_imgdata(attrs['src'], self.indent)
                self.wordwrap.AppendImage(width, height, data, None)
            elif 'alt' in attrs:
                self.handle_data(attrs['alt'])

            self.in_img = True


    def handle_endtag(self, tag):
        global g_this_article_title
        global article_count
        global warnings

        # ignore end tag without start tag
        if (tag, True) not in self.tag_stack and (tag, False) not in self.tag_stack:
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(u'Warning: superfluous </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                 .format(tag, line, column, article_count + 1, g_this_article_title))
            return

        # backtrack up the stack closing each open tag until there is a match
        (start_tag, self.printing) = self.tag_stack.pop()
        while start_tag != tag:
            self.tag_stack.append((start_tag, self.printing))
            if warnings:
                (line, column) = self.getpos()
                PrintLog.message(u'Warning: force </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                 .format(start_tag, line, column, article_count + 1, g_this_article_title))
            self.handle_endtag(start_tag)
            (start_tag, self.printing) = self.tag_stack.pop()

        # must always do </html> tag
        if tag == 'html':
            self.printing = True
            self.tag_stack = []
            self.in_html = False
            esc_code1()
            write_article(self.language_links)
            return

        if not self.printing:
            return

        elif tag == 'script':
            pass

        elif tag == 'title':
            self.in_title = False
            g_this_article_title = g_this_article_title.strip()

        elif tag == 'body':
            self.in_body = False
            self.flush_buffer()

        elif tag == 'table':
            if self.in_table > 0:
                self.in_table -= 1

        # if in a table suppress everything after this point
        if self.in_table > 0:
            return

        elif tag == 'h1':
            self.flush_buffer()
            self.in_h1 = False
            esc_code0(H1_MARGIN_BOTTOM)

        elif tag == 'h2':
            self.flush_buffer()
            self.in_h2 = False

        elif tag == 'h3':
            self.flush_buffer()
            self.in_h3 = False

        elif tag == 'h4':
            self.flush_buffer()
            self.in_h4 = False

        elif tag == 'h5':
            self.flush_buffer()
            self.in_h5 = False

        elif tag == 'h6':
            self.flush_buffer()
            self.in_h6 = False

        elif tag == 'div':
            self.flush_buffer()

        elif tag == 'p':
            self.flush_buffer()
            self.in_p = False

        elif tag == 'blockquote':
            self.flush_buffer()
            if self.quote > 0:
                if self.quote < MAX_QUOTE_LEVEL:
                    self.indent -= BLOCKQUOTE_MARGIN_LEFT
                    self.lwidth += BLOCKQUOTE_MARGIN_LEFT + BLOCKQUOTE_MARGIN_RIGHT
                    esc_code9(-BLOCKQUOTE_MARGIN_LEFT)
                self.quote -= 1

        elif tag == 'b':
            self.in_b = False

        elif tag == 'big':
            self.in_b = False

        elif tag == 'strong':
            self.in_b = False

        elif tag == 'i':
            self.in_i = False

        elif tag == 'del':
            self.in_del = False

        elif tag == 'ins':
            self.in_ins = False

        elif tag == 'a':
            self.in_a = False
            self.url  = ""

        elif tag in ['ul', 'ol', 'dl']:
            self.leave_list()

        elif tag == 'li':
            if 0 == self.level:
                if warnings:
                    (line, column) = self.getpos()
                    PrintLog.message(u'Warning: stray </{0:s}> @[L{1:d}/C{2:d}] in article[{3:d}]: {4:s}'
                                     .format(tag, line, column, article_count + 1, g_this_article_title))
            else:
                self.flush_buffer(False)
                self.list_decrease_indent()
                self.li_inside[self.level] = False

        elif tag == 'dd':
            self.flush_buffer()
            self.list_decrease_indent()

        elif tag == 'dt':
            self.flush_buffer()

        elif tag == 'br':
            self.flush_buffer()
            self.in_br = False

        elif tag == 'img':
            self.in_img = False


    def enter_list(self, list_type, start = 1):
        self.flush_buffer(False)
        esc_code0(LIST_MARGIN_TOP)
        self.level += 1
        self.li_cnt[self.level] = start - 1
        self.li_inside[self.level] = False
        self.li_type[self.level] = list_type


    def list_increase_indent(self):
        if self.level <= LIMAX_INDENT_LEVELS:
            esc_code9(LIST_INDENT)
            esc_code8(LIST_INDENT)  ### Bug in lcd_buf_draw ASK ERIC
            self.lwidth -= LIST_INDENT
            self.indent += LIST_INDENT


    def leave_list(self):
        self.flush_buffer()
        if self.level > 0:
            esc_code0(LIST_MARGIN_TOP)
            del self.li_cnt[self.level]
            del self.li_inside[self.level]
            self.level -= 1

    def list_decrease_indent(self):
        if self.level <= LIMAX_INDENT_LEVELS:
            esc_code9(- LIST_INDENT)
            self.lwidth += LIST_INDENT
            self.indent -= LIST_INDENT


    def handle_charref(self, name):
        """handle &#DDDD; &#xXXXX;"""

        if 0 == len(name):
            return

        if 'x' == name[0] or 'X' == name[0]:
            try:
                value = int(name[1:], 16)
            except ValueError:
                PrintLog.message(u'charref: "{0:s}" is not hexadecimal'.format(name))
                return

        elif name.isdigit():
            try:
                value = int(name)
            except ValueError:
                PrintLog.message(u'charref: "{0:s}" is not decimal'.format(name))
                return

        try:
            c = unichr(value)
        except ValueError:
            PrintLog.message(u'charref: "{0:d}" is not convertible to unicode'.format(value))
            c = '?'
        self.handle_data(c)


    def handle_entityref(self, name):
        """handle &amp; &gt; ..."""
        try:
            self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
        except KeyError:
            PrintLog.message(u'ENTITYREF ERROR: {0:s} article: {1:s}'.format(name, g_this_article_title))


    def handle_data(self, data):
        global g_this_article_title

        if self.in_title:
            g_this_article_title += data

        # only parse valid tags in <body>
        # skip tables for now
        if not self.in_body or self.in_table > 0 or not self.printing:
            return

        # defaults
        data = re.sub("\s+" , " ", data)
        face = DEFAULT_FONT_IDX
        url  = None

        # only use italic fonts now (don't care about bold)
        if self.in_i:
            face = ITALIC_FONT_IDX

        if self.in_h1:
            face = TITLE_FONT_IDX
        elif self.in_h2 or self.in_h3 or self.in_h4 or self.in_h5 or self.in_h6:
            face = SUBTITLE_FONT_IDX

        # figure out if we need a url
        if self.in_a:
            url = self.url

        self.wordwrap.append(data, face, url)


    def flush_buffer(self, new_line = True):
        global output
        font = -1

        while self.wordwrap.have():
            url = None
            x0 = self.indent
            url_x0 = x0
            line = self.wordwrap.wrap(self.lwidth)
            if line == []:
                break

            if tuple == type(line[0][1]):
                if font < 0:
                    new_font = DEFAULT_FONT_IDX
                else:
                    new_font = font
            else:
                new_font = line[0][1]

            if new_line:
                if font != new_font:
                    font = new_font
                    esc_code3(font)
                else:
                    esc_code2()
            else:
                if font != new_font:
                    font = new_font
                    esc_code4(font)
                new_line = True


            for i in line:
                if tuple == type(i[1]):
                    (width, height, data) = i[1]
                    esc_code14(width, height, data)
                else:
                    if font != i[1]:
                        font = i[1]
                        esc_code4(font)

                    if url != i[2]:
                        if url != None:
                            make_link(url, url_x0, x0, i[0])
                        url = i[2]
                        if url != None:
                            url_x0 = x0
                    output.write(i[0].encode('utf-8'))
                x0 += i[3]

            if url != None:
                make_link(url, url_x0, x0, line[-1][0])


def link_number(url):
    global article_index

    try:
        n = article_index(url)[0]
    except KeyError:
        n = -1
    return n


# Add the '~' padding back here
def article_index(title):
    global article_db

    c = article_db.cursor()
    c.execute('select article_number, fnd_offset, restricted from articles where title = ? limit 1', ["~" + title])
    result = c.fetchone()
    c.close()
    return result  # this returns a tuple of text strings, so beware!


def write_article(language_links):
    global compress
    global verbose
    global output, f_out, i_out
    global article_count
    global g_this_article_title
    global file_number
    global start_time
    global article_writer

    article_count += 1
    if verbose:
        PrintLog.message(u'[MWR {0:d}] {1:s}'.format(article_count, g_this_article_title))

    elif article_count % 1000 == 0:
        now_time = time.time()
        PrintLog.message(u'Render[{0:d}]: {1:7.2f}s {2:10d}'.format(file_number, now_time - start_time, article_count))
        start_time = now_time

    output.flush()

    # create links
    links_stream = io.BytesIO('')

    for i in g_links:
        (x0, y0, x1, y1, url) = g_links[i]
        links_stream.write(struct.pack('<3I', (y0 << 8) | x0, (y1 << 8) | x1, link_number(url)))

    links_stream.flush()
    links = links_stream.getvalue()
    links_stream.close()

    # create language links
    links_stream = io.BytesIO('')
    japanese_convert = LanguageTranslation.LanguageJapanese().translate
    normal_convert = LanguageTranslation.LanguageNormal().translate

    for l in language_links:
        language, link = l.split(':', 1)

        if 'ja' == language:
            stripped = japanese_convert(link)
        else:
            stripped = normal_convert(link)

        if link == stripped:
            links_stream.write(l.encode('utf-8') + '\0')
        else:
            links_stream.write((language + '#' + stripped).encode('utf-8') + '\1' + link.encode('utf-8') + '\0')

    links_stream.flush()
    langs = links_stream.getvalue()
    links_stream.close()

    # create the header (header size = 8)
    header = struct.pack('<I2H', 8 + len(links) + len(langs), g_link_cnt, 0)
    body = output.getvalue()

    # combine the data
    whole_article = header + links + langs + body

    if compress:
        try:
            (article_number, fnd_offset, restricted) = article_index(g_this_article_title)
            restricted =  bool(int(restricted))  # '0' is True so turn it into False
            article_writer.add_article(article_number, whole_article, fnd_offset, restricted)
        except KeyError:
            PrintLog.message(u'Error in: write_article, Title not found')
            PrintLog.message(u'Title:  {0:s}'.format(g_this_article_title))
            PrintLog.message(u'Offset: {0:s}'.format(file_offset))
            PrintLog.message(u'Count:  {0:s}'.format(article_count))
    else:
        f_out.write(whole_article)
        f_out.flush()

    # Note: some versions of Python do not move file position on truncate
    #       so an explicit seek is needed to avoid nul padding bytes.
    output.seek(0)
    output.truncate(0)


class ArticleWriter(bucket.Bucket):
    """to combine sets of articles and compress them together"""

    def __init__(self, file_number, data_file, index_file,
                 max_buckets = 50, bucket_size = 524288, max_items_per_bucket = 64):

        super(ArticleWriter, self).__init__(max_buckets = max_buckets,
                                       bucket_size = bucket_size,
                                       max_items_per_bucket = max_items_per_bucket)
        self.file_number = file_number
        self.index_file = index_file
        self.data_file = data_file
        self.index = {}


    def add_article(self, article_index, article_data, fnd_offset, restricted):
        self.add((article_index, article_data, fnd_offset, restricted), len(article_data))


    def write(self, data):
        """output the article data"""
        all_data = ''
        blocks = ''
        offset = 0
        for size, item in data:
            article_index, article_data, fnd_offset, restricted = item
            blocks += struct.pack('<3I', article_index,
                                  offset + (0x80000000 if restricted else 0),
                                  size)
            offset += size
            all_data += article_data

        sizeof_one_block = 12 # number of bytes generated by struct above

        ah = chr(len(blocks) / sizeof_one_block) + blocks
        ac = CompressData(all_data)
        file_offset = self.data_file.tell()

        data_length = struct.pack('<I', len(ac))
        self.data_file.write(ah + data_length + ac)

        for size, item in data:
            article_index, article_data, fnd_offset, restricted = item
            self.index[article_index] = struct.pack('<2IB', file_offset, fnd_offset, self.file_number)


    def __del__(self):
        self.flush()
        keys = self.index.keys()
        keys.sort()
        for k in keys:
            self.index_file.write(self.index[k])


# run the program
if __name__ == "__main__":
    main()