jadedctrl
/
wrdk
mirror of https://github.com/nzmichaelh/wrdk


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
							#! /usr/bin/env python
# -*- coding: utf-8 -*-
# COPYRIGHT: Openmoko Inc. 2010
# LICENSE: GPL Version 3 or later
# DESCRIPTION: Read an process the XML file
# AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
#          Christopher Hall <hsw@openmoko.com>

import sys, os
import re
import getopt
import FilterWords
import RedirectedTo


class FileScanner(object):

    def __init__(self, *args, **kw):
        #super(FileScanner, self).__init__(*args, **kw)
        super(FileScanner, self).__init__()
        self.file_list = []
        self.category_to_key = {}
        self.key_to_category = {}
        self.current_file_id = -1  # no file yet

    KEY_ARTICLE  = 0
    KEY_TEMPLATE = 10

    def file_id(self):
        return self.current_file_id


    def current_filename(self):
        return self.file_list[self.current_file_id]


    def all_file_names(self):
        return self.file_list


    def namespace(self, key, text):
        pass


    def title(self, category, key, title, seek):
        return True


    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        pass


    def body(self, category, key, title, text, seek):
        pass


    namespaces_start = '<namespaces>'
    namespaces_end = '</namespaces>'

    namespace_start = '<namespace key="'
    namespace_stop = '/>'
    namespace_cont = '">'

    namespace_end = '</namespace>'

    title_start = '<title>'
    title_end = '</title>'
    title_end_len = len(title_end)

    text_start = '<text '
    text_cont = 'xml:space="preserve">'
    text_stop = '/>'

    text_end = '</text>'

    # en: redirect: <text.....#redirect.....[[title#relative link]].....
    # es: redirección ""
    #redirected_to = re.compile(r'#\s*(redirect|redirecci..n)[^\[]*\[\[(.*?)([#|].*?)?\]\]', re.IGNORECASE)

    StateMachine = {
        'start': [
            (namespaces_start, len(namespaces_start),  'category_start', 'spaces'),
            (title_start, len(title_start), 'drop', 'title'),
            ],

        'spaces': [
            (namespaces_end, len(namespaces_end),  'drop', 'start'),
            (namespace_start, len(namespace_start),  'drop', 'key'),
            ],

        'key': [
            (namespace_stop, len(namespace_stop), 'drop', 'spaces'),
            (namespace_cont, len(namespace_cont), 'key', 'ns'),
            ],

        'ns': [
            (namespace_end, len(namespace_end), 'namespace', 'spaces'),
            ],

        'title': [
            (title_end, len(title_end), 'title', 'text'),
            ],

        'text': [
            (text_start, len(text_start), 'drop', 'prebody'),
            ],

        'prebody': [
            (text_cont, len(text_cont), 'drop', 'body'),
            (text_stop, len(text_stop), 'zero', 'start'),
            ],

        'body': [
            (text_end, len(text_end), 'body', 'start'),
            ]
        }

    def process(self, filename, limit):
        self.file_list += [filename]
        self.current_file_id = len(self.file_list) - 1

        block = ''
        seek = 0
        key = None
        category = None
        title = None
        file = open(filename, 'rb')
        end = False
        more = True
        wanted = True
        CurrentState = self.StateMachine['start']

        run = True
        while run:
            if more or (not end and len(block) < 1024):
                more = False
                block2 = file.read(65536)
                if len(block2) == 0:
                    end = True
                else:
                    block += block2

            pos = -1
            state = None
            for s in CurrentState:
                p = block.find(s[0])
                if p >= 0:
                    if pos < 0 or p < pos:
                        pos = p
                        state = s

            if None == state:
                if end:
                    return limit
                else:
                    more = True
            else:
                (tag, length, proc, next) = state

                CurrentState = self.StateMachine[next]
                flag = False
                if 'key' == proc:
                    key = block[:pos].strip()
                elif 'namespace' == proc:
                    category =  block[:pos].strip().lower()
                    key = int(key)
                    self.category_to_key[category] = key
                    self.key_to_category[key] = category
                    self.namespace(key, category)
                    key = None
                elif 'title' == proc:
                    (category, key, title) = self.get_category(block[:pos].strip())
                    wanted = self.title(category, key, title, seek)
                elif 'body' == proc:
                    body =  block[:pos].strip()
                    body_leading_blanks = pos - len(block[:pos].lstrip())
                    flag = True
                    if '#' in body[0:10] or '＃' in body[0:10]:
                        match = RedirectedTo.regex.match(body)
                        if wanted and match:
                            (rcategory, rkey, rtitle) = self.get_category(match.group(2).strip())
                            self.redirect(category, key, title, rcategory, rkey, rtitle, seek + body_leading_blanks)
                            flag = False
                elif 'zero' == proc:
                    flag = True
                    body = ''
                elif 'category_start' == proc:
                    self.category_to_key = {}
                    self.key_to_category = {}

                if wanted and flag:
                    self.body(category, key, title, body, seek + body_leading_blanks)
                    if limit != 'all':
                        limit -= 1
                        if limit <= 0:
                            run = False
                            break
                    title = None

                block = block[pos + length:]
                seek += pos + length

        return limit


    def get_category(self, title):
        """split title into category, key, title"""

        if ':' in title:
            (category, t) = title.split(':', 1)
            category = category.strip().lower()
            t = t.strip()
            if category in self.category_to_key:
                key = self.category_to_key[category]
                return (category, key, t)
        return ('', 0, title)


class MyTestScanner(FileScanner):

    def __init__(self, *args, **kw):
        super(MyTestScanner, self).__init__(*args, **kw)
        self.count = 0
        self.articles = 0
        self.article_index = {}


    def namespace(self, key, text):
        print('namespace "{0:d}"->"{1:s}"'.format(key, text))


    def title(self, category, key, title, seek):
        self.count += 1
        print('T:{0:d} {1:d} : {2:s}[{3:d}]:{4:s}'.format(self.count, seek, category, key, title))
        if self.KEY_ARTICLE != key:
            return False
        return True


    def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
        #pass
        print('R:{0:d} {1:d} : {2:s}[{3:d}]:{4:s} -> {5:s}[{6:d}]:{7:s}'
              .format(self.count, seek, category, key, title, rcategory, rkey, rtitle))


    def body(self, category, key, title, text, seek):
        if not filter(title, text):
            self.articles += 1
            self.article_index[title] = [self.articles, seek, len(text)]
            print('B:{0:d} {1:d} [{2:s}[{3:d}]{4:s}] : {5:s}'
                  .format(self.count, seek, category, key, title, text[:100]))


def filter(title, text):

    (restricted, contains) = FilterWords.find_restricted(title + text)

    if restricted:
        print('TITLE: "{0:s}" restricted: [{1:s}]'.format(title, contains))

    return restricted


def usage(message):
    if None != message:
        print('error: {0:s}'.format(message))
    print('usage: {0:s} <options> xml-file...'.format(os.path.basename(__file__)))
    print('       --help                  This message')
    print('       --count=n               Number of article to process [all]')
    print('       --limit=number          Limit the number of articles processed')
    print('       --prefix=name           Device file name portion for .fnd/.pfx [pedia]')
    print('       --templates=file        Database for templates [templates.db]')
    exit(1)


def main():
    global verbose
    global debug

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hvc:d:',
                                   ['help', 'verbose',
                                    'count=',
                                    'debug='])
    except getopt.GetoptError, err:
        usage(err)

    verbose = False
    debug = 0
    count = 'all'

    for opt, arg in opts:
        if opt in ('-v', '--verbose'):
            verbose = True
        elif opt in ('-h', '--help'):
            usage(None)
        elif opt in ('-d', '--debug'):
            try:
                debug = int(arg)
            except ValueError:
                usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
        elif opt in ('-c', '--count'):
            if arg[-1] == 'k':
                arg = arg[:-1] + '000'
            if arg != 'all':
                try:
                    count = int(arg)
                except ValueError:
                    usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
            if count <= 0:
                usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
        else:
            usage('unhandled option: ' + opt)

    if len(args) == 0:
        usage('no files to process')

    scanner = MyTestScanner()

    for f in args:
        print('Processing file: {0:s}'.format(f))
        count = scanner.process(f, count)
        if 0 == count:
            break


# run the program
if __name__ == "__main__":
    main()