123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- #! /usr/bin/env python
- # -*- coding: utf-8 -*-
- # COPYRIGHT: Openmoko Inc. 2010
- # LICENSE: GPL Version 3 or later
- # DESCRIPTION: Read an process the XML file
- # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
- # Christopher Hall <hsw@openmoko.com>
- import sys, os
- import re
- import getopt
- import FilterWords
- import RedirectedTo
- class FileScanner(object):
- def __init__(self, *args, **kw):
- #super(FileScanner, self).__init__(*args, **kw)
- super(FileScanner, self).__init__()
- self.file_list = []
- self.category_to_key = {}
- self.key_to_category = {}
- self.current_file_id = -1 # no file yet
- KEY_ARTICLE = 0
- KEY_TEMPLATE = 10
- def file_id(self):
- return self.current_file_id
- def current_filename(self):
- return self.file_list[self.current_file_id]
- def all_file_names(self):
- return self.file_list
- def namespace(self, key, text):
- pass
- def title(self, category, key, title, seek):
- return True
- def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
- pass
- def body(self, category, key, title, text, seek):
- pass
- namespaces_start = '<namespaces>'
- namespaces_end = '</namespaces>'
- namespace_start = '<namespace key="'
- namespace_stop = '/>'
- namespace_cont = '">'
- namespace_end = '</namespace>'
- title_start = '<title>'
- title_end = '</title>'
- title_end_len = len(title_end)
- text_start = '<text '
- text_cont = 'xml:space="preserve">'
- text_stop = '/>'
- text_end = '</text>'
- # en: redirect: <text.....#redirect.....[[title#relative link]].....
- # es: redirección ""
- #redirected_to = re.compile(r'#\s*(redirect|redirecci..n)[^\[]*\[\[(.*?)([#|].*?)?\]\]', re.IGNORECASE)
- StateMachine = {
- 'start': [
- (namespaces_start, len(namespaces_start), 'category_start', 'spaces'),
- (title_start, len(title_start), 'drop', 'title'),
- ],
- 'spaces': [
- (namespaces_end, len(namespaces_end), 'drop', 'start'),
- (namespace_start, len(namespace_start), 'drop', 'key'),
- ],
- 'key': [
- (namespace_stop, len(namespace_stop), 'drop', 'spaces'),
- (namespace_cont, len(namespace_cont), 'key', 'ns'),
- ],
- 'ns': [
- (namespace_end, len(namespace_end), 'namespace', 'spaces'),
- ],
- 'title': [
- (title_end, len(title_end), 'title', 'text'),
- ],
- 'text': [
- (text_start, len(text_start), 'drop', 'prebody'),
- ],
- 'prebody': [
- (text_cont, len(text_cont), 'drop', 'body'),
- (text_stop, len(text_stop), 'zero', 'start'),
- ],
- 'body': [
- (text_end, len(text_end), 'body', 'start'),
- ]
- }
- def process(self, filename, limit):
- self.file_list += [filename]
- self.current_file_id = len(self.file_list) - 1
- block = ''
- seek = 0
- key = None
- category = None
- title = None
- file = open(filename, 'rb')
- end = False
- more = True
- wanted = True
- CurrentState = self.StateMachine['start']
- run = True
- while run:
- if more or (not end and len(block) < 1024):
- more = False
- block2 = file.read(65536)
- if len(block2) == 0:
- end = True
- else:
- block += block2
- pos = -1
- state = None
- for s in CurrentState:
- p = block.find(s[0])
- if p >= 0:
- if pos < 0 or p < pos:
- pos = p
- state = s
- if None == state:
- if end:
- return limit
- else:
- more = True
- else:
- (tag, length, proc, next) = state
- CurrentState = self.StateMachine[next]
- flag = False
- if 'key' == proc:
- key = block[:pos].split('"')[0].strip()
- elif 'namespace' == proc:
- category = block[:pos].strip().lower()
- key = int(key)
- self.category_to_key[category] = key
- self.key_to_category[key] = category
- self.namespace(key, category)
- key = None
- elif 'title' == proc:
- (category, key, title) = self.get_category(block[:pos].strip())
- wanted = self.title(category, key, title, seek)
- elif 'body' == proc:
- body = block[:pos].strip()
- body_leading_blanks = pos - len(block[:pos].lstrip())
- flag = True
- if '#' in body[0:10] or '#' in body[0:10]:
- match = RedirectedTo.regex.match(body)
- if wanted and match:
- (rcategory, rkey, rtitle) = self.get_category(match.group(2).strip())
- self.redirect(category, key, title, rcategory, rkey, rtitle, seek + body_leading_blanks)
- flag = False
- elif 'zero' == proc:
- flag = True
- body = ''
- elif 'category_start' == proc:
- self.category_to_key = {}
- self.key_to_category = {}
- if wanted and flag:
- self.body(category, key, title, body, seek + body_leading_blanks)
- if limit != 'all':
- limit -= 1
- if limit <= 0:
- run = False
- break
- title = None
- block = block[pos + length:]
- seek += pos + length
- return limit
- def get_category(self, title):
- """split title into category, key, title"""
- if ':' in title:
- (category, t) = title.split(':', 1)
- category = category.strip().lower()
- t = t.strip()
- if category in self.category_to_key:
- key = self.category_to_key[category]
- return (category, key, t)
- return ('', 0, title)
- class MyTestScanner(FileScanner):
- def __init__(self, *args, **kw):
- super(MyTestScanner, self).__init__(*args, **kw)
- self.count = 0
- self.articles = 0
- self.article_index = {}
- def namespace(self, key, text):
- print('namespace "{0:d}"->"{1:s}"'.format(key, text))
- def title(self, category, key, title, seek):
- self.count += 1
- print('T:{0:d} {1:d} : {2:s}[{3:d}]:{4:s}'.format(self.count, seek, category, key, title))
- if self.KEY_ARTICLE != key:
- return False
- return True
- def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
- #pass
- print('R:{0:d} {1:d} : {2:s}[{3:d}]:{4:s} -> {5:s}[{6:d}]:{7:s}'
- .format(self.count, seek, category, key, title, rcategory, rkey, rtitle))
- def body(self, category, key, title, text, seek):
- if not filter(title, text):
- self.articles += 1
- self.article_index[title] = [self.articles, seek, len(text)]
- print('B:{0:d} {1:d} [{2:s}[{3:d}]{4:s}] : {5:s}'
- .format(self.count, seek, category, key, title, text[:100]))
- def filter(title, text):
- (restricted, contains) = FilterWords.find_restricted(title + text)
- if restricted:
- print('TITLE: "{0:s}" restricted: [{1:s}]'.format(title, contains))
- return restricted
- def usage(message):
- if None != message:
- print('error: {0:s}'.format(message))
- print('usage: {0:s} <options> xml-file...'.format(os.path.basename(__file__)))
- print(' --help This message')
- print(' --count=n Number of article to process [all]')
- print(' --limit=number Limit the number of articles processed')
- print(' --prefix=name Device file name portion for .fnd/.pfx [pedia]')
- print(' --templates=file Database for templates [templates.db]')
- exit(1)
- def main():
- global verbose
- global debug
- try:
- opts, args = getopt.getopt(sys.argv[1:], 'hvc:d:',
- ['help', 'verbose',
- 'count=',
- 'debug='])
- except getopt.GetoptError, err:
- usage(err)
- verbose = False
- debug = 0
- count = 'all'
- for opt, arg in opts:
- if opt in ('-v', '--verbose'):
- verbose = True
- elif opt in ('-h', '--help'):
- usage(None)
- elif opt in ('-d', '--debug'):
- try:
- debug = int(arg)
- except ValueError:
- usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
- elif opt in ('-c', '--count'):
- if arg[-1] == 'k':
- arg = arg[:-1] + '000'
- if arg != 'all':
- try:
- count = int(arg)
- except ValueError:
- usage('"{0:s}={1:s}" is not numeric'.format(opt, arg))
- if count <= 0:
- usage('"{0:s}={1:s}" must be > zero'.format(opt, arg))
- else:
- usage('unhandled option: ' + opt)
- if len(args) == 0:
- usage('no files to process')
- scanner = MyTestScanner()
- for f in args:
- print('Processing file: {0:s}'.format(f))
- count = scanner.process(f, count)
- if 0 == count:
- break
- # run the program
- if __name__ == "__main__":
- main()
|