CheckForBadWords.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2010
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Scan a file for bad words
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import os, sys, re
  9. import littleparser
  10. import getopt
  11. import os.path
  12. import time
  13. import FilterWords
  14. import FileScanner
  15. import PrintLog
  16. # underscore and space
  17. whitespaces = re.compile(r'([\s_]+)', re.IGNORECASE)
  18. verbose = False
  19. show_restricted = False
  20. def usage(message):
  21. if None != message:
  22. print('error: {0:s}'.format(message))
  23. print('usage: {0:s} <options> xml-file...'.format(os.path.basename(__file__)))
  24. print(' --help This message')
  25. print(' --restricted Enable restricted output')
  26. print(' --verbose Enable verbose output')
  27. exit(1)
  28. def main():
  29. global verbose, show_restricted
  30. try:
  31. opts, args = getopt.getopt(sys.argv[1:], 'hvr',
  32. ['help', 'verbose',
  33. 'restricted'
  34. ])
  35. except getopt.GetoptError, err:
  36. usage(err)
  37. verbose = False
  38. show_restricted = False
  39. for opt, arg in opts:
  40. if opt in ('-v', '--verbose'):
  41. verbose = True
  42. elif opt in ('-r', '--restricted'):
  43. show_restricted = True
  44. elif opt in ('-h', '--help'):
  45. usage(None)
  46. else:
  47. usage('unhandled option: ' + opt)
  48. if [] == args:
  49. usage('missing argument(s)')
  50. processor = FileProcessing()
  51. limit = 'all'
  52. for f in args:
  53. limit = processor.process(f, limit)
  54. # record initial counts
  55. a = processor.article_count
  56. r = processor.redirect_count
  57. # record combined count and display statistics
  58. s = a + r
  59. for f in (sys.stdout,):
  60. f.write('Articles: {0:10d}\n'.format(a))
  61. f.write('Redirects: {0:10d}\n'.format(r))
  62. f.write('Sum: {0:10d}\n'.format(s))
  63. f.write('Maybe Restricted: {0:10d}\n'.format(processor.restricted_count))
  64. f.write('UnRestricted: {0:10d}\n'.format(processor.unrestricted_count))
  65. f.write('Restricted: {0:10d}\n'.format(processor.restricted_count - processor.unrestricted_count))
  66. del processor
  67. class FileProcessing(FileScanner.FileScanner):
  68. def __init__(self, *args, **kw):
  69. super(FileProcessing, self).__init__(*args, **kw)
  70. self.restricted_count = 0
  71. self.unrestricted_count = 0
  72. self.redirect_count = 0
  73. self.article_count = 0
  74. self.translate = littleparser.LittleParser().translate
  75. self.time = time.time()
  76. def title(self, category, key, title, seek):
  77. if self.KEY_ARTICLE != key:
  78. if verbose:
  79. PrintLog.message('Non-article: {0:s}:{1:s}'.format(category,title))
  80. return False
  81. return True
  82. def redirect(self, category, key, title, rcategory, rkey, rtitle, seek):
  83. self.redirect_count += 1
  84. if verbose:
  85. PrintLog.message('Redirect: {0:s}:{1:s} -> {2:s}:{3:s}'.format(category, title, rcategory, rtitle))
  86. def body(self, category, key, title, text, seek):
  87. global verbose, show_restricted
  88. restricted_title = FilterWords.is_restricted(title)
  89. restricted_text = FilterWords.is_restricted(text)
  90. restricted = restricted_title or restricted_text
  91. self.article_count += 1
  92. if restricted:
  93. self.restricted_count += 1
  94. if not verbose and self.article_count % 10000 == 0:
  95. start_time = time.time()
  96. PrintLog.message('{0:7.2f}s {1:10d}'.format(start_time - self.time, self.article_count))
  97. self.time = start_time
  98. if verbose:
  99. PrintLog.message('Title: {0:s}'.format(title))
  100. if restricted:
  101. if restricted_title:
  102. t_state = ' Title'
  103. else:
  104. t_state = ''
  105. if restricted_text:
  106. b_state = ' Text'
  107. (flag, contains) = FilterWords.find_restricted(text)
  108. if not flag:
  109. self.unrestricted_count += 1
  110. else:
  111. b_state = ''
  112. contains = None
  113. if show_restricted:
  114. PrintLog.message('{0:10d} Restricted{1:s}{2:s}: {3:s}'
  115. .format(self.restricted_count, t_state, b_state, title))
  116. if None != contains:
  117. PrintLog.message(' -> {0!s:s} {1:s}'.format(flag, contains))
  118. # run the program
  119. if __name__ == "__main__":
  120. main()