FilterWords.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # COPYRIGHT: Openmoko Inc. 2009
  4. # LICENSE: GPL Version 3 or later
  5. # DESCRIPTION: Filter bad words
  6. # AUTHORS: Sean Moss-Pultz <sean@openmoko.com>
  7. # Christopher Hall <hsw@openmoko.com>
  8. import re
  9. # text must be lower case
  10. # the number is the maximum number of occurances
  11. FILTER_WEIGHTS = {
  12. "pornograph": 3,
  13. "x-rated": 2,
  14. "dildo": 2,
  15. "erotic": 3,
  16. "bdsm": 1,
  17. "felching": 1,
  18. "pegging": 2,
  19. "cum shot": 1,
  20. "anilingus": 1,
  21. "deep-throat": 2,
  22. "fellatio": 1,
  23. "adult-video": 2,
  24. "adult-entertainment": 2,
  25. "son-of-a-bitch": 2,
  26. "dickhead": 3,
  27. "fuck": 3,
  28. "cunt": 3,
  29. }
  30. BAD_WORDS = FILTER_WEIGHTS.keys()
  31. NON_LETTERS = re.compile('[-\d\W]+')
  32. def find_restricted(text):
  33. """check if text contains any restricted words"""
  34. global NON_LETTERS, BAD_WORDS
  35. score = 0
  36. contains = {}
  37. for word in NON_LETTERS.split(text.lower()):
  38. for bad in BAD_WORDS:
  39. if word.startswith(bad):
  40. if bad not in contains:
  41. contains[bad] = 1
  42. else:
  43. contains[bad] += 1
  44. restrict = False
  45. for word in contains:
  46. if contains[word] > FILTER_WEIGHTS[word]:
  47. restrict = True
  48. break
  49. return (restrict, contains)
  50. def is_restricted(text):
  51. """check if text contains any restricted words"""
  52. global BAD_WORDS
  53. text = text.lower()
  54. size = len(text)
  55. for word in BAD_WORDS:
  56. i = 0
  57. while i < size:
  58. p = text.find(word, i)
  59. if 0 == p:
  60. return True
  61. elif p > 0:
  62. if not text[p - 1].isalpha():
  63. return True
  64. i = p + size
  65. else:
  66. break
  67. return False