check-string.lua 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. local string_gsub = string.gsub
  2. local string_lower = string.lower
  3. local string_len = string.len
  4. local string_find = string.find
  5. local ipairs = ipairs
  6. local type = type
  7. local empty_whitelist = {} -- Must be empty table.
  8. local function normalize_string(str)
  9. local sub = string_gsub
  10. str = string_lower(str)
  11. -- Remove these first, we use them later in order to implement ignoring certain sequences.
  12. str = sub(str, "%z", "") -- Zero byte.
  13. str = sub(str, "%c", "") -- Control bytes.
  14. -- Ignore numbers and number-like sequences.
  15. str = sub(str, "%d%s*[%*%-%/%+x%.]%s*%d", "\0")
  16. -- Normalize certain symbols to alphabetical.
  17. str = sub(str, "%$", "s")
  18. str = sub(str, "3", "e")
  19. str = sub(str, "5", "s")
  20. str = sub(str, "2", "s")
  21. str = sub(str, "0", "o")
  22. str = sub(str, "1", "i")
  23. str = sub(str, "!", "i")
  24. str = sub(str, "@", "a")
  25. str = sub(str, "z", "s")
  26. str = sub(str, "υ", "u")
  27. str = sub(str, "ph", "f")
  28. str = sub(str, "\\/", "v")
  29. str = sub(str, "%(%)", "o")
  30. -- Ignore contraction for 'he will'. Common token.
  31. str = sub(str, "he'll", "he\0ll")
  32. str = sub(str, "b/c", "because")
  33. -- Ignore "it's". Commonly confused with tits.
  34. local a, b = string_find(str, "%wt its ")
  35. if a and b then
  36. local s2 = str:sub(1, a) .. "t\0its\0" .. str:sub(b)
  37. str = s2
  38. end
  39. str = sub(str, "it's", "it\0s")
  40. str = sub(str, " it[ %p]", "\0it\0")
  41. -- Remove symbols that will interfere with our regexs.
  42. str = sub(str, "%p", "") -- Punctuation.
  43. -- Some badwords need special treatment. Preserve the space-break in front of some words using another character.
  44. str = sub(str, " ass", "\0ass") -- Fix false-negatives with strings like "you are an ass".
  45. -- Ignore false-positives like "same name as server".
  46. --str = sub(str, "as s%w", "")
  47. local a, b = string_find(str, "as s%w")
  48. if a and b then
  49. local s2 = str:sub(1, a) .. "s\0s" .. str:sub(b)
  50. str = s2
  51. end
  52. str = sub(str, "but ", "but\0") -- Fix false-positives with strings like "but there arent".
  53. str = sub(str, "put ", "put\0") -- Fix 'puto' (spanish) conflicting with "put on armor/put torch".
  54. str = sub(str, "had ", "had\0") -- Ignore false-positives like "had 3 solars".
  55. -- Remove all spaces.
  56. str = sub(str, "%s", "")
  57. return str
  58. end
  59. local function range_before(p1, p2, p3, p4)
  60. if p4 < p1 then
  61. return true
  62. end
  63. end
  64. local function range_overlaps(p1, p2, p3, p4)
  65. if p3 <= p2 and p3 >= p1 then
  66. return true
  67. end
  68. if p1 >= p3 and p1 <= p4 then
  69. return true
  70. end
  71. if p2 <= p4 and p2 >= p3 then
  72. return true
  73. end
  74. if p4 >= p1 and p4 <= p2 then
  75. return true
  76. end
  77. return false
  78. end
  79. -- Function must evaluate to 'true' if the string is bad. Otherwise must return something 'falsy'.
  80. anticurse.check_string = function(table, str)
  81. local norm = normalize_string(str)
  82. local havb = false -- Set 'true' if a badword is found.
  83. -- For all words listed in language table.
  84. for k, v in ipairs(table) do
  85. local n = nil
  86. local t = nil
  87. if type(v) == "string" then
  88. n = v
  89. t = empty_whitelist
  90. elseif type(v) == "table" then
  91. n = v.word
  92. t = v.white
  93. else
  94. assert(false)
  95. break -- We shouldn't reach here.
  96. end
  97. local p1, p2, p3, p4, isw
  98. local sf = 1
  99. local idx = 1
  100. ::retry::
  101. -- If we're past the end of the string, we're done.
  102. -- If a badword was already found, we're done.
  103. if idx > string_len(norm) or havb then
  104. return havb
  105. end
  106. p1, p2 = string_find(norm, n, idx)
  107. -- Find out if the word is whitelisted.
  108. if p1 then
  109. --minetest.chat_send_player("MustTest", "# Server: Found word at " .. idx .. ", " .. p1 .. ", " .. p2 .. ".")
  110. -- Set 'true' if the found word is whitelisted.
  111. isw = false
  112. for i, j in ipairs(t) do
  113. -- Calculate the location to start searching for the whitelisted word from.
  114. sf = p1 - string_len(j)
  115. ::shiftup::
  116. sf = sf + 1
  117. if sf < 1 then sf = 1 end
  118. --minetest.chat_send_player("MustTest", "# Server: Start search at " .. sf .. ": '" .. j .. "'.")
  119. -- Get the location of the whitelisted word in the string.
  120. p3, p4 = string_find(norm, j, sf)
  121. --if p3 then
  122. -- minetest.chat_send_player("MustTest",
  123. -- "# Server: Found '" .. j .. "' at " .. p3 .. ", " .. p4 .. ".")
  124. --end
  125. -- Find out if the detected possible "badword" overlaps with a whitelisted word.
  126. if p3 then
  127. -- If the found whitelisted word occurs *before* the badword (no overlap), then we didn't check successfully.
  128. -- We must try again, shifting 1 byte farther in the string.
  129. if range_before(p1, p2, p3, p4) then
  130. goto shiftup
  131. end
  132. if range_overlaps(p1, p2, p3, p4) then
  133. isw = true
  134. break -- No need to search rest of whitelisted words.
  135. end
  136. end
  137. end
  138. -- If 'havb' is set once, it is never unset.
  139. if isw == false then
  140. --if p1 and p2 then
  141. -- minetest.chat_send_player("MustTest",
  142. -- "# Server: BW: " .. sf .. " : " .. p1 .. ", " .. p2 .. ".")
  143. --end
  144. havb = true
  145. end
  146. end
  147. -- Have we searched the whole string?
  148. -- Search again if we didn't.
  149. if p1 then
  150. -- Advance just one byte, since bad words can be substrings.
  151. idx = idx + 1
  152. if not havb then
  153. -- Then word was in whitelist.
  154. goto retry
  155. end
  156. end
  157. end
  158. return havb
  159. end