70_sare_html3.cf 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266
  1. # SARE HTML Ruleset for SpamAssassin - ruleset 3
  2. # Version: 01.03.10
  3. # Created: 2004-03-31
  4. # Modified: 2006-06-03
  5. # Usage instructions, documentation, and change history in 70_sare_html0.cf
  6. #@@# Revision History: Full Revision History stored in 70_sare_html.log
  7. #@@# 01.03.10: June 3 2006
  8. #@@# Minor score tweaks based on recent mass-checks
  9. #@@# Modified "rule has been moved" meta flags
  10. #@@# Archive: SARE_HTML_URI_OPTPHP
  11. #@@# Moved file 1 to 3: SARE_HTML_URI_DEFASP
  12. # License: Artistic - see http://www.rulesemporium.com/license.txt
  13. # Current Maintainer: Bob Menschel - RMSA@Menschel.net
  14. # Current Home: http://www.rulesemporium.com/rules/70_sare_html3.cf
  15. #
  16. ######## ###################### ##################################################
  17. ######## ###################### ##################################################
  18. # Rules renamed or moved
  19. ######## ###################### ##################################################
  20. meta __SARE_HEAD_FALSE __FROM_AOL_COM && !__FROM_AOL_COM
  21. meta SARE_HTML_URI_OPTPHP __SARE_HEAD_FALSE
  22. ######## ###################### ##################################################
  23. body __NONEMPTY_BODY /\S/
  24. header __TOCC_EXISTS exists:ToCc
  25. rawbody __SARE_HTML_HAS_A eval:html_tag_exists('a')
  26. rawbody __SARE_HTML_HAS_BR eval:html_tag_exists('br')
  27. rawbody __SARE_HTML_HAS_DIV eval:html_tag_exists('div')
  28. rawbody __SARE_HTML_HAS_FONT eval:html_tag_exists('font')
  29. rawbody __SARE_HTML_HAS_IMG eval:html_tag_exists('img')
  30. rawbody __SARE_HTML_HAS_P eval:html_tag_exists('p')
  31. rawbody __SARE_HTML_HAS_PRE eval:html_tag_exists('pre')
  32. rawbody __SARE_HTML_HAS_TITLE eval:html_tag_exists('title')
  33. rawbody __SARE_HTML_HBODY m'<html><body>'i
  34. rawbody __SARE_HTML_BEHTML m'<body></html>'i
  35. rawbody __SARE_HTML_BEHTML2 m'^</?body></html>'i
  36. rawbody __SARE_HTML_EFONT m'^</font>'i
  37. rawbody __SARE_HTML_EHEB m'^</html></body>'i
  38. rawbody __SARE_HTML_CMT_CNTR /<center><!--/
  39. ######## ###################### ##################################################
  40. # Is there a message?
  41. ######## ###################### ##################################################
  42. meta SARE_HTML_EMPTY __CTYPE_HTML && !( __SARE_HTML_HAS_TITLE || __TAG_EXISTS_HTML || __SARE_HTML_HAS_FONT || __TAG_EXISTS_BODY || __SARE_HTML_HAS_PRE || __SARE_HTML_HAS_DIV || __SARE_HTML_HAS_P || __SARE_HTML_HAS_A || __SARE_HTML_HAS_BR )
  43. describe SARE_HTML_EMPTY Email is HTML format, but common tags not found
  44. score SARE_HTML_EMPTY 0.681
  45. #ham SARE_HTML_EMPTY An "html" format email, 30 Oct 2002, Microsoft Outlook Express 6.00.2600.0000, that used no tags, just one long textual paragraph
  46. #counts SARE_HTML_EMPTY 226s/7h of 333405 corpus (262498s/70907h RM) 05/12/06
  47. #max SARE_HTML_EMPTY 506s/33h of 689155 corpus (348140s/341015h RM) 09/18/05
  48. #counts SARE_HTML_EMPTY 28s/1h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  49. #max SARE_HTML_EMPTY 32s/2h of 54283 corpus (17106s/37177h JH-3.01) 02/13/05
  50. #counts SARE_HTML_EMPTY 0s/0h of 57287 corpus (52272s/5015h MY) 09/22/05
  51. #max SARE_HTML_EMPTY 132s/2h of 26326 corpus (22886s/3440h MY) 02/15/05
  52. #counts SARE_HTML_EMPTY 0s/0h of 13284 corpus (7412s/5872h CT) 05/14/06
  53. #max SARE_HTML_EMPTY 12s/0h of 10826 corpus (6364s/4462h CT) 05/28/05
  54. #counts SARE_HTML_EMPTY 1s/173h of 7500 corpus (1767s/5733h ft) 09/18/05
  55. ######## ###################### ##################################################
  56. # <HTML> and <BODY> tag spamsign
  57. ######## ###################### ##################################################
  58. rawbody __SARE_HTML_BODY_END2 m'</body[^>]*>.*</body[^>]*>'i
  59. meta SARE_HTML_BODY_END2 __SARE_HTML_BODY_END2
  60. describe SARE_HTML_BODY_END2 Double </body>
  61. score SARE_HTML_BODY_END2 0.444
  62. #hist SARE_HTML_BODY_END2 Contrib by Matt Keller June 7 2004
  63. #note SARE_HTML_BODY_END2 Add/remove HTML_MESSAGE test has no effect
  64. #counts SARE_HTML_BODY_END2 15s/1h of 333405 corpus (262498s/70907h RM) 05/12/06
  65. #max SARE_HTML_BODY_END2 163s/13h of 281655 corpus (110173s/171482h RM) 05/05/05
  66. #counts SARE_HTML_BODY_END2 2s/1h of 9988 corpus (5657s/4331h AxB) 05/14/06
  67. #counts SARE_HTML_BODY_END2 1s/1h of 13284 corpus (7412s/5872h CT) 05/14/06
  68. #max SARE_HTML_BODY_END2 6s/0h of 10826 corpus (6364s/4462h CT) 05/28/05
  69. #counts SARE_HTML_BODY_END2 6s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  70. #counts SARE_HTML_BODY_END2 0s/7h of 42328 corpus (34212s/8116h FVGT) 05/15/06
  71. #counts SARE_HTML_BODY_END2 15s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  72. #max SARE_HTML_BODY_END2 63s/0h of 38858 corpus (15368s/23490h JH-SA3.0rc1) 08/22/04
  73. #counts SARE_HTML_BODY_END2 13s/0h of 106399 corpus (73151s/33248h ML) 05/14/06
  74. #counts SARE_HTML_BODY_END2 52s/2h of 23053 corpus (17334s/5719h MY) 05/14/06
  75. #max SARE_HTML_BODY_END2 69s/2h of 57287 corpus (52272s/5015h MY) 09/22/05
  76. rawbody SARE_HTML_HTML_DBL /<html[^>]*><html[^>]*>/i
  77. describe SARE_HTML_HTML_DBL Message body has very strange HTML sequence
  78. score SARE_HTML_HTML_DBL 0.639
  79. #ham SARE_HTML_HTML_DBL Verified (several), common to various opt-in lists.
  80. #hist SARE_HTML_HTML_DBL Fred T: FR_HTML_HTML
  81. #hist SARE_HTML_HTML_DBL 2004-06-11: [^>]* added by Bob Menschel
  82. #counts SARE_HTML_HTML_DBL 7s/1h of 333405 corpus (262498s/70907h RM) 05/12/06
  83. #max SARE_HTML_HTML_DBL 168s/0h of 65984 corpus (40739s/25245h RM) 08/21/04
  84. #counts SARE_HTML_HTML_DBL 1s/0h of 9988 corpus (5657s/4331h AxB) 05/14/06
  85. #counts SARE_HTML_HTML_DBL 0s/0h of 13284 corpus (7412s/5872h CT) 05/14/06
  86. #max SARE_HTML_HTML_DBL 9s/0h of 6944 corpus (3188s/3756h CT) 05/19/04
  87. #counts SARE_HTML_HTML_DBL 3s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  88. #counts SARE_HTML_HTML_DBL 25s/0h of 54283 corpus (17106s/37177h JH-3.01) 02/13/05
  89. #max SARE_HTML_HTML_DBL 75s/0h of 32906 corpus (9660s/23246h JH) 05/24/04
  90. #counts SARE_HTML_HTML_DBL 1s/0h of 106399 corpus (73151s/33248h ML) 05/14/06
  91. #counts SARE_HTML_HTML_DBL 8s/1h of 23053 corpus (17334s/5719h MY) 05/14/06
  92. #max SARE_HTML_HTML_DBL 10s/0h of 57287 corpus (52272s/5015h MY) 09/22/05
  93. ######## ###################### ##################################################
  94. # <TITLE> Tag Tests
  95. ######## ###################### ##################################################
  96. # Moved file 1 to 3: SARE_HTML_TITLE_MNY
  97. rawbody SARE_HTML_TITLE_MNY /<title>.{0,25}Money.{0,25}<\/title>/i
  98. describe SARE_HTML_TITLE_MNY HTML Title implies this may be spam
  99. score SARE_HTML_TITLE_MNY 0.458
  100. #ham SARE_HTML_TITLE_MNY confirmed
  101. #hist SARE_HTML_TITLE_MNY Fred T: FR_TITLE_MONEY
  102. #counts SARE_HTML_TITLE_MNY 16s/2h of 333405 corpus (262498s/70907h RM) 05/12/06
  103. #max SARE_HTML_TITLE_MNY 260s/11h of 689155 corpus (348140s/341015h RM) 09/18/05
  104. #counts SARE_HTML_TITLE_MNY 0s/0h of 13287 corpus (7414s/5873h CT) 05/14/06
  105. #max SARE_HTML_TITLE_MNY 0s/1h of 6944 corpus (3188s/3756h CT) 05/19/04
  106. #counts SARE_HTML_TITLE_MNY 0s/0h of 54283 corpus (17106s/37177h JH-3.01) 02/13/05
  107. #max SARE_HTML_TITLE_MNY 7s/0h of 38858 corpus (15368s/23490h JH-SA3.0rc1) 08/22/04
  108. #counts SARE_HTML_TITLE_MNY 2s/0h of 105856 corpus (72598s/33258h ML) 05/14/06
  109. #counts SARE_HTML_TITLE_MNY 15s/0h of 23074 corpus (17350s/5724h MY) 05/14/06
  110. #max SARE_HTML_TITLE_MNY 120s/0h of 57287 corpus (52272s/5015h MY) 09/22/05
  111. ######## ###################### ##################################################
  112. # <A> and HREF rules
  113. ######## ###################### ##################################################
  114. ######## ###################### ##################################################
  115. # Spamsign character sets and fonts
  116. ######## ###################### ##################################################
  117. rawbody SARE_HTML_COLOR_B /(?:style="?|<style[^>]*>)[^>"]*[^-]color\s*:\s*rgb\(\s*2[2-5][0-9]\s*,\s*2[2-5][0-9]\s*,\s*2[2-5][0-9]\s*\)[^>]*>/i
  118. describe SARE_HTML_COLOR_B BAD STYLE: color: too light (rgb(n))
  119. score SARE_HTML_COLOR_B 0.621
  120. #ham SARE_HTML_COLOR_B Tickemaster ticket confirmation emails
  121. #hist SARE_HTML_COLOR_B From Jesse Houwing May 14 2004
  122. #counts SARE_HTML_COLOR_B 20s/4h of 333405 corpus (262498s/70907h RM) 05/12/06
  123. #counts SARE_HTML_COLOR_B 2s/8h of 9988 corpus (5657s/4331h AxB) 05/14/06
  124. #counts SARE_HTML_COLOR_B 1s/1h of 13284 corpus (7412s/5872h CT) 05/14/06
  125. #counts SARE_HTML_COLOR_B 47s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  126. #counts SARE_HTML_COLOR_B 0s/1h of 42328 corpus (34212s/8116h FVGT) 05/15/06
  127. #counts SARE_HTML_COLOR_B 3s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  128. #max SARE_HTML_COLOR_B 5s/0h of 54283 corpus (17106s/37177h JH-3.01) 02/13/05
  129. #counts SARE_HTML_COLOR_B 12s/0h of 106399 corpus (73151s/33248h ML) 05/14/06
  130. #counts SARE_HTML_COLOR_B 8s/0h of 23053 corpus (17334s/5719h MY) 05/14/06
  131. rawbody SARE_HTML_LANG_PTBR /lang=(?:3D)?PT-BR/
  132. describe SARE_HTML_LANG_PTBR Odd language
  133. score SARE_HTML_LANG_PTBR 0.189
  134. #hist SARE_HTML_LANG_PTBR LW_PT_BR, Loren Wilton
  135. #counts SARE_HTML_LANG_PTBR 11s/0h of 333405 corpus (262498s/70907h RM) 05/12/06
  136. #max SARE_HTML_LANG_PTBR 213s/0h of 70693 corpus (43127s/27566h RM) 10/02/04
  137. #counts SARE_HTML_LANG_PTBR 0s/1h of 56020 corpus (51687s/4333h AxB2) 05/15/06
  138. #counts SARE_HTML_LANG_PTBR 9s/25h of 13284 corpus (7412s/5872h CT) 05/14/06
  139. #counts SARE_HTML_LANG_PTBR 1s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  140. #counts SARE_HTML_LANG_PTBR 69s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  141. #counts SARE_HTML_LANG_PTBR 2s/0h of 106399 corpus (73151s/33248h ML) 05/14/06
  142. #counts SARE_HTML_LANG_PTBR 0s/0h of 47221 corpus (42968s/4253h MY) 06/18/05
  143. #max SARE_HTML_LANG_PTBR 10s/0h of 19448 corpus (16863s/2585h MY) 10/05/04
  144. ######## ###################### ##################################################
  145. # Invalid or Suspicious URI Tests
  146. ######## ###################### ##################################################
  147. uri SARE_HTML_URI_DEFASP m'/default.asp\?id='i
  148. describe SARE_HTML_URI_DEFASP URI to page name which suggests spammer's page
  149. score SARE_HTML_URI_DEFASP 0.093
  150. #hist SARE_HTML_URI_DEFASP Deleted SARE_HTML_URI_X1 = LW_URI_ID due to complete overlap: /\?id\x10\x30\x34\x35/i
  151. #counts SARE_HTML_URI_DEFASP 0s/8h of 333405 corpus (262498s/70907h RM) 05/12/06
  152. #max SARE_HTML_URI_DEFASP 130s/27h of 689155 corpus (348140s/341015h RM) 09/18/05
  153. #counts SARE_HTML_URI_DEFASP 0s/5h of 13287 corpus (7414s/5873h CT) 05/14/06
  154. #max SARE_HTML_URI_DEFASP 44s/0h of 6944 corpus (3188s/3756h CT) 05/19/04
  155. #counts SARE_HTML_URI_DEFASP 1s/1h of 42454 corpus (34336s/8118h FVGT) 05/15/06
  156. #counts SARE_HTML_URI_DEFASP 0s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  157. #max SARE_HTML_URI_DEFASP 361s/0h of 38858 corpus (15368s/23490h JH-SA3.0rc1) 08/22/04
  158. #counts SARE_HTML_URI_DEFASP 24s/0h of 23074 corpus (17350s/5724h MY) 05/14/06
  159. #max SARE_HTML_URI_DEFASP 24s/0h of 57287 corpus (52272s/5015h MY) 09/22/05
  160. ######## ###################### ##################################################
  161. # Image tag tests
  162. ######## ###################### ##################################################
  163. ######## ###################### ##################################################
  164. # Paragraphs, breaks, and spacings
  165. ######## ###################### ##################################################
  166. rawbody SARE_HTML_P_MANY3 /<P><P><P>/i
  167. describe SARE_HTML_P_MANY3 Too many empty paragraph tags in a row
  168. score SARE_HTML_P_MANY3 1.108
  169. #hist SARE_HTML_P_MANY3 04/02/2004 http://www.rulesemporium.com/rules/99_FVGT_rawbody.cf
  170. #overlap SARE_HTML_P_MANY3 Total overlap within SARE_HTML_URI_MANYP2, but no ham hits here (until Feb 2005)
  171. #ham SARE_HTML_P_MANY3 From: Ticketmaster <support@reply.ticketmaster.com>, Tuesday, January 25, 2005, 4:00:27 PM
  172. #counts SARE_HTML_P_MANY3 78s/6h of 333405 corpus (262498s/70907h RM) 05/12/06
  173. #max SARE_HTML_P_MANY3 458s/28h of 689155 corpus (348140s/341015h RM) 09/18/05
  174. #counts SARE_HTML_P_MANY3 143s/0h of 56020 corpus (51687s/4333h AxB2) 05/15/06
  175. #counts SARE_HTML_P_MANY3 0s/0h of 11260 corpus (6568s/4692h CT) 06/17/05
  176. #max SARE_HTML_P_MANY3 9s/0h of 6944 corpus (3188s/3756h CT) 05/19/04
  177. #counts SARE_HTML_P_MANY3 412s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  178. #counts SARE_HTML_P_MANY3 50s/0h of 42328 corpus (34212s/8116h FVGT) 05/15/06
  179. #counts SARE_HTML_P_MANY3 4s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  180. #max SARE_HTML_P_MANY3 15s/0h of 32260 corpus (8983s/23277h JH) 05/14/04
  181. #counts SARE_HTML_P_MANY3 9s/0h of 23053 corpus (17334s/5719h MY) 05/14/06
  182. #max SARE_HTML_P_MANY3 41s/0h of 57287 corpus (52272s/5015h MY) 09/22/05
  183. ######## ###################### ##################################################
  184. # Javascript and object tests
  185. ######## ###################### ##################################################
  186. ######## ###################### ##################################################
  187. # Useless tags (tag structures that do nothing)
  188. # Largely submitted by Matt Yackley, with contributions by
  189. # Carl Friend, Jennifer Wheeler, Scott Sprunger, Larry Gilson
  190. ######## ###################### ##################################################
  191. rawbody SARE_HTML_USL_1CHAR m'(?!<[biopu]></[biopu]>)<([a-z])></\1>'i
  192. describe SARE_HTML_USL_1CHAR Invalid and empty 1-char tag - /tag combination
  193. score SARE_HTML_USL_1CHAR 0.029
  194. #counts SARE_HTML_USL_1CHAR 6s/14h of 333405 corpus (262498s/70907h RM) 05/12/06
  195. #max SARE_HTML_USL_1CHAR 46s/6h of 196718 corpus (96193s/100525h RM) 02/22/05
  196. #counts SARE_HTML_USL_1CHAR 3s/0h of 56020 corpus (51687s/4333h AxB2) 05/15/06
  197. #counts SARE_HTML_USL_1CHAR 0s/0h of 10826 corpus (6364s/4462h CT) 05/28/05
  198. #max SARE_HTML_USL_1CHAR 3s/0h of 6944 corpus (3188s/3756h CT) 05/19/04
  199. #counts SARE_HTML_USL_1CHAR 8s/30h of 155408 corpus (103805s/51603h DOC) 05/15/06
  200. #counts SARE_HTML_USL_1CHAR 2s/1h of 42328 corpus (34212s/8116h FVGT) 05/15/06
  201. #counts SARE_HTML_USL_1CHAR 3s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  202. #max SARE_HTML_USL_1CHAR 6s/0h of 54283 corpus (17106s/37177h JH-3.01) 02/13/05
  203. #counts SARE_HTML_USL_1CHAR 2s/0h of 23053 corpus (17334s/5719h MY) 05/14/06
  204. ######## ###################### ##################################################
  205. # Miscellaneous tag tests
  206. ######## ###################### ##################################################
  207. rawbody SARE_HTML_BODY_2SP /<body /i
  208. describe SARE_HTML_BODY_2SP HTML tag is strangely formed
  209. score SARE_HTML_BODY_2SP 0.665
  210. #hist SARE_HTML_BODY_2SP FR_BODY_2SPACES
  211. #counts SARE_HTML_BODY_2SP 682s/152h of 333405 corpus (262498s/70907h RM) 05/12/06
  212. #counts SARE_HTML_BODY_2SP 678s/2h of 9988 corpus (5657s/4331h AxB) 05/14/06
  213. #counts SARE_HTML_BODY_2SP 48s/0h of 13284 corpus (7412s/5872h CT) 05/14/06
  214. #counts SARE_HTML_BODY_2SP 215s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  215. #counts SARE_HTML_BODY_2SP 1455s/8h of 42328 corpus (34212s/8116h FVGT) 05/15/06
  216. #counts SARE_HTML_BODY_2SP 62s/5h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  217. #max SARE_HTML_BODY_2SP 94s/0h of 38858 corpus (15368s/23490h JH-SA3.0rc1) 08/22/04
  218. #counts SARE_HTML_BODY_2SP 361s/2h of 106399 corpus (73151s/33248h ML) 05/14/06
  219. #counts SARE_HTML_BODY_2SP 21s/2h of 23053 corpus (17334s/5719h MY) 05/14/06
  220. #max SARE_HTML_BODY_2SP 66s/2h of 47221 corpus (42968s/4253h MY) 06/18/05
  221. full SARE_HTML_TD_BR m'<td.{10,400}<br>.{1,7}<br>.{1,7}<br>.{1,7}<br>.{0,400}</td>'is
  222. describe SARE_HTML_TD_BR Multiple line breaks in spammer pattern
  223. score SARE_HTML_TD_BR 0.934
  224. #hist SARE_HTML_TD_BR Fred T: FR_WICKED_SPAM_??
  225. #counts SARE_HTML_TD_BR 2757s/33h of 333405 corpus (262498s/70907h RM) 05/12/06
  226. #counts SARE_HTML_TD_BR 368s/0h of 56020 corpus (51687s/4333h AxB2) 05/15/06
  227. #counts SARE_HTML_TD_BR 40s/10h of 13284 corpus (7412s/5872h CT) 05/14/06
  228. #counts SARE_HTML_TD_BR 471s/0h of 155408 corpus (103805s/51603h DOC) 05/15/06
  229. #counts SARE_HTML_TD_BR 190s/10h of 42328 corpus (34212s/8116h FVGT) 05/15/06
  230. #counts SARE_HTML_TD_BR 36s/0h of 54067 corpus (16890s/37177h JH-3.01) 06/18/05
  231. #max SARE_HTML_TD_BR 182s/0h of 38858 corpus (15368s/23490h JH-SA3.0rc1) 08/22/04
  232. #counts SARE_HTML_TD_BR 700s/0h of 106399 corpus (73151s/33248h ML) 05/14/06
  233. #counts SARE_HTML_TD_BR 68s/14h of 23053 corpus (17334s/5719h MY) 05/14/06
  234. #max SARE_HTML_TD_BR 184s/15h of 47221 corpus (42968s/4253h MY) 06/18/05
  235. # EOF