LinkFilter.php 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. <?php
  2. /**
  3. * Functions to help implement an external link filter for spam control.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. */
  22. use Wikimedia\Rdbms\LikeMatch;
  23. /**
  24. * Some functions to help implement an external link filter for spam control.
  25. *
  26. * @todo implement the filter. Currently these are just some functions to help
  27. * maintenance/cleanupSpam.php remove links to a single specified domain. The
  28. * next thing is to implement functions for checking a given page against a big
  29. * list of domains.
  30. *
  31. * Another cool thing to do would be a web interface for fast spam removal.
  32. */
  33. class LinkFilter {
  34. /**
  35. * Increment this when makeIndexes output changes. It'll cause
  36. * maintenance/refreshExternallinksIndex.php to run from update.php.
  37. */
  38. const VERSION = 1;
  39. /**
  40. * Check whether $content contains a link to $filterEntry
  41. *
  42. * @param Content $content Content to check
  43. * @param string $filterEntry Domainparts, see makeRegex() for more details
  44. * @param string $protocol 'http://' or 'https://'
  45. * @return int 0 if no match or 1 if there's at least one match
  46. */
  47. public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
  48. if ( !( $content instanceof TextContent ) ) {
  49. // TODO: handle other types of content too.
  50. // Maybe create ContentHandler::matchFilter( LinkFilter ).
  51. // Think about a common base class for LinkFilter and MagicWord.
  52. return 0;
  53. }
  54. $text = $content->getText();
  55. $regex = self::makeRegex( $filterEntry, $protocol );
  56. return preg_match( $regex, $text );
  57. }
  58. /**
  59. * Builds a regex pattern for $filterEntry.
  60. *
  61. * @todo This doesn't match the rest of the functionality here.
  62. * @param string $filterEntry URL, if it begins with "*.", it'll be
  63. * replaced to match any subdomain
  64. * @param string $protocol 'http://' or 'https://'
  65. *
  66. * @return string Regex pattern, for preg_match()
  67. */
  68. private static function makeRegex( $filterEntry, $protocol ) {
  69. $regex = '!' . preg_quote( $protocol, '!' );
  70. if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
  71. $regex .= '(?:[A-Za-z0-9.-]+\.|)';
  72. $filterEntry = substr( $filterEntry, 2 );
  73. }
  74. $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
  75. return $regex;
  76. }
  77. /**
  78. * Indicate whether LinkFilter IDN support is available
  79. * @since 1.33
  80. * @return bool
  81. */
  82. public static function supportsIDN() {
  83. return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
  84. }
  85. /**
  86. * Canonicalize a hostname for el_index
  87. * @param string $host
  88. * @return string
  89. */
  90. private static function indexifyHost( $host ) {
  91. // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
  92. // Canonicalize.
  93. $host = rawurldecode( $host );
  94. if ( $host !== '' && self::supportsIDN() ) {
  95. // @todo Add a PHP fallback
  96. $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
  97. if ( $tmp !== false ) {
  98. $host = $tmp;
  99. }
  100. }
  101. $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
  102. if ( StringUtils::isUtf8( $host ) ) {
  103. // Save a little space by not percent-encoding valid UTF-8 bytes
  104. $okChars .= '\x80-\xf4';
  105. }
  106. $host = preg_replace_callback(
  107. '<[^' . $okChars . ']>',
  108. function ( $m ) {
  109. return rawurlencode( $m[0] );
  110. },
  111. strtolower( $host )
  112. );
  113. // IPv6? RFC 3986 syntax.
  114. if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
  115. $ip = $m[1];
  116. if ( IP::isValid( $ip ) ) {
  117. return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
  118. }
  119. if ( substr( $ip, -2 ) === ':*' ) {
  120. $cutIp = substr( $ip, 0, -2 );
  121. if ( IP::isValid( "{$cutIp}::" ) ) {
  122. // Wildcard IP doesn't contain "::", so multiple parts can be wild
  123. $ct = count( explode( ':', $ip ) ) - 1;
  124. return 'V6.' .
  125. implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
  126. '.*.';
  127. }
  128. if ( IP::isValid( "{$cutIp}:1" ) ) {
  129. // Wildcard IP does contain "::", so only the last part is wild
  130. return 'V6.' .
  131. substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
  132. '*.';
  133. }
  134. }
  135. }
  136. // Regularlize explicit specification of the DNS root.
  137. // Browsers seem to do this for IPv4 literals too.
  138. if ( substr( $host, -1 ) === '.' ) {
  139. $host = substr( $host, 0, -1 );
  140. }
  141. // IPv4?
  142. $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
  143. if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
  144. return 'V4.' . implode( '.', array_map( function ( $v ) {
  145. return $v === '*' ? $v : (int)$v;
  146. }, explode( '.', $host ) ) ) . '.';
  147. }
  148. // Must be a host name.
  149. return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
  150. }
  151. /**
  152. * Converts a URL into a format for el_index
  153. * @since 1.33
  154. * @param string $url
  155. * @return string[] Usually one entry, but might be two in case of
  156. * protocol-relative URLs. Empty array on error.
  157. */
  158. public static function makeIndexes( $url ) {
  159. // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
  160. // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
  161. // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
  162. // versus "https://" prefix. If you change that, you'll likely need to update
  163. // refreshExternallinksIndex.php accordingly.
  164. $bits = wfParseUrl( $url );
  165. if ( !$bits ) {
  166. return [];
  167. }
  168. // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
  169. // For emails turn it into "domain.reversed@localpart"
  170. if ( $bits['scheme'] == 'mailto' ) {
  171. $mailparts = explode( '@', $bits['host'], 2 );
  172. if ( count( $mailparts ) === 2 ) {
  173. $domainpart = self::indexifyHost( $mailparts[1] );
  174. } else {
  175. // No @, assume it's a local part with no domain
  176. $domainpart = '';
  177. }
  178. $bits['host'] = $domainpart . '@' . $mailparts[0];
  179. } else {
  180. $bits['host'] = self::indexifyHost( $bits['host'] );
  181. }
  182. // Reconstruct the pseudo-URL
  183. $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
  184. // Leave out user and password. Add the port, path, query and fragment
  185. if ( isset( $bits['port'] ) ) {
  186. $index .= ':' . $bits['port'];
  187. }
  188. if ( isset( $bits['path'] ) ) {
  189. $index .= $bits['path'];
  190. } else {
  191. $index .= '/';
  192. }
  193. if ( isset( $bits['query'] ) ) {
  194. $index .= '?' . $bits['query'];
  195. }
  196. if ( isset( $bits['fragment'] ) ) {
  197. $index .= '#' . $bits['fragment'];
  198. }
  199. if ( $bits['scheme'] == '' ) {
  200. return [ "http:$index", "https:$index" ];
  201. } else {
  202. return [ $index ];
  203. }
  204. }
  205. /**
  206. * Return query conditions which will match the specified string. There are
  207. * several kinds of filter entry:
  208. *
  209. * *.domain.com - Matches domain.com and www.domain.com
  210. * domain.com - Matches domain.com or domain.com/ but not www.domain.com
  211. * *.domain.com/x - Matches domain.com/xy or www.domain.com/xy. Also probably matches
  212. * domain.com/foobar/xy due to limitations of LIKE syntax.
  213. * domain.com/x - Matches domain.com/xy but not www.domain.com/xy
  214. * 192.0.2.* - Matches any IP in 192.0.2.0/24. Can also have a path appended.
  215. * [2001:db8::*] - Matches any IP in 2001:db8::/112. Can also have a path appended.
  216. * [2001:db8:*] - Matches any IP in 2001:db8::/32. Can also have a path appended.
  217. * foo@domain.com - With protocol 'mailto:', matches the email address foo@domain.com.
  218. * *@domain.com - With protocol 'mailto:', matches any email address at domain.com, but
  219. * not subdomains like foo@mail.domain.com
  220. *
  221. * Asterisks in any other location are considered invalid.
  222. *
  223. * @since 1.33
  224. * @param string $filterEntry Filter entry, as described above
  225. * @param array $options Options are:
  226. * - protocol: (string) Protocol to query (default http://)
  227. * - oneWildcard: (bool) Stop at the first wildcard (default false)
  228. * - prefix: (string) Field prefix (default 'el'). The query will test
  229. * fields '{$prefix}_index' and '{$prefix}_index_60'
  230. * - db: (IDatabase|null) Database to use.
  231. * @return array|bool Conditions to be used for the query (to be ANDed) or
  232. * false on error. To determine if the query is constant on the
  233. * el_index_60 field, check whether key 'el_index_60' is set.
  234. */
  235. public static function getQueryConditions( $filterEntry, array $options = [] ) {
  236. $options += [
  237. 'protocol' => 'http://',
  238. 'oneWildcard' => false,
  239. 'prefix' => 'el',
  240. 'db' => null,
  241. ];
  242. // First, get the like array
  243. $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
  244. if ( $like === false ) {
  245. return $like;
  246. }
  247. // Get the constant prefix (i.e. everything up to the first wildcard)
  248. $trimmedLike = self::keepOneWildcard( $like );
  249. if ( $options['oneWildcard'] ) {
  250. $like = $trimmedLike;
  251. }
  252. if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
  253. array_pop( $trimmedLike );
  254. }
  255. $index = implode( '', $trimmedLike );
  256. $p = $options['prefix'];
  257. $db = $options['db'] ?: wfGetDB( DB_REPLICA );
  258. // Build the query
  259. $l = strlen( $index );
  260. if ( $l >= 60 ) {
  261. // The constant prefix is larger than el_index_60, so we can use a
  262. // constant comparison.
  263. return [
  264. "{$p}_index_60" => substr( $index, 0, 60 ),
  265. "{$p}_index" . $db->buildLike( $like ),
  266. ];
  267. }
  268. // The constant prefix is smaller than el_index_60, so we use a LIKE
  269. // for a prefix search.
  270. return [
  271. "{$p}_index_60" . $db->buildLike( $index, $db->anyString() ),
  272. "{$p}_index" . $db->buildLike( $like ),
  273. ];
  274. }
  275. /**
  276. * Make an array to be used for calls to Database::buildLike(), which
  277. * will match the specified string.
  278. *
  279. * This function does the same as LinkFilter::makeIndexes(), except it also takes care
  280. * of adding wildcards
  281. *
  282. * @note You probably want self::getQueryConditions() instead
  283. * @param string $filterEntry Filter entry, @see self::getQueryConditions()
  284. * @param string $protocol Protocol (default http://)
  285. * @return array|bool Array to be passed to Database::buildLike() or false on error
  286. */
  287. public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
  288. $db = wfGetDB( DB_REPLICA );
  289. $like = [];
  290. $target = $protocol . $filterEntry;
  291. $bits = wfParseUrl( $target );
  292. if ( !$bits ) {
  293. return false;
  294. }
  295. $subdomains = false;
  296. if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
  297. // Email address with domain and non-empty local part
  298. $mailparts = explode( '@', $bits['host'], 2 );
  299. $domainpart = self::indexifyHost( $mailparts[1] );
  300. if ( $mailparts[0] === '*' ) {
  301. $subdomains = true;
  302. $bits['host'] = $domainpart . '@';
  303. } else {
  304. $bits['host'] = $domainpart . '@' . $mailparts[0];
  305. }
  306. } else {
  307. // Non-email, or email with only a domain part.
  308. $bits['host'] = self::indexifyHost( $bits['host'] );
  309. if ( substr( $bits['host'], -3 ) === '.*.' ) {
  310. $subdomains = true;
  311. $bits['host'] = substr( $bits['host'], 0, -2 );
  312. }
  313. }
  314. $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
  315. if ( $subdomains ) {
  316. $like[] = $db->anyString();
  317. }
  318. if ( isset( $bits['port'] ) ) {
  319. $like[] = ':' . $bits['port'];
  320. }
  321. if ( isset( $bits['path'] ) ) {
  322. $like[] = $bits['path'];
  323. } elseif ( !$subdomains ) {
  324. $like[] = '/';
  325. }
  326. if ( isset( $bits['query'] ) ) {
  327. $like[] = '?' . $bits['query'];
  328. }
  329. if ( isset( $bits['fragment'] ) ) {
  330. $like[] = '#' . $bits['fragment'];
  331. }
  332. // Check for stray asterisks: asterisk only allowed at the start of the domain
  333. foreach ( $like as $likepart ) {
  334. if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
  335. return false;
  336. }
  337. }
  338. if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
  339. // Add wildcard at the end if there isn't one already
  340. $like[] = $db->anyString();
  341. }
  342. return $like;
  343. }
  344. /**
  345. * Filters an array returned by makeLikeArray(), removing everything past first
  346. * pattern placeholder.
  347. *
  348. * @note You probably want self::getQueryConditions() instead
  349. * @param array $arr Array to filter
  350. * @return array Filtered array
  351. */
  352. public static function keepOneWildcard( $arr ) {
  353. if ( !is_array( $arr ) ) {
  354. return $arr;
  355. }
  356. foreach ( $arr as $key => $value ) {
  357. if ( $value instanceof LikeMatch ) {
  358. return array_slice( $arr, 0, $key + 1 );
  359. }
  360. }
  361. return $arr;
  362. }
  363. }