Sanitizer.php 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386
  1. <?php
  2. /**
  3. * XHTML sanitizer for MediaWiki
  4. *
  5. * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
  6. * http://www.mediawiki.org/
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License along
  19. * with this program; if not, write to the Free Software Foundation, Inc.,
  20. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  21. * http://www.gnu.org/copyleft/gpl.html
  22. *
  23. * @file
  24. * @ingroup Parser
  25. */
  26. /**
  27. * Regular expression to match various types of character references in
  28. * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
  29. */
  30. define( 'MW_CHAR_REFS_REGEX',
  31. '/&([A-Za-z0-9\x80-\xff]+);
  32. |&\#([0-9]+);
  33. |&\#x([0-9A-Za-z]+);
  34. |&\#X([0-9A-Za-z]+);
  35. |(&)/x' );
  36. /**
  37. * Regular expression to match HTML/XML attribute pairs within a tag.
  38. * Allows some... latitude.
  39. * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
  40. */
  41. $attrib = '[A-Za-z0-9]';
  42. $space = '[\x09\x0a\x0d\x20]';
  43. define( 'MW_ATTRIBS_REGEX',
  44. "/(?:^|$space)($attrib+)
  45. ($space*=$space*
  46. (?:
  47. # The attribute value: quoted or alone
  48. \"([^<\"]*)\"
  49. | '([^<']*)'
  50. | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
  51. | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
  52. # colors are specified like this.
  53. # We'll be normalizing it.
  54. )
  55. )?(?=$space|\$)/sx" );
  56. /**
  57. * List of all named character entities defined in HTML 4.01
  58. * http://www.w3.org/TR/html4/sgml/entities.html
  59. * @private
  60. */
  61. global $wgHtmlEntities;
  62. $wgHtmlEntities = array(
  63. 'Aacute' => 193,
  64. 'aacute' => 225,
  65. 'Acirc' => 194,
  66. 'acirc' => 226,
  67. 'acute' => 180,
  68. 'AElig' => 198,
  69. 'aelig' => 230,
  70. 'Agrave' => 192,
  71. 'agrave' => 224,
  72. 'alefsym' => 8501,
  73. 'Alpha' => 913,
  74. 'alpha' => 945,
  75. 'amp' => 38,
  76. 'and' => 8743,
  77. 'ang' => 8736,
  78. 'Aring' => 197,
  79. 'aring' => 229,
  80. 'asymp' => 8776,
  81. 'Atilde' => 195,
  82. 'atilde' => 227,
  83. 'Auml' => 196,
  84. 'auml' => 228,
  85. 'bdquo' => 8222,
  86. 'Beta' => 914,
  87. 'beta' => 946,
  88. 'brvbar' => 166,
  89. 'bull' => 8226,
  90. 'cap' => 8745,
  91. 'Ccedil' => 199,
  92. 'ccedil' => 231,
  93. 'cedil' => 184,
  94. 'cent' => 162,
  95. 'Chi' => 935,
  96. 'chi' => 967,
  97. 'circ' => 710,
  98. 'clubs' => 9827,
  99. 'cong' => 8773,
  100. 'copy' => 169,
  101. 'crarr' => 8629,
  102. 'cup' => 8746,
  103. 'curren' => 164,
  104. 'dagger' => 8224,
  105. 'Dagger' => 8225,
  106. 'darr' => 8595,
  107. 'dArr' => 8659,
  108. 'deg' => 176,
  109. 'Delta' => 916,
  110. 'delta' => 948,
  111. 'diams' => 9830,
  112. 'divide' => 247,
  113. 'Eacute' => 201,
  114. 'eacute' => 233,
  115. 'Ecirc' => 202,
  116. 'ecirc' => 234,
  117. 'Egrave' => 200,
  118. 'egrave' => 232,
  119. 'empty' => 8709,
  120. 'emsp' => 8195,
  121. 'ensp' => 8194,
  122. 'Epsilon' => 917,
  123. 'epsilon' => 949,
  124. 'equiv' => 8801,
  125. 'Eta' => 919,
  126. 'eta' => 951,
  127. 'ETH' => 208,
  128. 'eth' => 240,
  129. 'Euml' => 203,
  130. 'euml' => 235,
  131. 'euro' => 8364,
  132. 'exist' => 8707,
  133. 'fnof' => 402,
  134. 'forall' => 8704,
  135. 'frac12' => 189,
  136. 'frac14' => 188,
  137. 'frac34' => 190,
  138. 'frasl' => 8260,
  139. 'Gamma' => 915,
  140. 'gamma' => 947,
  141. 'ge' => 8805,
  142. 'gt' => 62,
  143. 'harr' => 8596,
  144. 'hArr' => 8660,
  145. 'hearts' => 9829,
  146. 'hellip' => 8230,
  147. 'Iacute' => 205,
  148. 'iacute' => 237,
  149. 'Icirc' => 206,
  150. 'icirc' => 238,
  151. 'iexcl' => 161,
  152. 'Igrave' => 204,
  153. 'igrave' => 236,
  154. 'image' => 8465,
  155. 'infin' => 8734,
  156. 'int' => 8747,
  157. 'Iota' => 921,
  158. 'iota' => 953,
  159. 'iquest' => 191,
  160. 'isin' => 8712,
  161. 'Iuml' => 207,
  162. 'iuml' => 239,
  163. 'Kappa' => 922,
  164. 'kappa' => 954,
  165. 'Lambda' => 923,
  166. 'lambda' => 955,
  167. 'lang' => 9001,
  168. 'laquo' => 171,
  169. 'larr' => 8592,
  170. 'lArr' => 8656,
  171. 'lceil' => 8968,
  172. 'ldquo' => 8220,
  173. 'le' => 8804,
  174. 'lfloor' => 8970,
  175. 'lowast' => 8727,
  176. 'loz' => 9674,
  177. 'lrm' => 8206,
  178. 'lsaquo' => 8249,
  179. 'lsquo' => 8216,
  180. 'lt' => 60,
  181. 'macr' => 175,
  182. 'mdash' => 8212,
  183. 'micro' => 181,
  184. 'middot' => 183,
  185. 'minus' => 8722,
  186. 'Mu' => 924,
  187. 'mu' => 956,
  188. 'nabla' => 8711,
  189. 'nbsp' => 160,
  190. 'ndash' => 8211,
  191. 'ne' => 8800,
  192. 'ni' => 8715,
  193. 'not' => 172,
  194. 'notin' => 8713,
  195. 'nsub' => 8836,
  196. 'Ntilde' => 209,
  197. 'ntilde' => 241,
  198. 'Nu' => 925,
  199. 'nu' => 957,
  200. 'Oacute' => 211,
  201. 'oacute' => 243,
  202. 'Ocirc' => 212,
  203. 'ocirc' => 244,
  204. 'OElig' => 338,
  205. 'oelig' => 339,
  206. 'Ograve' => 210,
  207. 'ograve' => 242,
  208. 'oline' => 8254,
  209. 'Omega' => 937,
  210. 'omega' => 969,
  211. 'Omicron' => 927,
  212. 'omicron' => 959,
  213. 'oplus' => 8853,
  214. 'or' => 8744,
  215. 'ordf' => 170,
  216. 'ordm' => 186,
  217. 'Oslash' => 216,
  218. 'oslash' => 248,
  219. 'Otilde' => 213,
  220. 'otilde' => 245,
  221. 'otimes' => 8855,
  222. 'Ouml' => 214,
  223. 'ouml' => 246,
  224. 'para' => 182,
  225. 'part' => 8706,
  226. 'permil' => 8240,
  227. 'perp' => 8869,
  228. 'Phi' => 934,
  229. 'phi' => 966,
  230. 'Pi' => 928,
  231. 'pi' => 960,
  232. 'piv' => 982,
  233. 'plusmn' => 177,
  234. 'pound' => 163,
  235. 'prime' => 8242,
  236. 'Prime' => 8243,
  237. 'prod' => 8719,
  238. 'prop' => 8733,
  239. 'Psi' => 936,
  240. 'psi' => 968,
  241. 'quot' => 34,
  242. 'radic' => 8730,
  243. 'rang' => 9002,
  244. 'raquo' => 187,
  245. 'rarr' => 8594,
  246. 'rArr' => 8658,
  247. 'rceil' => 8969,
  248. 'rdquo' => 8221,
  249. 'real' => 8476,
  250. 'reg' => 174,
  251. 'rfloor' => 8971,
  252. 'Rho' => 929,
  253. 'rho' => 961,
  254. 'rlm' => 8207,
  255. 'rsaquo' => 8250,
  256. 'rsquo' => 8217,
  257. 'sbquo' => 8218,
  258. 'Scaron' => 352,
  259. 'scaron' => 353,
  260. 'sdot' => 8901,
  261. 'sect' => 167,
  262. 'shy' => 173,
  263. 'Sigma' => 931,
  264. 'sigma' => 963,
  265. 'sigmaf' => 962,
  266. 'sim' => 8764,
  267. 'spades' => 9824,
  268. 'sub' => 8834,
  269. 'sube' => 8838,
  270. 'sum' => 8721,
  271. 'sup' => 8835,
  272. 'sup1' => 185,
  273. 'sup2' => 178,
  274. 'sup3' => 179,
  275. 'supe' => 8839,
  276. 'szlig' => 223,
  277. 'Tau' => 932,
  278. 'tau' => 964,
  279. 'there4' => 8756,
  280. 'Theta' => 920,
  281. 'theta' => 952,
  282. 'thetasym' => 977,
  283. 'thinsp' => 8201,
  284. 'THORN' => 222,
  285. 'thorn' => 254,
  286. 'tilde' => 732,
  287. 'times' => 215,
  288. 'trade' => 8482,
  289. 'Uacute' => 218,
  290. 'uacute' => 250,
  291. 'uarr' => 8593,
  292. 'uArr' => 8657,
  293. 'Ucirc' => 219,
  294. 'ucirc' => 251,
  295. 'Ugrave' => 217,
  296. 'ugrave' => 249,
  297. 'uml' => 168,
  298. 'upsih' => 978,
  299. 'Upsilon' => 933,
  300. 'upsilon' => 965,
  301. 'Uuml' => 220,
  302. 'uuml' => 252,
  303. 'weierp' => 8472,
  304. 'Xi' => 926,
  305. 'xi' => 958,
  306. 'Yacute' => 221,
  307. 'yacute' => 253,
  308. 'yen' => 165,
  309. 'Yuml' => 376,
  310. 'yuml' => 255,
  311. 'Zeta' => 918,
  312. 'zeta' => 950,
  313. 'zwj' => 8205,
  314. 'zwnj' => 8204 );
  315. /**
  316. * Character entity aliases accepted by MediaWiki
  317. */
  318. global $wgHtmlEntityAliases;
  319. $wgHtmlEntityAliases = array(
  320. 'רלמ' => 'rlm',
  321. 'رلم' => 'rlm',
  322. );
  323. /**
  324. * XHTML sanitizer for MediaWiki
  325. * @ingroup Parser
  326. */
  327. class Sanitizer {
  328. /**
  329. * Cleans up HTML, removes dangerous tags and attributes, and
  330. * removes HTML comments
  331. * @private
  332. * @param string $text
  333. * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
  334. * @param array $args for the processing callback
  335. * @return string
  336. */
  337. static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
  338. global $wgUseTidy;
  339. static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
  340. $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
  341. wfProfileIn( __METHOD__ );
  342. if ( !$staticInitialised ) {
  343. $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
  344. 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
  345. 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
  346. 'strike', 'strong', 'tt', 'var', 'div', 'center',
  347. 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
  348. 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
  349. ) );
  350. $htmlsingle = array(
  351. 'br', 'hr', 'li', 'dt', 'dd'
  352. );
  353. $htmlsingleonly = array( # Elements that cannot have close tags
  354. 'br', 'hr'
  355. );
  356. $htmlnest = array( # Tags that can be nested--??
  357. 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
  358. 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
  359. );
  360. $tabletags = array( # Can only appear inside table, we will close them
  361. 'td', 'th', 'tr',
  362. );
  363. $htmllist = array( # Tags used by list
  364. 'ul','ol',
  365. );
  366. $listtags = array( # Tags that can appear in a list
  367. 'li',
  368. );
  369. $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
  370. $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
  371. # Convert them all to hashtables for faster lookup
  372. $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
  373. 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
  374. foreach ( $vars as $var ) {
  375. $$var = array_flip( $$var );
  376. }
  377. $staticInitialised = true;
  378. }
  379. # Remove HTML comments
  380. $text = Sanitizer::removeHTMLcomments( $text );
  381. $bits = explode( '<', $text );
  382. $text = str_replace( '>', '&gt;', array_shift( $bits ) );
  383. if(!$wgUseTidy) {
  384. $tagstack = $tablestack = array();
  385. foreach ( $bits as $x ) {
  386. $regs = array();
  387. if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
  388. list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
  389. } else {
  390. $slash = $t = $params = $brace = $rest = null;
  391. }
  392. $badtag = 0 ;
  393. if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
  394. # Check our stack
  395. if ( $slash ) {
  396. # Closing a tag...
  397. if( isset( $htmlsingleonly[$t] ) ) {
  398. $badtag = 1;
  399. } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
  400. if ( isset( $htmlsingleallowed[$ot] ) ) {
  401. # Pop all elements with an optional close tag
  402. # and see if we find a match below them
  403. $optstack = array();
  404. array_push ($optstack, $ot);
  405. while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
  406. isset( $htmlsingleallowed[$ot] ) )
  407. {
  408. array_push ($optstack, $ot);
  409. }
  410. if ( $t != $ot ) {
  411. # No match. Push the optinal elements back again
  412. $badtag = 1;
  413. while ( $ot = @array_pop( $optstack ) ) {
  414. array_push( $tagstack, $ot );
  415. }
  416. }
  417. } else {
  418. @array_push( $tagstack, $ot );
  419. # <li> can be nested in <ul> or <ol>, skip those cases:
  420. if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
  421. $badtag = 1;
  422. }
  423. }
  424. } else {
  425. if ( $t == 'table' ) {
  426. $tagstack = array_pop( $tablestack );
  427. }
  428. }
  429. $newparams = '';
  430. } else {
  431. # Keep track for later
  432. if ( isset( $tabletags[$t] ) &&
  433. ! in_array( 'table', $tagstack ) ) {
  434. $badtag = 1;
  435. } else if ( in_array( $t, $tagstack ) &&
  436. ! isset( $htmlnest [$t ] ) ) {
  437. $badtag = 1 ;
  438. # Is it a self closed htmlpair ? (bug 5487)
  439. } else if( $brace == '/>' &&
  440. isset( $htmlpairs[$t] ) ) {
  441. $badtag = 1;
  442. } elseif( isset( $htmlsingleonly[$t] ) ) {
  443. # Hack to force empty tag for uncloseable elements
  444. $brace = '/>';
  445. } else if( isset( $htmlsingle[$t] ) ) {
  446. # Hack to not close $htmlsingle tags
  447. $brace = NULL;
  448. } else if( isset( $tabletags[$t] )
  449. && in_array($t ,$tagstack) ) {
  450. // New table tag but forgot to close the previous one
  451. $text .= "</$t>";
  452. } else {
  453. if ( $t == 'table' ) {
  454. array_push( $tablestack, $tagstack );
  455. $tagstack = array();
  456. }
  457. array_push( $tagstack, $t );
  458. }
  459. # Replace any variables or template parameters with
  460. # plaintext results.
  461. if( is_callable( $processCallback ) ) {
  462. call_user_func_array( $processCallback, array( &$params, $args ) );
  463. }
  464. # Strip non-approved attributes from the tag
  465. $newparams = Sanitizer::fixTagAttributes( $params, $t );
  466. }
  467. if ( ! $badtag ) {
  468. $rest = str_replace( '>', '&gt;', $rest );
  469. $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
  470. $text .= "<$slash$t$newparams$close>$rest";
  471. continue;
  472. }
  473. }
  474. $text .= '&lt;' . str_replace( '>', '&gt;', $x);
  475. }
  476. # Close off any remaining tags
  477. while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
  478. $text .= "</$t>\n";
  479. if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
  480. }
  481. } else {
  482. # this might be possible using tidy itself
  483. foreach ( $bits as $x ) {
  484. preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
  485. $x, $regs );
  486. @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
  487. if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
  488. if( is_callable( $processCallback ) ) {
  489. call_user_func_array( $processCallback, array( &$params, $args ) );
  490. }
  491. $newparams = Sanitizer::fixTagAttributes( $params, $t );
  492. $rest = str_replace( '>', '&gt;', $rest );
  493. $text .= "<$slash$t$newparams$brace$rest";
  494. } else {
  495. $text .= '&lt;' . str_replace( '>', '&gt;', $x);
  496. }
  497. }
  498. }
  499. wfProfileOut( __METHOD__ );
  500. return $text;
  501. }
  502. /**
  503. * Remove '<!--', '-->', and everything between.
  504. * To avoid leaving blank lines, when a comment is both preceded
  505. * and followed by a newline (ignoring spaces), trim leading and
  506. * trailing spaces and one of the newlines.
  507. *
  508. * @private
  509. * @param string $text
  510. * @return string
  511. */
  512. static function removeHTMLcomments( $text ) {
  513. wfProfileIn( __METHOD__ );
  514. while (($start = strpos($text, '<!--')) !== false) {
  515. $end = strpos($text, '-->', $start + 4);
  516. if ($end === false) {
  517. # Unterminated comment; bail out
  518. break;
  519. }
  520. $end += 3;
  521. # Trim space and newline if the comment is both
  522. # preceded and followed by a newline
  523. $spaceStart = max($start - 1, 0);
  524. $spaceLen = $end - $spaceStart;
  525. while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
  526. $spaceStart--;
  527. $spaceLen++;
  528. }
  529. while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
  530. $spaceLen++;
  531. if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
  532. # Remove the comment, leading and trailing
  533. # spaces, and leave only one newline.
  534. $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
  535. }
  536. else {
  537. # Remove just the comment.
  538. $text = substr_replace($text, '', $start, $end - $start);
  539. }
  540. }
  541. wfProfileOut( __METHOD__ );
  542. return $text;
  543. }
  544. /**
  545. * Take an array of attribute names and values and normalize or discard
  546. * illegal values for the given element type.
  547. *
  548. * - Discards attributes not on a whitelist for the given element
  549. * - Unsafe style attributes are discarded
  550. * - Invalid id attributes are reencoded
  551. *
  552. * @param array $attribs
  553. * @param string $element
  554. * @return array
  555. *
  556. * @todo Check for legal values where the DTD limits things.
  557. * @todo Check for unique id attribute :P
  558. */
  559. static function validateTagAttributes( $attribs, $element ) {
  560. return Sanitizer::validateAttributes( $attribs,
  561. Sanitizer::attributeWhitelist( $element ) );
  562. }
  563. /**
  564. * Take an array of attribute names and values and normalize or discard
  565. * illegal values for the given whitelist.
  566. *
  567. * - Discards attributes not the given whitelist
  568. * - Unsafe style attributes are discarded
  569. * - Invalid id attributes are reencoded
  570. *
  571. * @param array $attribs
  572. * @param array $whitelist list of allowed attribute names
  573. * @return array
  574. *
  575. * @todo Check for legal values where the DTD limits things.
  576. * @todo Check for unique id attribute :P
  577. */
  578. static function validateAttributes( $attribs, $whitelist ) {
  579. $whitelist = array_flip( $whitelist );
  580. $out = array();
  581. foreach( $attribs as $attribute => $value ) {
  582. if( !isset( $whitelist[$attribute] ) ) {
  583. continue;
  584. }
  585. # Strip javascript "expression" from stylesheets.
  586. # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
  587. if( $attribute == 'style' ) {
  588. $value = Sanitizer::checkCss( $value );
  589. if( $value === false ) {
  590. # haxx0r
  591. continue;
  592. }
  593. }
  594. if ( $attribute === 'id' ) {
  595. global $wgEnforceHtmlIds;
  596. $value = Sanitizer::escapeId( $value,
  597. $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
  598. }
  599. // If this attribute was previously set, override it.
  600. // Output should only have one attribute of each name.
  601. $out[$attribute] = $value;
  602. }
  603. return $out;
  604. }
  605. /**
  606. * Merge two sets of HTML attributes. Conflicting items in the second set
  607. * will override those in the first, except for 'class' attributes which
  608. * will be combined (if they're both strings).
  609. *
  610. * @todo implement merging for other attributes such as style
  611. * @param array $a
  612. * @param array $b
  613. * @return array
  614. */
  615. static function mergeAttributes( $a, $b ) {
  616. $out = array_merge( $a, $b );
  617. if( isset( $a['class'] ) && isset( $b['class'] )
  618. && is_string( $a['class'] ) && is_string( $b['class'] )
  619. && $a['class'] !== $b['class'] ) {
  620. $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
  621. -1, PREG_SPLIT_NO_EMPTY );
  622. $out['class'] = implode( ' ', array_unique( $classes ) );
  623. }
  624. return $out;
  625. }
  626. /**
  627. * Pick apart some CSS and check it for forbidden or unsafe structures.
  628. * Returns a sanitized string, or false if it was just too evil.
  629. *
  630. * Currently URL references, 'expression', 'tps' are forbidden.
  631. *
  632. * @param string $value
  633. * @return mixed
  634. */
  635. static function checkCss( $value ) {
  636. $stripped = Sanitizer::decodeCharReferences( $value );
  637. // Remove any comments; IE gets token splitting wrong
  638. $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
  639. $value = $stripped;
  640. // ... and continue checks
  641. $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
  642. 'codepointToUtf8(hexdec("$1"))', $stripped );
  643. $stripped = str_replace( '\\', '', $stripped );
  644. if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
  645. $stripped ) ) {
  646. # haxx0r
  647. return false;
  648. }
  649. return $value;
  650. }
  651. /**
  652. * Take a tag soup fragment listing an HTML element's attributes
  653. * and normalize it to well-formed XML, discarding unwanted attributes.
  654. * Output is safe for further wikitext processing, with escaping of
  655. * values that could trigger problems.
  656. *
  657. * - Normalizes attribute names to lowercase
  658. * - Discards attributes not on a whitelist for the given element
  659. * - Turns broken or invalid entities into plaintext
  660. * - Double-quotes all attribute values
  661. * - Attributes without values are given the name as attribute
  662. * - Double attributes are discarded
  663. * - Unsafe style attributes are discarded
  664. * - Prepends space if there are attributes.
  665. *
  666. * @param string $text
  667. * @param string $element
  668. * @return string
  669. */
  670. static function fixTagAttributes( $text, $element ) {
  671. if( trim( $text ) == '' ) {
  672. return '';
  673. }
  674. $stripped = Sanitizer::validateTagAttributes(
  675. Sanitizer::decodeTagAttributes( $text ), $element );
  676. $attribs = array();
  677. foreach( $stripped as $attribute => $value ) {
  678. $encAttribute = htmlspecialchars( $attribute );
  679. $encValue = Sanitizer::safeEncodeAttribute( $value );
  680. $attribs[] = "$encAttribute=\"$encValue\"";
  681. }
  682. return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
  683. }
  684. /**
  685. * Encode an attribute value for HTML output.
  686. * @param $text
  687. * @return HTML-encoded text fragment
  688. */
  689. static function encodeAttribute( $text ) {
  690. $encValue = htmlspecialchars( $text, ENT_QUOTES );
  691. // Whitespace is normalized during attribute decoding,
  692. // so if we've been passed non-spaces we must encode them
  693. // ahead of time or they won't be preserved.
  694. $encValue = strtr( $encValue, array(
  695. "\n" => '&#10;',
  696. "\r" => '&#13;',
  697. "\t" => '&#9;',
  698. ) );
  699. return $encValue;
  700. }
  701. /**
  702. * Encode an attribute value for HTML tags, with extra armoring
  703. * against further wiki processing.
  704. * @param $text
  705. * @return HTML-encoded text fragment
  706. */
  707. static function safeEncodeAttribute( $text ) {
  708. $encValue = Sanitizer::encodeAttribute( $text );
  709. # Templates and links may be expanded in later parsing,
  710. # creating invalid or dangerous output. Suppress this.
  711. $encValue = strtr( $encValue, array(
  712. '<' => '&lt;', // This should never happen,
  713. '>' => '&gt;', // we've received invalid input
  714. '"' => '&quot;', // which should have been escaped.
  715. '{' => '&#123;',
  716. '[' => '&#91;',
  717. "''" => '&#39;&#39;',
  718. 'ISBN' => '&#73;SBN',
  719. 'RFC' => '&#82;FC',
  720. 'PMID' => '&#80;MID',
  721. '|' => '&#124;',
  722. '__' => '&#95;_',
  723. ) );
  724. # Stupid hack
  725. $encValue = preg_replace_callback(
  726. '/(' . wfUrlProtocols() . ')/',
  727. array( 'Sanitizer', 'armorLinksCallback' ),
  728. $encValue );
  729. return $encValue;
  730. }
  731. /**
  732. * Given a value escape it so that it can be used in an id attribute and
  733. * return it, this does not validate the value however (see first link)
  734. *
  735. * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
  736. * in the id and
  737. * name attributes
  738. * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
  739. *
  740. * @param string $id Id to validate
  741. * @param mixed $options String or array of strings (default is array()):
  742. * 'noninitial': This is a non-initial fragment of an id, not a full id,
  743. * so don't pay attention if the first character isn't valid at the
  744. * beginning of an id.
  745. * 'xml': Don't restrict the id to be HTML4-compatible. This option
  746. * allows any alphabetic character to be used, per the XML standard.
  747. * Therefore, it also completely changes the type of escaping: instead
  748. * of weird dot-encoding, runs of invalid characters (mostly
  749. * whitespace) are just compressed into a single underscore.
  750. * @return string
  751. */
  752. static function escapeId( $id, $options = array() ) {
  753. $options = (array)$options;
  754. if ( !in_array( 'xml', $options ) ) {
  755. # HTML4-style escaping
  756. static $replace = array(
  757. '%3A' => ':',
  758. '%' => '.'
  759. );
  760. $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
  761. $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
  762. if ( !preg_match( '/^[a-zA-Z]/', $id )
  763. && !in_array( 'noninitial', $options ) ) {
  764. // Initial character must be a letter!
  765. $id = "x$id";
  766. }
  767. return $id;
  768. }
  769. # XML-style escaping. For the patterns used, see the XML 1.0 standard,
  770. # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
  771. $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
  772. . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
  773. . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
  774. $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
  775. . '\x{203F}-\x{2040}';
  776. # Replace _ as well so we don't get multiple consecutive underscores
  777. $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
  778. $id = trim( $id, '_' );
  779. if ( !preg_match( "/^[$nameStartChar]/u", $id )
  780. && !in_array( 'noninitial', $options ) ) {
  781. $id = "_$id";
  782. }
  783. return $id;
  784. }
  785. /**
  786. * Given a value, escape it so that it can be used as a CSS class and
  787. * return it.
  788. *
  789. * @todo For extra validity, input should be validated UTF-8.
  790. *
  791. * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
  792. *
  793. * @param string $class
  794. * @return string
  795. */
  796. static function escapeClass( $class ) {
  797. // Convert ugly stuff to underscores and kill underscores in ugly places
  798. return rtrim(preg_replace(
  799. array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
  800. '_',
  801. $class ), '_');
  802. }
  803. /**
  804. * Given HTML input, escape with htmlspecialchars but un-escape entites.
  805. * This allows (generally harmless) entities like &nbsp; to survive.
  806. *
  807. * @param string $html String to escape
  808. * @return string Escaped input
  809. */
  810. static function escapeHtmlAllowEntities( $html ) {
  811. # It seems wise to escape ' as well as ", as a matter of course. Can't
  812. # hurt.
  813. $html = htmlspecialchars( $html, ENT_QUOTES );
  814. $html = str_replace( '&amp;', '&', $html );
  815. $html = Sanitizer::normalizeCharReferences( $html );
  816. return $html;
  817. }
  818. /**
  819. * Regex replace callback for armoring links against further processing.
  820. * @param array $matches
  821. * @return string
  822. * @private
  823. */
  824. private static function armorLinksCallback( $matches ) {
  825. return str_replace( ':', '&#58;', $matches[1] );
  826. }
  827. /**
  828. * Return an associative array of attribute names and values from
  829. * a partial tag string. Attribute names are forces to lowercase,
  830. * character references are decoded to UTF-8 text.
  831. *
  832. * @param string
  833. * @return array
  834. */
  835. public static function decodeTagAttributes( $text ) {
  836. $attribs = array();
  837. if( trim( $text ) == '' ) {
  838. return $attribs;
  839. }
  840. $pairs = array();
  841. if( !preg_match_all(
  842. MW_ATTRIBS_REGEX,
  843. $text,
  844. $pairs,
  845. PREG_SET_ORDER ) ) {
  846. return $attribs;
  847. }
  848. foreach( $pairs as $set ) {
  849. $attribute = strtolower( $set[1] );
  850. $value = Sanitizer::getTagAttributeCallback( $set );
  851. // Normalize whitespace
  852. $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
  853. $value = trim( $value );
  854. // Decode character references
  855. $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
  856. }
  857. return $attribs;
  858. }
  859. /**
  860. * Pick the appropriate attribute value from a match set from the
  861. * MW_ATTRIBS_REGEX matches.
  862. *
  863. * @param array $set
  864. * @return string
  865. * @private
  866. */
  867. private static function getTagAttributeCallback( $set ) {
  868. if( isset( $set[6] ) ) {
  869. # Illegal #XXXXXX color with no quotes.
  870. return $set[6];
  871. } elseif( isset( $set[5] ) ) {
  872. # No quotes.
  873. return $set[5];
  874. } elseif( isset( $set[4] ) ) {
  875. # Single-quoted
  876. return $set[4];
  877. } elseif( isset( $set[3] ) ) {
  878. # Double-quoted
  879. return $set[3];
  880. } elseif( !isset( $set[2] ) ) {
  881. # In XHTML, attributes must have a value.
  882. # For 'reduced' form, return explicitly the attribute name here.
  883. return $set[1];
  884. } else {
  885. throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
  886. }
  887. }
  888. /**
  889. * Normalize whitespace and character references in an XML source-
  890. * encoded text for an attribute value.
  891. *
  892. * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
  893. * but note that we're not returning the value, but are returning
  894. * XML source fragments that will be slapped into output.
  895. *
  896. * @param string $text
  897. * @return string
  898. * @private
  899. */
  900. private static function normalizeAttributeValue( $text ) {
  901. return str_replace( '"', '&quot;',
  902. self::normalizeWhitespace(
  903. Sanitizer::normalizeCharReferences( $text ) ) );
  904. }
  905. private static function normalizeWhitespace( $text ) {
  906. return preg_replace(
  907. '/\r\n|[\x20\x0d\x0a\x09]/',
  908. ' ',
  909. $text );
  910. }
  911. /**
  912. * Ensure that any entities and character references are legal
  913. * for XML and XHTML specifically. Any stray bits will be
  914. * &amp;-escaped to result in a valid text fragment.
  915. *
  916. * a. any named char refs must be known in XHTML
  917. * b. any numeric char refs must be legal chars, not invalid or forbidden
  918. * c. use &#x, not &#X
  919. * d. fix or reject non-valid attributes
  920. *
  921. * @param string $text
  922. * @return string
  923. * @private
  924. */
  925. static function normalizeCharReferences( $text ) {
  926. return preg_replace_callback(
  927. MW_CHAR_REFS_REGEX,
  928. array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
  929. $text );
  930. }
  931. /**
  932. * @param string $matches
  933. * @return string
  934. */
  935. static function normalizeCharReferencesCallback( $matches ) {
  936. $ret = null;
  937. if( $matches[1] != '' ) {
  938. $ret = Sanitizer::normalizeEntity( $matches[1] );
  939. } elseif( $matches[2] != '' ) {
  940. $ret = Sanitizer::decCharReference( $matches[2] );
  941. } elseif( $matches[3] != '' ) {
  942. $ret = Sanitizer::hexCharReference( $matches[3] );
  943. } elseif( $matches[4] != '' ) {
  944. $ret = Sanitizer::hexCharReference( $matches[4] );
  945. }
  946. if( is_null( $ret ) ) {
  947. return htmlspecialchars( $matches[0] );
  948. } else {
  949. return $ret;
  950. }
  951. }
  952. /**
  953. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  954. * return the named entity reference as is. If the entity is a
  955. * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
  956. * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
  957. *
  958. * @param string $name
  959. * @return string
  960. * @static
  961. */
  962. static function normalizeEntity( $name ) {
  963. global $wgHtmlEntities, $wgHtmlEntityAliases;
  964. if ( isset( $wgHtmlEntityAliases[$name] ) ) {
  965. return "&{$wgHtmlEntityAliases[$name]};";
  966. } elseif( isset( $wgHtmlEntities[$name] ) ) {
  967. return "&$name;";
  968. } else {
  969. return "&amp;$name;";
  970. }
  971. }
  972. static function decCharReference( $codepoint ) {
  973. $point = intval( $codepoint );
  974. if( Sanitizer::validateCodepoint( $point ) ) {
  975. return sprintf( '&#%d;', $point );
  976. } else {
  977. return null;
  978. }
  979. }
  980. static function hexCharReference( $codepoint ) {
  981. $point = hexdec( $codepoint );
  982. if( Sanitizer::validateCodepoint( $point ) ) {
  983. return sprintf( '&#x%x;', $point );
  984. } else {
  985. return null;
  986. }
  987. }
  988. /**
  989. * Returns true if a given Unicode codepoint is a valid character in XML.
  990. * @param int $codepoint
  991. * @return bool
  992. */
  993. private static function validateCodepoint( $codepoint ) {
  994. return ($codepoint == 0x09)
  995. || ($codepoint == 0x0a)
  996. || ($codepoint == 0x0d)
  997. || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
  998. || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
  999. || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
  1000. }
  1001. /**
  1002. * Decode any character references, numeric or named entities,
  1003. * in the text and return a UTF-8 string.
  1004. *
  1005. * @param string $text
  1006. * @return string
  1007. * @public
  1008. * @static
  1009. */
  1010. public static function decodeCharReferences( $text ) {
  1011. return preg_replace_callback(
  1012. MW_CHAR_REFS_REGEX,
  1013. array( 'Sanitizer', 'decodeCharReferencesCallback' ),
  1014. $text );
  1015. }
  1016. /**
  1017. * @param string $matches
  1018. * @return string
  1019. */
  1020. static function decodeCharReferencesCallback( $matches ) {
  1021. if( $matches[1] != '' ) {
  1022. return Sanitizer::decodeEntity( $matches[1] );
  1023. } elseif( $matches[2] != '' ) {
  1024. return Sanitizer::decodeChar( intval( $matches[2] ) );
  1025. } elseif( $matches[3] != '' ) {
  1026. return Sanitizer::decodeChar( hexdec( $matches[3] ) );
  1027. } elseif( $matches[4] != '' ) {
  1028. return Sanitizer::decodeChar( hexdec( $matches[4] ) );
  1029. }
  1030. # Last case should be an ampersand by itself
  1031. return $matches[0];
  1032. }
  1033. /**
  1034. * Return UTF-8 string for a codepoint if that is a valid
  1035. * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
  1036. * @param int $codepoint
  1037. * @return string
  1038. * @private
  1039. */
  1040. static function decodeChar( $codepoint ) {
  1041. if( Sanitizer::validateCodepoint( $codepoint ) ) {
  1042. return codepointToUtf8( $codepoint );
  1043. } else {
  1044. return UTF8_REPLACEMENT;
  1045. }
  1046. }
  1047. /**
  1048. * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
  1049. * return the UTF-8 encoding of that character. Otherwise, returns
  1050. * pseudo-entity source (eg &foo;)
  1051. *
  1052. * @param string $name
  1053. * @return string
  1054. */
  1055. static function decodeEntity( $name ) {
  1056. global $wgHtmlEntities, $wgHtmlEntityAliases;
  1057. if ( isset( $wgHtmlEntityAliases[$name] ) ) {
  1058. $name = $wgHtmlEntityAliases[$name];
  1059. }
  1060. if( isset( $wgHtmlEntities[$name] ) ) {
  1061. return codepointToUtf8( $wgHtmlEntities[$name] );
  1062. } else {
  1063. return "&$name;";
  1064. }
  1065. }
  1066. /**
  1067. * Fetch the whitelist of acceptable attributes for a given
  1068. * element name.
  1069. *
  1070. * @param string $element
  1071. * @return array
  1072. */
  1073. static function attributeWhitelist( $element ) {
  1074. static $list;
  1075. if( !isset( $list ) ) {
  1076. $list = Sanitizer::setupAttributeWhitelist();
  1077. }
  1078. return isset( $list[$element] )
  1079. ? $list[$element]
  1080. : array();
  1081. }
  1082. /**
  1083. * Foreach array key (an allowed HTML element), return an array
  1084. * of allowed attributes
  1085. * @return array
  1086. */
  1087. static function setupAttributeWhitelist() {
  1088. $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
  1089. $block = array_merge( $common, array( 'align' ) );
  1090. $tablealign = array( 'align', 'char', 'charoff', 'valign' );
  1091. $tablecell = array( 'abbr',
  1092. 'axis',
  1093. 'headers',
  1094. 'scope',
  1095. 'rowspan',
  1096. 'colspan',
  1097. 'nowrap', # deprecated
  1098. 'width', # deprecated
  1099. 'height', # deprecated
  1100. 'bgcolor' # deprecated
  1101. );
  1102. # Numbers refer to sections in HTML 4.01 standard describing the element.
  1103. # See: http://www.w3.org/TR/html4/
  1104. $whitelist = array (
  1105. # 7.5.4
  1106. 'div' => $block,
  1107. 'center' => $common, # deprecated
  1108. 'span' => $block, # ??
  1109. # 7.5.5
  1110. 'h1' => $block,
  1111. 'h2' => $block,
  1112. 'h3' => $block,
  1113. 'h4' => $block,
  1114. 'h5' => $block,
  1115. 'h6' => $block,
  1116. # 7.5.6
  1117. # address
  1118. # 8.2.4
  1119. # bdo
  1120. # 9.2.1
  1121. 'em' => $common,
  1122. 'strong' => $common,
  1123. 'cite' => $common,
  1124. # dfn
  1125. 'code' => $common,
  1126. # samp
  1127. # kbd
  1128. 'var' => $common,
  1129. # abbr
  1130. # acronym
  1131. # 9.2.2
  1132. 'blockquote' => array_merge( $common, array( 'cite' ) ),
  1133. # q
  1134. # 9.2.3
  1135. 'sub' => $common,
  1136. 'sup' => $common,
  1137. # 9.3.1
  1138. 'p' => $block,
  1139. # 9.3.2
  1140. 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
  1141. # 9.3.4
  1142. 'pre' => array_merge( $common, array( 'width' ) ),
  1143. # 9.4
  1144. 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
  1145. 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
  1146. # 10.2
  1147. 'ul' => array_merge( $common, array( 'type' ) ),
  1148. 'ol' => array_merge( $common, array( 'type', 'start' ) ),
  1149. 'li' => array_merge( $common, array( 'type', 'value' ) ),
  1150. # 10.3
  1151. 'dl' => $common,
  1152. 'dd' => $common,
  1153. 'dt' => $common,
  1154. # 11.2.1
  1155. 'table' => array_merge( $common,
  1156. array( 'summary', 'width', 'border', 'frame',
  1157. 'rules', 'cellspacing', 'cellpadding',
  1158. 'align', 'bgcolor',
  1159. ) ),
  1160. # 11.2.2
  1161. 'caption' => array_merge( $common, array( 'align' ) ),
  1162. # 11.2.3
  1163. 'thead' => array_merge( $common, $tablealign ),
  1164. 'tfoot' => array_merge( $common, $tablealign ),
  1165. 'tbody' => array_merge( $common, $tablealign ),
  1166. # 11.2.4
  1167. 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
  1168. 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
  1169. # 11.2.5
  1170. 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
  1171. # 11.2.6
  1172. 'td' => array_merge( $common, $tablecell, $tablealign ),
  1173. 'th' => array_merge( $common, $tablecell, $tablealign ),
  1174. # 13.2
  1175. # Not usually allowed, but may be used for extension-style hooks
  1176. # such as <math> when it is rasterized
  1177. 'img' => array_merge( $common, array( 'alt' ) ),
  1178. # 15.2.1
  1179. 'tt' => $common,
  1180. 'b' => $common,
  1181. 'i' => $common,
  1182. 'big' => $common,
  1183. 'small' => $common,
  1184. 'strike' => $common,
  1185. 's' => $common,
  1186. 'u' => $common,
  1187. # 15.2.2
  1188. 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
  1189. # basefont
  1190. # 15.3
  1191. 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
  1192. # XHTML Ruby annotation text module, simple ruby only.
  1193. # http://www.w3c.org/TR/ruby/
  1194. 'ruby' => $common,
  1195. # rbc
  1196. # rtc
  1197. 'rb' => $common,
  1198. 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
  1199. 'rp' => $common,
  1200. # MathML root element, where used for extensions
  1201. # 'title' may not be 100% valid here; it's XHTML
  1202. # http://www.w3.org/TR/REC-MathML/
  1203. 'math' => array( 'class', 'style', 'id', 'title' ),
  1204. );
  1205. return $whitelist;
  1206. }
  1207. /**
  1208. * Take a fragment of (potentially invalid) HTML and return
  1209. * a version with any tags removed, encoded as plain text.
  1210. *
  1211. * Warning: this return value must be further escaped for literal
  1212. * inclusion in HTML output as of 1.10!
  1213. *
  1214. * @param string $text HTML fragment
  1215. * @return string
  1216. */
  1217. static function stripAllTags( $text ) {
  1218. # Actual <tags>
  1219. $text = StringUtils::delimiterReplace( '<', '>', '', $text );
  1220. # Normalize &entities and whitespace
  1221. $text = self::decodeCharReferences( $text );
  1222. $text = self::normalizeWhitespace( $text );
  1223. return $text;
  1224. }
  1225. /**
  1226. * Hack up a private DOCTYPE with HTML's standard entity declarations.
  1227. * PHP 4 seemed to know these if you gave it an HTML doctype, but
  1228. * PHP 5.1 doesn't.
  1229. *
  1230. * Use for passing XHTML fragments to PHP's XML parsing functions
  1231. *
  1232. * @return string
  1233. * @static
  1234. */
  1235. static function hackDocType() {
  1236. global $wgHtmlEntities;
  1237. $out = "<!DOCTYPE html [\n";
  1238. foreach( $wgHtmlEntities as $entity => $codepoint ) {
  1239. $out .= "<!ENTITY $entity \"&#$codepoint;\">";
  1240. }
  1241. $out .= "]>\n";
  1242. return $out;
  1243. }
  1244. static function cleanUrl( $url ) {
  1245. # Normalize any HTML entities in input. They will be
  1246. # re-escaped by makeExternalLink().
  1247. $url = Sanitizer::decodeCharReferences( $url );
  1248. # Escape any control characters introduced by the above step
  1249. $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
  1250. # Validate hostname portion
  1251. $matches = array();
  1252. if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
  1253. list( /* $whole */, $protocol, $host, $rest ) = $matches;
  1254. // Characters that will be ignored in IDNs.
  1255. // http://tools.ietf.org/html/3454#section-3.1
  1256. // Strip them before further processing so blacklists and such work.
  1257. $strip = "/
  1258. \\s| # general whitespace
  1259. \xc2\xad| # 00ad SOFT HYPHEN
  1260. \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
  1261. \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
  1262. \xe2\x81\xa0| # 2060 WORD JOINER
  1263. \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
  1264. \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
  1265. \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
  1266. \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
  1267. \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
  1268. \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
  1269. \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
  1270. [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
  1271. /xuD";
  1272. $host = preg_replace( $strip, '', $host );
  1273. // @fixme: validate hostnames here
  1274. return $protocol . $host . $rest;
  1275. } else {
  1276. return $url;
  1277. }
  1278. }
  1279. }