RemexCompatMunger.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. <?php
  2. namespace MediaWiki\Tidy;
  3. use RemexHtml\HTMLData;
  4. use RemexHtml\Serializer\Serializer;
  5. use RemexHtml\Serializer\SerializerNode;
  6. use RemexHtml\Tokenizer\Attributes;
  7. use RemexHtml\Tokenizer\PlainAttributes;
  8. use RemexHtml\TreeBuilder\TreeBuilder;
  9. use RemexHtml\TreeBuilder\TreeHandler;
  10. use RemexHtml\TreeBuilder\Element;
  11. /**
  12. * @internal
  13. */
  14. class RemexCompatMunger implements TreeHandler {
  15. private static $onlyInlineElements = [
  16. "a" => true,
  17. "abbr" => true,
  18. "acronym" => true,
  19. "applet" => true,
  20. "b" => true,
  21. "basefont" => true,
  22. "bdo" => true,
  23. "big" => true,
  24. "br" => true,
  25. "button" => true,
  26. "cite" => true,
  27. "code" => true,
  28. "del" => true,
  29. "dfn" => true,
  30. "em" => true,
  31. "font" => true,
  32. "i" => true,
  33. "iframe" => true,
  34. "img" => true,
  35. "input" => true,
  36. "ins" => true,
  37. "kbd" => true,
  38. "label" => true,
  39. "legend" => true,
  40. "map" => true,
  41. "object" => true,
  42. "param" => true,
  43. "q" => true,
  44. "rb" => true,
  45. "rbc" => true,
  46. "rp" => true,
  47. "rt" => true,
  48. "rtc" => true,
  49. "ruby" => true,
  50. "s" => true,
  51. "samp" => true,
  52. "select" => true,
  53. "small" => true,
  54. "span" => true,
  55. "strike" => true,
  56. "strong" => true,
  57. "sub" => true,
  58. "sup" => true,
  59. "textarea" => true,
  60. "tt" => true,
  61. "u" => true,
  62. "var" => true,
  63. // Those defined in tidy.conf
  64. "video" => true,
  65. "audio" => true,
  66. "bdi" => true,
  67. "data" => true,
  68. "time" => true,
  69. "mark" => true,
  70. ];
  71. /**
  72. * For the purposes of this class, "metadata" elements are those that
  73. * should neither trigger p-wrapping nor stop an outer p-wrapping,
  74. * typically those that are themselves invisible in a browser's rendering.
  75. * This isn't a complete list, it's just the tags that we're likely to
  76. * encounter in practice.
  77. * @var array
  78. */
  79. private static $metadataElements = [
  80. 'style' => true,
  81. 'script' => true,
  82. 'link' => true,
  83. 'meta' => true,
  84. ];
  85. private static $formattingElements = [
  86. 'a' => true,
  87. 'b' => true,
  88. 'big' => true,
  89. 'code' => true,
  90. 'em' => true,
  91. 'font' => true,
  92. 'i' => true,
  93. 'nobr' => true,
  94. 's' => true,
  95. 'small' => true,
  96. 'strike' => true,
  97. 'strong' => true,
  98. 'tt' => true,
  99. 'u' => true,
  100. ];
  101. /** @var Serializer */
  102. private $serializer;
  103. /** @var bool */
  104. private $trace;
  105. /**
  106. * @param Serializer $serializer
  107. * @param bool $trace
  108. */
  109. public function __construct( Serializer $serializer, $trace = false ) {
  110. $this->serializer = $serializer;
  111. $this->trace = $trace;
  112. }
  113. public function startDocument( $fragmentNamespace, $fragmentName ) {
  114. $this->serializer->startDocument( $fragmentNamespace, $fragmentName );
  115. $root = $this->serializer->getRootNode();
  116. $root->snData = new RemexMungerData;
  117. $root->snData->needsPWrapping = true;
  118. }
  119. public function endDocument( $pos ) {
  120. $this->serializer->endDocument( $pos );
  121. }
  122. private function getParentForInsert( $preposition, $refElement ) {
  123. if ( $preposition === TreeBuilder::ROOT ) {
  124. return [ $this->serializer->getRootNode(), null ];
  125. } elseif ( $preposition === TreeBuilder::BEFORE ) {
  126. $refNode = $refElement->userData;
  127. return [ $this->serializer->getParentNode( $refNode ), $refNode ];
  128. } else {
  129. $refNode = $refElement->userData;
  130. $refData = $refNode->snData;
  131. if ( $refData->currentCloneElement ) {
  132. // Follow a chain of clone links if necessary
  133. $origRefData = $refData;
  134. while ( $refData->currentCloneElement ) {
  135. $refElement = $refData->currentCloneElement;
  136. $refNode = $refElement->userData;
  137. $refData = $refNode->snData;
  138. }
  139. // Cache the end of the chain in the requested element
  140. $origRefData->currentCloneElement = $refElement;
  141. } elseif ( $refData->childPElement ) {
  142. $refElement = $refData->childPElement;
  143. $refNode = $refElement->userData;
  144. }
  145. return [ $refNode, $refNode ];
  146. }
  147. }
  148. /**
  149. * Insert a p-wrapper
  150. *
  151. * @param SerializerNode $parent
  152. * @param int $sourceStart
  153. * @return SerializerNode
  154. */
  155. private function insertPWrapper( SerializerNode $parent, $sourceStart ) {
  156. $pWrap = new Element( HTMLData::NS_HTML, 'mw:p-wrap', new PlainAttributes );
  157. $this->serializer->insertElement( TreeBuilder::UNDER, $parent, $pWrap, false,
  158. $sourceStart, 0 );
  159. $data = new RemexMungerData;
  160. $data->isPWrapper = true;
  161. $data->wrapBaseNode = $parent;
  162. $pWrap->userData->snData = $data;
  163. $parent->snData->childPElement = $pWrap;
  164. return $pWrap->userData;
  165. }
  166. public function characters( $preposition, $refElement, $text, $start, $length,
  167. $sourceStart, $sourceLength
  168. ) {
  169. $isBlank = strspn( $text, "\t\n\f\r ", $start, $length ) === $length;
  170. list( $parent, $refNode ) = $this->getParentForInsert( $preposition, $refElement );
  171. $parentData = $parent->snData;
  172. if ( $preposition === TreeBuilder::UNDER ) {
  173. if ( $parentData->needsPWrapping && !$isBlank ) {
  174. // Add a p-wrapper for bare text under body/blockquote
  175. $refNode = $this->insertPWrapper( $refNode, $sourceStart );
  176. $parent = $refNode;
  177. $parentData = $parent->snData;
  178. } elseif ( $parentData->isSplittable && !$parentData->ancestorPNode ) {
  179. // The parent is splittable and in block mode, so split the tag stack
  180. $refNode = $this->splitTagStack( $refNode, true, $sourceStart );
  181. $parent = $refNode;
  182. $parentData = $parent->snData;
  183. }
  184. }
  185. if ( !$isBlank ) {
  186. // Non-whitespace characters detected
  187. $parentData->nonblankNodeCount++;
  188. }
  189. $this->serializer->characters( $preposition, $refNode, $text, $start,
  190. $length, $sourceStart, $sourceLength );
  191. }
  192. private function trace( $msg ) {
  193. if ( $this->trace ) {
  194. wfDebug( "[RCM] $msg" );
  195. }
  196. }
  197. /**
  198. * Insert or reparent an element. Create p-wrappers or split the tag stack
  199. * as necessary.
  200. *
  201. * Consider the following insertion locations. The parent may be:
  202. *
  203. * - A: A body or blockquote (!!needsPWrapping)
  204. * - B: A p-wrapper (!!isPWrapper)
  205. * - C: A descendant of a p-wrapper (!!ancestorPNode)
  206. * - CS: With splittable formatting elements in the stack region up to
  207. * the p-wrapper
  208. * - CU: With one or more unsplittable elements in the stack region up
  209. * to the p-wrapper
  210. * - D: Not a descendant of a p-wrapper (!ancestorNode)
  211. * - DS: With splittable formatting elements in the stack region up to
  212. * the body or blockquote
  213. * - DU: With one or more unsplittable elements in the stack region up
  214. * to the body or blockquote
  215. *
  216. * And consider that we may insert two types of element:
  217. * - b: block
  218. * - i: inline
  219. *
  220. * We handle the insertion as follows:
  221. *
  222. * - A/i: Create a p-wrapper, insert under it
  223. * - A/b: Insert as normal
  224. * - B/i: Insert as normal
  225. * - B/b: Close the p-wrapper, insert under the body/blockquote (wrap
  226. * base) instead)
  227. * - C/i: Insert as normal
  228. * - CS/b: Split the tag stack, insert the block under cloned formatting
  229. * elements which have the wrap base (the parent of the p-wrap) as
  230. * their ultimate parent.
  231. * - CU/b: Disable the p-wrap, by reparenting the currently open child
  232. * of the p-wrap under the p-wrap's parent. Then insert the block as
  233. * normal.
  234. * - D/b: Insert as normal
  235. * - DS/i: Split the tag stack, creating a new p-wrapper as the ultimate
  236. * parent of the formatting elements thus cloned. The parent of the
  237. * p-wrapper is the body or blockquote.
  238. * - DU/i: Insert as normal
  239. *
  240. * FIXME: fostering ($preposition == BEFORE) is mostly done by inserting as
  241. * normal, the full algorithm is not followed.
  242. *
  243. * @param int $preposition
  244. * @param Element|SerializerNode|null $refElement
  245. * @param Element $element
  246. * @param bool $void
  247. * @param int $sourceStart
  248. * @param int $sourceLength
  249. */
  250. public function insertElement( $preposition, $refElement, Element $element, $void,
  251. $sourceStart, $sourceLength
  252. ) {
  253. list( $parent, $newRef ) = $this->getParentForInsert( $preposition, $refElement );
  254. $parentData = $parent->snData;
  255. $elementName = $element->htmlName;
  256. $inline = isset( self::$onlyInlineElements[$elementName] );
  257. $under = $preposition === TreeBuilder::UNDER;
  258. $elementToEnd = null;
  259. if ( isset( self::$metadataElements[$elementName] ) ) {
  260. // The element is a metadata element, that we allow to appear in
  261. // both inline and block contexts.
  262. $this->trace( 'insert metadata' );
  263. } elseif ( $under && $parentData->isPWrapper && !$inline ) {
  264. // [B/b] The element is non-inline and the parent is a p-wrapper,
  265. // close the parent and insert into its parent instead
  266. $this->trace( 'insert B/b' );
  267. $newParent = $this->serializer->getParentNode( $parent );
  268. $parent = $newParent;
  269. $parentData = $parent->snData;
  270. $parentData->childPElement = null;
  271. $newRef = $refElement->userData;
  272. } elseif ( $under && $parentData->isSplittable
  273. && (bool)$parentData->ancestorPNode !== $inline
  274. ) {
  275. // [CS/b, DS/i] The parent is splittable and the current element is
  276. // inline in block context, or if the current element is a block
  277. // under a p-wrapper, split the tag stack.
  278. $this->trace( $inline ? 'insert DS/i' : 'insert CS/b' );
  279. $newRef = $this->splitTagStack( $newRef, $inline, $sourceStart );
  280. $parent = $newRef;
  281. $parentData = $parent->snData;
  282. } elseif ( $under && $parentData->needsPWrapping && $inline ) {
  283. // [A/i] If the element is inline and we are in body/blockquote,
  284. // we need to create a p-wrapper
  285. $this->trace( 'insert A/i' );
  286. $newRef = $this->insertPWrapper( $newRef, $sourceStart );
  287. $parent = $newRef;
  288. $parentData = $parent->snData;
  289. } elseif ( $parentData->ancestorPNode && !$inline ) {
  290. // [CU/b] If the element is non-inline and (despite attempting to
  291. // split above) there is still an ancestor p-wrap, disable that
  292. // p-wrap
  293. $this->trace( 'insert CU/b' );
  294. $this->disablePWrapper( $parent, $sourceStart );
  295. } else {
  296. // [A/b, B/i, C/i, D/b, DU/i] insert as normal
  297. $this->trace( 'insert normal' );
  298. }
  299. // An element with element children is a non-blank element
  300. $parentData->nonblankNodeCount++;
  301. // Insert the element downstream and so initialise its userData
  302. $this->serializer->insertElement( $preposition, $newRef,
  303. $element, $void, $sourceStart, $sourceLength );
  304. // Initialise snData
  305. if ( !$element->userData->snData ) {
  306. $elementData = $element->userData->snData = new RemexMungerData;
  307. } else {
  308. $elementData = $element->userData->snData;
  309. }
  310. if ( ( $parentData->isPWrapper || $parentData->isSplittable )
  311. && isset( self::$formattingElements[$elementName] )
  312. ) {
  313. $elementData->isSplittable = true;
  314. }
  315. if ( $parentData->isPWrapper ) {
  316. $elementData->ancestorPNode = $parent;
  317. } elseif ( $parentData->ancestorPNode ) {
  318. $elementData->ancestorPNode = $parentData->ancestorPNode;
  319. }
  320. if ( $parentData->wrapBaseNode ) {
  321. $elementData->wrapBaseNode = $parentData->wrapBaseNode;
  322. } elseif ( $parentData->needsPWrapping ) {
  323. $elementData->wrapBaseNode = $parent;
  324. }
  325. if ( $elementName === 'body'
  326. || $elementName === 'blockquote'
  327. || $elementName === 'html'
  328. ) {
  329. $elementData->needsPWrapping = true;
  330. }
  331. }
  332. /**
  333. * Clone nodes in a stack range and return the new parent
  334. *
  335. * @param SerializerNode $parentNode
  336. * @param bool $inline
  337. * @param int $pos The source position
  338. * @return SerializerNode
  339. */
  340. private function splitTagStack( SerializerNode $parentNode, $inline, $pos ) {
  341. $parentData = $parentNode->snData;
  342. $wrapBase = $parentData->wrapBaseNode;
  343. $pWrap = $parentData->ancestorPNode;
  344. if ( !$pWrap ) {
  345. $cloneEnd = $wrapBase;
  346. } else {
  347. $cloneEnd = $parentData->ancestorPNode;
  348. }
  349. $serializer = $this->serializer;
  350. $node = $parentNode;
  351. $root = $serializer->getRootNode();
  352. $nodes = [];
  353. $removableNodes = [];
  354. while ( $node !== $cloneEnd ) {
  355. $nextParent = $serializer->getParentNode( $node );
  356. if ( $nextParent === $root ) {
  357. throw new \Exception( 'Did not find end of clone range' );
  358. }
  359. $nodes[] = $node;
  360. if ( $node->snData->nonblankNodeCount === 0 ) {
  361. $removableNodes[] = $node;
  362. $nextParent->snData->nonblankNodeCount--;
  363. }
  364. $node = $nextParent;
  365. }
  366. if ( $inline ) {
  367. $pWrap = $this->insertPWrapper( $wrapBase, $pos );
  368. $node = $pWrap;
  369. } else {
  370. if ( $pWrap ) {
  371. // End the p-wrap which was open, cancel the diversion
  372. $wrapBase->snData->childPElement = null;
  373. }
  374. $pWrap = null;
  375. $node = $wrapBase;
  376. }
  377. for ( $i = count( $nodes ) - 1; $i >= 0; $i-- ) {
  378. $oldNode = $nodes[$i];
  379. $oldData = $oldNode->snData;
  380. $nodeParent = $node;
  381. $element = new Element( $oldNode->namespace, $oldNode->name, $oldNode->attrs );
  382. $this->serializer->insertElement( TreeBuilder::UNDER, $nodeParent,
  383. $element, false, $pos, 0 );
  384. $oldData->currentCloneElement = $element;
  385. $newNode = $element->userData;
  386. $newData = $newNode->snData = new RemexMungerData;
  387. if ( $pWrap ) {
  388. $newData->ancestorPNode = $pWrap;
  389. }
  390. $newData->isSplittable = true;
  391. $newData->wrapBaseNode = $wrapBase;
  392. $newData->isPWrapper = $oldData->isPWrapper;
  393. $nodeParent->snData->nonblankNodeCount++;
  394. $node = $newNode;
  395. }
  396. foreach ( $removableNodes as $rNode ) {
  397. $fakeElement = new Element( $rNode->namespace, $rNode->name, $rNode->attrs );
  398. $fakeElement->userData = $rNode;
  399. $this->serializer->removeNode( $fakeElement, $pos );
  400. }
  401. return $node;
  402. }
  403. /**
  404. * Find the ancestor of $node which is a child of a p-wrapper, and
  405. * reparent that node so that it is placed after the end of the p-wrapper
  406. */
  407. private function disablePWrapper( SerializerNode $node, $sourceStart ) {
  408. $nodeData = $node->snData;
  409. $pWrapNode = $nodeData->ancestorPNode;
  410. $newParent = $this->serializer->getParentNode( $pWrapNode );
  411. if ( $pWrapNode !== $this->serializer->getLastChild( $newParent ) ) {
  412. // Fostering or something? Abort!
  413. return;
  414. }
  415. $nextParent = $node;
  416. do {
  417. $victim = $nextParent;
  418. $victim->snData->ancestorPNode = null;
  419. $nextParent = $this->serializer->getParentNode( $victim );
  420. } while ( $nextParent !== $pWrapNode );
  421. // Make a fake Element to use in a reparenting operation
  422. $victimElement = new Element( $victim->namespace, $victim->name, $victim->attrs );
  423. $victimElement->userData = $victim;
  424. // Reparent
  425. $this->serializer->insertElement( TreeBuilder::UNDER, $newParent, $victimElement,
  426. false, $sourceStart, 0 );
  427. // Decrement nonblank node count
  428. $pWrapNode->snData->nonblankNodeCount--;
  429. // Cancel the diversion so that no more elements are inserted under this p-wrap
  430. $newParent->snData->childPElement = null;
  431. }
  432. public function endTag( Element $element, $sourceStart, $sourceLength ) {
  433. $data = $element->userData->snData;
  434. if ( $data->childPElement ) {
  435. $this->endTag( $data->childPElement, $sourceStart, 0 );
  436. }
  437. $this->serializer->endTag( $element, $sourceStart, $sourceLength );
  438. $element->userData->snData = null;
  439. $element->userData = null;
  440. }
  441. public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
  442. $this->serializer->doctype( $name, $public, $system, $quirks,
  443. $sourceStart, $sourceLength );
  444. }
  445. public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) {
  446. list( , $refNode ) = $this->getParentForInsert( $preposition, $refElement );
  447. $this->serializer->comment( $preposition, $refNode, $text, $sourceStart, $sourceLength );
  448. }
  449. public function error( $text, $pos ) {
  450. $this->serializer->error( $text, $pos );
  451. }
  452. public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) {
  453. $this->serializer->mergeAttributes( $element, $attrs, $sourceStart );
  454. }
  455. public function removeNode( Element $element, $sourceStart ) {
  456. $this->serializer->removeNode( $element, $sourceStart );
  457. }
  458. public function reparentChildren( Element $element, Element $newParent, $sourceStart ) {
  459. $self = $element->userData;
  460. if ( $self->snData->childPElement ) {
  461. // Reparent under the p-wrapper instead, so that e.g.
  462. // <blockquote><mw:p-wrap>...</mw:p-wrap></blockquote>
  463. // becomes
  464. // <blockquote><mw:p-wrap><i>...</i></mw:p-wrap></blockquote>
  465. // The formatting element should not be the parent of the p-wrap.
  466. // Without this special case, the insertElement() of the <i> below
  467. // would be diverted into the p-wrapper, causing infinite recursion
  468. // (T178632)
  469. $this->reparentChildren( $self->snData->childPElement, $newParent, $sourceStart );
  470. return;
  471. }
  472. $children = $self->children;
  473. $self->children = [];
  474. $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 );
  475. $newParentNode = $newParent->userData;
  476. $newParentId = $newParentNode->id;
  477. foreach ( $children as $child ) {
  478. if ( is_object( $child ) ) {
  479. $this->trace( "reparent <{$child->name}>" );
  480. $child->parentId = $newParentId;
  481. }
  482. }
  483. $newParentNode->children = $children;
  484. }
  485. }