123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531 |
- <?php
- namespace MediaWiki\Tidy;
- use RemexHtml\HTMLData;
- use RemexHtml\Serializer\Serializer;
- use RemexHtml\Serializer\SerializerNode;
- use RemexHtml\Tokenizer\Attributes;
- use RemexHtml\Tokenizer\PlainAttributes;
- use RemexHtml\TreeBuilder\TreeBuilder;
- use RemexHtml\TreeBuilder\TreeHandler;
- use RemexHtml\TreeBuilder\Element;
- /**
- * @internal
- */
- class RemexCompatMunger implements TreeHandler {
- private static $onlyInlineElements = [
- "a" => true,
- "abbr" => true,
- "acronym" => true,
- "applet" => true,
- "b" => true,
- "basefont" => true,
- "bdo" => true,
- "big" => true,
- "br" => true,
- "button" => true,
- "cite" => true,
- "code" => true,
- "del" => true,
- "dfn" => true,
- "em" => true,
- "font" => true,
- "i" => true,
- "iframe" => true,
- "img" => true,
- "input" => true,
- "ins" => true,
- "kbd" => true,
- "label" => true,
- "legend" => true,
- "map" => true,
- "object" => true,
- "param" => true,
- "q" => true,
- "rb" => true,
- "rbc" => true,
- "rp" => true,
- "rt" => true,
- "rtc" => true,
- "ruby" => true,
- "s" => true,
- "samp" => true,
- "select" => true,
- "small" => true,
- "span" => true,
- "strike" => true,
- "strong" => true,
- "sub" => true,
- "sup" => true,
- "textarea" => true,
- "tt" => true,
- "u" => true,
- "var" => true,
- // Those defined in tidy.conf
- "video" => true,
- "audio" => true,
- "bdi" => true,
- "data" => true,
- "time" => true,
- "mark" => true,
- ];
- /**
- * For the purposes of this class, "metadata" elements are those that
- * should neither trigger p-wrapping nor stop an outer p-wrapping,
- * typically those that are themselves invisible in a browser's rendering.
- * This isn't a complete list, it's just the tags that we're likely to
- * encounter in practice.
- * @var array
- */
- private static $metadataElements = [
- 'style' => true,
- 'script' => true,
- 'link' => true,
- 'meta' => true,
- ];
- private static $formattingElements = [
- 'a' => true,
- 'b' => true,
- 'big' => true,
- 'code' => true,
- 'em' => true,
- 'font' => true,
- 'i' => true,
- 'nobr' => true,
- 's' => true,
- 'small' => true,
- 'strike' => true,
- 'strong' => true,
- 'tt' => true,
- 'u' => true,
- ];
- /** @var Serializer */
- private $serializer;
- /** @var bool */
- private $trace;
- /**
- * @param Serializer $serializer
- * @param bool $trace
- */
- public function __construct( Serializer $serializer, $trace = false ) {
- $this->serializer = $serializer;
- $this->trace = $trace;
- }
- public function startDocument( $fragmentNamespace, $fragmentName ) {
- $this->serializer->startDocument( $fragmentNamespace, $fragmentName );
- $root = $this->serializer->getRootNode();
- $root->snData = new RemexMungerData;
- $root->snData->needsPWrapping = true;
- }
- public function endDocument( $pos ) {
- $this->serializer->endDocument( $pos );
- }
- private function getParentForInsert( $preposition, $refElement ) {
- if ( $preposition === TreeBuilder::ROOT ) {
- return [ $this->serializer->getRootNode(), null ];
- } elseif ( $preposition === TreeBuilder::BEFORE ) {
- $refNode = $refElement->userData;
- return [ $this->serializer->getParentNode( $refNode ), $refNode ];
- } else {
- $refNode = $refElement->userData;
- $refData = $refNode->snData;
- if ( $refData->currentCloneElement ) {
- // Follow a chain of clone links if necessary
- $origRefData = $refData;
- while ( $refData->currentCloneElement ) {
- $refElement = $refData->currentCloneElement;
- $refNode = $refElement->userData;
- $refData = $refNode->snData;
- }
- // Cache the end of the chain in the requested element
- $origRefData->currentCloneElement = $refElement;
- } elseif ( $refData->childPElement ) {
- $refElement = $refData->childPElement;
- $refNode = $refElement->userData;
- }
- return [ $refNode, $refNode ];
- }
- }
- /**
- * Insert a p-wrapper
- *
- * @param SerializerNode $parent
- * @param int $sourceStart
- * @return SerializerNode
- */
- private function insertPWrapper( SerializerNode $parent, $sourceStart ) {
- $pWrap = new Element( HTMLData::NS_HTML, 'mw:p-wrap', new PlainAttributes );
- $this->serializer->insertElement( TreeBuilder::UNDER, $parent, $pWrap, false,
- $sourceStart, 0 );
- $data = new RemexMungerData;
- $data->isPWrapper = true;
- $data->wrapBaseNode = $parent;
- $pWrap->userData->snData = $data;
- $parent->snData->childPElement = $pWrap;
- return $pWrap->userData;
- }
- public function characters( $preposition, $refElement, $text, $start, $length,
- $sourceStart, $sourceLength
- ) {
- $isBlank = strspn( $text, "\t\n\f\r ", $start, $length ) === $length;
- list( $parent, $refNode ) = $this->getParentForInsert( $preposition, $refElement );
- $parentData = $parent->snData;
- if ( $preposition === TreeBuilder::UNDER ) {
- if ( $parentData->needsPWrapping && !$isBlank ) {
- // Add a p-wrapper for bare text under body/blockquote
- $refNode = $this->insertPWrapper( $refNode, $sourceStart );
- $parent = $refNode;
- $parentData = $parent->snData;
- } elseif ( $parentData->isSplittable && !$parentData->ancestorPNode ) {
- // The parent is splittable and in block mode, so split the tag stack
- $refNode = $this->splitTagStack( $refNode, true, $sourceStart );
- $parent = $refNode;
- $parentData = $parent->snData;
- }
- }
- if ( !$isBlank ) {
- // Non-whitespace characters detected
- $parentData->nonblankNodeCount++;
- }
- $this->serializer->characters( $preposition, $refNode, $text, $start,
- $length, $sourceStart, $sourceLength );
- }
- private function trace( $msg ) {
- if ( $this->trace ) {
- wfDebug( "[RCM] $msg" );
- }
- }
- /**
- * Insert or reparent an element. Create p-wrappers or split the tag stack
- * as necessary.
- *
- * Consider the following insertion locations. The parent may be:
- *
- * - A: A body or blockquote (!!needsPWrapping)
- * - B: A p-wrapper (!!isPWrapper)
- * - C: A descendant of a p-wrapper (!!ancestorPNode)
- * - CS: With splittable formatting elements in the stack region up to
- * the p-wrapper
- * - CU: With one or more unsplittable elements in the stack region up
- * to the p-wrapper
- * - D: Not a descendant of a p-wrapper (!ancestorNode)
- * - DS: With splittable formatting elements in the stack region up to
- * the body or blockquote
- * - DU: With one or more unsplittable elements in the stack region up
- * to the body or blockquote
- *
- * And consider that we may insert two types of element:
- * - b: block
- * - i: inline
- *
- * We handle the insertion as follows:
- *
- * - A/i: Create a p-wrapper, insert under it
- * - A/b: Insert as normal
- * - B/i: Insert as normal
- * - B/b: Close the p-wrapper, insert under the body/blockquote (wrap
- * base) instead)
- * - C/i: Insert as normal
- * - CS/b: Split the tag stack, insert the block under cloned formatting
- * elements which have the wrap base (the parent of the p-wrap) as
- * their ultimate parent.
- * - CU/b: Disable the p-wrap, by reparenting the currently open child
- * of the p-wrap under the p-wrap's parent. Then insert the block as
- * normal.
- * - D/b: Insert as normal
- * - DS/i: Split the tag stack, creating a new p-wrapper as the ultimate
- * parent of the formatting elements thus cloned. The parent of the
- * p-wrapper is the body or blockquote.
- * - DU/i: Insert as normal
- *
- * FIXME: fostering ($preposition == BEFORE) is mostly done by inserting as
- * normal, the full algorithm is not followed.
- *
- * @param int $preposition
- * @param Element|SerializerNode|null $refElement
- * @param Element $element
- * @param bool $void
- * @param int $sourceStart
- * @param int $sourceLength
- */
- public function insertElement( $preposition, $refElement, Element $element, $void,
- $sourceStart, $sourceLength
- ) {
- list( $parent, $newRef ) = $this->getParentForInsert( $preposition, $refElement );
- $parentData = $parent->snData;
- $elementName = $element->htmlName;
- $inline = isset( self::$onlyInlineElements[$elementName] );
- $under = $preposition === TreeBuilder::UNDER;
- $elementToEnd = null;
- if ( isset( self::$metadataElements[$elementName] ) ) {
- // The element is a metadata element, that we allow to appear in
- // both inline and block contexts.
- $this->trace( 'insert metadata' );
- } elseif ( $under && $parentData->isPWrapper && !$inline ) {
- // [B/b] The element is non-inline and the parent is a p-wrapper,
- // close the parent and insert into its parent instead
- $this->trace( 'insert B/b' );
- $newParent = $this->serializer->getParentNode( $parent );
- $parent = $newParent;
- $parentData = $parent->snData;
- $parentData->childPElement = null;
- $newRef = $refElement->userData;
- } elseif ( $under && $parentData->isSplittable
- && (bool)$parentData->ancestorPNode !== $inline
- ) {
- // [CS/b, DS/i] The parent is splittable and the current element is
- // inline in block context, or if the current element is a block
- // under a p-wrapper, split the tag stack.
- $this->trace( $inline ? 'insert DS/i' : 'insert CS/b' );
- $newRef = $this->splitTagStack( $newRef, $inline, $sourceStart );
- $parent = $newRef;
- $parentData = $parent->snData;
- } elseif ( $under && $parentData->needsPWrapping && $inline ) {
- // [A/i] If the element is inline and we are in body/blockquote,
- // we need to create a p-wrapper
- $this->trace( 'insert A/i' );
- $newRef = $this->insertPWrapper( $newRef, $sourceStart );
- $parent = $newRef;
- $parentData = $parent->snData;
- } elseif ( $parentData->ancestorPNode && !$inline ) {
- // [CU/b] If the element is non-inline and (despite attempting to
- // split above) there is still an ancestor p-wrap, disable that
- // p-wrap
- $this->trace( 'insert CU/b' );
- $this->disablePWrapper( $parent, $sourceStart );
- } else {
- // [A/b, B/i, C/i, D/b, DU/i] insert as normal
- $this->trace( 'insert normal' );
- }
- // An element with element children is a non-blank element
- $parentData->nonblankNodeCount++;
- // Insert the element downstream and so initialise its userData
- $this->serializer->insertElement( $preposition, $newRef,
- $element, $void, $sourceStart, $sourceLength );
- // Initialise snData
- if ( !$element->userData->snData ) {
- $elementData = $element->userData->snData = new RemexMungerData;
- } else {
- $elementData = $element->userData->snData;
- }
- if ( ( $parentData->isPWrapper || $parentData->isSplittable )
- && isset( self::$formattingElements[$elementName] )
- ) {
- $elementData->isSplittable = true;
- }
- if ( $parentData->isPWrapper ) {
- $elementData->ancestorPNode = $parent;
- } elseif ( $parentData->ancestorPNode ) {
- $elementData->ancestorPNode = $parentData->ancestorPNode;
- }
- if ( $parentData->wrapBaseNode ) {
- $elementData->wrapBaseNode = $parentData->wrapBaseNode;
- } elseif ( $parentData->needsPWrapping ) {
- $elementData->wrapBaseNode = $parent;
- }
- if ( $elementName === 'body'
- || $elementName === 'blockquote'
- || $elementName === 'html'
- ) {
- $elementData->needsPWrapping = true;
- }
- }
- /**
- * Clone nodes in a stack range and return the new parent
- *
- * @param SerializerNode $parentNode
- * @param bool $inline
- * @param int $pos The source position
- * @return SerializerNode
- */
- private function splitTagStack( SerializerNode $parentNode, $inline, $pos ) {
- $parentData = $parentNode->snData;
- $wrapBase = $parentData->wrapBaseNode;
- $pWrap = $parentData->ancestorPNode;
- if ( !$pWrap ) {
- $cloneEnd = $wrapBase;
- } else {
- $cloneEnd = $parentData->ancestorPNode;
- }
- $serializer = $this->serializer;
- $node = $parentNode;
- $root = $serializer->getRootNode();
- $nodes = [];
- $removableNodes = [];
- while ( $node !== $cloneEnd ) {
- $nextParent = $serializer->getParentNode( $node );
- if ( $nextParent === $root ) {
- throw new \Exception( 'Did not find end of clone range' );
- }
- $nodes[] = $node;
- if ( $node->snData->nonblankNodeCount === 0 ) {
- $removableNodes[] = $node;
- $nextParent->snData->nonblankNodeCount--;
- }
- $node = $nextParent;
- }
- if ( $inline ) {
- $pWrap = $this->insertPWrapper( $wrapBase, $pos );
- $node = $pWrap;
- } else {
- if ( $pWrap ) {
- // End the p-wrap which was open, cancel the diversion
- $wrapBase->snData->childPElement = null;
- }
- $pWrap = null;
- $node = $wrapBase;
- }
- for ( $i = count( $nodes ) - 1; $i >= 0; $i-- ) {
- $oldNode = $nodes[$i];
- $oldData = $oldNode->snData;
- $nodeParent = $node;
- $element = new Element( $oldNode->namespace, $oldNode->name, $oldNode->attrs );
- $this->serializer->insertElement( TreeBuilder::UNDER, $nodeParent,
- $element, false, $pos, 0 );
- $oldData->currentCloneElement = $element;
- $newNode = $element->userData;
- $newData = $newNode->snData = new RemexMungerData;
- if ( $pWrap ) {
- $newData->ancestorPNode = $pWrap;
- }
- $newData->isSplittable = true;
- $newData->wrapBaseNode = $wrapBase;
- $newData->isPWrapper = $oldData->isPWrapper;
- $nodeParent->snData->nonblankNodeCount++;
- $node = $newNode;
- }
- foreach ( $removableNodes as $rNode ) {
- $fakeElement = new Element( $rNode->namespace, $rNode->name, $rNode->attrs );
- $fakeElement->userData = $rNode;
- $this->serializer->removeNode( $fakeElement, $pos );
- }
- return $node;
- }
- /**
- * Find the ancestor of $node which is a child of a p-wrapper, and
- * reparent that node so that it is placed after the end of the p-wrapper
- */
- private function disablePWrapper( SerializerNode $node, $sourceStart ) {
- $nodeData = $node->snData;
- $pWrapNode = $nodeData->ancestorPNode;
- $newParent = $this->serializer->getParentNode( $pWrapNode );
- if ( $pWrapNode !== $this->serializer->getLastChild( $newParent ) ) {
- // Fostering or something? Abort!
- return;
- }
- $nextParent = $node;
- do {
- $victim = $nextParent;
- $victim->snData->ancestorPNode = null;
- $nextParent = $this->serializer->getParentNode( $victim );
- } while ( $nextParent !== $pWrapNode );
- // Make a fake Element to use in a reparenting operation
- $victimElement = new Element( $victim->namespace, $victim->name, $victim->attrs );
- $victimElement->userData = $victim;
- // Reparent
- $this->serializer->insertElement( TreeBuilder::UNDER, $newParent, $victimElement,
- false, $sourceStart, 0 );
- // Decrement nonblank node count
- $pWrapNode->snData->nonblankNodeCount--;
- // Cancel the diversion so that no more elements are inserted under this p-wrap
- $newParent->snData->childPElement = null;
- }
- public function endTag( Element $element, $sourceStart, $sourceLength ) {
- $data = $element->userData->snData;
- if ( $data->childPElement ) {
- $this->endTag( $data->childPElement, $sourceStart, 0 );
- }
- $this->serializer->endTag( $element, $sourceStart, $sourceLength );
- $element->userData->snData = null;
- $element->userData = null;
- }
- public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
- $this->serializer->doctype( $name, $public, $system, $quirks,
- $sourceStart, $sourceLength );
- }
- public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) {
- list( , $refNode ) = $this->getParentForInsert( $preposition, $refElement );
- $this->serializer->comment( $preposition, $refNode, $text, $sourceStart, $sourceLength );
- }
- public function error( $text, $pos ) {
- $this->serializer->error( $text, $pos );
- }
- public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) {
- $this->serializer->mergeAttributes( $element, $attrs, $sourceStart );
- }
- public function removeNode( Element $element, $sourceStart ) {
- $this->serializer->removeNode( $element, $sourceStart );
- }
- public function reparentChildren( Element $element, Element $newParent, $sourceStart ) {
- $self = $element->userData;
- if ( $self->snData->childPElement ) {
- // Reparent under the p-wrapper instead, so that e.g.
- // <blockquote><mw:p-wrap>...</mw:p-wrap></blockquote>
- // becomes
- // <blockquote><mw:p-wrap><i>...</i></mw:p-wrap></blockquote>
- // The formatting element should not be the parent of the p-wrap.
- // Without this special case, the insertElement() of the <i> below
- // would be diverted into the p-wrapper, causing infinite recursion
- // (T178632)
- $this->reparentChildren( $self->snData->childPElement, $newParent, $sourceStart );
- return;
- }
- $children = $self->children;
- $self->children = [];
- $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 );
- $newParentNode = $newParent->userData;
- $newParentId = $newParentNode->id;
- foreach ( $children as $child ) {
- if ( is_object( $child ) ) {
- $this->trace( "reparent <{$child->name}>" );
- $child->parentId = $newParentId;
- }
- }
- $newParentNode->children = $children;
- }
- }
|