123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 |
- <?php
- /**
- * Takes a well formed list of tokens and fixes their nesting.
- *
- * HTML elements dictate which elements are allowed to be their children,
- * for example, you can't have a p tag in a span tag. Other elements have
- * much more rigorous definitions: tables, for instance, require a specific
- * order for their elements. There are also constraints not expressible by
- * document type definitions, such as the chameleon nature of ins/del
- * tags and global child exclusions.
- *
- * The first major objective of this strategy is to iterate through all
- * the nodes and determine whether or not their children conform to the
- * element's definition. If they do not, the child definition may
- * optionally supply an amended list of elements that is valid or
- * require that the entire node be deleted (and the previous node
- * rescanned).
- *
- * The second objective is to ensure that explicitly excluded elements of
- * an element do not appear in its children. Code that accomplishes this
- * task is pervasive through the strategy, though the two are distinct tasks
- * and could, theoretically, be seperated (although it's not recommended).
- *
- * @note Whether or not unrecognized children are silently dropped or
- * translated into text depends on the child definitions.
- *
- * @todo Enable nodes to be bubbled out of the structure. This is
- * easier with our new algorithm.
- */
- class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
- {
- /**
- * @param HTMLPurifier_Token[] $tokens
- * @param HTMLPurifier_Config $config
- * @param HTMLPurifier_Context $context
- * @return array|HTMLPurifier_Token[]
- */
- public function execute($tokens, $config, $context)
- {
- //####################################################################//
- // Pre-processing
- // O(n) pass to convert to a tree, so that we can efficiently
- // refer to substrings
- $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
- // get a copy of the HTML definition
- $definition = $config->getHTMLDefinition();
- $excludes_enabled = !$config->get('Core.DisableExcludes');
- // setup the context variable 'IsInline', for chameleon processing
- // is 'false' when we are not inline, 'true' when it must always
- // be inline, and an integer when it is inline for a certain
- // branch of the document tree
- $is_inline = $definition->info_parent_def->descendants_are_inline;
- $context->register('IsInline', $is_inline);
- // setup error collector
- $e =& $context->get('ErrorCollector', true);
- //####################################################################//
- // Loop initialization
- // stack that contains all elements that are excluded
- // it is organized by parent elements, similar to $stack,
- // but it is only populated when an element with exclusions is
- // processed, i.e. there won't be empty exclusions.
- $exclude_stack = array($definition->info_parent_def->excludes);
- // variable that contains the start token while we are processing
- // nodes. This enables error reporting to do its job
- $node = $top_node;
- // dummy token
- list($token, $d) = $node->toTokenPair();
- $context->register('CurrentNode', $node);
- $context->register('CurrentToken', $token);
- //####################################################################//
- // Loop
- // We need to implement a post-order traversal iteratively, to
- // avoid running into stack space limits. This is pretty tricky
- // to reason about, so we just manually stack-ify the recursive
- // variant:
- //
- // function f($node) {
- // foreach ($node->children as $child) {
- // f($child);
- // }
- // validate($node);
- // }
- //
- // Thus, we will represent a stack frame as array($node,
- // $is_inline, stack of children)
- // e.g. array_reverse($node->children) - already processed
- // children.
- $parent_def = $definition->info_parent_def;
- $stack = array(
- array($top_node,
- $parent_def->descendants_are_inline,
- $parent_def->excludes, // exclusions
- 0)
- );
- while (!empty($stack)) {
- list($node, $is_inline, $excludes, $ix) = array_pop($stack);
- // recursive call
- $go = false;
- $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
- while (isset($node->children[$ix])) {
- $child = $node->children[$ix++];
- if ($child instanceof HTMLPurifier_Node_Element) {
- $go = true;
- $stack[] = array($node, $is_inline, $excludes, $ix);
- $stack[] = array($child,
- // ToDo: I don't think it matters if it's def or
- // child_def, but double check this...
- $is_inline || $def->descendants_are_inline,
- empty($def->excludes) ? $excludes
- : array_merge($excludes, $def->excludes),
- 0);
- break;
- }
- };
- if ($go) continue;
- list($token, $d) = $node->toTokenPair();
- // base case
- if ($excludes_enabled && isset($excludes[$node->name])) {
- $node->dead = true;
- if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
- } else {
- // XXX I suppose it would be slightly more efficient to
- // avoid the allocation here and have children
- // strategies handle it
- $children = array();
- foreach ($node->children as $child) {
- if (!$child->dead) $children[] = $child;
- }
- $result = $def->child->validateChildren($children, $config, $context);
- if ($result === true) {
- // nop
- $node->children = $children;
- } elseif ($result === false) {
- $node->dead = true;
- if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
- } else {
- $node->children = $result;
- if ($e) {
- // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
- if (empty($result) && !empty($children)) {
- $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
- } else if ($result != $children) {
- $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
- }
- }
- }
- }
- }
- //####################################################################//
- // Post-processing
- // remove context variables
- $context->destroy('IsInline');
- $context->destroy('CurrentNode');
- $context->destroy('CurrentToken');
- //####################################################################//
- // Return
- return HTMLPurifier_Arborize::flatten($node, $config, $context);
- }
- }
- // vim: et sw=4 sts=4
|