FixNesting.php 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. <?php
  2. /**
  3. * Takes a well formed list of tokens and fixes their nesting.
  4. *
  5. * HTML elements dictate which elements are allowed to be their children,
  6. * for example, you can't have a p tag in a span tag. Other elements have
  7. * much more rigorous definitions: tables, for instance, require a specific
  8. * order for their elements. There are also constraints not expressible by
  9. * document type definitions, such as the chameleon nature of ins/del
  10. * tags and global child exclusions.
  11. *
  12. * The first major objective of this strategy is to iterate through all
  13. * the nodes and determine whether or not their children conform to the
  14. * element's definition. If they do not, the child definition may
  15. * optionally supply an amended list of elements that is valid or
  16. * require that the entire node be deleted (and the previous node
  17. * rescanned).
  18. *
  19. * The second objective is to ensure that explicitly excluded elements of
  20. * an element do not appear in its children. Code that accomplishes this
  21. * task is pervasive through the strategy, though the two are distinct tasks
  22. * and could, theoretically, be seperated (although it's not recommended).
  23. *
  24. * @note Whether or not unrecognized children are silently dropped or
  25. * translated into text depends on the child definitions.
  26. *
  27. * @todo Enable nodes to be bubbled out of the structure. This is
  28. * easier with our new algorithm.
  29. */
  30. class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
  31. {
  32. /**
  33. * @param HTMLPurifier_Token[] $tokens
  34. * @param HTMLPurifier_Config $config
  35. * @param HTMLPurifier_Context $context
  36. * @return array|HTMLPurifier_Token[]
  37. */
  38. public function execute($tokens, $config, $context)
  39. {
  40. //####################################################################//
  41. // Pre-processing
  42. // O(n) pass to convert to a tree, so that we can efficiently
  43. // refer to substrings
  44. $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
  45. // get a copy of the HTML definition
  46. $definition = $config->getHTMLDefinition();
  47. $excludes_enabled = !$config->get('Core.DisableExcludes');
  48. // setup the context variable 'IsInline', for chameleon processing
  49. // is 'false' when we are not inline, 'true' when it must always
  50. // be inline, and an integer when it is inline for a certain
  51. // branch of the document tree
  52. $is_inline = $definition->info_parent_def->descendants_are_inline;
  53. $context->register('IsInline', $is_inline);
  54. // setup error collector
  55. $e =& $context->get('ErrorCollector', true);
  56. //####################################################################//
  57. // Loop initialization
  58. // stack that contains all elements that are excluded
  59. // it is organized by parent elements, similar to $stack,
  60. // but it is only populated when an element with exclusions is
  61. // processed, i.e. there won't be empty exclusions.
  62. $exclude_stack = array($definition->info_parent_def->excludes);
  63. // variable that contains the start token while we are processing
  64. // nodes. This enables error reporting to do its job
  65. $node = $top_node;
  66. // dummy token
  67. list($token, $d) = $node->toTokenPair();
  68. $context->register('CurrentNode', $node);
  69. $context->register('CurrentToken', $token);
  70. //####################################################################//
  71. // Loop
  72. // We need to implement a post-order traversal iteratively, to
  73. // avoid running into stack space limits. This is pretty tricky
  74. // to reason about, so we just manually stack-ify the recursive
  75. // variant:
  76. //
  77. // function f($node) {
  78. // foreach ($node->children as $child) {
  79. // f($child);
  80. // }
  81. // validate($node);
  82. // }
  83. //
  84. // Thus, we will represent a stack frame as array($node,
  85. // $is_inline, stack of children)
  86. // e.g. array_reverse($node->children) - already processed
  87. // children.
  88. $parent_def = $definition->info_parent_def;
  89. $stack = array(
  90. array($top_node,
  91. $parent_def->descendants_are_inline,
  92. $parent_def->excludes, // exclusions
  93. 0)
  94. );
  95. while (!empty($stack)) {
  96. list($node, $is_inline, $excludes, $ix) = array_pop($stack);
  97. // recursive call
  98. $go = false;
  99. $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
  100. while (isset($node->children[$ix])) {
  101. $child = $node->children[$ix++];
  102. if ($child instanceof HTMLPurifier_Node_Element) {
  103. $go = true;
  104. $stack[] = array($node, $is_inline, $excludes, $ix);
  105. $stack[] = array($child,
  106. // ToDo: I don't think it matters if it's def or
  107. // child_def, but double check this...
  108. $is_inline || $def->descendants_are_inline,
  109. empty($def->excludes) ? $excludes
  110. : array_merge($excludes, $def->excludes),
  111. 0);
  112. break;
  113. }
  114. };
  115. if ($go) continue;
  116. list($token, $d) = $node->toTokenPair();
  117. // base case
  118. if ($excludes_enabled && isset($excludes[$node->name])) {
  119. $node->dead = true;
  120. if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
  121. } else {
  122. // XXX I suppose it would be slightly more efficient to
  123. // avoid the allocation here and have children
  124. // strategies handle it
  125. $children = array();
  126. foreach ($node->children as $child) {
  127. if (!$child->dead) $children[] = $child;
  128. }
  129. $result = $def->child->validateChildren($children, $config, $context);
  130. if ($result === true) {
  131. // nop
  132. $node->children = $children;
  133. } elseif ($result === false) {
  134. $node->dead = true;
  135. if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
  136. } else {
  137. $node->children = $result;
  138. if ($e) {
  139. // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
  140. if (empty($result) && !empty($children)) {
  141. $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
  142. } else if ($result != $children) {
  143. $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
  144. }
  145. }
  146. }
  147. }
  148. }
  149. //####################################################################//
  150. // Post-processing
  151. // remove context variables
  152. $context->destroy('IsInline');
  153. $context->destroy('CurrentNode');
  154. $context->destroy('CurrentToken');
  155. //####################################################################//
  156. // Return
  157. return HTMLPurifier_Arborize::flatten($node, $config, $context);
  158. }
  159. }
  160. // vim: et sw=4 sts=4