Nodes.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. <?php
  2. /** Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. * or see http://www.gnu.org/
  18. *
  19. */
  20. /**
  21. * Any element in the DOM tree of an HTML document.
  22. * @ingroup DifferenceEngine
  23. */
  24. class Node {
  25. public $parent;
  26. protected $parentTree;
  27. public $whiteBefore = false;
  28. public $whiteAfter = false;
  29. function __construct($parent) {
  30. $this->parent = $parent;
  31. }
  32. public function getParentTree() {
  33. if (!isset($this->parentTree)) {
  34. if (!is_null($this->parent)) {
  35. $this->parentTree = $this->parent->getParentTree();
  36. $this->parentTree[] = $this->parent;
  37. } else {
  38. $this->parentTree = array();
  39. }
  40. }
  41. return $this->parentTree;
  42. }
  43. public function getLastCommonParent(Node $other) {
  44. $result = new LastCommonParentResult();
  45. $myParents = $this->getParentTree();
  46. $otherParents = $other->getParentTree();
  47. $i = 1;
  48. $isSame = true;
  49. $nbMyParents = count($myParents);
  50. $nbOtherParents = count($otherParents);
  51. while ($isSame && $i < $nbMyParents && $i < $nbOtherParents) {
  52. if (!$myParents[$i]->openingTag === $otherParents[$i]->openingTag) {
  53. $isSame = false;
  54. } else {
  55. // After a while, the index i-1 must be the last common parent
  56. $i++;
  57. }
  58. }
  59. $result->lastCommonParentDepth = $i - 1;
  60. $result->parent = $myParents[$i - 1];
  61. if (!$isSame || $nbMyParents > $nbOtherParents) {
  62. // Not all tags matched, or all tags matched but
  63. // there are tags left in this tree
  64. $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($myParents[$i]);
  65. $result->splittingNeeded = true;
  66. } else if ($nbMyParents <= $nbOtherParents) {
  67. $result->indexInLastCommonParent = $myParents[$i - 1]->getIndexOf($this);
  68. }
  69. return $result;
  70. }
  71. public function setParent($parent) {
  72. $this->parent = $parent;
  73. unset($this->parentTree);
  74. }
  75. public function inPre() {
  76. $tree = $this->getParentTree();
  77. foreach ($tree as &$ancestor) {
  78. if ($ancestor->isPre()) {
  79. return true;
  80. }
  81. }
  82. return false;
  83. }
  84. }
  85. /**
  86. * Node that can contain other nodes. Represents an HTML tag.
  87. * @ingroup DifferenceEngine
  88. */
  89. class TagNode extends Node {
  90. public $children = array();
  91. public $qName;
  92. public $attributes = array();
  93. public $openingTag;
  94. function __construct($parent, $qName, /*array*/ $attributes) {
  95. parent::__construct($parent);
  96. $this->qName = strtolower($qName);
  97. foreach($attributes as $key => &$value){
  98. $this->attributes[strtolower($key)] = $value;
  99. }
  100. return $this->openingTag = Xml::openElement($this->qName, $this->attributes);
  101. }
  102. public function addChildAbsolute(Node $node, $index) {
  103. array_splice($this->children, $index, 0, array($node));
  104. }
  105. public function getIndexOf(Node $child) {
  106. // don't trust array_search with objects
  107. foreach ($this->children as $key => &$value){
  108. if ($value === $child) {
  109. return $key;
  110. }
  111. }
  112. return null;
  113. }
  114. public function getNbChildren() {
  115. return count($this->children);
  116. }
  117. public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
  118. $nodes = array();
  119. $allDeleted = false;
  120. $somethingDeleted = false;
  121. $hasNonDeletedDescendant = false;
  122. if (empty($this->children)) {
  123. return $nodes;
  124. }
  125. foreach ($this->children as &$child) {
  126. $allDeleted_local = false;
  127. $somethingDeleted_local = false;
  128. $childrenChildren = $child->getMinimalDeletedSet($id, $allDeleted_local, $somethingDeleted_local);
  129. if ($somethingDeleted_local) {
  130. $nodes = array_merge($nodes, $childrenChildren);
  131. $somethingDeleted = true;
  132. }
  133. if (!$allDeleted_local) {
  134. $hasNonDeletedDescendant = true;
  135. }
  136. }
  137. if (!$hasNonDeletedDescendant) {
  138. $nodes = array($this);
  139. $allDeleted = true;
  140. }
  141. return $nodes;
  142. }
  143. public function splitUntil(TagNode $parent, Node $split, $includeLeft) {
  144. $splitOccured = false;
  145. if ($parent !== $this) {
  146. $part1 = new TagNode(null, $this->qName, $this->attributes);
  147. $part2 = new TagNode(null, $this->qName, $this->attributes);
  148. $part1->setParent($this->parent);
  149. $part2->setParent($this->parent);
  150. $onSplit = false;
  151. $pastSplit = false;
  152. foreach ($this->children as &$child)
  153. {
  154. if ($child === $split) {
  155. $onSplit = true;
  156. }
  157. if(!$pastSplit || ($onSplit && $includeLeft)) {
  158. $child->setParent($part1);
  159. $part1->children[] = $child;
  160. } else {
  161. $child->setParent($part2);
  162. $part2->children[] = $child;
  163. }
  164. if ($onSplit) {
  165. $onSplit = false;
  166. $pastSplit = true;
  167. }
  168. }
  169. $myindexinparent = $this->parent->getIndexOf($this);
  170. if (!empty($part1->children)) {
  171. $this->parent->addChildAbsolute($part1, $myindexinparent);
  172. }
  173. if (!empty($part2->children)) {
  174. $this->parent->addChildAbsolute($part2, $myindexinparent);
  175. }
  176. if (!empty($part1->children) && !empty($part2->children)) {
  177. $splitOccured = true;
  178. }
  179. $this->parent->removeChild($myindexinparent);
  180. if ($includeLeft) {
  181. $this->parent->splitUntil($parent, $part1, $includeLeft);
  182. } else {
  183. $this->parent->splitUntil($parent, $part2, $includeLeft);
  184. }
  185. }
  186. return $splitOccured;
  187. }
  188. private function removeChild($index) {
  189. unset($this->children[$index]);
  190. $this->children = array_values($this->children);
  191. }
  192. public static $blocks = array('html', 'body','p','blockquote', 'h1',
  193. 'h2', 'h3', 'h4', 'h5', 'pre', 'div', 'ul', 'ol', 'li', 'table',
  194. 'tbody', 'tr', 'td', 'th', 'br');
  195. public function copyTree() {
  196. $newThis = new TagNode(null, $this->qName, $this->attributes);
  197. $newThis->whiteBefore = $this->whiteBefore;
  198. $newThis->whiteAfter = $this->whiteAfter;
  199. foreach ($this->children as &$child) {
  200. $newChild = $child->copyTree();
  201. $newChild->setParent($newThis);
  202. $newThis->children[] = $newChild;
  203. }
  204. return $newThis;
  205. }
  206. public function getMatchRatio(TagNode $other) {
  207. $txtComp = new TextOnlyComparator($other);
  208. return $txtComp->getMatchRatio(new TextOnlyComparator($this));
  209. }
  210. public function expandWhiteSpace() {
  211. $shift = 0;
  212. $spaceAdded = false;
  213. $nbOriginalChildren = $this->getNbChildren();
  214. for ($i = 0; $i < $nbOriginalChildren; ++$i) {
  215. $child = $this->children[$i + $shift];
  216. if ($child instanceof TagNode) {
  217. if (!$child->isPre()) {
  218. $child->expandWhiteSpace();
  219. }
  220. }
  221. if (!$spaceAdded && $child->whiteBefore) {
  222. $ws = new WhiteSpaceNode(null, ' ', $child->getLeftMostChild());
  223. $ws->setParent($this);
  224. $this->addChildAbsolute($ws,$i + ($shift++));
  225. }
  226. if ($child->whiteAfter) {
  227. $ws = new WhiteSpaceNode(null, ' ', $child->getRightMostChild());
  228. $ws->setParent($this);
  229. $this->addChildAbsolute($ws,$i + 1 + ($shift++));
  230. $spaceAdded = true;
  231. } else {
  232. $spaceAdded = false;
  233. }
  234. }
  235. }
  236. public function getLeftMostChild() {
  237. if (empty($this->children)) {
  238. return $this;
  239. }
  240. return $this->children[0]->getLeftMostChild();
  241. }
  242. public function getRightMostChild() {
  243. if (empty($this->children)) {
  244. return $this;
  245. }
  246. return $this->children[$this->getNbChildren() - 1]->getRightMostChild();
  247. }
  248. public function isPre() {
  249. return 0 == strcasecmp($this->qName,'pre');
  250. }
  251. public static function toDiffLine(TagNode $node) {
  252. return $node->openingTag;
  253. }
  254. }
  255. /**
  256. * Represents a piece of text in the HTML file.
  257. * @ingroup DifferenceEngine
  258. */
  259. class TextNode extends Node {
  260. public $text;
  261. public $modification;
  262. function __construct($parent, $text) {
  263. parent::__construct($parent);
  264. $this->modification = new Modification(Modification::NONE);
  265. $this->text = $text;
  266. }
  267. public function copyTree() {
  268. $clone = clone $this;
  269. $clone->setParent(null);
  270. return $clone;
  271. }
  272. public function getLeftMostChild() {
  273. return $this;
  274. }
  275. public function getRightMostChild() {
  276. return $this;
  277. }
  278. public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
  279. if ($this->modification->type == Modification::REMOVED
  280. && $this->modification->id == $id){
  281. $somethingDeleted = true;
  282. $allDeleted = true;
  283. return array($this);
  284. }
  285. return array();
  286. }
  287. public function isSameText($other) {
  288. if (is_null($other) || ! $other instanceof TextNode) {
  289. return false;
  290. }
  291. return str_replace('\n', ' ',$this->text) === str_replace('\n', ' ',$other->text);
  292. }
  293. public static function toDiffLine(TextNode $node) {
  294. return str_replace('\n', ' ',$node->text);
  295. }
  296. }
  297. /**
  298. * @todo Document
  299. * @ingroup DifferenceEngine
  300. */
  301. class WhiteSpaceNode extends TextNode {
  302. function __construct($parent, $s, Node $like = null) {
  303. parent::__construct($parent, $s);
  304. if(!is_null($like) && $like instanceof TextNode) {
  305. $newModification = clone $like->modification;
  306. $newModification->firstOfID = false;
  307. $this->modification = $newModification;
  308. }
  309. }
  310. }
  311. /**
  312. * Represents the root of a HTML document.
  313. * @ingroup DifferenceEngine
  314. */
  315. class BodyNode extends TagNode {
  316. function __construct() {
  317. parent::__construct(null, 'body', array());
  318. }
  319. public function copyTree() {
  320. $newThis = new BodyNode();
  321. foreach ($this->children as &$child) {
  322. $newChild = $child->copyTree();
  323. $newChild->setParent($newThis);
  324. $newThis->children[] = $newChild;
  325. }
  326. return $newThis;
  327. }
  328. public function getMinimalDeletedSet($id, &$allDeleted, &$somethingDeleted) {
  329. $nodes = array();
  330. foreach ($this->children as &$child) {
  331. $childrenChildren = $child->getMinimalDeletedSet($id,
  332. $allDeleted, $somethingDeleted);
  333. $nodes = array_merge($nodes, $childrenChildren);
  334. }
  335. return $nodes;
  336. }
  337. }
  338. /**
  339. * Represents an image in HTML. Even though images do not contain any text they
  340. * are independent visible objects on the page. They are logically a TextNode.
  341. * @ingroup DifferenceEngine
  342. */
  343. class ImageNode extends TextNode {
  344. public $attributes;
  345. function __construct(TagNode $parent, /*array*/ $attrs) {
  346. if(!array_key_exists('src', $attrs)) {
  347. HTMLDiffer::diffDebug( "Image without a source\n" );
  348. parent::__construct($parent, '<img></img>');
  349. }else{
  350. parent::__construct($parent, '<img>' . strtolower($attrs['src']) . '</img>');
  351. }
  352. $this->attributes = $attrs;
  353. }
  354. public function isSameText($other) {
  355. if (is_null($other) || ! $other instanceof ImageNode) {
  356. return false;
  357. }
  358. return $this->text === $other->text;
  359. }
  360. }
  361. /**
  362. * No-op node
  363. * @ingroup DifferenceEngine
  364. */
  365. class DummyNode extends Node {
  366. function __construct() {
  367. // no op
  368. }
  369. }