Diff.php 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. <?php
  2. /* Copyright (C) 2008 Guy Van den Broeck <guy@guyvdb.eu>
  3. *
  4. * This program is free software; you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation; either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17. * or see http://www.gnu.org/
  18. */
  19. /**
  20. * This diff implementation is mainly lifted from the LCS algorithm of the Eclipse project which
  21. * in turn is based on Myers' "An O(ND) difference algorithm and its variations"
  22. * (http://citeseer.ist.psu.edu/myers86ond.html) with range compression (see Wu et al.'s
  23. * "An O(NP) Sequence Comparison Algorithm").
  24. *
  25. * This implementation supports an upper bound on the excution time.
  26. *
  27. * Complexity: O((M + N)D) worst case time, O(M + N + D^2) expected time, O(M + N) space
  28. *
  29. * @author Guy Van den Broeck
  30. * @ingroup DifferenceEngine
  31. */
  32. class WikiDiff3 {
  33. //Input variables
  34. private $from;
  35. private $to;
  36. private $m;
  37. private $n;
  38. private $tooLong;
  39. private $powLimit;
  40. //State variables
  41. private $maxDifferences;
  42. private $lcsLengthCorrectedForHeuristic = false;
  43. //Output variables
  44. public $length;
  45. public $removed;
  46. public $added;
  47. public $heuristicUsed;
  48. function __construct($tooLong = 2000000, $powLimit = 1.45){
  49. $this->tooLong = $tooLong;
  50. $this->powLimit = $powLimit;
  51. }
  52. public function diff(/*array*/ $from, /*array*/ $to){
  53. //remember initial lengths
  54. $m = sizeof($from);
  55. $n = count($to);
  56. $this->heuristicUsed = false;
  57. //output
  58. $removed = $m > 0 ? array_fill(0, $m, true) : array();
  59. $added = $n > 0 ? array_fill(0, $n, true) : array();
  60. //reduce the complexity for the next step (intentionally done twice)
  61. //remove common tokens at the start
  62. $i = 0;
  63. while($i < $m && $i < $n && $from[$i] === $to[$i]) {
  64. $removed[$i] = $added[$i] = false;
  65. unset($from[$i], $to[$i]);
  66. ++$i;
  67. }
  68. //remove common tokens at the end
  69. $j = 1;
  70. while($i + $j <= $m && $i + $j <= $n && $from[$m - $j] === $to[$n - $j]) {
  71. $removed[$m - $j] = $added[$n - $j] = false;
  72. unset($from[$m - $j], $to[$n - $j]);
  73. ++$j;
  74. }
  75. $this->from = $newFromIndex = $this->to = $newToIndex = array();
  76. //remove tokens not in both sequences
  77. $shared = array();
  78. foreach( $from as $key ) {
  79. $shared[$key] = false;
  80. }
  81. foreach($to as $index => &$el) {
  82. if(array_key_exists($el, $shared)) {
  83. //keep it
  84. $this->to[] = $el;
  85. $shared[$el] = true;
  86. $newToIndex[] = $index;
  87. }
  88. }
  89. foreach($from as $index => &$el) {
  90. if($shared[$el]) {
  91. //keep it
  92. $this->from[] = $el;
  93. $newFromIndex[] = $index;
  94. }
  95. }
  96. unset($shared, $from, $to);
  97. $this->m = count($this->from);
  98. $this->n = count($this->to);
  99. $this->removed = $this->m > 0 ? array_fill(0, $this->m, true) : array();
  100. $this->added = $this->n > 0 ? array_fill(0, $this->n, true) : array();
  101. if ($this->m == 0 || $this->n == 0) {
  102. $this->length = 0;
  103. } else {
  104. $this->maxDifferences = ceil(($this->m + $this->n) / 2.0);
  105. if ($this->m * $this->n > $this->tooLong) {
  106. // limit complexity to D^POW_LIMIT for long sequences
  107. $this->maxDifferences = floor(pow($this->maxDifferences, $this->powLimit - 1.0));
  108. wfDebug("Limiting max number of differences to $this->maxDifferences\n");
  109. }
  110. /*
  111. * The common prefixes and suffixes are always part of some LCS, include
  112. * them now to reduce our search space
  113. */
  114. $max = min($this->m, $this->n);
  115. for ($forwardBound = 0; $forwardBound < $max
  116. && $this->from[$forwardBound] === $this->to[$forwardBound];
  117. ++$forwardBound) {
  118. $this->removed[$forwardBound] = $this->added[$forwardBound] = false;
  119. }
  120. $backBoundL1 = $this->m - 1;
  121. $backBoundL2 = $this->n - 1;
  122. while ($backBoundL1 >= $forwardBound && $backBoundL2 >= $forwardBound
  123. && $this->from[$backBoundL1] === $this->to[$backBoundL2]) {
  124. $this->removed[$backBoundL1--] = $this->added[$backBoundL2--] = false;
  125. }
  126. $temp = array_fill(0, $this->m + $this->n + 1, 0);
  127. $V = array($temp, $temp);
  128. $snake = array(0, 0, 0);
  129. $this->length = $forwardBound + $this->m - $backBoundL1 - 1
  130. + $this->lcs_rec($forwardBound, $backBoundL1,
  131. $forwardBound, $backBoundL2, $V, $snake);
  132. }
  133. $this->m = $m;
  134. $this->n = $n;
  135. $this->length += $i + $j - 1;
  136. foreach($this->removed as $key => &$removed_elem) {
  137. if(!$removed_elem) {
  138. $removed[$newFromIndex[$key]] = false;
  139. }
  140. }
  141. foreach($this->added as $key => &$added_elem) {
  142. if(!$added_elem) {
  143. $added[$newToIndex[$key]] = false;
  144. }
  145. }
  146. $this->removed = $removed;
  147. $this->added = $added;
  148. }
  149. function diff_range($from_lines, $to_lines) {
  150. // Diff and store locally
  151. $this->diff($from_lines, $to_lines);
  152. unset($from_lines, $to_lines);
  153. $ranges = array();
  154. $xi = $yi = 0;
  155. while ($xi < $this->m || $yi < $this->n) {
  156. // Matching "snake".
  157. while ($xi < $this->m && $yi < $this->n
  158. && !$this->removed[$xi]
  159. && !$this->added[$yi]) {
  160. ++$xi;
  161. ++$yi;
  162. }
  163. // Find deletes & adds.
  164. $xstart = $xi;
  165. while ($xi < $this->m && $this->removed[$xi]) {
  166. ++$xi;
  167. }
  168. $ystart = $yi;
  169. while ($yi < $this->n && $this->added[$yi]) {
  170. ++$yi;
  171. }
  172. if ($xi > $xstart || $yi > $ystart) {
  173. $ranges[] = new RangeDifference($xstart, $xi,
  174. $ystart, $yi);
  175. }
  176. }
  177. return $ranges;
  178. }
  179. private function lcs_rec($bottoml1, $topl1, $bottoml2, $topl2, &$V, &$snake) {
  180. // check that both sequences are non-empty
  181. if ($bottoml1 > $topl1 || $bottoml2 > $topl2) {
  182. return 0;
  183. }
  184. $d = $this->find_middle_snake($bottoml1, $topl1, $bottoml2,
  185. $topl2, $V, $snake);
  186. // need to store these so we don't lose them when they're
  187. // overwritten by the recursion
  188. $len = $snake[2];
  189. $startx = $snake[0];
  190. $starty = $snake[1];
  191. // the middle snake is part of the LCS, store it
  192. for ($i = 0; $i < $len; ++$i) {
  193. $this->removed[$startx + $i] = $this->added[$starty + $i] = false;
  194. }
  195. if ($d > 1) {
  196. return $len
  197. + $this->lcs_rec($bottoml1, $startx - 1, $bottoml2,
  198. $starty - 1, $V, $snake)
  199. + $this->lcs_rec($startx + $len, $topl1, $starty + $len,
  200. $topl2, $V, $snake);
  201. } else if ($d == 1) {
  202. /*
  203. * In this case the sequences differ by exactly 1 line. We have
  204. * already saved all the lines after the difference in the for loop
  205. * above, now we need to save all the lines before the difference.
  206. */
  207. $max = min($startx - $bottoml1, $starty - $bottoml2);
  208. for ($i = 0; $i < $max; ++$i) {
  209. $this->removed[$bottoml1 + $i] =
  210. $this->added[$bottoml2 + $i] = false;
  211. }
  212. return $max + $len;
  213. }
  214. return $len;
  215. }
  216. private function find_middle_snake($bottoml1, $topl1, $bottoml2,$topl2, &$V, &$snake) {
  217. $from = &$this->from;
  218. $to = &$this->to;
  219. $V0 = &$V[0];
  220. $V1 = &$V[1];
  221. $snake0 = &$snake[0];
  222. $snake1 = &$snake[1];
  223. $snake2 = &$snake[2];
  224. $bottoml1_min_1 = $bottoml1-1;
  225. $bottoml2_min_1 = $bottoml2-1;
  226. $N = $topl1 - $bottoml1_min_1;
  227. $M = $topl2 - $bottoml2_min_1;
  228. $delta = $N - $M;
  229. $maxabsx = $N+$bottoml1;
  230. $maxabsy = $M+$bottoml2;
  231. $limit = min($this->maxDifferences, ceil(($N + $M ) / 2));
  232. //value_to_add_forward: a 0 or 1 that we add to the start
  233. // offset to make it odd/even
  234. if (($M & 1) == 1) {
  235. $value_to_add_forward = 1;
  236. } else {
  237. $value_to_add_forward = 0;
  238. }
  239. if (($N & 1) == 1) {
  240. $value_to_add_backward = 1;
  241. } else {
  242. $value_to_add_backward = 0;
  243. }
  244. $start_forward = -$M;
  245. $end_forward = $N;
  246. $start_backward = -$N;
  247. $end_backward = $M;
  248. $limit_min_1 = $limit - 1;
  249. $limit_plus_1 = $limit + 1;
  250. $V0[$limit_plus_1] = 0;
  251. $V1[$limit_min_1] = $N;
  252. $limit = min($this->maxDifferences, ceil(($N + $M ) / 2));
  253. if (($delta & 1) == 1) {
  254. for ($d = 0; $d <= $limit; ++$d) {
  255. $start_diag = max($value_to_add_forward + $start_forward, -$d);
  256. $end_diag = min($end_forward, $d);
  257. $value_to_add_forward = 1 - $value_to_add_forward;
  258. // compute forward furthest reaching paths
  259. for ($k = $start_diag; $k <= $end_diag; $k += 2) {
  260. if ($k == -$d || ($k < $d
  261. && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k])) {
  262. $x = $V0[$limit_plus_1 + $k];
  263. } else {
  264. $x = $V0[$limit_min_1 + $k] + 1;
  265. }
  266. $absx = $snake0 = $x + $bottoml1;
  267. $absy = $snake1 = $x - $k + $bottoml2;
  268. while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) {
  269. ++$absx;
  270. ++$absy;
  271. }
  272. $x = $absx-$bottoml1;
  273. $snake2 = $absx -$snake0;
  274. $V0[$limit + $k] = $x;
  275. if ($k >= $delta - $d + 1 && $k <= $delta + $d - 1
  276. && $x >= $V1[$limit + $k - $delta]) {
  277. return 2 * $d - 1;
  278. }
  279. // check to see if we can cut down the diagonal range
  280. if ($x >= $N && $end_forward > $k - 1) {
  281. $end_forward = $k - 1;
  282. } else if ($absy - $bottoml2 >= $M) {
  283. $start_forward = $k + 1;
  284. $value_to_add_forward = 0;
  285. }
  286. }
  287. $start_diag = max($value_to_add_backward + $start_backward, -$d);
  288. $end_diag = min($end_backward, $d);
  289. $value_to_add_backward = 1 - $value_to_add_backward;
  290. // compute backward furthest reaching paths
  291. for ($k = $start_diag; $k <= $end_diag; $k += 2) {
  292. if ($k == $d
  293. || ($k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k])) {
  294. $x = $V1[$limit_min_1 + $k];
  295. } else {
  296. $x = $V1[$limit_plus_1 + $k] - 1;
  297. }
  298. $y = $x - $k - $delta;
  299. $snake2 = 0;
  300. while ($x > 0 && $y > 0
  301. && $from[$x +$bottoml1_min_1] === $to[$y + $bottoml2_min_1]) {
  302. --$x;
  303. --$y;
  304. ++$snake2;
  305. }
  306. $V1[$limit + $k] = $x;
  307. // check to see if we can cut down our diagonal range
  308. if ($x <= 0) {
  309. $start_backward = $k + 1;
  310. $value_to_add_backward = 0;
  311. } else if ($y <= 0 && $end_backward > $k - 1) {
  312. $end_backward = $k - 1;
  313. }
  314. }
  315. }
  316. } else {
  317. for ($d = 0; $d <= $limit; ++$d) {
  318. $start_diag = max($value_to_add_forward + $start_forward, -$d);
  319. $end_diag = min($end_forward, $d);
  320. $value_to_add_forward = 1 - $value_to_add_forward;
  321. // compute forward furthest reaching paths
  322. for ($k = $start_diag; $k <= $end_diag; $k += 2) {
  323. if ($k == -$d
  324. || ($k < $d && $V0[$limit_min_1 + $k] < $V0[$limit_plus_1 + $k])) {
  325. $x = $V0[$limit_plus_1 + $k];
  326. } else {
  327. $x = $V0[$limit_min_1 + $k] + 1;
  328. }
  329. $absx = $snake0 = $x + $bottoml1;
  330. $absy = $snake1 = $x - $k + $bottoml2;
  331. while ($absx < $maxabsx && $absy < $maxabsy && $from[$absx] === $to[$absy]) {
  332. ++$absx;
  333. ++$absy;
  334. }
  335. $x = $absx-$bottoml1;
  336. $snake2 = $absx -$snake0;
  337. $V0[$limit + $k] = $x;
  338. // check to see if we can cut down the diagonal range
  339. if ($x >= $N && $end_forward > $k - 1) {
  340. $end_forward = $k - 1;
  341. } else if ($absy-$bottoml2 >= $M) {
  342. $start_forward = $k + 1;
  343. $value_to_add_forward = 0;
  344. }
  345. }
  346. $start_diag = max($value_to_add_backward + $start_backward, -$d);
  347. $end_diag = min($end_backward, $d);
  348. $value_to_add_backward = 1 - $value_to_add_backward;
  349. // compute backward furthest reaching paths
  350. for ($k = $start_diag; $k <= $end_diag; $k += 2) {
  351. if ($k == $d
  352. || ($k != -$d && $V1[$limit_min_1 + $k] < $V1[$limit_plus_1 + $k])) {
  353. $x = $V1[$limit_min_1 + $k];
  354. } else {
  355. $x = $V1[$limit_plus_1 + $k] - 1;
  356. }
  357. $y = $x - $k - $delta;
  358. $snake2 = 0;
  359. while ($x > 0 && $y > 0
  360. && $from[$x +$bottoml1_min_1] === $to[$y + $bottoml2_min_1]) {
  361. --$x;
  362. --$y;
  363. ++$snake2;
  364. }
  365. $V1[$limit + $k] = $x;
  366. if ($k >= -$delta - $d && $k <= $d - $delta
  367. && $x <= $V0[$limit + $k + $delta]) {
  368. $snake0 = $bottoml1 + $x;
  369. $snake1 = $bottoml2 + $y;
  370. return 2 * $d;
  371. }
  372. // check to see if we can cut down our diagonal range
  373. if ($x <= 0) {
  374. $start_backward = $k + 1;
  375. $value_to_add_backward = 0;
  376. } else if ($y <= 0 && $end_backward > $k - 1) {
  377. $end_backward = $k - 1;
  378. }
  379. }
  380. }
  381. }
  382. /*
  383. * computing the true LCS is too expensive, instead find the diagonal
  384. * with the most progress and pretend a midle snake of length 0 occurs
  385. * there.
  386. */
  387. $most_progress = self::findMostProgress($M, $N, $limit, $V);
  388. $snake0 = $bottoml1 + $most_progress[0];
  389. $snake1 = $bottoml2 + $most_progress[1];
  390. $snake2 = 0;
  391. wfDebug("Computing the LCS is too expensive. Using a heuristic.\n");
  392. $this->heuristicUsed = true;
  393. return 5; /*
  394. * HACK: since we didn't really finish the LCS computation
  395. * we don't really know the length of the SES. We don't do
  396. * anything with the result anyway, unless it's <=1. We know
  397. * for a fact SES > 1 so 5 is as good a number as any to
  398. * return here
  399. */
  400. }
  401. private static function findMostProgress($M, $N, $limit, $V) {
  402. $delta = $N - $M;
  403. if (($M & 1) == ($limit & 1)) {
  404. $forward_start_diag = max(-$M, -$limit);
  405. } else {
  406. $forward_start_diag = max(1 - $M, -$limit);
  407. }
  408. $forward_end_diag = min($N, $limit);
  409. if (($N & 1) == ($limit & 1)) {
  410. $backward_start_diag = max(-$N, -$limit);
  411. } else {
  412. $backward_start_diag = max(1 - $N, -$limit);
  413. }
  414. $backward_end_diag = -min($M, $limit);
  415. $temp = array(0, 0, 0);
  416. $max_progress = array_fill(0, ceil(max($forward_end_diag - $forward_start_diag,
  417. $backward_end_diag - $backward_start_diag) / 2), $temp);
  418. $num_progress = 0; // the 1st entry is current, it is initialized
  419. // with 0s
  420. // first search the forward diagonals
  421. for ($k = $forward_start_diag; $k <= $forward_end_diag; $k += 2) {
  422. $x = $V[0][$limit + $k];
  423. $y = $x - $k;
  424. if ($x > $N || $y > $M) {
  425. continue;
  426. }
  427. $progress = $x + $y;
  428. if ($progress > $max_progress[0][2]) {
  429. $num_progress = 0;
  430. $max_progress[0][0] = $x;
  431. $max_progress[0][1] = $y;
  432. $max_progress[0][2] = $progress;
  433. } else if ($progress == $max_progress[0][2]) {
  434. ++$num_progress;
  435. $max_progress[$num_progress][0] = $x;
  436. $max_progress[$num_progress][1] = $y;
  437. $max_progress[$num_progress][2] = $progress;
  438. }
  439. }
  440. $max_progress_forward = true; // initially the maximum
  441. // progress is in the forward
  442. // direction
  443. // now search the backward diagonals
  444. for ($k = $backward_start_diag; $k <= $backward_end_diag; $k += 2) {
  445. $x = $V[1][$limit + $k];
  446. $y = $x - $k - $delta;
  447. if ($x < 0 || $y < 0) {
  448. continue;
  449. }
  450. $progress = $N - $x + $M - $y;
  451. if ($progress > $max_progress[0][2]) {
  452. $num_progress = 0;
  453. $max_progress_forward = false;
  454. $max_progress[0][0] = $x;
  455. $max_progress[0][1] = $y;
  456. $max_progress[0][2] = $progress;
  457. } else if ($progress == $max_progress[0][2] && !$max_progress_forward) {
  458. ++$num_progress;
  459. $max_progress[$num_progress][0] = $x;
  460. $max_progress[$num_progress][1] = $y;
  461. $max_progress[$num_progress][2] = $progress;
  462. }
  463. }
  464. // return the middle diagonal with maximal progress.
  465. return $max_progress[floor($num_progress / 2)];
  466. }
  467. public function getLcsLength(){
  468. if($this->heuristicUsed && !$this->lcsLengthCorrectedForHeuristic){
  469. $this->lcsLengthCorrectedForHeuristic = true;
  470. $this->length = $this->m-array_sum($this->added);
  471. }
  472. return $this->length;
  473. }
  474. }
  475. /**
  476. * Alternative representation of a set of changes, by the index
  477. * ranges that are changed.
  478. *
  479. * @ingroup DifferenceEngine
  480. */
  481. class RangeDifference {
  482. public $leftstart;
  483. public $leftend;
  484. public $leftlength;
  485. public $rightstart;
  486. public $rightend;
  487. public $rightlength;
  488. function __construct($leftstart, $leftend, $rightstart, $rightend){
  489. $this->leftstart = $leftstart;
  490. $this->leftend = $leftend;
  491. $this->leftlength = $leftend - $leftstart;
  492. $this->rightstart = $rightstart;
  493. $this->rightend = $rightend;
  494. $this->rightlength = $rightend - $rightstart;
  495. }
  496. }