HistoryBlob.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. <?php
  2. /**
  3. * Base class for general text storage via the "object" flag in old_flags, or
  4. * two-part external storage URLs. Used for represent efficient concatenated
  5. * storage, and migration-related pointer objects.
  6. */
  7. interface HistoryBlob
  8. {
  9. /**
  10. * Adds an item of text, returns a stub object which points to the item.
  11. * You must call setLocation() on the stub object before storing it to the
  12. * database
  13. * Returns the key for getItem()
  14. */
  15. public function addItem( $text );
  16. /**
  17. * Get item by key, or false if the key is not present
  18. */
  19. public function getItem( $key );
  20. /**
  21. * Set the "default text"
  22. * This concept is an odd property of the current DB schema, whereby each text item has a revision
  23. * associated with it. The default text is the text of the associated revision. There may, however,
  24. * be other revisions in the same object.
  25. *
  26. * Default text is not required for two-part external storage URLs.
  27. */
  28. public function setText( $text );
  29. /**
  30. * Get default text. This is called from Revision::getRevisionText()
  31. */
  32. function getText();
  33. }
  34. /**
  35. * Concatenated gzip (CGZ) storage
  36. * Improves compression ratio by concatenating like objects before gzipping
  37. */
  38. class ConcatenatedGzipHistoryBlob implements HistoryBlob
  39. {
  40. public $mVersion = 0, $mCompressed = false, $mItems = array(), $mDefaultHash = '';
  41. public $mSize = 0;
  42. public $mMaxSize = 10000000;
  43. public $mMaxCount = 100;
  44. /** Constructor */
  45. public function ConcatenatedGzipHistoryBlob() {
  46. if ( !function_exists( 'gzdeflate' ) ) {
  47. throw new MWException( "Need zlib support to read or write this kind of history object (ConcatenatedGzipHistoryBlob)\n" );
  48. }
  49. }
  50. public function addItem( $text ) {
  51. $this->uncompress();
  52. $hash = md5( $text );
  53. if ( !isset( $this->mItems[$hash] ) ) {
  54. $this->mItems[$hash] = $text;
  55. $this->mSize += strlen( $text );
  56. }
  57. return $hash;
  58. }
  59. public function getItem( $hash ) {
  60. $this->uncompress();
  61. if ( array_key_exists( $hash, $this->mItems ) ) {
  62. return $this->mItems[$hash];
  63. } else {
  64. return false;
  65. }
  66. }
  67. public function setText( $text ) {
  68. $this->uncompress();
  69. $this->mDefaultHash = $this->addItem( $text );
  70. }
  71. public function getText() {
  72. $this->uncompress();
  73. return $this->getItem( $this->mDefaultHash );
  74. }
  75. /**
  76. * Remove an item
  77. */
  78. public function removeItem( $hash ) {
  79. $this->mSize -= strlen( $this->mItems[$hash] );
  80. unset( $this->mItems[$hash] );
  81. }
  82. /**
  83. * Compress the bulk data in the object
  84. */
  85. public function compress() {
  86. if ( !$this->mCompressed ) {
  87. $this->mItems = gzdeflate( serialize( $this->mItems ) );
  88. $this->mCompressed = true;
  89. }
  90. }
  91. /**
  92. * Uncompress bulk data
  93. */
  94. public function uncompress() {
  95. if ( $this->mCompressed ) {
  96. $this->mItems = unserialize( gzinflate( $this->mItems ) );
  97. $this->mCompressed = false;
  98. }
  99. }
  100. function __sleep() {
  101. $this->compress();
  102. return array( 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' );
  103. }
  104. function __wakeup() {
  105. $this->uncompress();
  106. }
  107. /**
  108. * Helper function for compression jobs
  109. * Returns true until the object is "full" and ready to be committed
  110. */
  111. public function isHappy() {
  112. return $this->mSize < $this->mMaxSize
  113. && count( $this->mItems ) < $this->mMaxCount;
  114. }
  115. }
  116. /**
  117. * One-step cache variable to hold base blobs; operations that
  118. * pull multiple revisions may often pull multiple times from
  119. * the same blob. By keeping the last-used one open, we avoid
  120. * redundant unserialization and decompression overhead.
  121. */
  122. global $wgBlobCache;
  123. $wgBlobCache = array();
  124. /**
  125. * Pointer object for an item within a CGZ blob stored in the text table.
  126. */
  127. class HistoryBlobStub {
  128. var $mOldId, $mHash, $mRef;
  129. /**
  130. * @param string $hash The content hash of the text
  131. * @param integer $oldid The old_id for the CGZ object
  132. */
  133. function HistoryBlobStub( $hash = '', $oldid = 0 ) {
  134. $this->mHash = $hash;
  135. }
  136. /**
  137. * Sets the location (old_id) of the main object to which this object
  138. * points
  139. */
  140. function setLocation( $id ) {
  141. $this->mOldId = $id;
  142. }
  143. /**
  144. * Sets the location (old_id) of the referring object
  145. */
  146. function setReferrer( $id ) {
  147. $this->mRef = $id;
  148. }
  149. /**
  150. * Gets the location of the referring object
  151. */
  152. function getReferrer() {
  153. return $this->mRef;
  154. }
  155. function getText() {
  156. $fname = 'HistoryBlobStub::getText';
  157. global $wgBlobCache;
  158. if( isset( $wgBlobCache[$this->mOldId] ) ) {
  159. $obj = $wgBlobCache[$this->mOldId];
  160. } else {
  161. $dbr = wfGetDB( DB_SLAVE );
  162. $row = $dbr->selectRow( 'text', array( 'old_flags', 'old_text' ), array( 'old_id' => $this->mOldId ) );
  163. if( !$row ) {
  164. return false;
  165. }
  166. $flags = explode( ',', $row->old_flags );
  167. if( in_array( 'external', $flags ) ) {
  168. $url=$row->old_text;
  169. @list( /* $proto */ ,$path)=explode('://',$url,2);
  170. if ($path=="") {
  171. wfProfileOut( $fname );
  172. return false;
  173. }
  174. $row->old_text=ExternalStore::fetchFromUrl($url);
  175. }
  176. if( !in_array( 'object', $flags ) ) {
  177. return false;
  178. }
  179. if( in_array( 'gzip', $flags ) ) {
  180. // This shouldn't happen, but a bug in the compress script
  181. // may at times gzip-compress a HistoryBlob object row.
  182. $obj = unserialize( gzinflate( $row->old_text ) );
  183. } else {
  184. $obj = unserialize( $row->old_text );
  185. }
  186. if( !is_object( $obj ) ) {
  187. // Correct for old double-serialization bug.
  188. $obj = unserialize( $obj );
  189. }
  190. // Save this item for reference; if pulling many
  191. // items in a row we'll likely use it again.
  192. $obj->uncompress();
  193. $wgBlobCache = array( $this->mOldId => $obj );
  194. }
  195. return $obj->getItem( $this->mHash );
  196. }
  197. /**
  198. * Get the content hash
  199. */
  200. function getHash() {
  201. return $this->mHash;
  202. }
  203. }
  204. /**
  205. * To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the
  206. * leftover cur table as the backend. This avoids expensively copying hundreds
  207. * of megabytes of data during the conversion downtime.
  208. *
  209. * Serialized HistoryBlobCurStub objects will be inserted into the text table
  210. * on conversion if $wgFastSchemaUpgrades is set to true.
  211. */
  212. class HistoryBlobCurStub {
  213. var $mCurId;
  214. /**
  215. * @param integer $curid The cur_id pointed to
  216. */
  217. function HistoryBlobCurStub( $curid = 0 ) {
  218. $this->mCurId = $curid;
  219. }
  220. /**
  221. * Sets the location (cur_id) of the main object to which this object
  222. * points
  223. */
  224. function setLocation( $id ) {
  225. $this->mCurId = $id;
  226. }
  227. function getText() {
  228. $dbr = wfGetDB( DB_SLAVE );
  229. $row = $dbr->selectRow( 'cur', array( 'cur_text' ), array( 'cur_id' => $this->mCurId ) );
  230. if( !$row ) {
  231. return false;
  232. }
  233. return $row->cur_text;
  234. }
  235. }
  236. /**
  237. * Diff-based history compression
  238. * Requires xdiff 1.5+ and zlib
  239. */
  240. class DiffHistoryBlob implements HistoryBlob {
  241. /** Uncompressed item cache */
  242. var $mItems = array();
  243. /** Total uncompressed size */
  244. var $mSize = 0;
  245. /**
  246. * Array of diffs. If a diff D from A to B is notated D = B - A, and Z is
  247. * an empty string:
  248. *
  249. * { item[map[i]] - item[map[i-1]] where i > 0
  250. * diff[i] = {
  251. * { item[map[i]] - Z where i = 0
  252. */
  253. var $mDiffs;
  254. /** The diff map, see above */
  255. var $mDiffMap;
  256. /**
  257. * The key for getText()
  258. */
  259. var $mDefaultKey;
  260. /**
  261. * Compressed storage
  262. */
  263. var $mCompressed;
  264. /**
  265. * True if the object is locked against further writes
  266. */
  267. var $mFrozen = false;
  268. /**
  269. * The maximum uncompressed size before the object becomes sad
  270. * Should be less than max_allowed_packet
  271. */
  272. var $mMaxSize = 10000000;
  273. /**
  274. * The maximum number of text items before the object becomes sad
  275. */
  276. var $mMaxCount = 100;
  277. /** Constants from xdiff.h */
  278. const XDL_BDOP_INS = 1;
  279. const XDL_BDOP_CPY = 2;
  280. const XDL_BDOP_INSB = 3;
  281. function __construct() {
  282. if ( !function_exists( 'gzdeflate' ) ) {
  283. throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
  284. }
  285. }
  286. function addItem( $text ) {
  287. if ( $this->mFrozen ) {
  288. throw new MWException( __METHOD__.": Cannot add more items after sleep/wakeup" );
  289. }
  290. $this->mItems[] = $text;
  291. $this->mSize += strlen( $text );
  292. $this->mDiffs = null; // later
  293. return count( $this->mItems ) - 1;
  294. }
  295. function getItem( $key ) {
  296. return $this->mItems[$key];
  297. }
  298. function setText( $text ) {
  299. $this->mDefaultKey = $this->addItem( $text );
  300. }
  301. function getText() {
  302. return $this->getItem( $this->mDefaultKey );
  303. }
  304. function compress() {
  305. if ( !function_exists( 'xdiff_string_rabdiff' ) ){
  306. throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
  307. }
  308. if ( isset( $this->mDiffs ) ) {
  309. // Already compressed
  310. return;
  311. }
  312. if ( !count( $this->mItems ) ) {
  313. // Empty
  314. return;
  315. }
  316. // Create two diff sequences: one for main text and one for small text
  317. $sequences = array(
  318. 'small' => array(
  319. 'tail' => '',
  320. 'diffs' => array(),
  321. 'map' => array(),
  322. ),
  323. 'main' => array(
  324. 'tail' => '',
  325. 'diffs' => array(),
  326. 'map' => array(),
  327. ),
  328. );
  329. $smallFactor = 0.5;
  330. for ( $i = 0; $i < count( $this->mItems ); $i++ ) {
  331. $text = $this->mItems[$i];
  332. if ( $i == 0 ) {
  333. $seqName = 'main';
  334. } else {
  335. $mainTail = $sequences['main']['tail'];
  336. if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
  337. $seqName = 'small';
  338. } else {
  339. $seqName = 'main';
  340. }
  341. }
  342. $seq =& $sequences[$seqName];
  343. $tail = $seq['tail'];
  344. $diff = $this->diff( $tail, $text );
  345. $seq['diffs'][] = $diff;
  346. $seq['map'][] = $i;
  347. $seq['tail'] = $text;
  348. }
  349. unset( $seq ); // unlink dangerous alias
  350. // Knit the sequences together
  351. $tail = '';
  352. $this->mDiffs = array();
  353. $this->mDiffMap = array();
  354. foreach ( $sequences as $seq ) {
  355. if ( !count( $seq['diffs'] ) ) {
  356. continue;
  357. }
  358. if ( $tail === '' ) {
  359. $this->mDiffs[] = $seq['diffs'][0];
  360. } else {
  361. $head = $this->patch( '', $seq['diffs'][0] );
  362. $this->mDiffs[] = $this->diff( $tail, $head );
  363. }
  364. $this->mDiffMap[] = $seq['map'][0];
  365. for ( $i = 1; $i < count( $seq['diffs'] ); $i++ ) {
  366. $this->mDiffs[] = $seq['diffs'][$i];
  367. $this->mDiffMap[] = $seq['map'][$i];
  368. }
  369. $tail = $seq['tail'];
  370. }
  371. }
  372. function diff( $t1, $t2 ) {
  373. # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
  374. # "String is not zero-terminated"
  375. wfSuppressWarnings();
  376. $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
  377. wfRestoreWarnings();
  378. return $diff;
  379. }
  380. function patch( $base, $diff ) {
  381. if ( function_exists( 'xdiff_string_bpatch' ) ) {
  382. wfSuppressWarnings();
  383. $text = xdiff_string_bpatch( $base, $diff ) . '';
  384. wfRestoreWarnings();
  385. return $text;
  386. }
  387. # Pure PHP implementation
  388. $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
  389. # Check the checksum if mhash is available
  390. if ( extension_loaded( 'mhash' ) ) {
  391. $ofp = mhash( MHASH_ADLER32, $base );
  392. if ( $ofp !== substr( $diff, 0, 4 ) ) {
  393. wfDebug( __METHOD__. ": incorrect base checksum\n" );
  394. return false;
  395. }
  396. }
  397. if ( $header['csize'] != strlen( $base ) ) {
  398. wfDebug( __METHOD__. ": incorrect base length\n" );
  399. return false;
  400. }
  401. $p = 8;
  402. $out = '';
  403. while ( $p < strlen( $diff ) ) {
  404. $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
  405. $op = $x['op'];
  406. ++$p;
  407. switch ( $op ) {
  408. case self::XDL_BDOP_INS:
  409. $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
  410. $p++;
  411. $out .= substr( $diff, $p, $x['size'] );
  412. $p += $x['size'];
  413. break;
  414. case self::XDL_BDOP_INSB:
  415. $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
  416. $p += 4;
  417. $out .= substr( $diff, $p, $x['csize'] );
  418. $p += $x['csize'];
  419. break;
  420. case self::XDL_BDOP_CPY:
  421. $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
  422. $p += 8;
  423. $out .= substr( $base, $x['off'], $x['csize'] );
  424. break;
  425. default:
  426. wfDebug( __METHOD__.": invalid op\n" );
  427. return false;
  428. }
  429. }
  430. return $out;
  431. }
  432. function uncompress() {
  433. if ( !$this->mDiffs ) {
  434. return;
  435. }
  436. $tail = '';
  437. for ( $diffKey = 0; $diffKey < count( $this->mDiffs ); $diffKey++ ) {
  438. $textKey = $this->mDiffMap[$diffKey];
  439. $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
  440. $this->mItems[$textKey] = $text;
  441. $tail = $text;
  442. }
  443. }
  444. function __sleep() {
  445. $this->compress();
  446. if ( !count( $this->mItems ) ) {
  447. // Empty object
  448. $info = false;
  449. } else {
  450. // Take forward differences to improve the compression ratio for sequences
  451. $map = '';
  452. $prev = 0;
  453. foreach ( $this->mDiffMap as $i ) {
  454. if ( $map !== '' ) {
  455. $map .= ',';
  456. }
  457. $map .= $i - $prev;
  458. $prev = $i;
  459. }
  460. $info = array(
  461. 'diffs' => $this->mDiffs,
  462. 'map' => $map
  463. );
  464. }
  465. if ( isset( $this->mDefaultKey ) ) {
  466. $info['default'] = $this->mDefaultKey;
  467. }
  468. $this->mCompressed = gzdeflate( serialize( $info ) );
  469. return array( 'mCompressed' );
  470. }
  471. function __wakeup() {
  472. // addItem() doesn't work if mItems is partially filled from mDiffs
  473. $this->mFrozen = true;
  474. $info = unserialize( gzinflate( $this->mCompressed ) );
  475. unset( $this->mCompressed );
  476. if ( !$info ) {
  477. // Empty object
  478. return;
  479. }
  480. if ( isset( $info['default'] ) ) {
  481. $this->mDefaultKey = $info['default'];
  482. }
  483. $this->mDiffs = $info['diffs'];
  484. if ( isset( $info['base'] ) ) {
  485. // Old format
  486. $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
  487. array_unshift( $this->mDiffs,
  488. pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
  489. $info['base'] );
  490. } else {
  491. // New format
  492. $map = explode( ',', $info['map'] );
  493. $cur = 0;
  494. $this->mDiffMap = array();
  495. foreach ( $map as $i ) {
  496. $cur += $i;
  497. $this->mDiffMap[] = $cur;
  498. }
  499. }
  500. $this->uncompress();
  501. }
  502. /**
  503. * Helper function for compression jobs
  504. * Returns true until the object is "full" and ready to be committed
  505. */
  506. function isHappy() {
  507. return $this->mSize < $this->mMaxSize
  508. && count( $this->mItems ) < $this->mMaxCount;
  509. }
  510. }