HistoryBlob.php 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. <?php
  2. /**
  3. * Efficient concatenated text storage.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. */
  22. /**
  23. * Base class for general text storage via the "object" flag in old_flags, or
  24. * two-part external storage URLs. Used for represent efficient concatenated
  25. * storage, and migration-related pointer objects.
  26. */
  27. interface HistoryBlob {
  28. /**
  29. * Adds an item of text, returns a stub object which points to the item.
  30. * You must call setLocation() on the stub object before storing it to the
  31. * database
  32. *
  33. * @param string $text
  34. *
  35. * @return string The key for getItem()
  36. */
  37. function addItem( $text );
  38. /**
  39. * Get item by key, or false if the key is not present
  40. *
  41. * @param string $key
  42. *
  43. * @return string|bool
  44. */
  45. function getItem( $key );
  46. /**
  47. * Set the "default text"
  48. * This concept is an odd property of the current DB schema, whereby each text item has a revision
  49. * associated with it. The default text is the text of the associated revision. There may, however,
  50. * be other revisions in the same object.
  51. *
  52. * Default text is not required for two-part external storage URLs.
  53. *
  54. * @param string $text
  55. */
  56. function setText( $text );
  57. /**
  58. * Get default text. This is called from Revision::getRevisionText()
  59. *
  60. * @return string
  61. */
  62. function getText();
  63. }
  64. /**
  65. * Concatenated gzip (CGZ) storage
  66. * Improves compression ratio by concatenating like objects before gzipping
  67. */
  68. class ConcatenatedGzipHistoryBlob implements HistoryBlob {
  69. public $mVersion = 0, $mCompressed = false, $mItems = [], $mDefaultHash = '';
  70. public $mSize = 0;
  71. public $mMaxSize = 10000000;
  72. public $mMaxCount = 100;
  73. public function __construct() {
  74. if ( !function_exists( 'gzdeflate' ) ) {
  75. throw new MWException( "Need zlib support to read or write this "
  76. . "kind of history object (ConcatenatedGzipHistoryBlob)\n" );
  77. }
  78. }
  79. /**
  80. * @param string $text
  81. * @return string
  82. */
  83. public function addItem( $text ) {
  84. $this->uncompress();
  85. $hash = md5( $text );
  86. if ( !isset( $this->mItems[$hash] ) ) {
  87. $this->mItems[$hash] = $text;
  88. $this->mSize += strlen( $text );
  89. }
  90. return $hash;
  91. }
  92. /**
  93. * @param string $hash
  94. * @return array|bool
  95. */
  96. public function getItem( $hash ) {
  97. $this->uncompress();
  98. if ( array_key_exists( $hash, $this->mItems ) ) {
  99. return $this->mItems[$hash];
  100. } else {
  101. return false;
  102. }
  103. }
  104. /**
  105. * @param string $text
  106. * @return void
  107. */
  108. public function setText( $text ) {
  109. $this->uncompress();
  110. $this->mDefaultHash = $this->addItem( $text );
  111. }
  112. /**
  113. * @return array|bool
  114. */
  115. public function getText() {
  116. $this->uncompress();
  117. return $this->getItem( $this->mDefaultHash );
  118. }
  119. /**
  120. * Remove an item
  121. *
  122. * @param string $hash
  123. */
  124. public function removeItem( $hash ) {
  125. $this->mSize -= strlen( $this->mItems[$hash] );
  126. unset( $this->mItems[$hash] );
  127. }
  128. /**
  129. * Compress the bulk data in the object
  130. */
  131. public function compress() {
  132. if ( !$this->mCompressed ) {
  133. $this->mItems = gzdeflate( serialize( $this->mItems ) );
  134. $this->mCompressed = true;
  135. }
  136. }
  137. /**
  138. * Uncompress bulk data
  139. */
  140. public function uncompress() {
  141. if ( $this->mCompressed ) {
  142. $this->mItems = unserialize( gzinflate( $this->mItems ) );
  143. $this->mCompressed = false;
  144. }
  145. }
  146. /**
  147. * @return array
  148. */
  149. function __sleep() {
  150. $this->compress();
  151. return [ 'mVersion', 'mCompressed', 'mItems', 'mDefaultHash' ];
  152. }
  153. function __wakeup() {
  154. $this->uncompress();
  155. }
  156. /**
  157. * Helper function for compression jobs
  158. * Returns true until the object is "full" and ready to be committed
  159. *
  160. * @return bool
  161. */
  162. public function isHappy() {
  163. return $this->mSize < $this->mMaxSize
  164. && count( $this->mItems ) < $this->mMaxCount;
  165. }
  166. }
  167. /**
  168. * Pointer object for an item within a CGZ blob stored in the text table.
  169. */
  170. class HistoryBlobStub {
  171. /**
  172. * @var array One-step cache variable to hold base blobs; operations that
  173. * pull multiple revisions may often pull multiple times from the same
  174. * blob. By keeping the last-used one open, we avoid redundant
  175. * unserialization and decompression overhead.
  176. */
  177. protected static $blobCache = [];
  178. /** @var int */
  179. public $mOldId;
  180. /** @var string */
  181. public $mHash;
  182. /** @var string */
  183. public $mRef;
  184. /**
  185. * @param string $hash The content hash of the text
  186. * @param int $oldid The old_id for the CGZ object
  187. */
  188. function __construct( $hash = '', $oldid = 0 ) {
  189. $this->mHash = $hash;
  190. }
  191. /**
  192. * Sets the location (old_id) of the main object to which this object
  193. * points
  194. * @param int $id
  195. */
  196. function setLocation( $id ) {
  197. $this->mOldId = $id;
  198. }
  199. /**
  200. * Sets the location (old_id) of the referring object
  201. * @param string $id
  202. */
  203. function setReferrer( $id ) {
  204. $this->mRef = $id;
  205. }
  206. /**
  207. * Gets the location of the referring object
  208. * @return string
  209. */
  210. function getReferrer() {
  211. return $this->mRef;
  212. }
  213. /**
  214. * @return string|false
  215. */
  216. function getText() {
  217. if ( isset( self::$blobCache[$this->mOldId] ) ) {
  218. $obj = self::$blobCache[$this->mOldId];
  219. } else {
  220. $dbr = wfGetDB( DB_REPLICA );
  221. $row = $dbr->selectRow(
  222. 'text',
  223. [ 'old_flags', 'old_text' ],
  224. [ 'old_id' => $this->mOldId ]
  225. );
  226. if ( !$row ) {
  227. return false;
  228. }
  229. $flags = explode( ',', $row->old_flags );
  230. if ( in_array( 'external', $flags ) ) {
  231. $url = $row->old_text;
  232. $parts = explode( '://', $url, 2 );
  233. if ( !isset( $parts[1] ) || $parts[1] == '' ) {
  234. return false;
  235. }
  236. $row->old_text = ExternalStore::fetchFromURL( $url );
  237. }
  238. if ( !in_array( 'object', $flags ) ) {
  239. return false;
  240. }
  241. if ( in_array( 'gzip', $flags ) ) {
  242. // This shouldn't happen, but a bug in the compress script
  243. // may at times gzip-compress a HistoryBlob object row.
  244. $obj = unserialize( gzinflate( $row->old_text ) );
  245. } else {
  246. $obj = unserialize( $row->old_text );
  247. }
  248. if ( !is_object( $obj ) ) {
  249. // Correct for old double-serialization bug.
  250. $obj = unserialize( $obj );
  251. }
  252. // Save this item for reference; if pulling many
  253. // items in a row we'll likely use it again.
  254. $obj->uncompress();
  255. self::$blobCache = [ $this->mOldId => $obj ];
  256. }
  257. return $obj->getItem( $this->mHash );
  258. }
  259. /**
  260. * Get the content hash
  261. *
  262. * @return string
  263. */
  264. function getHash() {
  265. return $this->mHash;
  266. }
  267. }
  268. /**
  269. * To speed up conversion from 1.4 to 1.5 schema, text rows can refer to the
  270. * leftover cur table as the backend. This avoids expensively copying hundreds
  271. * of megabytes of data during the conversion downtime.
  272. *
  273. * Serialized HistoryBlobCurStub objects will be inserted into the text table
  274. * on conversion if $wgLegacySchemaConversion is set to true.
  275. */
  276. class HistoryBlobCurStub {
  277. /** @var int */
  278. public $mCurId;
  279. /**
  280. * @param int $curid The cur_id pointed to
  281. */
  282. function __construct( $curid = 0 ) {
  283. $this->mCurId = $curid;
  284. }
  285. /**
  286. * Sets the location (cur_id) of the main object to which this object
  287. * points
  288. *
  289. * @param int $id
  290. */
  291. function setLocation( $id ) {
  292. $this->mCurId = $id;
  293. }
  294. /**
  295. * @return string|bool
  296. */
  297. function getText() {
  298. $dbr = wfGetDB( DB_REPLICA );
  299. $row = $dbr->selectRow( 'cur', [ 'cur_text' ], [ 'cur_id' => $this->mCurId ] );
  300. if ( !$row ) {
  301. return false;
  302. }
  303. return $row->cur_text;
  304. }
  305. }
  306. /**
  307. * Diff-based history compression
  308. * Requires xdiff 1.5+ and zlib
  309. */
  310. class DiffHistoryBlob implements HistoryBlob {
  311. /** @var array Uncompressed item cache */
  312. public $mItems = [];
  313. /** @var int Total uncompressed size */
  314. public $mSize = 0;
  315. /**
  316. * @var array Array of diffs. If a diff D from A to B is notated D = B - A,
  317. * and Z is an empty string:
  318. *
  319. * { item[map[i]] - item[map[i-1]] where i > 0
  320. * diff[i] = {
  321. * { item[map[i]] - Z where i = 0
  322. */
  323. public $mDiffs;
  324. /** @var array The diff map, see above */
  325. public $mDiffMap;
  326. /** @var int The key for getText()
  327. */
  328. public $mDefaultKey;
  329. /** @var string Compressed storage */
  330. public $mCompressed;
  331. /** @var bool True if the object is locked against further writes */
  332. public $mFrozen = false;
  333. /**
  334. * @var int The maximum uncompressed size before the object becomes sad
  335. * Should be less than max_allowed_packet
  336. */
  337. public $mMaxSize = 10000000;
  338. /** @var int The maximum number of text items before the object becomes sad */
  339. public $mMaxCount = 100;
  340. /** Constants from xdiff.h */
  341. const XDL_BDOP_INS = 1;
  342. const XDL_BDOP_CPY = 2;
  343. const XDL_BDOP_INSB = 3;
  344. function __construct() {
  345. if ( !function_exists( 'gzdeflate' ) ) {
  346. throw new MWException( "Need zlib support to read or write DiffHistoryBlob\n" );
  347. }
  348. }
  349. /**
  350. * @throws MWException
  351. * @param string $text
  352. * @return int
  353. */
  354. function addItem( $text ) {
  355. if ( $this->mFrozen ) {
  356. throw new MWException( __METHOD__ . ": Cannot add more items after sleep/wakeup" );
  357. }
  358. $this->mItems[] = $text;
  359. $this->mSize += strlen( $text );
  360. $this->mDiffs = null; // later
  361. return count( $this->mItems ) - 1;
  362. }
  363. /**
  364. * @param string $key
  365. * @return string
  366. */
  367. function getItem( $key ) {
  368. return $this->mItems[$key];
  369. }
  370. /**
  371. * @param string $text
  372. */
  373. function setText( $text ) {
  374. $this->mDefaultKey = $this->addItem( $text );
  375. }
  376. /**
  377. * @return string
  378. */
  379. function getText() {
  380. return $this->getItem( $this->mDefaultKey );
  381. }
  382. /**
  383. * @throws MWException
  384. */
  385. function compress() {
  386. if ( !function_exists( 'xdiff_string_rabdiff' ) ) {
  387. throw new MWException( "Need xdiff 1.5+ support to write DiffHistoryBlob\n" );
  388. }
  389. if ( isset( $this->mDiffs ) ) {
  390. // Already compressed
  391. return;
  392. }
  393. if ( !count( $this->mItems ) ) {
  394. // Empty
  395. return;
  396. }
  397. // Create two diff sequences: one for main text and one for small text
  398. $sequences = [
  399. 'small' => [
  400. 'tail' => '',
  401. 'diffs' => [],
  402. 'map' => [],
  403. ],
  404. 'main' => [
  405. 'tail' => '',
  406. 'diffs' => [],
  407. 'map' => [],
  408. ],
  409. ];
  410. $smallFactor = 0.5;
  411. $mItemsCount = count( $this->mItems );
  412. for ( $i = 0; $i < $mItemsCount; $i++ ) {
  413. $text = $this->mItems[$i];
  414. if ( $i == 0 ) {
  415. $seqName = 'main';
  416. } else {
  417. $mainTail = $sequences['main']['tail'];
  418. if ( strlen( $text ) < strlen( $mainTail ) * $smallFactor ) {
  419. $seqName = 'small';
  420. } else {
  421. $seqName = 'main';
  422. }
  423. }
  424. $seq =& $sequences[$seqName];
  425. $tail = $seq['tail'];
  426. $diff = $this->diff( $tail, $text );
  427. $seq['diffs'][] = $diff;
  428. $seq['map'][] = $i;
  429. $seq['tail'] = $text;
  430. }
  431. unset( $seq ); // unlink dangerous alias
  432. // Knit the sequences together
  433. $tail = '';
  434. $this->mDiffs = [];
  435. $this->mDiffMap = [];
  436. foreach ( $sequences as $seq ) {
  437. if ( !count( $seq['diffs'] ) ) {
  438. continue;
  439. }
  440. if ( $tail === '' ) {
  441. $this->mDiffs[] = $seq['diffs'][0];
  442. } else {
  443. $head = $this->patch( '', $seq['diffs'][0] );
  444. $this->mDiffs[] = $this->diff( $tail, $head );
  445. }
  446. $this->mDiffMap[] = $seq['map'][0];
  447. $diffsCount = count( $seq['diffs'] );
  448. for ( $i = 1; $i < $diffsCount; $i++ ) {
  449. $this->mDiffs[] = $seq['diffs'][$i];
  450. $this->mDiffMap[] = $seq['map'][$i];
  451. }
  452. $tail = $seq['tail'];
  453. }
  454. }
  455. /**
  456. * @param string $t1
  457. * @param string $t2
  458. * @return string
  459. */
  460. function diff( $t1, $t2 ) {
  461. # Need to do a null concatenation with warnings off, due to bugs in the current version of xdiff
  462. # "String is not zero-terminated"
  463. Wikimedia\suppressWarnings();
  464. $diff = xdiff_string_rabdiff( $t1, $t2 ) . '';
  465. Wikimedia\restoreWarnings();
  466. return $diff;
  467. }
  468. /**
  469. * @param string $base
  470. * @param string $diff
  471. * @return bool|string
  472. */
  473. function patch( $base, $diff ) {
  474. if ( function_exists( 'xdiff_string_bpatch' ) ) {
  475. Wikimedia\suppressWarnings();
  476. $text = xdiff_string_bpatch( $base, $diff ) . '';
  477. Wikimedia\restoreWarnings();
  478. return $text;
  479. }
  480. # Pure PHP implementation
  481. $header = unpack( 'Vofp/Vcsize', substr( $diff, 0, 8 ) );
  482. # Check the checksum if hash extension is available
  483. $ofp = $this->xdiffAdler32( $base );
  484. if ( $ofp !== false && $ofp !== substr( $diff, 0, 4 ) ) {
  485. wfDebug( __METHOD__ . ": incorrect base checksum\n" );
  486. return false;
  487. }
  488. if ( $header['csize'] != strlen( $base ) ) {
  489. wfDebug( __METHOD__ . ": incorrect base length\n" );
  490. return false;
  491. }
  492. $p = 8;
  493. $out = '';
  494. while ( $p < strlen( $diff ) ) {
  495. $x = unpack( 'Cop', substr( $diff, $p, 1 ) );
  496. $op = $x['op'];
  497. ++$p;
  498. switch ( $op ) {
  499. case self::XDL_BDOP_INS:
  500. $x = unpack( 'Csize', substr( $diff, $p, 1 ) );
  501. $p++;
  502. $out .= substr( $diff, $p, $x['size'] );
  503. $p += $x['size'];
  504. break;
  505. case self::XDL_BDOP_INSB:
  506. $x = unpack( 'Vcsize', substr( $diff, $p, 4 ) );
  507. $p += 4;
  508. $out .= substr( $diff, $p, $x['csize'] );
  509. $p += $x['csize'];
  510. break;
  511. case self::XDL_BDOP_CPY:
  512. $x = unpack( 'Voff/Vcsize', substr( $diff, $p, 8 ) );
  513. $p += 8;
  514. $out .= substr( $base, $x['off'], $x['csize'] );
  515. break;
  516. default:
  517. wfDebug( __METHOD__ . ": invalid op\n" );
  518. return false;
  519. }
  520. }
  521. return $out;
  522. }
  523. /**
  524. * Compute a binary "Adler-32" checksum as defined by LibXDiff, i.e. with
  525. * the bytes backwards and initialised with 0 instead of 1. See T36428.
  526. *
  527. * @param string $s
  528. * @return string|bool False if the hash extension is not available
  529. */
  530. function xdiffAdler32( $s ) {
  531. if ( !function_exists( 'hash' ) ) {
  532. return false;
  533. }
  534. static $init;
  535. if ( $init === null ) {
  536. $init = str_repeat( "\xf0", 205 ) . "\xee" . str_repeat( "\xf0", 67 ) . "\x02";
  537. }
  538. // The real Adler-32 checksum of $init is zero, so it initialises the
  539. // state to zero, as it is at the start of LibXDiff's checksum
  540. // algorithm. Appending the subject string then simulates LibXDiff.
  541. return strrev( hash( 'adler32', $init . $s, true ) );
  542. }
  543. function uncompress() {
  544. if ( !$this->mDiffs ) {
  545. return;
  546. }
  547. $tail = '';
  548. $mDiffsCount = count( $this->mDiffs );
  549. for ( $diffKey = 0; $diffKey < $mDiffsCount; $diffKey++ ) {
  550. $textKey = $this->mDiffMap[$diffKey];
  551. $text = $this->patch( $tail, $this->mDiffs[$diffKey] );
  552. $this->mItems[$textKey] = $text;
  553. $tail = $text;
  554. }
  555. }
  556. /**
  557. * @return array
  558. */
  559. function __sleep() {
  560. $this->compress();
  561. if ( !count( $this->mItems ) ) {
  562. // Empty object
  563. $info = false;
  564. } else {
  565. // Take forward differences to improve the compression ratio for sequences
  566. $map = '';
  567. $prev = 0;
  568. foreach ( $this->mDiffMap as $i ) {
  569. if ( $map !== '' ) {
  570. $map .= ',';
  571. }
  572. $map .= $i - $prev;
  573. $prev = $i;
  574. }
  575. $info = [
  576. 'diffs' => $this->mDiffs,
  577. 'map' => $map
  578. ];
  579. }
  580. if ( isset( $this->mDefaultKey ) ) {
  581. $info['default'] = $this->mDefaultKey;
  582. }
  583. $this->mCompressed = gzdeflate( serialize( $info ) );
  584. return [ 'mCompressed' ];
  585. }
  586. function __wakeup() {
  587. // addItem() doesn't work if mItems is partially filled from mDiffs
  588. $this->mFrozen = true;
  589. $info = unserialize( gzinflate( $this->mCompressed ) );
  590. unset( $this->mCompressed );
  591. if ( !$info ) {
  592. // Empty object
  593. return;
  594. }
  595. if ( isset( $info['default'] ) ) {
  596. $this->mDefaultKey = $info['default'];
  597. }
  598. $this->mDiffs = $info['diffs'];
  599. if ( isset( $info['base'] ) ) {
  600. // Old format
  601. $this->mDiffMap = range( 0, count( $this->mDiffs ) - 1 );
  602. array_unshift( $this->mDiffs,
  603. pack( 'VVCV', 0, 0, self::XDL_BDOP_INSB, strlen( $info['base'] ) ) .
  604. $info['base'] );
  605. } else {
  606. // New format
  607. $map = explode( ',', $info['map'] );
  608. $cur = 0;
  609. $this->mDiffMap = [];
  610. foreach ( $map as $i ) {
  611. $cur += $i;
  612. $this->mDiffMap[] = $cur;
  613. }
  614. }
  615. $this->uncompress();
  616. }
  617. /**
  618. * Helper function for compression jobs
  619. * Returns true until the object is "full" and ready to be committed
  620. *
  621. * @return bool
  622. */
  623. function isHappy() {
  624. return $this->mSize < $this->mMaxSize
  625. && count( $this->mItems ) < $this->mMaxCount;
  626. }
  627. }
  628. // phpcs:ignore Generic.CodeAnalysis.UnconditionalIfStatement.Found
  629. if ( false ) {
  630. // Blobs generated by MediaWiki < 1.5 on PHP 4 were serialized with the
  631. // class name coerced to lowercase. We can improve efficiency by adding
  632. // autoload entries for the lowercase variants of these classes (T166759).
  633. // The code below is never executed, but it is picked up by the AutoloadGenerator
  634. // parser, which scans for class_alias() calls.
  635. class_alias( ConcatenatedGzipHistoryBlob::class, 'concatenatedgziphistoryblob' );
  636. class_alias( HistoryBlobCurStub::class, 'historyblobcurstub' );
  637. class_alias( HistoryBlobStub::class, 'historyblobstub' );
  638. }