purgeChangedPages.php 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. <?php
  2. /**
  3. * Send purge requests for pages edited in date range to squid/varnish.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Maintenance
  22. */
  23. require_once __DIR__ . '/Maintenance.php';
  24. /**
  25. * Maintenance script that sends purge requests for pages edited in a date
  26. * range to squid/varnish.
  27. *
  28. * Can be used to recover from an HTCP message partition or other major cache
  29. * layer interruption.
  30. *
  31. * @ingroup Maintenance
  32. */
  33. class PurgeChangedPages extends Maintenance {
  34. public function __construct() {
  35. parent::__construct();
  36. $this->addDescription( 'Send purge requests for edits in date range to squid/varnish' );
  37. $this->addOption( 'starttime', 'Starting timestamp', true, true );
  38. $this->addOption( 'endtime', 'Ending timestamp', true, true );
  39. $this->addOption( 'htcp-dest', 'HTCP announcement destination (IP:port)', false, true );
  40. $this->addOption( 'sleep-per-batch', 'Milliseconds to sleep between batches', false, true );
  41. $this->addOption( 'dry-run', 'Do not send purge requests' );
  42. $this->addOption( 'verbose', 'Show more output', false, false, 'v' );
  43. $this->setBatchSize( 100 );
  44. }
  45. public function execute() {
  46. global $wgHTCPRouting;
  47. if ( $this->hasOption( 'htcp-dest' ) ) {
  48. $parts = explode( ':', $this->getOption( 'htcp-dest' ) );
  49. if ( count( $parts ) < 2 ) {
  50. // Add default htcp port
  51. $parts[] = '4827';
  52. }
  53. // Route all HTCP messages to provided host:port
  54. $wgHTCPRouting = [
  55. '' => [ 'host' => $parts[0], 'port' => $parts[1] ],
  56. ];
  57. if ( $this->hasOption( 'verbose' ) ) {
  58. $this->output( "HTCP broadcasts to {$parts[0]}:{$parts[1]}\n" );
  59. }
  60. }
  61. $dbr = $this->getDB( DB_SLAVE );
  62. $minTime = $dbr->timestamp( $this->getOption( 'starttime' ) );
  63. $maxTime = $dbr->timestamp( $this->getOption( 'endtime' ) );
  64. if ( $maxTime < $minTime ) {
  65. $this->error( "\nERROR: starttime after endtime\n" );
  66. $this->maybeHelp( true );
  67. }
  68. $stuckCount = 0; // loop breaker
  69. while ( true ) {
  70. // Adjust bach size if we are stuck in a second that had many changes
  71. $bSize = $this->mBatchSize + ( $stuckCount * $this->mBatchSize );
  72. $res = $dbr->select(
  73. [ 'page', 'revision' ],
  74. [
  75. 'rev_timestamp',
  76. 'page_namespace',
  77. 'page_title',
  78. ],
  79. [
  80. "rev_timestamp > " . $dbr->addQuotes( $minTime ),
  81. "rev_timestamp <= " . $dbr->addQuotes( $maxTime ),
  82. // Only get rows where the revision is the latest for the page.
  83. // Other revisions would be duplicate and we don't need to purge if
  84. // there has been an edit after the interesting time window.
  85. "page_latest = rev_id",
  86. ],
  87. __METHOD__,
  88. [ 'ORDER BY' => 'rev_timestamp', 'LIMIT' => $bSize ],
  89. [
  90. 'page' => [ 'INNER JOIN', 'rev_page=page_id' ],
  91. ]
  92. );
  93. if ( !$res->numRows() ) {
  94. // nothing more found so we are done
  95. break;
  96. }
  97. // Kludge to not get stuck in loops for batches with the same timestamp
  98. list( $rows, $lastTime ) = $this->pageableSortedRows( $res, 'rev_timestamp', $bSize );
  99. if ( !count( $rows ) ) {
  100. ++$stuckCount;
  101. continue;
  102. }
  103. // Reset suck counter
  104. $stuckCount = 0;
  105. $this->output( "Processing changes from {$minTime} to {$lastTime}.\n" );
  106. // Advance past the last row next time
  107. $minTime = $lastTime;
  108. // Create list of URLs from page_namespace + page_title
  109. $urls = [];
  110. foreach ( $rows as $row ) {
  111. $title = Title::makeTitle( $row->page_namespace, $row->page_title );
  112. $urls[] = $title->getInternalURL();
  113. }
  114. if ( $this->hasOption( 'dry-run' ) || $this->hasOption( 'verbose' ) ) {
  115. $this->output( implode( "\n", $urls ) . "\n" );
  116. if ( $this->hasOption( 'dry-run' ) ) {
  117. continue;
  118. }
  119. }
  120. // Send batch of purge requests out to squids
  121. $squid = new CdnCacheUpdate( $urls, count( $urls ) );
  122. $squid->doUpdate();
  123. if ( $this->hasOption( 'sleep-per-batch' ) ) {
  124. // sleep-per-batch is milliseconds, usleep wants micro seconds.
  125. usleep( 1000 * (int)$this->getOption( 'sleep-per-batch' ) );
  126. }
  127. }
  128. $this->output( "Done!\n" );
  129. }
  130. /**
  131. * Remove all the rows in a result set with the highest value for column
  132. * $column unless the number of rows is less $limit. This returns the new
  133. * array of rows and the highest value of column $column for the rows left.
  134. * The ordering of rows is maintained.
  135. *
  136. * This is useful for paging on mostly-unique values that may sometimes
  137. * have large clumps of identical values. It should be safe to do the next
  138. * query on items with a value higher than the highest of the rows returned here.
  139. * If this returns an empty array for a non-empty query result, then all the rows
  140. * had the same column value and the query should be repeated with a higher LIMIT.
  141. *
  142. * @todo move this elsewhere
  143. *
  144. * @param ResultWrapper $res Query result sorted by $column (ascending)
  145. * @param string $column
  146. * @param int $limit
  147. * @return array (array of rows, string column value)
  148. */
  149. protected function pageableSortedRows( ResultWrapper $res, $column, $limit ) {
  150. $rows = iterator_to_array( $res, false );
  151. $count = count( $rows );
  152. if ( !$count ) {
  153. return [ [], null ]; // nothing to do
  154. } elseif ( $count < $limit ) {
  155. return [ $rows, $rows[$count - 1]->$column ]; // no more rows left
  156. }
  157. $lastValue = $rows[$count - 1]->$column; // should be the highest
  158. for ( $i = $count - 1; $i >= 0; --$i ) {
  159. if ( $rows[$i]->$column === $lastValue ) {
  160. unset( $rows[$i] );
  161. } else {
  162. break;
  163. }
  164. }
  165. $lastValueLeft = count( $rows ) ? $rows[count( $rows ) - 1]->$column : null;
  166. return [ $rows, $lastValueLeft ];
  167. }
  168. }
  169. $maintClass = "PurgeChangedPages";
  170. require_once RUN_MAINTENANCE_IF_MAIN;