BacklinkJobUtils.php 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. <?php
  2. /**
  3. * Job to update links for a given title.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup JobQueue
  22. */
  23. /**
  24. * Class with Backlink related Job helper methods
  25. *
  26. * When an asset changes, a base job can be inserted to update all assets that depend on it.
  27. * The base job splits into per-title "leaf" jobs and a "remnant" job to handle the remaining
  28. * range of backlinks. This recurs until the remnant job's backlink range is small enough that
  29. * only leaf jobs are created from it.
  30. *
  31. * For example, if templates A and B are edited (at the same time) the queue will have:
  32. * (A base, B base)
  33. * When these jobs run, the queue will have per-title and remnant partition jobs:
  34. * (titleX,titleY,titleZ,...,A remnant,titleM,titleN,titleO,...,B remnant)
  35. *
  36. * This works best when the queue is FIFO, for several reasons:
  37. * - a) Since the remnant jobs are enqueued after the leaf jobs, the slower leaf jobs have to
  38. * get popped prior to the fast remnant jobs. This avoids flooding the queue with leaf jobs
  39. * for every single backlink of widely used assets (which can be millions).
  40. * - b) Other jobs going in the queue still get a chance to run after a widely used asset changes.
  41. * This is due to the large remnant job pushing to the end of the queue with each division.
  42. *
  43. * The size of the queues used in this manner depend on the number of assets changes and the
  44. * number of workers. Also, with FIFO-per-partition queues, the queue size can be somewhat larger,
  45. * depending on the number of queue partitions.
  46. *
  47. * @ingroup JobQueue
  48. * @since 1.23
  49. */
  50. class BacklinkJobUtils {
  51. /**
  52. * Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition
  53. * job that covers the remaining backlink range (if needed). Jobs for the first $bSize
  54. * titles are collated ($cSize per job) into leaf jobs to do actual work. All the
  55. * resulting jobs are of the same class as $job. No partition job is returned if the
  56. * range covered by $job was less than $bSize, as the leaf jobs have full coverage.
  57. *
  58. * The leaf jobs have the 'pages' param set to a (<page ID>:(<namespace>,<DB key>),...)
  59. * map so that the run() function knows what pages to act on. The leaf jobs will keep
  60. * the same job title as the parent job (e.g. $job).
  61. *
  62. * The partition jobs have the 'range' parameter set to a map of the format
  63. * (start:<integer>, end:<integer>, batchSize:<integer>, subranges:((<start>,<end>),...)),
  64. * the 'table' parameter set to that of $job, and the 'recursive' parameter set to true.
  65. * This method can be called on the resulting job to repeat the process again.
  66. *
  67. * The job provided ($job) must have the 'recursive' parameter set to true and the 'table'
  68. * parameter must be set to a backlink table. The job title will be used as the title to
  69. * find backlinks for. Any 'range' parameter must follow the same format as mentioned above.
  70. * This should be managed by recursive calls to this method.
  71. *
  72. * The first jobs return are always the leaf jobs. This lets the caller use push() to
  73. * put them directly into the queue and works well if the queue is FIFO. In such a queue,
  74. * the leaf jobs have to get finished first before anything can resolve the next partition
  75. * job, which keeps the queue very small.
  76. *
  77. * $opts includes:
  78. * - params : extra job parameters to include in each job
  79. *
  80. * @param Job $job
  81. * @param int $bSize BacklinkCache partition size; usually $wgUpdateRowsPerJob
  82. * @param int $cSize Max titles per leaf job; Usually 1 or a modest value
  83. * @param array $opts Optional parameter map
  84. * @return Job[] List of Job objects
  85. */
  86. public static function partitionBacklinkJob( Job $job, $bSize, $cSize, $opts = [] ) {
  87. $class = get_class( $job );
  88. $title = $job->getTitle();
  89. $params = $job->getParams();
  90. if ( isset( $params['pages'] ) || empty( $params['recursive'] ) ) {
  91. $ranges = []; // sanity; this is a leaf node
  92. $realBSize = 0;
  93. wfWarn( __METHOD__ . " called on {$job->getType()} leaf job (explosive recursion)." );
  94. } elseif ( isset( $params['range'] ) ) {
  95. // This is a range job to trigger the insertion of partitioned/title jobs...
  96. $ranges = $params['range']['subranges'];
  97. $realBSize = $params['range']['batchSize'];
  98. } else {
  99. // This is a base job to trigger the insertion of partitioned jobs...
  100. $ranges = $title->getBacklinkCache()->partition( $params['table'], $bSize );
  101. $realBSize = $bSize;
  102. }
  103. $extraParams = $opts['params'] ?? [];
  104. $jobs = [];
  105. // Combine the first range (of size $bSize) backlinks into leaf jobs
  106. if ( isset( $ranges[0] ) ) {
  107. list( $start, $end ) = $ranges[0];
  108. $iter = $title->getBacklinkCache()->getLinks( $params['table'], $start, $end );
  109. $titles = iterator_to_array( $iter );
  110. /** @var Title[] $titleBatch */
  111. foreach ( array_chunk( $titles, $cSize ) as $titleBatch ) {
  112. $pages = [];
  113. foreach ( $titleBatch as $tl ) {
  114. $pages[$tl->getArticleID()] = [ $tl->getNamespace(), $tl->getDBkey() ];
  115. }
  116. $jobs[] = new $class(
  117. $title, // maintain parent job title
  118. [ 'pages' => $pages ] + $extraParams
  119. );
  120. }
  121. }
  122. // Take all of the remaining ranges and build a partition job from it
  123. if ( isset( $ranges[1] ) ) {
  124. $jobs[] = new $class(
  125. $title, // maintain parent job title
  126. [
  127. 'recursive' => true,
  128. 'table' => $params['table'],
  129. 'range' => [
  130. 'start' => $ranges[1][0],
  131. 'end' => $ranges[count( $ranges ) - 1][1],
  132. 'batchSize' => $realBSize,
  133. 'subranges' => array_slice( $ranges, 1 )
  134. ],
  135. // Track how many times the base job divided for debugging
  136. 'division' => isset( $params['division'] )
  137. ? ( $params['division'] + 1 )
  138. : 1
  139. ] + $extraParams
  140. );
  141. }
  142. return $jobs;
  143. }
  144. }