MediaWikiPageNameNormalizer.php 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. <?php
  2. namespace MediaWiki\Site;
  3. use FormatJson;
  4. use Http;
  5. use UtfNormal\Validator;
  6. /**
  7. * Service for normalizing a page name using a MediaWiki api.
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License along
  20. * with this program; if not, write to the Free Software Foundation, Inc.,
  21. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  22. * http://www.gnu.org/copyleft/gpl.html
  23. *
  24. * @since 1.27
  25. *
  26. * @license GPL-2.0-or-later
  27. * @author John Erling Blad < jeblad@gmail.com >
  28. * @author Daniel Kinzler
  29. * @author Jeroen De Dauw < jeroendedauw@gmail.com >
  30. * @author Marius Hoch
  31. */
  32. class MediaWikiPageNameNormalizer {
  33. /**
  34. * @var Http
  35. */
  36. private $http;
  37. /**
  38. * @param Http|null $http
  39. */
  40. public function __construct( Http $http = null ) {
  41. if ( !$http ) {
  42. $http = new Http();
  43. }
  44. $this->http = $http;
  45. }
  46. /**
  47. * Returns the normalized form of the given page title, using the
  48. * normalization rules of the given site. If the given title is a redirect,
  49. * the redirect will be resolved and the redirect target is returned.
  50. * Only titles of existing pages will be returned.
  51. *
  52. * @note This actually makes an API request to the remote site, so beware
  53. * that this function is slow and depends on an external service.
  54. *
  55. * @see Site::normalizePageName
  56. *
  57. * @since 1.27
  58. *
  59. * @param string $pageName
  60. * @param string $apiUrl
  61. *
  62. * @return string|false The normalized form of the title,
  63. * or false to indicate an invalid title, a missing page,
  64. * or some other kind of error.
  65. * @throws \MWException
  66. */
  67. public function normalizePageName( $pageName, $apiUrl ) {
  68. // Check if we have strings as arguments.
  69. if ( !is_string( $pageName ) ) {
  70. throw new \MWException( '$pageName must be a string' );
  71. }
  72. // Go on call the external site
  73. // Make sure the string is normalized into NFC (due to T42017)
  74. // but do nothing to the whitespaces, that should work appropriately.
  75. // @see https://phabricator.wikimedia.org/T42017
  76. $pageName = Validator::cleanUp( $pageName );
  77. // Build the args for the specific call
  78. $args = [
  79. 'action' => 'query',
  80. 'prop' => 'info',
  81. 'redirects' => true,
  82. 'converttitles' => true,
  83. 'format' => 'json',
  84. 'titles' => $pageName,
  85. // @todo options for maxlag and maxage
  86. // Note that maxlag will lead to a long delay before a reply is made,
  87. // but that maxage can avoid the extreme delay. On the other hand
  88. // maxage could be nice to use anyhow as it stops unnecessary requests.
  89. // Also consider smaxage if maxage is used.
  90. ];
  91. $url = wfAppendQuery( $apiUrl, $args );
  92. // Go on call the external site
  93. // @todo we need a good way to specify a timeout here.
  94. $ret = $this->http->get( $url, [], __METHOD__ );
  95. if ( $ret === false ) {
  96. wfDebugLog( "MediaWikiSite", "call to external site failed: $url" );
  97. return false;
  98. }
  99. $data = FormatJson::decode( $ret, true );
  100. if ( !is_array( $data ) ) {
  101. wfDebugLog( "MediaWikiSite", "call to <$url> returned bad json: " . $ret );
  102. return false;
  103. }
  104. $page = static::extractPageRecord( $data, $pageName );
  105. if ( isset( $page['missing'] ) ) {
  106. wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for a missing page title! "
  107. . $ret );
  108. return false;
  109. }
  110. if ( isset( $page['invalid'] ) ) {
  111. wfDebugLog( "MediaWikiSite", "call to <$url> returned a marker for an invalid page title! "
  112. . $ret );
  113. return false;
  114. }
  115. if ( !isset( $page['title'] ) ) {
  116. wfDebugLog( "MediaWikiSite", "call to <$url> did not return a page title! " . $ret );
  117. return false;
  118. }
  119. return $page['title'];
  120. }
  121. /**
  122. * Get normalization record for a given page title from an API response.
  123. *
  124. * @param array $externalData A reply from the API on a external server.
  125. * @param string $pageTitle Identifies the page at the external site, needing normalization.
  126. *
  127. * @return array|bool A 'page' structure representing the page identified by $pageTitle.
  128. */
  129. private static function extractPageRecord( $externalData, $pageTitle ) {
  130. // If there is a special case with only one returned page
  131. // we can cheat, and only return
  132. // the single page in the "pages" substructure.
  133. if ( isset( $externalData['query']['pages'] ) ) {
  134. $pages = array_values( $externalData['query']['pages'] );
  135. if ( count( $pages ) === 1 ) {
  136. return $pages[0];
  137. }
  138. }
  139. // This is only used during internal testing, as it is assumed
  140. // a more optimal (and lossfree) storage.
  141. // Make initial checks and return if prerequisites are not meet.
  142. if ( !is_array( $externalData ) || !isset( $externalData['query'] ) ) {
  143. return false;
  144. }
  145. // Loop over the tree different named structures, that otherwise are similar
  146. $structs = [
  147. 'normalized' => 'from',
  148. 'converted' => 'from',
  149. 'redirects' => 'from',
  150. 'pages' => 'title'
  151. ];
  152. foreach ( $structs as $listId => $fieldId ) {
  153. // Check if the substructure exist at all.
  154. if ( !isset( $externalData['query'][$listId] ) ) {
  155. continue;
  156. }
  157. // Filter the substructure down to what we actually are using.
  158. $collectedHits = array_filter(
  159. array_values( $externalData['query'][$listId] ),
  160. function ( $a ) use ( $fieldId, $pageTitle ) {
  161. return $a[$fieldId] === $pageTitle;
  162. }
  163. );
  164. // If still looping over normalization, conversion or redirects,
  165. // then we need to keep the new page title for later rounds.
  166. if ( $fieldId === 'from' && is_array( $collectedHits ) ) {
  167. switch ( count( $collectedHits ) ) {
  168. case 0:
  169. break;
  170. case 1:
  171. $pageTitle = $collectedHits[0]['to'];
  172. break;
  173. default:
  174. return false;
  175. }
  176. } elseif ( $fieldId === 'title' && is_array( $collectedHits ) ) {
  177. // If on the pages structure we should prepare for returning.
  178. switch ( count( $collectedHits ) ) {
  179. case 0:
  180. return false;
  181. case 1:
  182. return array_shift( $collectedHits );
  183. default:
  184. return false;
  185. }
  186. }
  187. }
  188. // should never be here
  189. return false;
  190. }
  191. }