WikiTextStructure.php 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. <?php
  2. use HtmlFormatter\HtmlFormatter;
  3. /**
  4. * Class allowing to explore structure of parsed wikitext.
  5. */
  6. class WikiTextStructure {
  7. /**
  8. * @var string
  9. */
  10. private $openingText;
  11. /**
  12. * @var string
  13. */
  14. private $allText;
  15. /**
  16. * @var string[]
  17. */
  18. private $auxText = [];
  19. /**
  20. * @var ParserOutput
  21. */
  22. private $parserOutput;
  23. /**
  24. * @var string[] selectors to elements that are excluded entirely from search
  25. */
  26. private $excludedElementSelectors = [
  27. // "it looks like you don't have javascript enabled..." – do not need to index
  28. 'audio', 'video',
  29. // CSS stylesheets aren't content
  30. 'style',
  31. // The [1] for references
  32. 'sup.reference',
  33. // The ↑ next to references in the references section
  34. '.mw-cite-backlink',
  35. // Headings are already indexed in their own field.
  36. 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  37. // Collapsed fields are hidden by default so we don't want them showing up.
  38. '.autocollapse',
  39. // Content explicitly decided to be not searchable by editors such
  40. // as custom navigation templates.
  41. '.navigation-not-searchable'
  42. ];
  43. /**
  44. * @var string[] selectors to elements that are considered auxiliary to article text for search
  45. */
  46. private $auxiliaryElementSelectors = [
  47. // Thumbnail captions aren't really part of the text proper
  48. '.thumbcaption',
  49. // Neither are tables
  50. 'table',
  51. // Common style for "See also:".
  52. '.rellink',
  53. // Common style for calling out helpful links at the top of the article.
  54. '.dablink',
  55. // New class users can use to mark stuff as auxiliary to searches.
  56. '.searchaux',
  57. ];
  58. /**
  59. * @param ParserOutput $parserOutput
  60. */
  61. public function __construct( ParserOutput $parserOutput ) {
  62. $this->parserOutput = $parserOutput;
  63. }
  64. /**
  65. * Get headings on the page.
  66. * @return string[]
  67. * First strip out things that look like references. We can't use HTML filtering because
  68. * the references come back as <sup> tags without a class. To keep from breaking stuff like
  69. * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
  70. * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
  71. * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
  72. * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
  73. * reference. And since the data looks like:
  74. * Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
  75. * we can not really use HtmlFormatter as we have no suitable selector.
  76. */
  77. public function headings() {
  78. $headings = [];
  79. $ignoredHeadings = $this->getIgnoredHeadings();
  80. foreach ( $this->parserOutput->getSections() as $heading ) {
  81. $heading = $heading[ 'line' ];
  82. // Some wikis wrap the brackets in a span:
  83. // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
  84. $heading = preg_replace( '/<\/?span>/', '', $heading );
  85. // Normalize [] so the following regexp would work.
  86. $heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
  87. $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
  88. // Strip tags from the heading or else we'll display them (escaped) in search results
  89. $heading = trim( Sanitizer::stripAllTags( $heading ) );
  90. // Note that we don't take the level of the heading into account - all headings are equal.
  91. // Except the ones we ignore.
  92. if ( !in_array( $heading, $ignoredHeadings ) ) {
  93. $headings[] = $heading;
  94. }
  95. }
  96. return $headings;
  97. }
  98. /**
  99. * Parse a message content into an array. This function is generally used to
  100. * parse settings stored as i18n messages (see search-ignored-headings).
  101. *
  102. * @param string $message
  103. * @return string[]
  104. */
  105. public static function parseSettingsInMessage( $message ) {
  106. $lines = explode( "\n", $message );
  107. $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
  108. $lines = array_map( 'trim', $lines ); // Remove extra spaces
  109. $lines = array_filter( $lines ); // Remove empty lines
  110. return $lines;
  111. }
  112. /**
  113. * Get list of heading to ignore.
  114. * @return string[]
  115. */
  116. private function getIgnoredHeadings() {
  117. static $ignoredHeadings = null;
  118. if ( $ignoredHeadings === null ) {
  119. $ignoredHeadings = [];
  120. $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
  121. if ( $source->isBlank() ) {
  122. // Try old version too, just in case
  123. $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
  124. }
  125. if ( !$source->isDisabled() ) {
  126. $lines = self::parseSettingsInMessage( $source->plain() );
  127. $ignoredHeadings = $lines; // Now we just have headings!
  128. }
  129. }
  130. return $ignoredHeadings;
  131. }
  132. /**
  133. * Extract parts of the text - opening, main and auxiliary.
  134. */
  135. private function extractWikitextParts() {
  136. if ( !is_null( $this->allText ) ) {
  137. return;
  138. }
  139. $text = $this->parserOutput->getText( [
  140. 'enableSectionEditTokens' => false,
  141. 'allowTOC' => false,
  142. ] );
  143. if ( strlen( $text ) == 0 ) {
  144. $this->allText = "";
  145. // empty text - nothing to seek here
  146. return;
  147. }
  148. $opening = null;
  149. $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
  150. // Add extra spacing around break tags so text crammed together like<br>this
  151. // doesn't make one word.
  152. $text = str_replace( '<br', "\n<br", $text );
  153. $formatter = new HtmlFormatter( $text );
  154. // Strip elements from the page that we never want in the search text.
  155. $formatter->remove( $this->excludedElementSelectors );
  156. $formatter->filterContent();
  157. // Strip elements from the page that are auxiliary text. These will still be
  158. // searched but matches will be ranked lower and non-auxiliary matches will be
  159. // preferred in highlighting.
  160. $formatter->remove( $this->auxiliaryElementSelectors );
  161. $auxiliaryElements = $formatter->filterContent();
  162. $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
  163. foreach ( $auxiliaryElements as $auxiliaryElement ) {
  164. $this->auxText[] =
  165. trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
  166. }
  167. }
  168. /**
  169. * Get text before first heading.
  170. * @param string $text
  171. * @return string|null
  172. */
  173. private function extractHeadingBeforeFirstHeading( $text ) {
  174. $matches = [];
  175. if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
  176. // There isn't a first heading so we interpret this as the article
  177. // being entirely without heading.
  178. return null;
  179. }
  180. $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
  181. if ( !$text ) {
  182. // There isn't any text before the first heading so we declare there isn't
  183. // a first heading.
  184. return null;
  185. }
  186. $formatter = new HtmlFormatter( $text );
  187. $formatter->remove( $this->excludedElementSelectors );
  188. $formatter->remove( $this->auxiliaryElementSelectors );
  189. $formatter->filterContent();
  190. $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
  191. if ( !$text ) {
  192. // There isn't any text after filtering before the first heading so we declare
  193. // that there isn't a first heading.
  194. return null;
  195. }
  196. return $text;
  197. }
  198. /**
  199. * Get opening text
  200. * @return string
  201. */
  202. public function getOpeningText() {
  203. $this->extractWikitextParts();
  204. return $this->openingText;
  205. }
  206. /**
  207. * Get main text
  208. * @return string
  209. */
  210. public function getMainText() {
  211. $this->extractWikitextParts();
  212. return $this->allText;
  213. }
  214. /**
  215. * Get auxiliary text
  216. * @return string[]
  217. */
  218. public function getAuxiliaryText() {
  219. $this->extractWikitextParts();
  220. return $this->auxText;
  221. }
  222. /**
  223. * Get the defaultsort property
  224. * @return string|null
  225. */
  226. public function getDefaultSort() {
  227. return $this->parserOutput->getProperty( 'defaultsort' );
  228. }
  229. }