123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255 |
- <?php
- use HtmlFormatter\HtmlFormatter;
- /**
- * Class allowing to explore structure of parsed wikitext.
- */
- class WikiTextStructure {
- /**
- * @var string
- */
- private $openingText;
- /**
- * @var string
- */
- private $allText;
- /**
- * @var string[]
- */
- private $auxText = [];
- /**
- * @var ParserOutput
- */
- private $parserOutput;
- /**
- * @var string[] selectors to elements that are excluded entirely from search
- */
- private $excludedElementSelectors = [
- // "it looks like you don't have javascript enabled..." – do not need to index
- 'audio', 'video',
- // CSS stylesheets aren't content
- 'style',
- // The [1] for references
- 'sup.reference',
- // The ↑ next to references in the references section
- '.mw-cite-backlink',
- // Headings are already indexed in their own field.
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
- // Collapsed fields are hidden by default so we don't want them showing up.
- '.autocollapse',
- // Content explicitly decided to be not searchable by editors such
- // as custom navigation templates.
- '.navigation-not-searchable'
- ];
- /**
- * @var string[] selectors to elements that are considered auxiliary to article text for search
- */
- private $auxiliaryElementSelectors = [
- // Thumbnail captions aren't really part of the text proper
- '.thumbcaption',
- // Neither are tables
- 'table',
- // Common style for "See also:".
- '.rellink',
- // Common style for calling out helpful links at the top of the article.
- '.dablink',
- // New class users can use to mark stuff as auxiliary to searches.
- '.searchaux',
- ];
- /**
- * @param ParserOutput $parserOutput
- */
- public function __construct( ParserOutput $parserOutput ) {
- $this->parserOutput = $parserOutput;
- }
- /**
- * Get headings on the page.
- * @return string[]
- * First strip out things that look like references. We can't use HTML filtering because
- * the references come back as <sup> tags without a class. To keep from breaking stuff like
- * ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
- * we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
- * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
- * or something. Whatever. So we only strip things that look like <sup> tags wrapping a
- * reference. And since the data looks like:
- * Reference in heading <sup>[1]</sup><sup>[2]</sup>
- * we can not really use HtmlFormatter as we have no suitable selector.
- */
- public function headings() {
- $headings = [];
- $ignoredHeadings = $this->getIgnoredHeadings();
- foreach ( $this->parserOutput->getSections() as $heading ) {
- $heading = $heading[ 'line' ];
- // Some wikis wrap the brackets in a span:
- // https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
- $heading = preg_replace( '/<\/?span>/', '', $heading );
- // Normalize [] so the following regexp would work.
- $heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
- $heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );
- // Strip tags from the heading or else we'll display them (escaped) in search results
- $heading = trim( Sanitizer::stripAllTags( $heading ) );
- // Note that we don't take the level of the heading into account - all headings are equal.
- // Except the ones we ignore.
- if ( !in_array( $heading, $ignoredHeadings ) ) {
- $headings[] = $heading;
- }
- }
- return $headings;
- }
- /**
- * Parse a message content into an array. This function is generally used to
- * parse settings stored as i18n messages (see search-ignored-headings).
- *
- * @param string $message
- * @return string[]
- */
- public static function parseSettingsInMessage( $message ) {
- $lines = explode( "\n", $message );
- $lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
- $lines = array_map( 'trim', $lines ); // Remove extra spaces
- $lines = array_filter( $lines ); // Remove empty lines
- return $lines;
- }
- /**
- * Get list of heading to ignore.
- * @return string[]
- */
- private function getIgnoredHeadings() {
- static $ignoredHeadings = null;
- if ( $ignoredHeadings === null ) {
- $ignoredHeadings = [];
- $source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
- if ( $source->isBlank() ) {
- // Try old version too, just in case
- $source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
- }
- if ( !$source->isDisabled() ) {
- $lines = self::parseSettingsInMessage( $source->plain() );
- $ignoredHeadings = $lines; // Now we just have headings!
- }
- }
- return $ignoredHeadings;
- }
- /**
- * Extract parts of the text - opening, main and auxiliary.
- */
- private function extractWikitextParts() {
- if ( !is_null( $this->allText ) ) {
- return;
- }
- $text = $this->parserOutput->getText( [
- 'enableSectionEditTokens' => false,
- 'allowTOC' => false,
- ] );
- if ( strlen( $text ) == 0 ) {
- $this->allText = "";
- // empty text - nothing to seek here
- return;
- }
- $opening = null;
- $this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
- // Add extra spacing around break tags so text crammed together like<br>this
- // doesn't make one word.
- $text = str_replace( '<br', "\n<br", $text );
- $formatter = new HtmlFormatter( $text );
- // Strip elements from the page that we never want in the search text.
- $formatter->remove( $this->excludedElementSelectors );
- $formatter->filterContent();
- // Strip elements from the page that are auxiliary text. These will still be
- // searched but matches will be ranked lower and non-auxiliary matches will be
- // preferred in highlighting.
- $formatter->remove( $this->auxiliaryElementSelectors );
- $auxiliaryElements = $formatter->filterContent();
- $this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
- foreach ( $auxiliaryElements as $auxiliaryElement ) {
- $this->auxText[] =
- trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
- }
- }
- /**
- * Get text before first heading.
- * @param string $text
- * @return string|null
- */
- private function extractHeadingBeforeFirstHeading( $text ) {
- $matches = [];
- if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
- // There isn't a first heading so we interpret this as the article
- // being entirely without heading.
- return null;
- }
- $text = substr( $text, 0, $matches[ 0 ][ 1 ] );
- if ( !$text ) {
- // There isn't any text before the first heading so we declare there isn't
- // a first heading.
- return null;
- }
- $formatter = new HtmlFormatter( $text );
- $formatter->remove( $this->excludedElementSelectors );
- $formatter->remove( $this->auxiliaryElementSelectors );
- $formatter->filterContent();
- $text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
- if ( !$text ) {
- // There isn't any text after filtering before the first heading so we declare
- // that there isn't a first heading.
- return null;
- }
- return $text;
- }
- /**
- * Get opening text
- * @return string
- */
- public function getOpeningText() {
- $this->extractWikitextParts();
- return $this->openingText;
- }
- /**
- * Get main text
- * @return string
- */
- public function getMainText() {
- $this->extractWikitextParts();
- return $this->allText;
- }
- /**
- * Get auxiliary text
- * @return string[]
- */
- public function getAuxiliaryText() {
- $this->extractWikitextParts();
- return $this->auxText;
- }
- /**
- * Get the defaultsort property
- * @return string|null
- */
- public function getDefaultSort() {
- return $this->parserOutput->getProperty( 'defaultsort' );
- }
- }
|