SearchEngine.php 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. <?php
  2. /**
  3. * @defgroup Search Search
  4. *
  5. * @file
  6. * @ingroup Search
  7. */
  8. /**
  9. * Contain a class for special pages
  10. * @ingroup Search
  11. */
  12. class SearchEngine {
  13. var $limit = 10;
  14. var $offset = 0;
  15. var $prefix = '';
  16. var $searchTerms = array();
  17. var $namespaces = array( NS_MAIN );
  18. var $showRedirects = false;
  19. /**
  20. * Perform a full text search query and return a result set.
  21. * If title searches are not supported or disabled, return null.
  22. *
  23. * @param string $term - Raw search term
  24. * @return SearchResultSet
  25. * @access public
  26. * @abstract
  27. */
  28. function searchText( $term ) {
  29. return null;
  30. }
  31. /**
  32. * Perform a title-only search query and return a result set.
  33. * If title searches are not supported or disabled, return null.
  34. *
  35. * @param string $term - Raw search term
  36. * @return SearchResultSet
  37. * @access public
  38. * @abstract
  39. */
  40. function searchTitle( $term ) {
  41. return null;
  42. }
  43. /** If this search backend can list/unlist redirects */
  44. function acceptListRedirects() {
  45. return true;
  46. }
  47. /**
  48. * Transform search term in cases when parts of the query came as different GET params (when supported)
  49. * e.g. for prefix queries: search=test&prefix=Main_Page/Archive -> test prefix:Main Page/Archive
  50. */
  51. function transformSearchTerm( $term ) {
  52. return $term;
  53. }
  54. /**
  55. * If an exact title match can be find, or a very slightly close match,
  56. * return the title. If no match, returns NULL.
  57. *
  58. * @param string $term
  59. * @return Title
  60. */
  61. public static function getNearMatch( $searchterm ) {
  62. global $wgContLang;
  63. $allSearchTerms = array($searchterm);
  64. if($wgContLang->hasVariants()){
  65. $allSearchTerms = array_merge($allSearchTerms,$wgContLang->convertLinkToAllVariants($searchterm));
  66. }
  67. foreach($allSearchTerms as $term){
  68. # Exact match? No need to look further.
  69. $title = Title::newFromText( $term );
  70. if (is_null($title))
  71. return NULL;
  72. if ( $title->getNamespace() == NS_SPECIAL || $title->isExternal() || $title->exists() ) {
  73. return $title;
  74. }
  75. # See if it still otherwise has content is some sane sense
  76. $article = MediaWiki::articleFromTitle( $title );
  77. if( $article->hasViewableContent() ) {
  78. return $title;
  79. }
  80. # Now try all lower case (i.e. first letter capitalized)
  81. #
  82. $title = Title::newFromText( $wgContLang->lc( $term ) );
  83. if ( $title && $title->exists() ) {
  84. return $title;
  85. }
  86. # Now try capitalized string
  87. #
  88. $title = Title::newFromText( $wgContLang->ucwords( $term ) );
  89. if ( $title && $title->exists() ) {
  90. return $title;
  91. }
  92. # Now try all upper case
  93. #
  94. $title = Title::newFromText( $wgContLang->uc( $term ) );
  95. if ( $title && $title->exists() ) {
  96. return $title;
  97. }
  98. # Now try Word-Caps-Breaking-At-Word-Breaks, for hyphenated names etc
  99. $title = Title::newFromText( $wgContLang->ucwordbreaks($term) );
  100. if ( $title && $title->exists() ) {
  101. return $title;
  102. }
  103. // Give hooks a chance at better match variants
  104. $title = null;
  105. if( !wfRunHooks( 'SearchGetNearMatch', array( $term, &$title ) ) ) {
  106. return $title;
  107. }
  108. }
  109. $title = Title::newFromText( $searchterm );
  110. # Entering an IP address goes to the contributions page
  111. if ( ( $title->getNamespace() == NS_USER && User::isIP($title->getText() ) )
  112. || User::isIP( trim( $searchterm ) ) ) {
  113. return SpecialPage::getTitleFor( 'Contributions', $title->getDBkey() );
  114. }
  115. # Entering a user goes to the user page whether it's there or not
  116. if ( $title->getNamespace() == NS_USER ) {
  117. return $title;
  118. }
  119. # Go to images that exist even if there's no local page.
  120. # There may have been a funny upload, or it may be on a shared
  121. # file repository such as Wikimedia Commons.
  122. if( $title->getNamespace() == NS_FILE ) {
  123. $image = wfFindFile( $title );
  124. if( $image ) {
  125. return $title;
  126. }
  127. }
  128. # MediaWiki namespace? Page may be "implied" if not customized.
  129. # Just return it, with caps forced as the message system likes it.
  130. if( $title->getNamespace() == NS_MEDIAWIKI ) {
  131. return Title::makeTitle( NS_MEDIAWIKI, $wgContLang->ucfirst( $title->getText() ) );
  132. }
  133. # Quoted term? Try without the quotes...
  134. $matches = array();
  135. if( preg_match( '/^"([^"]+)"$/', $searchterm, $matches ) ) {
  136. return SearchEngine::getNearMatch( $matches[1] );
  137. }
  138. return NULL;
  139. }
  140. public static function legalSearchChars() {
  141. return "A-Za-z_'.0-9\\x80-\\xFF\\-";
  142. }
  143. /**
  144. * Set the maximum number of results to return
  145. * and how many to skip before returning the first.
  146. *
  147. * @param int $limit
  148. * @param int $offset
  149. * @access public
  150. */
  151. function setLimitOffset( $limit, $offset = 0 ) {
  152. $this->limit = intval( $limit );
  153. $this->offset = intval( $offset );
  154. }
  155. /**
  156. * Set which namespaces the search should include.
  157. * Give an array of namespace index numbers.
  158. *
  159. * @param array $namespaces
  160. * @access public
  161. */
  162. function setNamespaces( $namespaces ) {
  163. $this->namespaces = $namespaces;
  164. }
  165. /**
  166. * Parse some common prefixes: all (search everything)
  167. * or namespace names
  168. *
  169. * @param string $query
  170. */
  171. function replacePrefixes( $query ){
  172. global $wgContLang;
  173. if( strpos($query,':') === false )
  174. return $query; // nothing to do
  175. $parsed = $query;
  176. $allkeyword = wfMsgForContent('searchall').":";
  177. if( strncmp($query, $allkeyword, strlen($allkeyword)) == 0 ){
  178. $this->namespaces = null;
  179. $parsed = substr($query,strlen($allkeyword));
  180. } else if( strpos($query,':') !== false ) {
  181. $prefix = substr($query,0,strpos($query,':'));
  182. $index = $wgContLang->getNsIndex($prefix);
  183. if($index !== false){
  184. $this->namespaces = array($index);
  185. $parsed = substr($query,strlen($prefix)+1);
  186. }
  187. }
  188. if(trim($parsed) == '')
  189. return $query; // prefix was the whole query
  190. return $parsed;
  191. }
  192. /**
  193. * Make a list of searchable namespaces and their canonical names.
  194. * @return array
  195. */
  196. public static function searchableNamespaces() {
  197. global $wgContLang;
  198. $arr = array();
  199. foreach( $wgContLang->getNamespaces() as $ns => $name ) {
  200. if( $ns >= NS_MAIN ) {
  201. $arr[$ns] = $name;
  202. }
  203. }
  204. return $arr;
  205. }
  206. /**
  207. * Extract default namespaces to search from the given user's
  208. * settings, returning a list of index numbers.
  209. *
  210. * @param User $user
  211. * @return array
  212. * @static
  213. */
  214. public static function userNamespaces( &$user ) {
  215. $arr = array();
  216. foreach( SearchEngine::searchableNamespaces() as $ns => $name ) {
  217. if( $user->getOption( 'searchNs' . $ns ) ) {
  218. $arr[] = $ns;
  219. }
  220. }
  221. return $arr;
  222. }
  223. /**
  224. * Find snippet highlight settings for a given user
  225. *
  226. * @param User $user
  227. * @return array contextlines, contextchars
  228. * @static
  229. */
  230. public static function userHighlightPrefs( &$user ){
  231. //$contextlines = $user->getOption( 'contextlines', 5 );
  232. //$contextchars = $user->getOption( 'contextchars', 50 );
  233. $contextlines = 2; // Hardcode this. Old defaults sucked. :)
  234. $contextchars = 75; // same as above.... :P
  235. return array($contextlines, $contextchars);
  236. }
  237. /**
  238. * An array of namespaces indexes to be searched by default
  239. *
  240. * @return array
  241. * @static
  242. */
  243. public static function defaultNamespaces(){
  244. global $wgNamespacesToBeSearchedDefault;
  245. return array_keys($wgNamespacesToBeSearchedDefault, true);
  246. }
  247. /**
  248. * Get a list of namespace names useful for showing in tooltips
  249. * and preferences
  250. *
  251. * @param unknown_type $namespaces
  252. */
  253. public static function namespacesAsText( $namespaces ){
  254. global $wgContLang;
  255. $formatted = array_map( array($wgContLang,'getFormattedNsText'), $namespaces );
  256. foreach( $formatted as $key => $ns ){
  257. if ( empty($ns) )
  258. $formatted[$key] = wfMsg( 'blanknamespace' );
  259. }
  260. return $formatted;
  261. }
  262. /**
  263. * An array of "project" namespaces indexes typically searched
  264. * by logged-in users
  265. *
  266. * @return array
  267. * @static
  268. */
  269. public static function projectNamespaces() {
  270. global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
  271. return array_keys( $wgNamespacesToBeSearchedProject, true );
  272. }
  273. /**
  274. * An array of "project" namespaces indexes typically searched
  275. * by logged-in users in addition to the default namespaces
  276. *
  277. * @return array
  278. * @static
  279. */
  280. public static function defaultAndProjectNamespaces() {
  281. global $wgNamespacesToBeSearchedDefault, $wgNamespacesToBeSearchedProject;
  282. return array_keys( $wgNamespacesToBeSearchedDefault +
  283. $wgNamespacesToBeSearchedProject, true);
  284. }
  285. /**
  286. * Return a 'cleaned up' search string
  287. *
  288. * @return string
  289. * @access public
  290. */
  291. function filter( $text ) {
  292. $lc = $this->legalSearchChars();
  293. return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
  294. }
  295. /**
  296. * Load up the appropriate search engine class for the currently
  297. * active database backend, and return a configured instance.
  298. *
  299. * @return SearchEngine
  300. */
  301. public static function create() {
  302. global $wgSearchType;
  303. $dbr = wfGetDB( DB_SLAVE );
  304. if( $wgSearchType ) {
  305. $class = $wgSearchType;
  306. } else {
  307. $class = $dbr->getSearchEngine();
  308. }
  309. $search = new $class( $dbr );
  310. $search->setLimitOffset(0,0);
  311. return $search;
  312. }
  313. /**
  314. * Create or update the search index record for the given page.
  315. * Title and text should be pre-processed.
  316. *
  317. * @param int $id
  318. * @param string $title
  319. * @param string $text
  320. * @abstract
  321. */
  322. function update( $id, $title, $text ) {
  323. // no-op
  324. }
  325. /**
  326. * Update a search index record's title only.
  327. * Title should be pre-processed.
  328. *
  329. * @param int $id
  330. * @param string $title
  331. * @abstract
  332. */
  333. function updateTitle( $id, $title ) {
  334. // no-op
  335. }
  336. /**
  337. * Get OpenSearch suggestion template
  338. *
  339. * @return string
  340. * @static
  341. */
  342. public static function getOpenSearchTemplate() {
  343. global $wgOpenSearchTemplate, $wgServer, $wgScriptPath;
  344. if( $wgOpenSearchTemplate ) {
  345. return $wgOpenSearchTemplate;
  346. } else {
  347. $ns = implode( '|', SearchEngine::defaultNamespaces() );
  348. if( !$ns ) $ns = "0";
  349. return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace='.$ns;
  350. }
  351. }
  352. /**
  353. * Get internal MediaWiki Suggest template
  354. *
  355. * @return string
  356. * @static
  357. */
  358. public static function getMWSuggestTemplate() {
  359. global $wgMWSuggestTemplate, $wgServer, $wgScriptPath;
  360. if($wgMWSuggestTemplate)
  361. return $wgMWSuggestTemplate;
  362. else
  363. return $wgServer . $wgScriptPath . '/api.php?action=opensearch&search={searchTerms}&namespace={namespaces}&suggest';
  364. }
  365. }
  366. /**
  367. * @ingroup Search
  368. */
  369. class SearchResultSet {
  370. /**
  371. * Fetch an array of regular expression fragments for matching
  372. * the search terms as parsed by this engine in a text extract.
  373. *
  374. * @return array
  375. * @access public
  376. * @abstract
  377. */
  378. function termMatches() {
  379. return array();
  380. }
  381. function numRows() {
  382. return 0;
  383. }
  384. /**
  385. * Return true if results are included in this result set.
  386. * @return bool
  387. * @abstract
  388. */
  389. function hasResults() {
  390. return false;
  391. }
  392. /**
  393. * Some search modes return a total hit count for the query
  394. * in the entire article database. This may include pages
  395. * in namespaces that would not be matched on the given
  396. * settings.
  397. *
  398. * Return null if no total hits number is supported.
  399. *
  400. * @return int
  401. * @access public
  402. */
  403. function getTotalHits() {
  404. return null;
  405. }
  406. /**
  407. * Some search modes return a suggested alternate term if there are
  408. * no exact hits. Returns true if there is one on this set.
  409. *
  410. * @return bool
  411. * @access public
  412. */
  413. function hasSuggestion() {
  414. return false;
  415. }
  416. /**
  417. * @return string suggested query, null if none
  418. */
  419. function getSuggestionQuery(){
  420. return null;
  421. }
  422. /**
  423. * @return string HTML highlighted suggested query, '' if none
  424. */
  425. function getSuggestionSnippet(){
  426. return '';
  427. }
  428. /**
  429. * Return information about how and from where the results were fetched,
  430. * should be useful for diagnostics and debugging
  431. *
  432. * @return string
  433. */
  434. function getInfo() {
  435. return null;
  436. }
  437. /**
  438. * Return a result set of hits on other (multiple) wikis associated with this one
  439. *
  440. * @return SearchResultSet
  441. */
  442. function getInterwikiResults() {
  443. return null;
  444. }
  445. /**
  446. * Check if there are results on other wikis
  447. *
  448. * @return boolean
  449. */
  450. function hasInterwikiResults() {
  451. return $this->getInterwikiResults() != null;
  452. }
  453. /**
  454. * Fetches next search result, or false.
  455. * @return SearchResult
  456. * @access public
  457. * @abstract
  458. */
  459. function next() {
  460. return false;
  461. }
  462. /**
  463. * Frees the result set, if applicable.
  464. * @ access public
  465. */
  466. function free() {
  467. // ...
  468. }
  469. }
  470. /**
  471. * @ingroup Search
  472. */
  473. class SearchResultTooMany {
  474. ## Some search engines may bail out if too many matches are found
  475. }
  476. /**
  477. * @fixme This class is horribly factored. It would probably be better to have
  478. * a useful base class to which you pass some standard information, then let
  479. * the fancy self-highlighters extend that.
  480. * @ingroup Search
  481. */
  482. class SearchResult {
  483. var $mRevision = null;
  484. var $mImage = null;
  485. function __construct( $row ) {
  486. $this->mTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
  487. if( !is_null($this->mTitle) ){
  488. $this->mRevision = Revision::newFromTitle( $this->mTitle );
  489. if( $this->mTitle->getNamespace() === NS_FILE )
  490. $this->mImage = wfFindFile( $this->mTitle );
  491. }
  492. }
  493. /**
  494. * Check if this is result points to an invalid title
  495. *
  496. * @return boolean
  497. * @access public
  498. */
  499. function isBrokenTitle(){
  500. if( is_null($this->mTitle) )
  501. return true;
  502. return false;
  503. }
  504. /**
  505. * Check if target page is missing, happens when index is out of date
  506. *
  507. * @return boolean
  508. * @access public
  509. */
  510. function isMissingRevision(){
  511. return !$this->mRevision && !$this->mImage;
  512. }
  513. /**
  514. * @return Title
  515. * @access public
  516. */
  517. function getTitle() {
  518. return $this->mTitle;
  519. }
  520. /**
  521. * @return double or null if not supported
  522. */
  523. function getScore() {
  524. return null;
  525. }
  526. /**
  527. * Lazy initialization of article text from DB
  528. */
  529. protected function initText(){
  530. if( !isset($this->mText) ){
  531. if($this->mRevision != null)
  532. $this->mText = $this->mRevision->getText();
  533. else // TODO: can we fetch raw wikitext for commons images?
  534. $this->mText = '';
  535. }
  536. }
  537. /**
  538. * @param array $terms terms to highlight
  539. * @return string highlighted text snippet, null (and not '') if not supported
  540. */
  541. function getTextSnippet($terms){
  542. global $wgUser, $wgAdvancedSearchHighlighting;
  543. $this->initText();
  544. list($contextlines,$contextchars) = SearchEngine::userHighlightPrefs($wgUser);
  545. $h = new SearchHighlighter();
  546. if( $wgAdvancedSearchHighlighting )
  547. return $h->highlightText( $this->mText, $terms, $contextlines, $contextchars );
  548. else
  549. return $h->highlightSimple( $this->mText, $terms, $contextlines, $contextchars );
  550. }
  551. /**
  552. * @param array $terms terms to highlight
  553. * @return string highlighted title, '' if not supported
  554. */
  555. function getTitleSnippet($terms){
  556. return '';
  557. }
  558. /**
  559. * @param array $terms terms to highlight
  560. * @return string highlighted redirect name (redirect to this page), '' if none or not supported
  561. */
  562. function getRedirectSnippet($terms){
  563. return '';
  564. }
  565. /**
  566. * @return Title object for the redirect to this page, null if none or not supported
  567. */
  568. function getRedirectTitle(){
  569. return null;
  570. }
  571. /**
  572. * @return string highlighted relevant section name, null if none or not supported
  573. */
  574. function getSectionSnippet(){
  575. return '';
  576. }
  577. /**
  578. * @return Title object (pagename+fragment) for the section, null if none or not supported
  579. */
  580. function getSectionTitle(){
  581. return null;
  582. }
  583. /**
  584. * @return string timestamp
  585. */
  586. function getTimestamp(){
  587. if( $this->mRevision )
  588. return $this->mRevision->getTimestamp();
  589. else if( $this->mImage )
  590. return $this->mImage->getTimestamp();
  591. return '';
  592. }
  593. /**
  594. * @return int number of words
  595. */
  596. function getWordCount(){
  597. $this->initText();
  598. return str_word_count( $this->mText );
  599. }
  600. /**
  601. * @return int size in bytes
  602. */
  603. function getByteSize(){
  604. $this->initText();
  605. return strlen( $this->mText );
  606. }
  607. /**
  608. * @return boolean if hit has related articles
  609. */
  610. function hasRelated(){
  611. return false;
  612. }
  613. /**
  614. * @return interwiki prefix of the title (return iw even if title is broken)
  615. */
  616. function getInterwikiPrefix(){
  617. return '';
  618. }
  619. }
  620. /**
  621. * Highlight bits of wikitext
  622. *
  623. * @ingroup Search
  624. */
  625. class SearchHighlighter {
  626. var $mCleanWikitext = true;
  627. function SearchHighlighter($cleanupWikitext = true){
  628. $this->mCleanWikitext = $cleanupWikitext;
  629. }
  630. /**
  631. * Default implementation of wikitext highlighting
  632. *
  633. * @param string $text
  634. * @param array $terms Terms to highlight (unescaped)
  635. * @param int $contextlines
  636. * @param int $contextchars
  637. * @return string
  638. */
  639. public function highlightText( $text, $terms, $contextlines, $contextchars ) {
  640. global $wgLang, $wgContLang;
  641. global $wgSearchHighlightBoundaries;
  642. $fname = __METHOD__;
  643. if($text == '')
  644. return '';
  645. // spli text into text + templates/links/tables
  646. $spat = "/(\\{\\{)|(\\[\\[[^\\]:]+:)|(\n\\{\\|)";
  647. // first capture group is for detecting nested templates/links/tables/references
  648. $endPatterns = array(
  649. 1 => '/(\{\{)|(\}\})/', // template
  650. 2 => '/(\[\[)|(\]\])/', // image
  651. 3 => "/(\n\\{\\|)|(\n\\|\\})/"); // table
  652. // FIXME: this should prolly be a hook or something
  653. if(function_exists('wfCite')){
  654. $spat .= '|(<ref>)'; // references via cite extension
  655. $endPatterns[4] = '/(<ref>)|(<\/ref>)/';
  656. }
  657. $spat .= '/';
  658. $textExt = array(); // text extracts
  659. $otherExt = array(); // other extracts
  660. wfProfileIn( "$fname-split" );
  661. $start = 0;
  662. $textLen = strlen($text);
  663. $count = 0; // sequence number to maintain ordering
  664. while( $start < $textLen ){
  665. // find start of template/image/table
  666. if( preg_match( $spat, $text, $matches, PREG_OFFSET_CAPTURE, $start ) ){
  667. $epat = '';
  668. foreach($matches as $key => $val){
  669. if($key > 0 && $val[1] != -1){
  670. if($key == 2){
  671. // see if this is an image link
  672. $ns = substr($val[0],2,-1);
  673. if( $wgContLang->getNsIndex($ns) != NS_FILE )
  674. break;
  675. }
  676. $epat = $endPatterns[$key];
  677. $this->splitAndAdd( $textExt, $count, substr( $text, $start, $val[1] - $start ) );
  678. $start = $val[1];
  679. break;
  680. }
  681. }
  682. if( $epat ){
  683. // find end (and detect any nested elements)
  684. $level = 0;
  685. $offset = $start + 1;
  686. $found = false;
  687. while( preg_match( $epat, $text, $endMatches, PREG_OFFSET_CAPTURE, $offset ) ){
  688. if( array_key_exists(2,$endMatches) ){
  689. // found end
  690. if($level == 0){
  691. $len = strlen($endMatches[2][0]);
  692. $off = $endMatches[2][1];
  693. $this->splitAndAdd( $otherExt, $count,
  694. substr( $text, $start, $off + $len - $start ) );
  695. $start = $off + $len;
  696. $found = true;
  697. break;
  698. } else{
  699. // end of nested element
  700. $level -= 1;
  701. }
  702. } else{
  703. // nested
  704. $level += 1;
  705. }
  706. $offset = $endMatches[0][1] + strlen($endMatches[0][0]);
  707. }
  708. if( ! $found ){
  709. // couldn't find appropriate closing tag, skip
  710. $this->splitAndAdd( $textExt, $count, substr( $text, $start, strlen($matches[0][0]) ) );
  711. $start += strlen($matches[0][0]);
  712. }
  713. continue;
  714. }
  715. }
  716. // else: add as text extract
  717. $this->splitAndAdd( $textExt, $count, substr($text,$start) );
  718. break;
  719. }
  720. $all = $textExt + $otherExt; // these have disjunct key sets
  721. wfProfileOut( "$fname-split" );
  722. // prepare regexps
  723. foreach( $terms as $index => $term ) {
  724. // manually do upper/lowercase stuff for utf-8 since PHP won't do it
  725. if(preg_match('/[\x80-\xff]/', $term) ){
  726. $terms[$index] = preg_replace_callback('/./us',array($this,'caseCallback'),$terms[$index]);
  727. } else {
  728. $terms[$index] = $term;
  729. }
  730. }
  731. $anyterm = implode( '|', $terms );
  732. $phrase = implode("$wgSearchHighlightBoundaries+", $terms );
  733. // FIXME: a hack to scale contextchars, a correct solution
  734. // would be to have contextchars actually be char and not byte
  735. // length, and do proper utf-8 substrings and lengths everywhere,
  736. // but PHP is making that very hard and unclean to implement :(
  737. $scale = strlen($anyterm) / mb_strlen($anyterm);
  738. $contextchars = intval( $contextchars * $scale );
  739. $patPre = "(^|$wgSearchHighlightBoundaries)";
  740. $patPost = "($wgSearchHighlightBoundaries|$)";
  741. $pat1 = "/(".$phrase.")/ui";
  742. $pat2 = "/$patPre(".$anyterm.")$patPost/ui";
  743. wfProfileIn( "$fname-extract" );
  744. $left = $contextlines;
  745. $snippets = array();
  746. $offsets = array();
  747. // show beginning only if it contains all words
  748. $first = 0;
  749. $firstText = '';
  750. foreach($textExt as $index => $line){
  751. if(strlen($line)>0 && $line[0] != ';' && $line[0] != ':'){
  752. $firstText = $this->extract( $line, 0, $contextchars * $contextlines );
  753. $first = $index;
  754. break;
  755. }
  756. }
  757. if( $firstText ){
  758. $succ = true;
  759. // check if first text contains all terms
  760. foreach($terms as $term){
  761. if( ! preg_match("/$patPre".$term."$patPost/ui", $firstText) ){
  762. $succ = false;
  763. break;
  764. }
  765. }
  766. if( $succ ){
  767. $snippets[$first] = $firstText;
  768. $offsets[$first] = 0;
  769. }
  770. }
  771. if( ! $snippets ) {
  772. // match whole query on text
  773. $this->process($pat1, $textExt, $left, $contextchars, $snippets, $offsets);
  774. // match whole query on templates/tables/images
  775. $this->process($pat1, $otherExt, $left, $contextchars, $snippets, $offsets);
  776. // match any words on text
  777. $this->process($pat2, $textExt, $left, $contextchars, $snippets, $offsets);
  778. // match any words on templates/tables/images
  779. $this->process($pat2, $otherExt, $left, $contextchars, $snippets, $offsets);
  780. ksort($snippets);
  781. }
  782. // add extra chars to each snippet to make snippets constant size
  783. $extended = array();
  784. if( count( $snippets ) == 0){
  785. // couldn't find the target words, just show beginning of article
  786. $targetchars = $contextchars * $contextlines;
  787. $snippets[$first] = '';
  788. $offsets[$first] = 0;
  789. } else{
  790. // if begin of the article contains the whole phrase, show only that !!
  791. if( array_key_exists($first,$snippets) && preg_match($pat1,$snippets[$first])
  792. && $offsets[$first] < $contextchars * 2 ){
  793. $snippets = array ($first => $snippets[$first]);
  794. }
  795. // calc by how much to extend existing snippets
  796. $targetchars = intval( ($contextchars * $contextlines) / count ( $snippets ) );
  797. }
  798. foreach($snippets as $index => $line){
  799. $extended[$index] = $line;
  800. $len = strlen($line);
  801. if( $len < $targetchars - 20 ){
  802. // complete this line
  803. if($len < strlen( $all[$index] )){
  804. $extended[$index] = $this->extract( $all[$index], $offsets[$index], $offsets[$index]+$targetchars, $offsets[$index]);
  805. $len = strlen( $extended[$index] );
  806. }
  807. // add more lines
  808. $add = $index + 1;
  809. while( $len < $targetchars - 20
  810. && array_key_exists($add,$all)
  811. && !array_key_exists($add,$snippets) ){
  812. $offsets[$add] = 0;
  813. $tt = "\n".$this->extract( $all[$add], 0, $targetchars - $len, $offsets[$add] );
  814. $extended[$add] = $tt;
  815. $len += strlen( $tt );
  816. $add++;
  817. }
  818. }
  819. }
  820. //$snippets = array_map('htmlspecialchars', $extended);
  821. $snippets = $extended;
  822. $last = -1;
  823. $extract = '';
  824. foreach($snippets as $index => $line){
  825. if($last == -1)
  826. $extract .= $line; // first line
  827. elseif($last+1 == $index && $offsets[$last]+strlen($snippets[$last]) >= strlen($all[$last]))
  828. $extract .= " ".$line; // continous lines
  829. else
  830. $extract .= '<b> ... </b>' . $line;
  831. $last = $index;
  832. }
  833. if( $extract )
  834. $extract .= '<b> ... </b>';
  835. $processed = array();
  836. foreach($terms as $term){
  837. if( ! isset($processed[$term]) ){
  838. $pat3 = "/$patPre(".$term.")$patPost/ui"; // highlight word
  839. $extract = preg_replace( $pat3,
  840. "\\1<span class='searchmatch'>\\2</span>\\3", $extract );
  841. $processed[$term] = true;
  842. }
  843. }
  844. wfProfileOut( "$fname-extract" );
  845. return $extract;
  846. }
  847. /**
  848. * Split text into lines and add it to extracts array
  849. *
  850. * @param array $extracts index -> $line
  851. * @param int $count
  852. * @param string $text
  853. */
  854. function splitAndAdd(&$extracts, &$count, $text){
  855. $split = explode( "\n", $this->mCleanWikitext? $this->removeWiki($text) : $text );
  856. foreach($split as $line){
  857. $tt = trim($line);
  858. if( $tt )
  859. $extracts[$count++] = $tt;
  860. }
  861. }
  862. /**
  863. * Do manual case conversion for non-ascii chars
  864. *
  865. * @param unknown_type $matches
  866. */
  867. function caseCallback($matches){
  868. global $wgContLang;
  869. if( strlen($matches[0]) > 1 ){
  870. return '['.$wgContLang->lc($matches[0]).$wgContLang->uc($matches[0]).']';
  871. } else
  872. return $matches[0];
  873. }
  874. /**
  875. * Extract part of the text from start to end, but by
  876. * not chopping up words
  877. * @param string $text
  878. * @param int $start
  879. * @param int $end
  880. * @param int $posStart (out) actual start position
  881. * @param int $posEnd (out) actual end position
  882. * @return string
  883. */
  884. function extract($text, $start, $end, &$posStart = null, &$posEnd = null ){
  885. global $wgContLang;
  886. if( $start != 0)
  887. $start = $this->position( $text, $start, 1 );
  888. if( $end >= strlen($text) )
  889. $end = strlen($text);
  890. else
  891. $end = $this->position( $text, $end );
  892. if(!is_null($posStart))
  893. $posStart = $start;
  894. if(!is_null($posEnd))
  895. $posEnd = $end;
  896. if($end > $start)
  897. return substr($text, $start, $end-$start);
  898. else
  899. return '';
  900. }
  901. /**
  902. * Find a nonletter near a point (index) in the text
  903. *
  904. * @param string $text
  905. * @param int $point
  906. * @param int $offset to found index
  907. * @return int nearest nonletter index, or beginning of utf8 char if none
  908. */
  909. function position($text, $point, $offset=0 ){
  910. $tolerance = 10;
  911. $s = max( 0, $point - $tolerance );
  912. $l = min( strlen($text), $point + $tolerance ) - $s;
  913. $m = array();
  914. if( preg_match('/[ ,.!?~!@#$%^&*\(\)+=\-\\\|\[\]"\'<>]/', substr($text,$s,$l), $m, PREG_OFFSET_CAPTURE ) ){
  915. return $m[0][1] + $s + $offset;
  916. } else{
  917. // check if point is on a valid first UTF8 char
  918. $char = ord( $text[$point] );
  919. while( $char >= 0x80 && $char < 0xc0 ) {
  920. // skip trailing bytes
  921. $point++;
  922. if($point >= strlen($text))
  923. return strlen($text);
  924. $char = ord( $text[$point] );
  925. }
  926. return $point;
  927. }
  928. }
  929. /**
  930. * Search extracts for a pattern, and return snippets
  931. *
  932. * @param string $pattern regexp for matching lines
  933. * @param array $extracts extracts to search
  934. * @param int $linesleft number of extracts to make
  935. * @param int $contextchars length of snippet
  936. * @param array $out map for highlighted snippets
  937. * @param array $offsets map of starting points of snippets
  938. * @protected
  939. */
  940. function process( $pattern, $extracts, &$linesleft, &$contextchars, &$out, &$offsets ){
  941. if($linesleft == 0)
  942. return; // nothing to do
  943. foreach($extracts as $index => $line){
  944. if( array_key_exists($index,$out) )
  945. continue; // this line already highlighted
  946. $m = array();
  947. if ( !preg_match( $pattern, $line, $m, PREG_OFFSET_CAPTURE ) )
  948. continue;
  949. $offset = $m[0][1];
  950. $len = strlen($m[0][0]);
  951. if($offset + $len < $contextchars)
  952. $begin = 0;
  953. elseif( $len > $contextchars)
  954. $begin = $offset;
  955. else
  956. $begin = $offset + intval( ($len - $contextchars) / 2 );
  957. $end = $begin + $contextchars;
  958. $posBegin = $begin;
  959. // basic snippet from this line
  960. $out[$index] = $this->extract($line,$begin,$end,$posBegin);
  961. $offsets[$index] = $posBegin;
  962. $linesleft--;
  963. if($linesleft == 0)
  964. return;
  965. }
  966. }
  967. /**
  968. * Basic wikitext removal
  969. * @protected
  970. */
  971. function removeWiki($text) {
  972. $fname = __METHOD__;
  973. wfProfileIn( $fname );
  974. //$text = preg_replace("/'{2,5}/", "", $text);
  975. //$text = preg_replace("/\[[a-z]+:\/\/[^ ]+ ([^]]+)\]/", "\\2", $text);
  976. //$text = preg_replace("/\[\[([^]|]+)\]\]/", "\\1", $text);
  977. //$text = preg_replace("/\[\[([^]]+\|)?([^|]]+)\]\]/", "\\2", $text);
  978. //$text = preg_replace("/\\{\\|(.*?)\\|\\}/", "", $text);
  979. //$text = preg_replace("/\\[\\[[A-Za-z_-]+:([^|]+?)\\]\\]/", "", $text);
  980. $text = preg_replace("/\\{\\{([^|]+?)\\}\\}/", "", $text);
  981. $text = preg_replace("/\\{\\{([^|]+\\|)(.*?)\\}\\}/", "\\2", $text);
  982. $text = preg_replace("/\\[\\[([^|]+?)\\]\\]/", "\\1", $text);
  983. $text = preg_replace_callback("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", array($this,'linkReplace'), $text);
  984. //$text = preg_replace("/\\[\\[([^|]+\\|)(.*?)\\]\\]/", "\\2", $text);
  985. $text = preg_replace("/<\/?[^>]+>/", "", $text);
  986. $text = preg_replace("/'''''/", "", $text);
  987. $text = preg_replace("/('''|<\/?[iIuUbB]>)/", "", $text);
  988. $text = preg_replace("/''/", "", $text);
  989. wfProfileOut( $fname );
  990. return $text;
  991. }
  992. /**
  993. * callback to replace [[target|caption]] kind of links, if
  994. * the target is category or image, leave it
  995. *
  996. * @param array $matches
  997. */
  998. function linkReplace($matches){
  999. $colon = strpos( $matches[1], ':' );
  1000. if( $colon === false )
  1001. return $matches[2]; // replace with caption
  1002. global $wgContLang;
  1003. $ns = substr( $matches[1], 0, $colon );
  1004. $index = $wgContLang->getNsIndex($ns);
  1005. if( $index !== false && ($index == NS_FILE || $index == NS_CATEGORY) )
  1006. return $matches[0]; // return the whole thing
  1007. else
  1008. return $matches[2];
  1009. }
  1010. /**
  1011. * Simple & fast snippet extraction, but gives completely unrelevant
  1012. * snippets
  1013. *
  1014. * @param string $text
  1015. * @param array $terms
  1016. * @param int $contextlines
  1017. * @param int $contextchars
  1018. * @return string
  1019. */
  1020. public function highlightSimple( $text, $terms, $contextlines, $contextchars ) {
  1021. global $wgLang, $wgContLang;
  1022. $fname = __METHOD__;
  1023. $lines = explode( "\n", $text );
  1024. $terms = implode( '|', $terms );
  1025. $max = intval( $contextchars ) + 1;
  1026. $pat1 = "/(.*)($terms)(.{0,$max})/i";
  1027. $lineno = 0;
  1028. $extract = "";
  1029. wfProfileIn( "$fname-extract" );
  1030. foreach ( $lines as $line ) {
  1031. if ( 0 == $contextlines ) {
  1032. break;
  1033. }
  1034. ++$lineno;
  1035. $m = array();
  1036. if ( ! preg_match( $pat1, $line, $m ) ) {
  1037. continue;
  1038. }
  1039. --$contextlines;
  1040. $pre = $wgContLang->truncate( $m[1], -$contextchars );
  1041. if ( count( $m ) < 3 ) {
  1042. $post = '';
  1043. } else {
  1044. $post = $wgContLang->truncate( $m[3], $contextchars );
  1045. }
  1046. $found = $m[2];
  1047. $line = htmlspecialchars( $pre . $found . $post );
  1048. $pat2 = '/(' . $terms . ")/i";
  1049. $line = preg_replace( $pat2,
  1050. "<span class='searchmatch'>\\1</span>", $line );
  1051. $extract .= "${line}\n";
  1052. }
  1053. wfProfileOut( "$fname-extract" );
  1054. return $extract;
  1055. }
  1056. }
  1057. /**
  1058. * Dummy class to be used when non-supported Database engine is present.
  1059. * @fixme Dummy class should probably try something at least mildly useful,
  1060. * such as a LIKE search through titles.
  1061. * @ingroup Search
  1062. */
  1063. class SearchEngineDummy extends SearchEngine {
  1064. // no-op
  1065. }