LanguageConverter.php 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165
  1. <?php
  2. /**
  3. * This program is free software; you can redistribute it and/or modify
  4. * it under the terms of the GNU General Public License as published by
  5. * the Free Software Foundation; either version 2 of the License, or
  6. * (at your option) any later version.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License along
  14. * with this program; if not, write to the Free Software Foundation, Inc.,
  15. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16. * http://www.gnu.org/copyleft/gpl.html
  17. *
  18. * @file
  19. * @ingroup Language
  20. */
  21. use MediaWiki\Logger\LoggerFactory;
  22. /**
  23. * Base class for language conversion.
  24. * @ingroup Language
  25. *
  26. * @author Zhengzhu Feng <zhengzhu@gmail.com>
  27. * @author fdcn <fdcn64@gmail.com>
  28. * @author shinjiman <shinjiman@gmail.com>
  29. * @author PhiLiP <philip.npc@gmail.com>
  30. */
  31. class LanguageConverter {
  32. /**
  33. * languages supporting variants
  34. * @since 1.20
  35. * @var array
  36. */
  37. static public $languagesWithVariants = [
  38. 'gan',
  39. 'iu',
  40. 'kk',
  41. 'ku',
  42. 'shi',
  43. 'sr',
  44. 'tg',
  45. 'uz',
  46. 'zh',
  47. ];
  48. public $mMainLanguageCode;
  49. public $mVariants, $mVariantFallbacks, $mVariantNames;
  50. public $mTablesLoaded = false;
  51. public $mTables;
  52. // 'bidirectional' 'unidirectional' 'disable' for each variant
  53. public $mManualLevel;
  54. /**
  55. * @var string Memcached key name
  56. */
  57. public $mCacheKey;
  58. public $mLangObj;
  59. public $mFlags;
  60. public $mDescCodeSep = ':', $mDescVarSep = ';';
  61. public $mUcfirst = false;
  62. public $mConvRuleTitle = false;
  63. public $mURLVariant;
  64. public $mUserVariant;
  65. public $mHeaderVariant;
  66. public $mMaxDepth = 10;
  67. public $mVarSeparatorPattern;
  68. const CACHE_VERSION_KEY = 'VERSION 7';
  69. /**
  70. * Constructor
  71. *
  72. * @param Language $langobj
  73. * @param string $maincode The main language code of this language
  74. * @param array $variants The supported variants of this language
  75. * @param array $variantfallbacks The fallback language of each variant
  76. * @param array $flags Defining the custom strings that maps to the flags
  77. * @param array $manualLevel Limit for supported variants
  78. */
  79. public function __construct( $langobj, $maincode, $variants = [],
  80. $variantfallbacks = [], $flags = [],
  81. $manualLevel = [] ) {
  82. global $wgDisabledVariants;
  83. $this->mLangObj = $langobj;
  84. $this->mMainLanguageCode = $maincode;
  85. $this->mVariants = array_diff( $variants, $wgDisabledVariants );
  86. $this->mVariantFallbacks = $variantfallbacks;
  87. $this->mVariantNames = Language::fetchLanguageNames();
  88. $this->mCacheKey = wfMemcKey( 'conversiontables', $maincode );
  89. $defaultflags = [
  90. // 'S' show converted text
  91. // '+' add rules for alltext
  92. // 'E' the gave flags is error
  93. // these flags above are reserved for program
  94. 'A' => 'A', // add rule for convert code (all text convert)
  95. 'T' => 'T', // title convert
  96. 'R' => 'R', // raw content
  97. 'D' => 'D', // convert description (subclass implement)
  98. '-' => '-', // remove convert (not implement)
  99. 'H' => 'H', // add rule for convert code (but no display in placed code)
  100. 'N' => 'N' // current variant name
  101. ];
  102. $this->mFlags = array_merge( $defaultflags, $flags );
  103. foreach ( $this->mVariants as $v ) {
  104. if ( array_key_exists( $v, $manualLevel ) ) {
  105. $this->mManualLevel[$v] = $manualLevel[$v];
  106. } else {
  107. $this->mManualLevel[$v] = 'bidirectional';
  108. }
  109. $this->mFlags[$v] = $v;
  110. }
  111. }
  112. /**
  113. * Get all valid variants.
  114. * Call this instead of using $this->mVariants directly.
  115. *
  116. * @return array Contains all valid variants
  117. */
  118. public function getVariants() {
  119. return $this->mVariants;
  120. }
  121. /**
  122. * In case some variant is not defined in the markup, we need
  123. * to have some fallback. For example, in zh, normally people
  124. * will define zh-hans and zh-hant, but less so for zh-sg or zh-hk.
  125. * when zh-sg is preferred but not defined, we will pick zh-hans
  126. * in this case. Right now this is only used by zh.
  127. *
  128. * @param string $variant The language code of the variant
  129. * @return string|array The code of the fallback language or the
  130. * main code if there is no fallback
  131. */
  132. public function getVariantFallbacks( $variant ) {
  133. if ( isset( $this->mVariantFallbacks[$variant] ) ) {
  134. return $this->mVariantFallbacks[$variant];
  135. }
  136. return $this->mMainLanguageCode;
  137. }
  138. /**
  139. * Get the title produced by the conversion rule.
  140. * @return string The converted title text
  141. */
  142. public function getConvRuleTitle() {
  143. return $this->mConvRuleTitle;
  144. }
  145. /**
  146. * Get preferred language variant.
  147. * @return string The preferred language code
  148. */
  149. public function getPreferredVariant() {
  150. global $wgDefaultLanguageVariant, $wgUser;
  151. $req = $this->getURLVariant();
  152. if ( $wgUser->isSafeToLoad() && $wgUser->isLoggedIn() && !$req ) {
  153. $req = $this->getUserVariant();
  154. } elseif ( !$req ) {
  155. $req = $this->getHeaderVariant();
  156. }
  157. if ( $wgDefaultLanguageVariant && !$req ) {
  158. $req = $this->validateVariant( $wgDefaultLanguageVariant );
  159. }
  160. // This function, unlike the other get*Variant functions, is
  161. // not memoized (i.e. there return value is not cached) since
  162. // new information might appear during processing after this
  163. // is first called.
  164. if ( $this->validateVariant( $req ) ) {
  165. return $req;
  166. }
  167. return $this->mMainLanguageCode;
  168. }
  169. /**
  170. * Get default variant.
  171. * This function would not be affected by user's settings
  172. * @return string The default variant code
  173. */
  174. public function getDefaultVariant() {
  175. global $wgDefaultLanguageVariant;
  176. $req = $this->getURLVariant();
  177. if ( !$req ) {
  178. $req = $this->getHeaderVariant();
  179. }
  180. if ( $wgDefaultLanguageVariant && !$req ) {
  181. $req = $this->validateVariant( $wgDefaultLanguageVariant );
  182. }
  183. if ( $req ) {
  184. return $req;
  185. }
  186. return $this->mMainLanguageCode;
  187. }
  188. /**
  189. * Validate the variant
  190. * @param string $variant The variant to validate
  191. * @return mixed Returns the variant if it is valid, null otherwise
  192. */
  193. public function validateVariant( $variant = null ) {
  194. if ( $variant !== null && in_array( $variant, $this->mVariants ) ) {
  195. return $variant;
  196. }
  197. return null;
  198. }
  199. /**
  200. * Get the variant specified in the URL
  201. *
  202. * @return mixed Variant if one found, false otherwise.
  203. */
  204. public function getURLVariant() {
  205. global $wgRequest;
  206. if ( $this->mURLVariant ) {
  207. return $this->mURLVariant;
  208. }
  209. // see if the preference is set in the request
  210. $ret = $wgRequest->getText( 'variant' );
  211. if ( !$ret ) {
  212. $ret = $wgRequest->getVal( 'uselang' );
  213. }
  214. $this->mURLVariant = $this->validateVariant( $ret );
  215. return $this->mURLVariant;
  216. }
  217. /**
  218. * Determine if the user has a variant set.
  219. *
  220. * @return mixed Variant if one found, false otherwise.
  221. */
  222. protected function getUserVariant() {
  223. global $wgUser, $wgContLang;
  224. // memoizing this function wreaks havoc on parserTest.php
  225. /*
  226. if ( $this->mUserVariant ) {
  227. return $this->mUserVariant;
  228. }
  229. */
  230. // Get language variant preference from logged in users
  231. // Don't call this on stub objects because that causes infinite
  232. // recursion during initialisation
  233. if ( !$wgUser->isSafeToLoad() ) {
  234. return false;
  235. }
  236. if ( $wgUser->isLoggedIn() ) {
  237. if ( $this->mMainLanguageCode == $wgContLang->getCode() ) {
  238. $ret = $wgUser->getOption( 'variant' );
  239. } else {
  240. $ret = $wgUser->getOption( 'variant-' . $this->mMainLanguageCode );
  241. }
  242. } else {
  243. // figure out user lang without constructing wgLang to avoid
  244. // infinite recursion
  245. $ret = $wgUser->getOption( 'language' );
  246. }
  247. $this->mUserVariant = $this->validateVariant( $ret );
  248. return $this->mUserVariant;
  249. }
  250. /**
  251. * Determine the language variant from the Accept-Language header.
  252. *
  253. * @return mixed Variant if one found, false otherwise.
  254. */
  255. protected function getHeaderVariant() {
  256. global $wgRequest;
  257. if ( $this->mHeaderVariant ) {
  258. return $this->mHeaderVariant;
  259. }
  260. // see if some supported language variant is set in the
  261. // HTTP header.
  262. $languages = array_keys( $wgRequest->getAcceptLang() );
  263. if ( empty( $languages ) ) {
  264. return null;
  265. }
  266. $fallbackLanguages = [];
  267. foreach ( $languages as $language ) {
  268. $this->mHeaderVariant = $this->validateVariant( $language );
  269. if ( $this->mHeaderVariant ) {
  270. break;
  271. }
  272. // To see if there are fallbacks of current language.
  273. // We record these fallback variants, and process
  274. // them later.
  275. $fallbacks = $this->getVariantFallbacks( $language );
  276. if ( is_string( $fallbacks ) && $fallbacks !== $this->mMainLanguageCode ) {
  277. $fallbackLanguages[] = $fallbacks;
  278. } elseif ( is_array( $fallbacks ) ) {
  279. $fallbackLanguages =
  280. array_merge( $fallbackLanguages, $fallbacks );
  281. }
  282. }
  283. if ( !$this->mHeaderVariant ) {
  284. // process fallback languages now
  285. $fallback_languages = array_unique( $fallbackLanguages );
  286. foreach ( $fallback_languages as $language ) {
  287. $this->mHeaderVariant = $this->validateVariant( $language );
  288. if ( $this->mHeaderVariant ) {
  289. break;
  290. }
  291. }
  292. }
  293. return $this->mHeaderVariant;
  294. }
  295. /**
  296. * Dictionary-based conversion.
  297. * This function would not parse the conversion rules.
  298. * If you want to parse rules, try to use convert() or
  299. * convertTo().
  300. *
  301. * @param string $text The text to be converted
  302. * @param bool|string $toVariant The target language code
  303. * @return string The converted text
  304. */
  305. public function autoConvert( $text, $toVariant = false ) {
  306. $this->loadTables();
  307. if ( !$toVariant ) {
  308. $toVariant = $this->getPreferredVariant();
  309. if ( !$toVariant ) {
  310. return $text;
  311. }
  312. }
  313. if ( $this->guessVariant( $text, $toVariant ) ) {
  314. return $text;
  315. }
  316. /* we convert everything except:
  317. 1. HTML markups (anything between < and >)
  318. 2. HTML entities
  319. 3. placeholders created by the parser
  320. IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404).
  321. Minimize use of backtracking where possible.
  322. */
  323. $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
  324. // this one is needed when the text is inside an HTML markup
  325. $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
  326. // Optimize for the common case where these tags have
  327. // few or no children. Thus try and possesively get as much as
  328. // possible, and only engage in backtracking when we hit a '<'.
  329. // disable convert to variants between <code> tags
  330. $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
  331. // disable conversion of <script> tags
  332. $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
  333. // disable conversion of <pre> tags
  334. $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
  335. // The "|.*+)" at the end, is in case we missed some part of html syntax,
  336. // we will fail securely (hopefully) by matching the rest of the string.
  337. $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
  338. $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
  339. '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
  340. $startPos = 0;
  341. $sourceBlob = '';
  342. $literalBlob = '';
  343. // Guard against delimiter nulls in the input
  344. $text = str_replace( "\000", '', $text );
  345. $text = str_replace( "\004", '', $text );
  346. $markupMatches = null;
  347. $elementMatches = null;
  348. // We add a marker (\004) at the end of text, to ensure we always match the
  349. // entire text (Otherwise, pcre.backtrack_limit might cause silent failure)
  350. while ( $startPos < strlen( $text ) ) {
  351. if ( preg_match( $reg, $text . "\004", $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
  352. $elementPos = $markupMatches[0][1];
  353. $element = $markupMatches[0][0];
  354. if ( $element === "\004" ) {
  355. // We hit the end.
  356. $elementPos = strlen( $text );
  357. $element = '';
  358. } elseif ( substr( $element, -1 ) === "\004" ) {
  359. // This can sometimes happen if we have
  360. // unclosed html tags (For example
  361. // when converting a title attribute
  362. // during a recursive call that contains
  363. // a &lt; e.g. <div title="&lt;">.
  364. $element = substr( $element, 0, -1 );
  365. }
  366. } else {
  367. // If we hit here, then Language Converter could be tricked
  368. // into doing an XSS, so we refuse to translate.
  369. // If non-crazy input manages to reach this code path,
  370. // we should consider it a bug.
  371. $log = LoggerFactory::getInstance( 'languageconverter' );
  372. $log->error( "Hit pcre.backtrack_limit in " . __METHOD__
  373. . ". Disabling language conversion for this page.",
  374. [
  375. "method" => __METHOD__,
  376. "variant" => $toVariant,
  377. "startOfText" => substr( $text, 0, 500 )
  378. ]
  379. );
  380. return $text;
  381. }
  382. // Queue the part before the markup for translation in a batch
  383. $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
  384. // Advance to the next position
  385. $startPos = $elementPos + strlen( $element );
  386. // Translate any alt or title attributes inside the matched element
  387. if ( $element !== ''
  388. && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
  389. ) {
  390. // FIXME, this decodes entities, so if you have something
  391. // like <div title="foo&lt;bar"> the bar won't get
  392. // translated since after entity decoding it looks like
  393. // unclosed html and we call this method recursively
  394. // on attributes.
  395. $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
  396. // Ensure self-closing tags stay self-closing.
  397. $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : '';
  398. $changed = false;
  399. foreach ( [ 'title', 'alt' ] as $attrName ) {
  400. if ( !isset( $attrs[$attrName] ) ) {
  401. continue;
  402. }
  403. $attr = $attrs[$attrName];
  404. // Don't convert URLs
  405. if ( !strpos( $attr, '://' ) ) {
  406. $attr = $this->recursiveConvertTopLevel( $attr, $toVariant );
  407. }
  408. // Remove HTML tags to avoid disrupting the layout
  409. $attr = preg_replace( '/<[^>]++>/', '', $attr );
  410. if ( $attr !== $attrs[$attrName] ) {
  411. $attrs[$attrName] = $attr;
  412. $changed = true;
  413. }
  414. }
  415. if ( $changed ) {
  416. $element = $elementMatches[1] . Html::expandAttributes( $attrs ) .
  417. $close . $elementMatches[3];
  418. }
  419. }
  420. $literalBlob .= $element . "\000";
  421. }
  422. // Do the main translation batch
  423. $translatedBlob = $this->translate( $sourceBlob, $toVariant );
  424. // Put the output back together
  425. $translatedIter = StringUtils::explode( "\000", $translatedBlob );
  426. $literalIter = StringUtils::explode( "\000", $literalBlob );
  427. $output = '';
  428. while ( $translatedIter->valid() && $literalIter->valid() ) {
  429. $output .= $translatedIter->current();
  430. $output .= $literalIter->current();
  431. $translatedIter->next();
  432. $literalIter->next();
  433. }
  434. return $output;
  435. }
  436. /**
  437. * Translate a string to a variant.
  438. * Doesn't parse rules or do any of that other stuff, for that use
  439. * convert() or convertTo().
  440. *
  441. * @param string $text Text to convert
  442. * @param string $variant Variant language code
  443. * @return string Translated text
  444. */
  445. public function translate( $text, $variant ) {
  446. // If $text is empty or only includes spaces, do nothing
  447. // Otherwise translate it
  448. if ( trim( $text ) ) {
  449. $this->loadTables();
  450. $text = $this->mTables[$variant]->replace( $text );
  451. }
  452. return $text;
  453. }
  454. /**
  455. * Call translate() to convert text to all valid variants.
  456. *
  457. * @param string $text The text to be converted
  458. * @return array Variant => converted text
  459. */
  460. public function autoConvertToAllVariants( $text ) {
  461. $this->loadTables();
  462. $ret = [];
  463. foreach ( $this->mVariants as $variant ) {
  464. $ret[$variant] = $this->translate( $text, $variant );
  465. }
  466. return $ret;
  467. }
  468. /**
  469. * Apply manual conversion rules.
  470. *
  471. * @param ConverterRule $convRule
  472. */
  473. protected function applyManualConv( $convRule ) {
  474. // Use syntax -{T|zh-cn:TitleCN; zh-tw:TitleTw}- to custom
  475. // title conversion.
  476. // Bug 24072: $mConvRuleTitle was overwritten by other manual
  477. // rule(s) not for title, this breaks the title conversion.
  478. $newConvRuleTitle = $convRule->getTitle();
  479. if ( $newConvRuleTitle ) {
  480. // So I add an empty check for getTitle()
  481. $this->mConvRuleTitle = $newConvRuleTitle;
  482. }
  483. // merge/remove manual conversion rules to/from global table
  484. $convTable = $convRule->getConvTable();
  485. $action = $convRule->getRulesAction();
  486. foreach ( $convTable as $variant => $pair ) {
  487. if ( !$this->validateVariant( $variant ) ) {
  488. continue;
  489. }
  490. if ( $action == 'add' ) {
  491. // More efficient than array_merge(), about 2.5 times.
  492. foreach ( $pair as $from => $to ) {
  493. $this->mTables[$variant]->setPair( $from, $to );
  494. }
  495. } elseif ( $action == 'remove' ) {
  496. $this->mTables[$variant]->removeArray( $pair );
  497. }
  498. }
  499. }
  500. /**
  501. * Auto convert a Title object to a readable string in the
  502. * preferred variant.
  503. *
  504. * @param Title $title A object of Title
  505. * @return string Converted title text
  506. */
  507. public function convertTitle( $title ) {
  508. $variant = $this->getPreferredVariant();
  509. $index = $title->getNamespace();
  510. if ( $index !== NS_MAIN ) {
  511. $text = $this->convertNamespace( $index, $variant ) . ':';
  512. } else {
  513. $text = '';
  514. }
  515. $text .= $this->translate( $title->getText(), $variant );
  516. return $text;
  517. }
  518. /**
  519. * Get the namespace display name in the preferred variant.
  520. *
  521. * @param int $index Namespace id
  522. * @param string|null $variant Variant code or null for preferred variant
  523. * @return string Namespace name for display
  524. */
  525. public function convertNamespace( $index, $variant = null ) {
  526. if ( $index === NS_MAIN ) {
  527. return '';
  528. }
  529. if ( $variant === null ) {
  530. $variant = $this->getPreferredVariant();
  531. }
  532. $cache = ObjectCache::newAccelerator( CACHE_NONE );
  533. $key = wfMemcKey( 'languageconverter', 'namespace-text', $index, $variant );
  534. $nsVariantText = $cache->get( $key );
  535. if ( $nsVariantText !== false ) {
  536. return $nsVariantText;
  537. }
  538. // First check if a message gives a converted name in the target variant.
  539. $nsConvMsg = wfMessage( 'conversion-ns' . $index )->inLanguage( $variant );
  540. if ( $nsConvMsg->exists() ) {
  541. $nsVariantText = $nsConvMsg->plain();
  542. }
  543. // Then check if a message gives a converted name in content language
  544. // which needs extra translation to the target variant.
  545. if ( $nsVariantText === false ) {
  546. $nsConvMsg = wfMessage( 'conversion-ns' . $index )->inContentLanguage();
  547. if ( $nsConvMsg->exists() ) {
  548. $nsVariantText = $this->translate( $nsConvMsg->plain(), $variant );
  549. }
  550. }
  551. if ( $nsVariantText === false ) {
  552. // No message exists, retrieve it from the target variant's namespace names.
  553. $langObj = $this->mLangObj->factory( $variant );
  554. $nsVariantText = $langObj->getFormattedNsText( $index );
  555. }
  556. $cache->set( $key, $nsVariantText, 60 );
  557. return $nsVariantText;
  558. }
  559. /**
  560. * Convert text to different variants of a language. The automatic
  561. * conversion is done in autoConvert(). Here we parse the text
  562. * marked with -{}-, which specifies special conversions of the
  563. * text that can not be accomplished in autoConvert().
  564. *
  565. * Syntax of the markup:
  566. * -{code1:text1;code2:text2;...}- or
  567. * -{flags|code1:text1;code2:text2;...}- or
  568. * -{text}- in which case no conversion should take place for text
  569. *
  570. * @param string $text Text to be converted
  571. * @return string Converted text
  572. */
  573. public function convert( $text ) {
  574. $variant = $this->getPreferredVariant();
  575. return $this->convertTo( $text, $variant );
  576. }
  577. /**
  578. * Same as convert() except a extra parameter to custom variant.
  579. *
  580. * @param string $text Text to be converted
  581. * @param string $variant The target variant code
  582. * @return string Converted text
  583. */
  584. public function convertTo( $text, $variant ) {
  585. global $wgDisableLangConversion;
  586. if ( $wgDisableLangConversion ) {
  587. return $text;
  588. }
  589. // Reset converter state for a new converter run.
  590. $this->mConvRuleTitle = false;
  591. return $this->recursiveConvertTopLevel( $text, $variant );
  592. }
  593. /**
  594. * Recursively convert text on the outside. Allow to use nested
  595. * markups to custom rules.
  596. *
  597. * @param string $text Text to be converted
  598. * @param string $variant The target variant code
  599. * @param int $depth Depth of recursion
  600. * @return string Converted text
  601. */
  602. protected function recursiveConvertTopLevel( $text, $variant, $depth = 0 ) {
  603. $startPos = 0;
  604. $out = '';
  605. $length = strlen( $text );
  606. $shouldConvert = !$this->guessVariant( $text, $variant );
  607. $continue = 1;
  608. $noScript = '<script.*?>.*?<\/script>(*SKIP)(*FAIL)';
  609. $noStyle = '<style.*?>.*?<\/style>(*SKIP)(*FAIL)';
  610. // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
  611. $noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)';
  612. // @codingStandardsIgnoreEnd
  613. while ( $startPos < $length && $continue ) {
  614. $continue = preg_match(
  615. // Only match -{ outside of html.
  616. "/$noScript|$noStyle|$noHtml|-\{/",
  617. $text,
  618. $m,
  619. PREG_OFFSET_CAPTURE,
  620. $startPos
  621. );
  622. if ( !$continue ) {
  623. // No more markup, append final segment
  624. $fragment = substr( $text, $startPos );
  625. $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
  626. return $out;
  627. }
  628. // Offset of the match of the regex pattern.
  629. $pos = $m[0][1];
  630. // Append initial segment
  631. $fragment = substr( $text, $startPos, $pos - $startPos );
  632. $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
  633. // -{ marker found, not in attribute
  634. // Advance position up to -{ marker.
  635. $startPos = $pos;
  636. // Do recursive conversion
  637. // Note: This passes $startPos by reference, and advances it.
  638. $out .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
  639. }
  640. return $out;
  641. }
  642. /**
  643. * Recursively convert text on the inside.
  644. *
  645. * @param string $text Text to be converted
  646. * @param string $variant The target variant code
  647. * @param int $startPos
  648. * @param int $depth Depth of recursion
  649. *
  650. * @throws MWException
  651. * @return string Converted text
  652. */
  653. protected function recursiveConvertRule( $text, $variant, &$startPos, $depth = 0 ) {
  654. // Quick sanity check (no function calls)
  655. if ( $text[$startPos] !== '-' || $text[$startPos + 1] !== '{' ) {
  656. throw new MWException( __METHOD__ . ': invalid input string' );
  657. }
  658. $startPos += 2;
  659. $inner = '';
  660. $warningDone = false;
  661. $length = strlen( $text );
  662. while ( $startPos < $length ) {
  663. $m = false;
  664. preg_match( '/-\{|\}-/', $text, $m, PREG_OFFSET_CAPTURE, $startPos );
  665. if ( !$m ) {
  666. // Unclosed rule
  667. break;
  668. }
  669. $token = $m[0][0];
  670. $pos = $m[0][1];
  671. // Markup found
  672. // Append initial segment
  673. $inner .= substr( $text, $startPos, $pos - $startPos );
  674. // Advance position
  675. $startPos = $pos;
  676. switch ( $token ) {
  677. case '-{':
  678. // Check max depth
  679. if ( $depth >= $this->mMaxDepth ) {
  680. $inner .= '-{';
  681. if ( !$warningDone ) {
  682. $inner .= '<span class="error">' .
  683. wfMessage( 'language-converter-depth-warning' )
  684. ->numParams( $this->mMaxDepth )->inContentLanguage()->text() .
  685. '</span>';
  686. $warningDone = true;
  687. }
  688. $startPos += 2;
  689. continue;
  690. }
  691. // Recursively parse another rule
  692. $inner .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
  693. break;
  694. case '}-':
  695. // Apply the rule
  696. $startPos += 2;
  697. $rule = new ConverterRule( $inner, $this );
  698. $rule->parse( $variant );
  699. $this->applyManualConv( $rule );
  700. return $rule->getDisplay();
  701. default:
  702. throw new MWException( __METHOD__ . ': invalid regex match' );
  703. }
  704. }
  705. // Unclosed rule
  706. if ( $startPos < $length ) {
  707. $inner .= substr( $text, $startPos );
  708. }
  709. $startPos = $length;
  710. return '-{' . $this->autoConvert( $inner, $variant );
  711. }
  712. /**
  713. * If a language supports multiple variants, it is possible that
  714. * non-existing link in one variant actually exists in another variant.
  715. * This function tries to find it. See e.g. LanguageZh.php
  716. * The input parameters may be modified upon return
  717. *
  718. * @param string &$link The name of the link
  719. * @param Title &$nt The title object of the link
  720. * @param bool $ignoreOtherCond To disable other conditions when
  721. * we need to transclude a template or update a category's link
  722. */
  723. public function findVariantLink( &$link, &$nt, $ignoreOtherCond = false ) {
  724. # If the article has already existed, there is no need to
  725. # check it again, otherwise it may cause a fault.
  726. if ( is_object( $nt ) && $nt->exists() ) {
  727. return;
  728. }
  729. global $wgDisableLangConversion, $wgDisableTitleConversion, $wgRequest;
  730. $isredir = $wgRequest->getText( 'redirect', 'yes' );
  731. $action = $wgRequest->getText( 'action' );
  732. if ( $action == 'edit' && $wgRequest->getBool( 'redlink' ) ) {
  733. $action = 'view';
  734. }
  735. $linkconvert = $wgRequest->getText( 'linkconvert', 'yes' );
  736. $disableLinkConversion = $wgDisableLangConversion
  737. || $wgDisableTitleConversion;
  738. $linkBatch = new LinkBatch();
  739. $ns = NS_MAIN;
  740. if ( $disableLinkConversion ||
  741. ( !$ignoreOtherCond &&
  742. ( $isredir == 'no'
  743. || $action == 'edit'
  744. || $action == 'submit'
  745. || $linkconvert == 'no' ) ) ) {
  746. return;
  747. }
  748. if ( is_object( $nt ) ) {
  749. $ns = $nt->getNamespace();
  750. }
  751. $variants = $this->autoConvertToAllVariants( $link );
  752. if ( !$variants ) { // give up
  753. return;
  754. }
  755. $titles = [];
  756. foreach ( $variants as $v ) {
  757. if ( $v != $link ) {
  758. $varnt = Title::newFromText( $v, $ns );
  759. if ( !is_null( $varnt ) ) {
  760. $linkBatch->addObj( $varnt );
  761. $titles[] = $varnt;
  762. }
  763. }
  764. }
  765. // fetch all variants in single query
  766. $linkBatch->execute();
  767. foreach ( $titles as $varnt ) {
  768. if ( $varnt->getArticleID() > 0 ) {
  769. $nt = $varnt;
  770. $link = $varnt->getText();
  771. break;
  772. }
  773. }
  774. }
  775. /**
  776. * Returns language specific hash options.
  777. *
  778. * @return string
  779. */
  780. public function getExtraHashOptions() {
  781. $variant = $this->getPreferredVariant();
  782. return '!' . $variant;
  783. }
  784. /**
  785. * Guess if a text is written in a variant. This should be implemented in subclasses.
  786. *
  787. * @param string $text The text to be checked
  788. * @param string $variant Language code of the variant to be checked for
  789. * @return bool True if $text appears to be written in $variant, false if not
  790. *
  791. * @author Nikola Smolenski <smolensk@eunet.rs>
  792. * @since 1.19
  793. */
  794. public function guessVariant( $text, $variant ) {
  795. return false;
  796. }
  797. /**
  798. * Load default conversion tables.
  799. * This method must be implemented in derived class.
  800. *
  801. * @private
  802. * @throws MWException
  803. */
  804. function loadDefaultTables() {
  805. $name = get_class( $this );
  806. throw new MWException( "Must implement loadDefaultTables() method in class $name" );
  807. }
  808. /**
  809. * Load conversion tables either from the cache or the disk.
  810. * @private
  811. * @param bool $fromCache Load from memcached? Defaults to true.
  812. */
  813. function loadTables( $fromCache = true ) {
  814. global $wgLanguageConverterCacheType;
  815. if ( $this->mTablesLoaded ) {
  816. return;
  817. }
  818. $this->mTablesLoaded = true;
  819. $this->mTables = false;
  820. $cache = ObjectCache::getInstance( $wgLanguageConverterCacheType );
  821. if ( $fromCache ) {
  822. wfProfileIn( __METHOD__ . '-cache' );
  823. $this->mTables = $cache->get( $this->mCacheKey );
  824. wfProfileOut( __METHOD__ . '-cache' );
  825. }
  826. if ( !$this->mTables || !array_key_exists( self::CACHE_VERSION_KEY, $this->mTables ) ) {
  827. wfProfileIn( __METHOD__ . '-recache' );
  828. // not in cache, or we need a fresh reload.
  829. // We will first load the default tables
  830. // then update them using things in MediaWiki:Conversiontable/*
  831. $this->loadDefaultTables();
  832. foreach ( $this->mVariants as $var ) {
  833. $cached = $this->parseCachedTable( $var );
  834. $this->mTables[$var]->mergeArray( $cached );
  835. }
  836. $this->postLoadTables();
  837. $this->mTables[self::CACHE_VERSION_KEY] = true;
  838. $cache->set( $this->mCacheKey, $this->mTables, 43200 );
  839. wfProfileOut( __METHOD__ . '-recache' );
  840. }
  841. }
  842. /**
  843. * Hook for post processing after conversion tables are loaded.
  844. */
  845. function postLoadTables() {
  846. }
  847. /**
  848. * Reload the conversion tables.
  849. *
  850. * @private
  851. */
  852. function reloadTables() {
  853. if ( $this->mTables ) {
  854. unset( $this->mTables );
  855. }
  856. $this->mTablesLoaded = false;
  857. $this->loadTables( false );
  858. }
  859. /**
  860. * Parse the conversion table stored in the cache.
  861. *
  862. * The tables should be in blocks of the following form:
  863. * -{
  864. * word => word ;
  865. * word => word ;
  866. * ...
  867. * }-
  868. *
  869. * To make the tables more manageable, subpages are allowed
  870. * and will be parsed recursively if $recursive == true.
  871. *
  872. * @param string $code Language code
  873. * @param string $subpage Subpage name
  874. * @param bool $recursive Parse subpages recursively? Defaults to true.
  875. *
  876. * @return array
  877. */
  878. function parseCachedTable( $code, $subpage = '', $recursive = true ) {
  879. static $parsed = [];
  880. $key = 'Conversiontable/' . $code;
  881. if ( $subpage ) {
  882. $key .= '/' . $subpage;
  883. }
  884. if ( array_key_exists( $key, $parsed ) ) {
  885. return [];
  886. }
  887. $parsed[$key] = true;
  888. if ( $subpage === '' ) {
  889. $txt = MessageCache::singleton()->getMsgFromNamespace( $key, $code );
  890. } else {
  891. $txt = false;
  892. $title = Title::makeTitleSafe( NS_MEDIAWIKI, $key );
  893. if ( $title && $title->exists() ) {
  894. $revision = Revision::newFromTitle( $title );
  895. if ( $revision ) {
  896. if ( $revision->getContentModel() == CONTENT_MODEL_WIKITEXT ) {
  897. $txt = $revision->getContent( Revision::RAW )->getNativeData();
  898. }
  899. // @todo in the future, use a specialized content model, perhaps based on json!
  900. }
  901. }
  902. }
  903. # Nothing to parse if there's no text
  904. if ( $txt === false || $txt === null || $txt === '' ) {
  905. return [];
  906. }
  907. // get all subpage links of the form
  908. // [[MediaWiki:Conversiontable/zh-xx/...|...]]
  909. $linkhead = $this->mLangObj->getNsText( NS_MEDIAWIKI ) .
  910. ':Conversiontable';
  911. $subs = StringUtils::explode( '[[', $txt );
  912. $sublinks = [];
  913. foreach ( $subs as $sub ) {
  914. $link = explode( ']]', $sub, 2 );
  915. if ( count( $link ) != 2 ) {
  916. continue;
  917. }
  918. $b = explode( '|', $link[0], 2 );
  919. $b = explode( '/', trim( $b[0] ), 3 );
  920. if ( count( $b ) == 3 ) {
  921. $sublink = $b[2];
  922. } else {
  923. $sublink = '';
  924. }
  925. if ( $b[0] == $linkhead && $b[1] == $code ) {
  926. $sublinks[] = $sublink;
  927. }
  928. }
  929. // parse the mappings in this page
  930. $blocks = StringUtils::explode( '-{', $txt );
  931. $ret = [];
  932. $first = true;
  933. foreach ( $blocks as $block ) {
  934. if ( $first ) {
  935. // Skip the part before the first -{
  936. $first = false;
  937. continue;
  938. }
  939. $mappings = explode( '}-', $block, 2 )[0];
  940. $stripped = str_replace( [ "'", '"', '*', '#' ], '', $mappings );
  941. $table = StringUtils::explode( ';', $stripped );
  942. foreach ( $table as $t ) {
  943. $m = explode( '=>', $t, 3 );
  944. if ( count( $m ) != 2 ) {
  945. continue;
  946. }
  947. // trim any trailling comments starting with '//'
  948. $tt = explode( '//', $m[1], 2 );
  949. $ret[trim( $m[0] )] = trim( $tt[0] );
  950. }
  951. }
  952. // recursively parse the subpages
  953. if ( $recursive ) {
  954. foreach ( $sublinks as $link ) {
  955. $s = $this->parseCachedTable( $code, $link, $recursive );
  956. $ret = $s + $ret;
  957. }
  958. }
  959. if ( $this->mUcfirst ) {
  960. foreach ( $ret as $k => $v ) {
  961. $ret[$this->mLangObj->ucfirst( $k )] = $this->mLangObj->ucfirst( $v );
  962. }
  963. }
  964. return $ret;
  965. }
  966. /**
  967. * Enclose a string with the "no conversion" tag. This is used by
  968. * various functions in the Parser.
  969. *
  970. * @param string $text Text to be tagged for no conversion
  971. * @param bool $noParse Unused
  972. * @return string The tagged text
  973. */
  974. public function markNoConversion( $text, $noParse = false ) {
  975. # don't mark if already marked
  976. if ( strpos( $text, '-{' ) || strpos( $text, '}-' ) ) {
  977. return $text;
  978. }
  979. $ret = "-{R|$text}-";
  980. return $ret;
  981. }
  982. /**
  983. * Convert the sorting key for category links. This should make different
  984. * keys that are variants of each other map to the same key.
  985. *
  986. * @param string $key
  987. *
  988. * @return string
  989. */
  990. function convertCategoryKey( $key ) {
  991. return $key;
  992. }
  993. /**
  994. * Refresh the cache of conversion tables when
  995. * MediaWiki:Conversiontable* is updated.
  996. *
  997. * @param Title $titleobj The Title of the page being updated
  998. */
  999. public function updateConversionTable( Title $titleobj ) {
  1000. if ( $titleobj->getNamespace() == NS_MEDIAWIKI ) {
  1001. $title = $titleobj->getDBkey();
  1002. $t = explode( '/', $title, 3 );
  1003. $c = count( $t );
  1004. if ( $c > 1 && $t[0] == 'Conversiontable' ) {
  1005. if ( $this->validateVariant( $t[1] ) ) {
  1006. $this->reloadTables();
  1007. }
  1008. }
  1009. }
  1010. }
  1011. /**
  1012. * Get the cached separator pattern for ConverterRule::parseRules()
  1013. * @return string
  1014. */
  1015. function getVarSeparatorPattern() {
  1016. if ( is_null( $this->mVarSeparatorPattern ) ) {
  1017. // varsep_pattern for preg_split:
  1018. // text should be splited by ";" only if a valid variant
  1019. // name exist after the markup, for example:
  1020. // -{zh-hans:<span style="font-size:120%;">xxx</span>;zh-hant:\
  1021. // <span style="font-size:120%;">yyy</span>;}-
  1022. // we should split it as:
  1023. // array(
  1024. // [0] => 'zh-hans:<span style="font-size:120%;">xxx</span>'
  1025. // [1] => 'zh-hant:<span style="font-size:120%;">yyy</span>'
  1026. // [2] => ''
  1027. // )
  1028. $pat = '/;\s*(?=';
  1029. foreach ( $this->mVariants as $variant ) {
  1030. // zh-hans:xxx;zh-hant:yyy
  1031. $pat .= $variant . '\s*:|';
  1032. // xxx=>zh-hans:yyy; xxx=>zh-hant:zzz
  1033. $pat .= '[^;]*?=>\s*' . $variant . '\s*:|';
  1034. }
  1035. $pat .= '\s*$)/';
  1036. $this->mVarSeparatorPattern = $pat;
  1037. }
  1038. return $this->mVarSeparatorPattern;
  1039. }
  1040. }