MagicWordArray.php 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. <?php
  2. /**
  3. * See docs/magicword.txt.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Parser
  22. */
  23. use MediaWiki\Logger\LoggerFactory;
  24. /**
  25. * Class for handling an array of magic words
  26. * @ingroup Parser
  27. */
  28. class MagicWordArray {
  29. /** @var array */
  30. public $names = [];
  31. /** @var array */
  32. private $hash;
  33. private $baseRegex;
  34. private $regex;
  35. /**
  36. * @param array $names
  37. */
  38. public function __construct( $names = [] ) {
  39. $this->names = $names;
  40. }
  41. /**
  42. * Add a magic word by name
  43. *
  44. * @param string $name
  45. */
  46. public function add( $name ) {
  47. $this->names[] = $name;
  48. $this->hash = $this->baseRegex = $this->regex = null;
  49. }
  50. /**
  51. * Add a number of magic words by name
  52. *
  53. * @param array $names
  54. */
  55. public function addArray( $names ) {
  56. $this->names = array_merge( $this->names, array_values( $names ) );
  57. $this->hash = $this->baseRegex = $this->regex = null;
  58. }
  59. /**
  60. * Get a 2-d hashtable for this array
  61. * @return array
  62. */
  63. public function getHash() {
  64. if ( is_null( $this->hash ) ) {
  65. global $wgContLang;
  66. $this->hash = [ 0 => [], 1 => [] ];
  67. foreach ( $this->names as $name ) {
  68. $magic = MagicWord::get( $name );
  69. $case = intval( $magic->isCaseSensitive() );
  70. foreach ( $magic->getSynonyms() as $syn ) {
  71. if ( !$case ) {
  72. $syn = $wgContLang->lc( $syn );
  73. }
  74. $this->hash[$case][$syn] = $name;
  75. }
  76. }
  77. }
  78. return $this->hash;
  79. }
  80. /**
  81. * Get the base regex
  82. * @return array
  83. */
  84. public function getBaseRegex() {
  85. if ( is_null( $this->baseRegex ) ) {
  86. $this->baseRegex = [ 0 => '', 1 => '' ];
  87. $allGroups = [];
  88. foreach ( $this->names as $name ) {
  89. $magic = MagicWord::get( $name );
  90. $case = intval( $magic->isCaseSensitive() );
  91. foreach ( $magic->getSynonyms() as $i => $syn ) {
  92. // Group name must start with a non-digit in PCRE 8.34+
  93. $it = strtr( $i, '0123456789', 'abcdefghij' );
  94. $groupName = $it . '_' . $name;
  95. $group = '(?P<' . $groupName . '>' . preg_quote( $syn, '/' ) . ')';
  96. // look for same group names to avoid same named subpatterns in the regex
  97. if ( isset( $allGroups[$groupName] ) ) {
  98. throw new MWException(
  99. __METHOD__ . ': duplicate internal name in magic word array: ' . $name
  100. );
  101. }
  102. $allGroups[$groupName] = true;
  103. if ( $this->baseRegex[$case] === '' ) {
  104. $this->baseRegex[$case] = $group;
  105. } else {
  106. $this->baseRegex[$case] .= '|' . $group;
  107. }
  108. }
  109. }
  110. }
  111. return $this->baseRegex;
  112. }
  113. /**
  114. * Get an unanchored regex that does not match parameters
  115. * @return array
  116. */
  117. public function getRegex() {
  118. if ( is_null( $this->regex ) ) {
  119. $base = $this->getBaseRegex();
  120. $this->regex = [ '', '' ];
  121. if ( $this->baseRegex[0] !== '' ) {
  122. $this->regex[0] = "/{$base[0]}/iuS";
  123. }
  124. if ( $this->baseRegex[1] !== '' ) {
  125. $this->regex[1] = "/{$base[1]}/S";
  126. }
  127. }
  128. return $this->regex;
  129. }
  130. /**
  131. * Get a regex for matching variables with parameters
  132. *
  133. * @return string
  134. */
  135. public function getVariableRegex() {
  136. return str_replace( "\\$1", "(.*?)", $this->getRegex() );
  137. }
  138. /**
  139. * Get a regex anchored to the start of the string that does not match parameters
  140. *
  141. * @return array
  142. */
  143. public function getRegexStart() {
  144. $base = $this->getBaseRegex();
  145. $newRegex = [ '', '' ];
  146. if ( $base[0] !== '' ) {
  147. $newRegex[0] = "/^(?:{$base[0]})/iuS";
  148. }
  149. if ( $base[1] !== '' ) {
  150. $newRegex[1] = "/^(?:{$base[1]})/S";
  151. }
  152. return $newRegex;
  153. }
  154. /**
  155. * Get an anchored regex for matching variables with parameters
  156. *
  157. * @return array
  158. */
  159. public function getVariableStartToEndRegex() {
  160. $base = $this->getBaseRegex();
  161. $newRegex = [ '', '' ];
  162. if ( $base[0] !== '' ) {
  163. $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
  164. }
  165. if ( $base[1] !== '' ) {
  166. $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
  167. }
  168. return $newRegex;
  169. }
  170. /**
  171. * @since 1.20
  172. * @return array
  173. */
  174. public function getNames() {
  175. return $this->names;
  176. }
  177. /**
  178. * Parse a match array from preg_match
  179. * Returns array(magic word ID, parameter value)
  180. * If there is no parameter value, that element will be false.
  181. *
  182. * @param array $m
  183. *
  184. * @throws MWException
  185. * @return array
  186. */
  187. public function parseMatch( $m ) {
  188. reset( $m );
  189. while ( ( $key = key( $m ) ) !== null ) {
  190. $value = current( $m );
  191. next( $m );
  192. if ( $key === 0 || $value === '' ) {
  193. continue;
  194. }
  195. $parts = explode( '_', $key, 2 );
  196. if ( count( $parts ) != 2 ) {
  197. // This shouldn't happen
  198. // continue;
  199. throw new MWException( __METHOD__ . ': bad parameter name' );
  200. }
  201. list( /* $synIndex */, $magicName ) = $parts;
  202. $paramValue = next( $m );
  203. return [ $magicName, $paramValue ];
  204. }
  205. // This shouldn't happen either
  206. throw new MWException( __METHOD__ . ': parameter not found' );
  207. }
  208. /**
  209. * Match some text, with parameter capture
  210. * Returns an array with the magic word name in the first element and the
  211. * parameter in the second element.
  212. * Both elements are false if there was no match.
  213. *
  214. * @param string $text
  215. *
  216. * @return array
  217. */
  218. public function matchVariableStartToEnd( $text ) {
  219. $regexes = $this->getVariableStartToEndRegex();
  220. foreach ( $regexes as $regex ) {
  221. if ( $regex !== '' ) {
  222. $m = [];
  223. if ( preg_match( $regex, $text, $m ) ) {
  224. return $this->parseMatch( $m );
  225. }
  226. }
  227. }
  228. return [ false, false ];
  229. }
  230. /**
  231. * Match some text, without parameter capture
  232. * Returns the magic word name, or false if there was no capture
  233. *
  234. * @param string $text
  235. *
  236. * @return string|bool False on failure
  237. */
  238. public function matchStartToEnd( $text ) {
  239. $hash = $this->getHash();
  240. if ( isset( $hash[1][$text] ) ) {
  241. return $hash[1][$text];
  242. }
  243. global $wgContLang;
  244. $lc = $wgContLang->lc( $text );
  245. if ( isset( $hash[0][$lc] ) ) {
  246. return $hash[0][$lc];
  247. }
  248. return false;
  249. }
  250. /**
  251. * Returns an associative array, ID => param value, for all items that match
  252. * Removes the matched items from the input string (passed by reference)
  253. *
  254. * @param string &$text
  255. *
  256. * @return array
  257. */
  258. public function matchAndRemove( &$text ) {
  259. $found = [];
  260. $regexes = $this->getRegex();
  261. foreach ( $regexes as $regex ) {
  262. if ( $regex === '' ) {
  263. continue;
  264. }
  265. $matches = [];
  266. $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
  267. if ( $res === false ) {
  268. LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all returned false', [
  269. 'code' => preg_last_error(),
  270. 'regex' => $regex,
  271. 'text' => $text,
  272. ] );
  273. } elseif ( $res ) {
  274. foreach ( $matches as $m ) {
  275. list( $name, $param ) = $this->parseMatch( $m );
  276. $found[$name] = $param;
  277. }
  278. }
  279. $res = preg_replace( $regex, '', $text );
  280. if ( $res === null ) {
  281. LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace returned null', [
  282. 'code' => preg_last_error(),
  283. 'regex' => $regex,
  284. 'text' => $text,
  285. ] );
  286. }
  287. $text = $res;
  288. }
  289. return $found;
  290. }
  291. /**
  292. * Return the ID of the magic word at the start of $text, and remove
  293. * the prefix from $text.
  294. * Return false if no match found and $text is not modified.
  295. * Does not match parameters.
  296. *
  297. * @param string &$text
  298. *
  299. * @return int|bool False on failure
  300. */
  301. public function matchStartAndRemove( &$text ) {
  302. $regexes = $this->getRegexStart();
  303. foreach ( $regexes as $regex ) {
  304. if ( $regex === '' ) {
  305. continue;
  306. }
  307. if ( preg_match( $regex, $text, $m ) ) {
  308. list( $id, ) = $this->parseMatch( $m );
  309. if ( strlen( $m[0] ) >= strlen( $text ) ) {
  310. $text = '';
  311. } else {
  312. $text = substr( $text, strlen( $m[0] ) );
  313. }
  314. return $id;
  315. }
  316. }
  317. return false;
  318. }
  319. }