MagicWordArray.php 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. <?php
  2. /**
  3. * See docs/magicword.txt.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Parser
  22. */
  23. use MediaWiki\Logger\LoggerFactory;
  24. use MediaWiki\MediaWikiServices;
  25. /**
  26. * Class for handling an array of magic words
  27. * @ingroup Parser
  28. */
  29. class MagicWordArray {
  30. /** @var array */
  31. public $names = [];
  32. /** @var MagicWordFactory */
  33. private $factory;
  34. /** @var array */
  35. private $hash;
  36. private $baseRegex;
  37. private $regex;
  38. /**
  39. * @param array $names
  40. * @param MagicWordFactory|null $factory
  41. */
  42. public function __construct( $names = [], MagicWordFactory $factory = null ) {
  43. $this->names = $names;
  44. $this->factory = $factory;
  45. if ( !$factory ) {
  46. $this->factory = MediaWikiServices::getInstance()->getMagicWordFactory();
  47. }
  48. }
  49. /**
  50. * Add a magic word by name
  51. *
  52. * @param string $name
  53. */
  54. public function add( $name ) {
  55. $this->names[] = $name;
  56. $this->hash = $this->baseRegex = $this->regex = null;
  57. }
  58. /**
  59. * Add a number of magic words by name
  60. *
  61. * @param array $names
  62. */
  63. public function addArray( $names ) {
  64. $this->names = array_merge( $this->names, array_values( $names ) );
  65. $this->hash = $this->baseRegex = $this->regex = null;
  66. }
  67. /**
  68. * Get a 2-d hashtable for this array
  69. * @return array
  70. */
  71. public function getHash() {
  72. if ( is_null( $this->hash ) ) {
  73. $this->hash = [ 0 => [], 1 => [] ];
  74. foreach ( $this->names as $name ) {
  75. $magic = $this->factory->get( $name );
  76. $case = intval( $magic->isCaseSensitive() );
  77. foreach ( $magic->getSynonyms() as $syn ) {
  78. if ( !$case ) {
  79. $syn = $this->factory->getContentLanguage()->lc( $syn );
  80. }
  81. $this->hash[$case][$syn] = $name;
  82. }
  83. }
  84. }
  85. return $this->hash;
  86. }
  87. /**
  88. * Get the base regex
  89. * @return array
  90. */
  91. public function getBaseRegex() {
  92. if ( is_null( $this->baseRegex ) ) {
  93. $this->baseRegex = [ 0 => '', 1 => '' ];
  94. $allGroups = [];
  95. foreach ( $this->names as $name ) {
  96. $magic = $this->factory->get( $name );
  97. $case = intval( $magic->isCaseSensitive() );
  98. foreach ( $magic->getSynonyms() as $i => $syn ) {
  99. // Group name must start with a non-digit in PCRE 8.34+
  100. $it = strtr( $i, '0123456789', 'abcdefghij' );
  101. $groupName = $it . '_' . $name;
  102. $group = '(?P<' . $groupName . '>' . preg_quote( $syn, '/' ) . ')';
  103. // look for same group names to avoid same named subpatterns in the regex
  104. if ( isset( $allGroups[$groupName] ) ) {
  105. throw new MWException(
  106. __METHOD__ . ': duplicate internal name in magic word array: ' . $name
  107. );
  108. }
  109. $allGroups[$groupName] = true;
  110. if ( $this->baseRegex[$case] === '' ) {
  111. $this->baseRegex[$case] = $group;
  112. } else {
  113. $this->baseRegex[$case] .= '|' . $group;
  114. }
  115. }
  116. }
  117. }
  118. return $this->baseRegex;
  119. }
  120. /**
  121. * Get an unanchored regex that does not match parameters
  122. * @return array
  123. */
  124. public function getRegex() {
  125. if ( is_null( $this->regex ) ) {
  126. $base = $this->getBaseRegex();
  127. $this->regex = [ '', '' ];
  128. if ( $this->baseRegex[0] !== '' ) {
  129. $this->regex[0] = "/{$base[0]}/iuS";
  130. }
  131. if ( $this->baseRegex[1] !== '' ) {
  132. $this->regex[1] = "/{$base[1]}/S";
  133. }
  134. }
  135. return $this->regex;
  136. }
  137. /**
  138. * Get a regex for matching variables with parameters
  139. *
  140. * @return string
  141. */
  142. public function getVariableRegex() {
  143. return str_replace( "\\$1", "(.*?)", $this->getRegex() );
  144. }
  145. /**
  146. * Get a regex anchored to the start of the string that does not match parameters
  147. *
  148. * @return array
  149. */
  150. public function getRegexStart() {
  151. $base = $this->getBaseRegex();
  152. $newRegex = [ '', '' ];
  153. if ( $base[0] !== '' ) {
  154. $newRegex[0] = "/^(?:{$base[0]})/iuS";
  155. }
  156. if ( $base[1] !== '' ) {
  157. $newRegex[1] = "/^(?:{$base[1]})/S";
  158. }
  159. return $newRegex;
  160. }
  161. /**
  162. * Get an anchored regex for matching variables with parameters
  163. *
  164. * @return array
  165. */
  166. public function getVariableStartToEndRegex() {
  167. $base = $this->getBaseRegex();
  168. $newRegex = [ '', '' ];
  169. if ( $base[0] !== '' ) {
  170. $newRegex[0] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[0]})$/iuS" );
  171. }
  172. if ( $base[1] !== '' ) {
  173. $newRegex[1] = str_replace( "\\$1", "(.*?)", "/^(?:{$base[1]})$/S" );
  174. }
  175. return $newRegex;
  176. }
  177. /**
  178. * @since 1.20
  179. * @return array
  180. */
  181. public function getNames() {
  182. return $this->names;
  183. }
  184. /**
  185. * Parse a match array from preg_match
  186. * Returns array(magic word ID, parameter value)
  187. * If there is no parameter value, that element will be false.
  188. *
  189. * @param array $m
  190. *
  191. * @throws MWException
  192. * @return array
  193. */
  194. public function parseMatch( $m ) {
  195. reset( $m );
  196. while ( ( $key = key( $m ) ) !== null ) {
  197. $value = current( $m );
  198. next( $m );
  199. if ( $key === 0 || $value === '' ) {
  200. continue;
  201. }
  202. $parts = explode( '_', $key, 2 );
  203. if ( count( $parts ) != 2 ) {
  204. // This shouldn't happen
  205. // continue;
  206. throw new MWException( __METHOD__ . ': bad parameter name' );
  207. }
  208. list( /* $synIndex */, $magicName ) = $parts;
  209. $paramValue = next( $m );
  210. return [ $magicName, $paramValue ];
  211. }
  212. // This shouldn't happen either
  213. throw new MWException( __METHOD__ . ': parameter not found' );
  214. }
  215. /**
  216. * Match some text, with parameter capture
  217. * Returns an array with the magic word name in the first element and the
  218. * parameter in the second element.
  219. * Both elements are false if there was no match.
  220. *
  221. * @param string $text
  222. *
  223. * @return array
  224. */
  225. public function matchVariableStartToEnd( $text ) {
  226. $regexes = $this->getVariableStartToEndRegex();
  227. foreach ( $regexes as $regex ) {
  228. if ( $regex !== '' ) {
  229. $m = [];
  230. if ( preg_match( $regex, $text, $m ) ) {
  231. return $this->parseMatch( $m );
  232. }
  233. }
  234. }
  235. return [ false, false ];
  236. }
  237. /**
  238. * Match some text, without parameter capture
  239. * Returns the magic word name, or false if there was no capture
  240. *
  241. * @param string $text
  242. *
  243. * @return string|bool False on failure
  244. */
  245. public function matchStartToEnd( $text ) {
  246. $hash = $this->getHash();
  247. if ( isset( $hash[1][$text] ) ) {
  248. return $hash[1][$text];
  249. }
  250. $lc = $this->factory->getContentLanguage()->lc( $text );
  251. if ( isset( $hash[0][$lc] ) ) {
  252. return $hash[0][$lc];
  253. }
  254. return false;
  255. }
  256. /**
  257. * Returns an associative array, ID => param value, for all items that match
  258. * Removes the matched items from the input string (passed by reference)
  259. *
  260. * @param string &$text
  261. *
  262. * @return array
  263. */
  264. public function matchAndRemove( &$text ) {
  265. $found = [];
  266. $regexes = $this->getRegex();
  267. foreach ( $regexes as $regex ) {
  268. if ( $regex === '' ) {
  269. continue;
  270. }
  271. $matches = [];
  272. $res = preg_match_all( $regex, $text, $matches, PREG_SET_ORDER );
  273. if ( $res === false ) {
  274. LoggerFactory::getInstance( 'parser' )->warning( 'preg_match_all returned false', [
  275. 'code' => preg_last_error(),
  276. 'regex' => $regex,
  277. 'text' => $text,
  278. ] );
  279. } elseif ( $res ) {
  280. foreach ( $matches as $m ) {
  281. list( $name, $param ) = $this->parseMatch( $m );
  282. $found[$name] = $param;
  283. }
  284. }
  285. $res = preg_replace( $regex, '', $text );
  286. if ( $res === null ) {
  287. LoggerFactory::getInstance( 'parser' )->warning( 'preg_replace returned null', [
  288. 'code' => preg_last_error(),
  289. 'regex' => $regex,
  290. 'text' => $text,
  291. ] );
  292. }
  293. $text = $res;
  294. }
  295. return $found;
  296. }
  297. /**
  298. * Return the ID of the magic word at the start of $text, and remove
  299. * the prefix from $text.
  300. * Return false if no match found and $text is not modified.
  301. * Does not match parameters.
  302. *
  303. * @param string &$text
  304. *
  305. * @return int|bool False on failure
  306. */
  307. public function matchStartAndRemove( &$text ) {
  308. $regexes = $this->getRegexStart();
  309. foreach ( $regexes as $regex ) {
  310. if ( $regex === '' ) {
  311. continue;
  312. }
  313. if ( preg_match( $regex, $text, $m ) ) {
  314. list( $id, ) = $this->parseMatch( $m );
  315. if ( strlen( $m[0] ) >= strlen( $text ) ) {
  316. $text = '';
  317. } else {
  318. $text = substr( $text, strlen( $m[0] ) );
  319. }
  320. return $id;
  321. }
  322. }
  323. return false;
  324. }
  325. }