StringUtils.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. <?php
  2. /**
  3. * A collection of static methods to play with strings.
  4. */
  5. class StringUtils {
  6. /**
  7. * Perform an operation equivalent to
  8. *
  9. * preg_replace( "!$startDelim(.*?)$endDelim!", $replace, $subject );
  10. *
  11. * except that it's worst-case O(N) instead of O(N^2)
  12. *
  13. * Compared to delimiterReplace(), this implementation is fast but memory-
  14. * hungry and inflexible. The memory requirements are such that I don't
  15. * recommend using it on anything but guaranteed small chunks of text.
  16. */
  17. static function hungryDelimiterReplace( $startDelim, $endDelim, $replace, $subject ) {
  18. $segments = explode( $startDelim, $subject );
  19. $output = array_shift( $segments );
  20. foreach ( $segments as $s ) {
  21. $endDelimPos = strpos( $s, $endDelim );
  22. if ( $endDelimPos === false ) {
  23. $output .= $startDelim . $s;
  24. } else {
  25. $output .= $replace . substr( $s, $endDelimPos + strlen( $endDelim ) );
  26. }
  27. }
  28. return $output;
  29. }
  30. /**
  31. * Perform an operation equivalent to
  32. *
  33. * preg_replace_callback( "!$startDelim(.*)$endDelim!s$flags", $callback, $subject )
  34. *
  35. * This implementation is slower than hungryDelimiterReplace but uses far less
  36. * memory. The delimiters are literal strings, not regular expressions.
  37. *
  38. * @param string $flags Regular expression flags
  39. */
  40. # If the start delimiter ends with an initial substring of the end delimiter,
  41. # e.g. in the case of C-style comments, the behaviour differs from the model
  42. # regex. In this implementation, the end must share no characters with the
  43. # start, so e.g. /*/ is not considered to be both the start and end of a
  44. # comment. /*/xy/*/ is considered to be a single comment with contents /xy/.
  45. static function delimiterReplaceCallback( $startDelim, $endDelim, $callback, $subject, $flags = '' ) {
  46. $inputPos = 0;
  47. $outputPos = 0;
  48. $output = '';
  49. $foundStart = false;
  50. $encStart = preg_quote( $startDelim, '!' );
  51. $encEnd = preg_quote( $endDelim, '!' );
  52. $strcmp = strpos( $flags, 'i' ) === false ? 'strcmp' : 'strcasecmp';
  53. $endLength = strlen( $endDelim );
  54. $m = array();
  55. while ( $inputPos < strlen( $subject ) &&
  56. preg_match( "!($encStart)|($encEnd)!S$flags", $subject, $m, PREG_OFFSET_CAPTURE, $inputPos ) )
  57. {
  58. $tokenOffset = $m[0][1];
  59. if ( $m[1][0] != '' ) {
  60. if ( $foundStart &&
  61. $strcmp( $endDelim, substr( $subject, $tokenOffset, $endLength ) ) == 0 )
  62. {
  63. # An end match is present at the same location
  64. $tokenType = 'end';
  65. $tokenLength = $endLength;
  66. } else {
  67. $tokenType = 'start';
  68. $tokenLength = strlen( $m[0][0] );
  69. }
  70. } elseif ( $m[2][0] != '' ) {
  71. $tokenType = 'end';
  72. $tokenLength = strlen( $m[0][0] );
  73. } else {
  74. throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
  75. }
  76. if ( $tokenType == 'start' ) {
  77. $inputPos = $tokenOffset + $tokenLength;
  78. # Only move the start position if we haven't already found a start
  79. # This means that START START END matches outer pair
  80. if ( !$foundStart ) {
  81. # Found start
  82. # Write out the non-matching section
  83. $output .= substr( $subject, $outputPos, $tokenOffset - $outputPos );
  84. $outputPos = $tokenOffset;
  85. $contentPos = $inputPos;
  86. $foundStart = true;
  87. }
  88. } elseif ( $tokenType == 'end' ) {
  89. if ( $foundStart ) {
  90. # Found match
  91. $output .= call_user_func( $callback, array(
  92. substr( $subject, $outputPos, $tokenOffset + $tokenLength - $outputPos ),
  93. substr( $subject, $contentPos, $tokenOffset - $contentPos )
  94. ));
  95. $foundStart = false;
  96. } else {
  97. # Non-matching end, write it out
  98. $output .= substr( $subject, $inputPos, $tokenOffset + $tokenLength - $outputPos );
  99. }
  100. $inputPos = $outputPos = $tokenOffset + $tokenLength;
  101. } else {
  102. throw new MWException( 'Invalid delimiter given to ' . __METHOD__ );
  103. }
  104. }
  105. if ( $outputPos < strlen( $subject ) ) {
  106. $output .= substr( $subject, $outputPos );
  107. }
  108. return $output;
  109. }
  110. /*
  111. * Perform an operation equivalent to
  112. *
  113. * preg_replace( "!$startDelim(.*)$endDelim!$flags", $replace, $subject )
  114. *
  115. * @param string $startDelim Start delimiter regular expression
  116. * @param string $endDelim End delimiter regular expression
  117. * @param string $replace Replacement string. May contain $1, which will be
  118. * replaced by the text between the delimiters
  119. * @param string $subject String to search
  120. * @return string The string with the matches replaced
  121. */
  122. static function delimiterReplace( $startDelim, $endDelim, $replace, $subject, $flags = '' ) {
  123. $replacer = new RegexlikeReplacer( $replace );
  124. return self::delimiterReplaceCallback( $startDelim, $endDelim,
  125. $replacer->cb(), $subject, $flags );
  126. }
  127. /**
  128. * More or less "markup-safe" explode()
  129. * Ignores any instances of the separator inside <...>
  130. * @param string $separator
  131. * @param string $text
  132. * @return array
  133. */
  134. static function explodeMarkup( $separator, $text ) {
  135. $placeholder = "\x00";
  136. // Remove placeholder instances
  137. $text = str_replace( $placeholder, '', $text );
  138. // Replace instances of the separator inside HTML-like tags with the placeholder
  139. $replacer = new DoubleReplacer( $separator, $placeholder );
  140. $cleaned = StringUtils::delimiterReplaceCallback( '<', '>', $replacer->cb(), $text );
  141. // Explode, then put the replaced separators back in
  142. $items = explode( $separator, $cleaned );
  143. foreach( $items as $i => $str ) {
  144. $items[$i] = str_replace( $placeholder, $separator, $str );
  145. }
  146. return $items;
  147. }
  148. /**
  149. * Escape a string to make it suitable for inclusion in a preg_replace()
  150. * replacement parameter.
  151. *
  152. * @param string $string
  153. * @return string
  154. */
  155. static function escapeRegexReplacement( $string ) {
  156. $string = str_replace( '\\', '\\\\', $string );
  157. $string = str_replace( '$', '\\$', $string );
  158. return $string;
  159. }
  160. /**
  161. * Workalike for explode() with limited memory usage.
  162. * Returns an Iterator
  163. */
  164. static function explode( $separator, $subject ) {
  165. if ( substr_count( $subject, $separator ) > 1000 ) {
  166. return new ExplodeIterator( $separator, $subject );
  167. } else {
  168. return new ArrayIterator( explode( $separator, $subject ) );
  169. }
  170. }
  171. }
  172. /**
  173. * Base class for "replacers", objects used in preg_replace_callback() and
  174. * StringUtils::delimiterReplaceCallback()
  175. */
  176. class Replacer {
  177. function cb() {
  178. return array( &$this, 'replace' );
  179. }
  180. }
  181. /**
  182. * Class to replace regex matches with a string similar to that used in preg_replace()
  183. */
  184. class RegexlikeReplacer extends Replacer {
  185. var $r;
  186. function __construct( $r ) {
  187. $this->r = $r;
  188. }
  189. function replace( $matches ) {
  190. $pairs = array();
  191. foreach ( $matches as $i => $match ) {
  192. $pairs["\$$i"] = $match;
  193. }
  194. return strtr( $this->r, $pairs );
  195. }
  196. }
  197. /**
  198. * Class to perform secondary replacement within each replacement string
  199. */
  200. class DoubleReplacer extends Replacer {
  201. function __construct( $from, $to, $index = 0 ) {
  202. $this->from = $from;
  203. $this->to = $to;
  204. $this->index = $index;
  205. }
  206. function replace( $matches ) {
  207. return str_replace( $this->from, $this->to, $matches[$this->index] );
  208. }
  209. }
  210. /**
  211. * Class to perform replacement based on a simple hashtable lookup
  212. */
  213. class HashtableReplacer extends Replacer {
  214. var $table, $index;
  215. function __construct( $table, $index = 0 ) {
  216. $this->table = $table;
  217. $this->index = $index;
  218. }
  219. function replace( $matches ) {
  220. return $this->table[$matches[$this->index]];
  221. }
  222. }
  223. /**
  224. * Replacement array for FSS with fallback to strtr()
  225. * Supports lazy initialisation of FSS resource
  226. */
  227. class ReplacementArray {
  228. /*mostly private*/ var $data = false;
  229. /*mostly private*/ var $fss = false;
  230. /**
  231. * Create an object with the specified replacement array
  232. * The array should have the same form as the replacement array for strtr()
  233. */
  234. function __construct( $data = array() ) {
  235. $this->data = $data;
  236. }
  237. function __sleep() {
  238. return array( 'data' );
  239. }
  240. function __wakeup() {
  241. $this->fss = false;
  242. }
  243. /**
  244. * Set the whole replacement array at once
  245. */
  246. function setArray( $data ) {
  247. $this->data = $data;
  248. $this->fss = false;
  249. }
  250. function getArray() {
  251. return $this->data;
  252. }
  253. /**
  254. * Set an element of the replacement array
  255. */
  256. function setPair( $from, $to ) {
  257. $this->data[$from] = $to;
  258. $this->fss = false;
  259. }
  260. function mergeArray( $data ) {
  261. $this->data = array_merge( $this->data, $data );
  262. $this->fss = false;
  263. }
  264. function merge( $other ) {
  265. $this->data = array_merge( $this->data, $other->data );
  266. $this->fss = false;
  267. }
  268. function removePair( $from ) {
  269. unset($this->data[$from]);
  270. $this->fss = false;
  271. }
  272. function removeArray( $data ) {
  273. foreach( $data as $from => $to )
  274. $this->removePair( $from );
  275. $this->fss = false;
  276. }
  277. function replace( $subject ) {
  278. if ( function_exists( 'fss_prep_replace' ) ) {
  279. wfProfileIn( __METHOD__.'-fss' );
  280. if ( $this->fss === false ) {
  281. $this->fss = fss_prep_replace( $this->data );
  282. }
  283. $result = fss_exec_replace( $this->fss, $subject );
  284. wfProfileOut( __METHOD__.'-fss' );
  285. } else {
  286. wfProfileIn( __METHOD__.'-strtr' );
  287. $result = strtr( $subject, $this->data );
  288. wfProfileOut( __METHOD__.'-strtr' );
  289. }
  290. return $result;
  291. }
  292. }
  293. /**
  294. * An iterator which works exactly like:
  295. *
  296. * foreach ( explode( $delim, $s ) as $element ) {
  297. * ...
  298. * }
  299. *
  300. * Except it doesn't use 193 byte per element
  301. */
  302. class ExplodeIterator implements Iterator {
  303. // The subject string
  304. var $subject, $subjectLength;
  305. // The delimiter
  306. var $delim, $delimLength;
  307. // The position of the start of the line
  308. var $curPos;
  309. // The position after the end of the next delimiter
  310. var $endPos;
  311. // The current token
  312. var $current;
  313. /**
  314. * Construct a DelimIterator
  315. */
  316. function __construct( $delim, $s ) {
  317. $this->subject = $s;
  318. $this->delim = $delim;
  319. // Micro-optimisation (theoretical)
  320. $this->subjectLength = strlen( $s );
  321. $this->delimLength = strlen( $delim );
  322. $this->rewind();
  323. }
  324. function rewind() {
  325. $this->curPos = 0;
  326. $this->endPos = strpos( $this->subject, $this->delim );
  327. $this->refreshCurrent();
  328. }
  329. function refreshCurrent() {
  330. if ( $this->curPos === false ) {
  331. $this->current = false;
  332. } elseif ( $this->curPos >= $this->subjectLength ) {
  333. $this->current = '';
  334. } elseif ( $this->endPos === false ) {
  335. $this->current = substr( $this->subject, $this->curPos );
  336. } else {
  337. $this->current = substr( $this->subject, $this->curPos, $this->endPos - $this->curPos );
  338. }
  339. }
  340. function current() {
  341. return $this->current;
  342. }
  343. function key() {
  344. return $this->curPos;
  345. }
  346. function next() {
  347. if ( $this->endPos === false ) {
  348. $this->curPos = false;
  349. } else {
  350. $this->curPos = $this->endPos + $this->delimLength;
  351. if ( $this->curPos >= $this->subjectLength ) {
  352. $this->endPos = false;
  353. } else {
  354. $this->endPos = strpos( $this->subject, $this->delim, $this->curPos );
  355. }
  356. }
  357. $this->refreshCurrent();
  358. return $this->current;
  359. }
  360. function valid() {
  361. return $this->curPos !== false;
  362. }
  363. }