Preprocessor_Hash.php 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. <?php
  2. /**
  3. * Preprocessor using PHP arrays
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Parser
  22. */
  23. /**
  24. * Differences from DOM schema:
  25. * * attribute nodes are children
  26. * * "<h>" nodes that aren't at the top are replaced with <possible-h>
  27. *
  28. * Nodes are stored in a recursive array data structure. A node store is an
  29. * array where each element may be either a scalar (representing a text node)
  30. * or a "descriptor", which is a two-element array where the first element is
  31. * the node name and the second element is the node store for the children.
  32. *
  33. * Attributes are represented as children that have a node name starting with
  34. * "@", and a single text node child.
  35. *
  36. * @todo: Consider replacing descriptor arrays with objects of a new class.
  37. * Benchmark and measure resulting memory impact.
  38. *
  39. * @ingroup Parser
  40. */
  41. // phpcs:ignore Squiz.Classes.ValidClassName.NotCamelCaps
  42. class Preprocessor_Hash extends Preprocessor {
  43. const CACHE_PREFIX = 'preprocess-hash';
  44. const CACHE_VERSION = 2;
  45. /**
  46. * @param Parser $parser
  47. */
  48. public function __construct( $parser ) {
  49. $this->parser = $parser;
  50. }
  51. /**
  52. * @return PPFrame_Hash
  53. */
  54. public function newFrame() {
  55. return new PPFrame_Hash( $this );
  56. }
  57. /**
  58. * @param array $args
  59. * @return PPCustomFrame_Hash
  60. */
  61. public function newCustomFrame( $args ) {
  62. return new PPCustomFrame_Hash( $this, $args );
  63. }
  64. /**
  65. * @param array $values
  66. * @return PPNode_Hash_Array
  67. */
  68. public function newPartNodeArray( $values ) {
  69. $list = [];
  70. foreach ( $values as $k => $val ) {
  71. if ( is_int( $k ) ) {
  72. $store = [ [ 'part', [
  73. [ 'name', [ [ '@index', [ $k ] ] ] ],
  74. [ 'value', [ strval( $val ) ] ],
  75. ] ] ];
  76. } else {
  77. $store = [ [ 'part', [
  78. [ 'name', [ strval( $k ) ] ],
  79. '=',
  80. [ 'value', [ strval( $val ) ] ],
  81. ] ] ];
  82. }
  83. $list[] = new PPNode_Hash_Tree( $store, 0 );
  84. }
  85. $node = new PPNode_Hash_Array( $list );
  86. return $node;
  87. }
  88. /**
  89. * Preprocess some wikitext and return the document tree.
  90. *
  91. * @param string $text The text to parse
  92. * @param int $flags Bitwise combination of:
  93. * Parser::PTD_FOR_INCLUSION Handle "<noinclude>" and "<includeonly>" as if the text is being
  94. * included. Default is to assume a direct page view.
  95. *
  96. * The generated DOM tree must depend only on the input text and the flags.
  97. * The DOM tree must be the same in OT_HTML and OT_WIKI mode, to avoid a regression of T6899.
  98. *
  99. * Any flag added to the $flags parameter here, or any other parameter liable to cause a
  100. * change in the DOM tree for a given text, must be passed through the section identifier
  101. * in the section edit link and thus back to extractSections().
  102. *
  103. * @throws MWException
  104. * @return PPNode_Hash_Tree
  105. */
  106. public function preprocessToObj( $text, $flags = 0 ) {
  107. global $wgDisableLangConversion;
  108. $tree = $this->cacheGetTree( $text, $flags );
  109. if ( $tree !== false ) {
  110. $store = json_decode( $tree );
  111. if ( is_array( $store ) ) {
  112. return new PPNode_Hash_Tree( $store, 0 );
  113. }
  114. }
  115. $forInclusion = $flags & Parser::PTD_FOR_INCLUSION;
  116. $xmlishElements = $this->parser->getStripList();
  117. $xmlishAllowMissingEndTag = [ 'includeonly', 'noinclude', 'onlyinclude' ];
  118. $enableOnlyinclude = false;
  119. if ( $forInclusion ) {
  120. $ignoredTags = [ 'includeonly', '/includeonly' ];
  121. $ignoredElements = [ 'noinclude' ];
  122. $xmlishElements[] = 'noinclude';
  123. if ( strpos( $text, '<onlyinclude>' ) !== false
  124. && strpos( $text, '</onlyinclude>' ) !== false
  125. ) {
  126. $enableOnlyinclude = true;
  127. }
  128. } else {
  129. $ignoredTags = [ 'noinclude', '/noinclude', 'onlyinclude', '/onlyinclude' ];
  130. $ignoredElements = [ 'includeonly' ];
  131. $xmlishElements[] = 'includeonly';
  132. }
  133. $xmlishRegex = implode( '|', array_merge( $xmlishElements, $ignoredTags ) );
  134. // Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset
  135. $elementsRegex = "~($xmlishRegex)(?:\s|\/>|>)|(!--)~iA";
  136. $stack = new PPDStack_Hash;
  137. $searchBase = "[{<\n";
  138. if ( !$wgDisableLangConversion ) {
  139. $searchBase .= '-';
  140. }
  141. // For fast reverse searches
  142. $revText = strrev( $text );
  143. $lengthText = strlen( $text );
  144. // Input pointer, starts out pointing to a pseudo-newline before the start
  145. $i = 0;
  146. // Current accumulator. See the doc comment for Preprocessor_Hash for the format.
  147. $accum =& $stack->getAccum();
  148. // True to find equals signs in arguments
  149. $findEquals = false;
  150. // True to take notice of pipe characters
  151. $findPipe = false;
  152. $headingIndex = 1;
  153. // True if $i is inside a possible heading
  154. $inHeading = false;
  155. // True if there are no more greater-than (>) signs right of $i
  156. $noMoreGT = false;
  157. // Map of tag name => true if there are no more closing tags of given type right of $i
  158. $noMoreClosingTag = [];
  159. // True to ignore all input up to the next <onlyinclude>
  160. $findOnlyinclude = $enableOnlyinclude;
  161. // Do a line-start run without outputting an LF character
  162. $fakeLineStart = true;
  163. while ( true ) {
  164. // $this->memCheck();
  165. if ( $findOnlyinclude ) {
  166. // Ignore all input up to the next <onlyinclude>
  167. $startPos = strpos( $text, '<onlyinclude>', $i );
  168. if ( $startPos === false ) {
  169. // Ignored section runs to the end
  170. $accum[] = [ 'ignore', [ substr( $text, $i ) ] ];
  171. break;
  172. }
  173. $tagEndPos = $startPos + strlen( '<onlyinclude>' ); // past-the-end
  174. $accum[] = [ 'ignore', [ substr( $text, $i, $tagEndPos - $i ) ] ];
  175. $i = $tagEndPos;
  176. $findOnlyinclude = false;
  177. }
  178. if ( $fakeLineStart ) {
  179. $found = 'line-start';
  180. $curChar = '';
  181. } else {
  182. # Find next opening brace, closing brace or pipe
  183. $search = $searchBase;
  184. if ( $stack->top === false ) {
  185. $currentClosing = '';
  186. } else {
  187. $currentClosing = $stack->top->close;
  188. $search .= $currentClosing;
  189. }
  190. if ( $findPipe ) {
  191. $search .= '|';
  192. }
  193. if ( $findEquals ) {
  194. // First equals will be for the template
  195. $search .= '=';
  196. }
  197. $rule = null;
  198. # Output literal section, advance input counter
  199. $literalLength = strcspn( $text, $search, $i );
  200. if ( $literalLength > 0 ) {
  201. self::addLiteral( $accum, substr( $text, $i, $literalLength ) );
  202. $i += $literalLength;
  203. }
  204. if ( $i >= $lengthText ) {
  205. if ( $currentClosing == "\n" ) {
  206. // Do a past-the-end run to finish off the heading
  207. $curChar = '';
  208. $found = 'line-end';
  209. } else {
  210. # All done
  211. break;
  212. }
  213. } else {
  214. $curChar = $curTwoChar = $text[$i];
  215. if ( ( $i + 1 ) < $lengthText ) {
  216. $curTwoChar .= $text[$i + 1];
  217. }
  218. if ( $curChar == '|' ) {
  219. $found = 'pipe';
  220. } elseif ( $curChar == '=' ) {
  221. $found = 'equals';
  222. } elseif ( $curChar == '<' ) {
  223. $found = 'angle';
  224. } elseif ( $curChar == "\n" ) {
  225. if ( $inHeading ) {
  226. $found = 'line-end';
  227. } else {
  228. $found = 'line-start';
  229. }
  230. } elseif ( $curTwoChar == $currentClosing ) {
  231. $found = 'close';
  232. $curChar = $curTwoChar;
  233. } elseif ( $curChar == $currentClosing ) {
  234. $found = 'close';
  235. } elseif ( isset( $this->rules[$curTwoChar] ) ) {
  236. $curChar = $curTwoChar;
  237. $found = 'open';
  238. $rule = $this->rules[$curChar];
  239. } elseif ( isset( $this->rules[$curChar] ) ) {
  240. $found = 'open';
  241. $rule = $this->rules[$curChar];
  242. } else {
  243. # Some versions of PHP have a strcspn which stops on
  244. # null characters; ignore these and continue.
  245. # We also may get '-' and '}' characters here which
  246. # don't match -{ or $currentClosing. Add these to
  247. # output and continue.
  248. if ( $curChar == '-' || $curChar == '}' ) {
  249. self::addLiteral( $accum, $curChar );
  250. }
  251. ++$i;
  252. continue;
  253. }
  254. }
  255. }
  256. if ( $found == 'angle' ) {
  257. $matches = false;
  258. // Handle </onlyinclude>
  259. if ( $enableOnlyinclude
  260. && substr( $text, $i, strlen( '</onlyinclude>' ) ) == '</onlyinclude>'
  261. ) {
  262. $findOnlyinclude = true;
  263. continue;
  264. }
  265. // Determine element name
  266. if ( !preg_match( $elementsRegex, $text, $matches, 0, $i + 1 ) ) {
  267. // Element name missing or not listed
  268. self::addLiteral( $accum, '<' );
  269. ++$i;
  270. continue;
  271. }
  272. // Handle comments
  273. if ( isset( $matches[2] ) && $matches[2] == '!--' ) {
  274. // To avoid leaving blank lines, when a sequence of
  275. // space-separated comments is both preceded and followed by
  276. // a newline (ignoring spaces), then
  277. // trim leading and trailing spaces and the trailing newline.
  278. // Find the end
  279. $endPos = strpos( $text, '-->', $i + 4 );
  280. if ( $endPos === false ) {
  281. // Unclosed comment in input, runs to end
  282. $inner = substr( $text, $i );
  283. $accum[] = [ 'comment', [ $inner ] ];
  284. $i = $lengthText;
  285. } else {
  286. // Search backwards for leading whitespace
  287. $wsStart = $i ? ( $i - strspn( $revText, " \t", $lengthText - $i ) ) : 0;
  288. // Search forwards for trailing whitespace
  289. // $wsEnd will be the position of the last space (or the '>' if there's none)
  290. $wsEnd = $endPos + 2 + strspn( $text, " \t", $endPos + 3 );
  291. // Keep looking forward as long as we're finding more
  292. // comments.
  293. $comments = [ [ $wsStart, $wsEnd ] ];
  294. while ( substr( $text, $wsEnd + 1, 4 ) == '<!--' ) {
  295. $c = strpos( $text, '-->', $wsEnd + 4 );
  296. if ( $c === false ) {
  297. break;
  298. }
  299. $c = $c + 2 + strspn( $text, " \t", $c + 3 );
  300. $comments[] = [ $wsEnd + 1, $c ];
  301. $wsEnd = $c;
  302. }
  303. // Eat the line if possible
  304. // TODO: This could theoretically be done if $wsStart == 0, i.e. for comments at
  305. // the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but
  306. // it's a possible beneficial b/c break.
  307. if ( $wsStart > 0 && substr( $text, $wsStart - 1, 1 ) == "\n"
  308. && substr( $text, $wsEnd + 1, 1 ) == "\n"
  309. ) {
  310. // Remove leading whitespace from the end of the accumulator
  311. $wsLength = $i - $wsStart;
  312. $endIndex = count( $accum ) - 1;
  313. // Sanity check
  314. if ( $wsLength > 0
  315. && $endIndex >= 0
  316. && is_string( $accum[$endIndex] )
  317. && strspn( $accum[$endIndex], " \t", -$wsLength ) === $wsLength
  318. ) {
  319. $accum[$endIndex] = substr( $accum[$endIndex], 0, -$wsLength );
  320. }
  321. // Dump all but the last comment to the accumulator
  322. foreach ( $comments as $j => $com ) {
  323. $startPos = $com[0];
  324. $endPos = $com[1] + 1;
  325. if ( $j == ( count( $comments ) - 1 ) ) {
  326. break;
  327. }
  328. $inner = substr( $text, $startPos, $endPos - $startPos );
  329. $accum[] = [ 'comment', [ $inner ] ];
  330. }
  331. // Do a line-start run next time to look for headings after the comment
  332. $fakeLineStart = true;
  333. } else {
  334. // No line to eat, just take the comment itself
  335. $startPos = $i;
  336. $endPos += 2;
  337. }
  338. if ( $stack->top ) {
  339. $part = $stack->top->getCurrentPart();
  340. if ( !( isset( $part->commentEnd ) && $part->commentEnd == $wsStart - 1 ) ) {
  341. $part->visualEnd = $wsStart;
  342. }
  343. // Else comments abutting, no change in visual end
  344. $part->commentEnd = $endPos;
  345. }
  346. $i = $endPos + 1;
  347. $inner = substr( $text, $startPos, $endPos - $startPos + 1 );
  348. $accum[] = [ 'comment', [ $inner ] ];
  349. }
  350. continue;
  351. }
  352. $name = $matches[1];
  353. $lowerName = strtolower( $name );
  354. $attrStart = $i + strlen( $name ) + 1;
  355. // Find end of tag
  356. $tagEndPos = $noMoreGT ? false : strpos( $text, '>', $attrStart );
  357. if ( $tagEndPos === false ) {
  358. // Infinite backtrack
  359. // Disable tag search to prevent worst-case O(N^2) performance
  360. $noMoreGT = true;
  361. self::addLiteral( $accum, '<' );
  362. ++$i;
  363. continue;
  364. }
  365. // Handle ignored tags
  366. if ( in_array( $lowerName, $ignoredTags ) ) {
  367. $accum[] = [ 'ignore', [ substr( $text, $i, $tagEndPos - $i + 1 ) ] ];
  368. $i = $tagEndPos + 1;
  369. continue;
  370. }
  371. $tagStartPos = $i;
  372. if ( $text[$tagEndPos - 1] == '/' ) {
  373. // Short end tag
  374. $attrEnd = $tagEndPos - 1;
  375. $inner = null;
  376. $i = $tagEndPos + 1;
  377. $close = null;
  378. } else {
  379. $attrEnd = $tagEndPos;
  380. // Find closing tag
  381. if (
  382. !isset( $noMoreClosingTag[$name] ) &&
  383. preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i",
  384. $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
  385. ) {
  386. $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 );
  387. $i = $matches[0][1] + strlen( $matches[0][0] );
  388. $close = $matches[0][0];
  389. } else {
  390. // No end tag
  391. if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
  392. // Let it run out to the end of the text.
  393. $inner = substr( $text, $tagEndPos + 1 );
  394. $i = $lengthText;
  395. $close = null;
  396. } else {
  397. // Don't match the tag, treat opening tag as literal and resume parsing.
  398. $i = $tagEndPos + 1;
  399. self::addLiteral( $accum,
  400. substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
  401. // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>...
  402. $noMoreClosingTag[$name] = true;
  403. continue;
  404. }
  405. }
  406. }
  407. // <includeonly> and <noinclude> just become <ignore> tags
  408. if ( in_array( $lowerName, $ignoredElements ) ) {
  409. $accum[] = [ 'ignore', [ substr( $text, $tagStartPos, $i - $tagStartPos ) ] ];
  410. continue;
  411. }
  412. if ( $attrEnd <= $attrStart ) {
  413. $attr = '';
  414. } else {
  415. // Note that the attr element contains the whitespace between name and attribute,
  416. // this is necessary for precise reconstruction during pre-save transform.
  417. $attr = substr( $text, $attrStart, $attrEnd - $attrStart );
  418. }
  419. $children = [
  420. [ 'name', [ $name ] ],
  421. [ 'attr', [ $attr ] ] ];
  422. if ( $inner !== null ) {
  423. $children[] = [ 'inner', [ $inner ] ];
  424. }
  425. if ( $close !== null ) {
  426. $children[] = [ 'close', [ $close ] ];
  427. }
  428. $accum[] = [ 'ext', $children ];
  429. } elseif ( $found == 'line-start' ) {
  430. // Is this the start of a heading?
  431. // Line break belongs before the heading element in any case
  432. if ( $fakeLineStart ) {
  433. $fakeLineStart = false;
  434. } else {
  435. self::addLiteral( $accum, $curChar );
  436. $i++;
  437. }
  438. $count = strspn( $text, '=', $i, 6 );
  439. if ( $count == 1 && $findEquals ) {
  440. // DWIM: This looks kind of like a name/value separator.
  441. // Let's let the equals handler have it and break the potential
  442. // heading. This is heuristic, but AFAICT the methods for
  443. // completely correct disambiguation are very complex.
  444. } elseif ( $count > 0 ) {
  445. $piece = [
  446. 'open' => "\n",
  447. 'close' => "\n",
  448. 'parts' => [ new PPDPart_Hash( str_repeat( '=', $count ) ) ],
  449. 'startPos' => $i,
  450. 'count' => $count ];
  451. $stack->push( $piece );
  452. $accum =& $stack->getAccum();
  453. $stackFlags = $stack->getFlags();
  454. if ( isset( $stackFlags['findEquals'] ) ) {
  455. $findEquals = $stackFlags['findEquals'];
  456. }
  457. if ( isset( $stackFlags['findPipe'] ) ) {
  458. $findPipe = $stackFlags['findPipe'];
  459. }
  460. if ( isset( $stackFlags['inHeading'] ) ) {
  461. $inHeading = $stackFlags['inHeading'];
  462. }
  463. $i += $count;
  464. }
  465. } elseif ( $found == 'line-end' ) {
  466. $piece = $stack->top;
  467. // A heading must be open, otherwise \n wouldn't have been in the search list
  468. // FIXME: Don't use assert()
  469. // phpcs:ignore MediaWiki.Usage.ForbiddenFunctions.assert
  470. assert( $piece->open === "\n" );
  471. $part = $piece->getCurrentPart();
  472. // Search back through the input to see if it has a proper close.
  473. // Do this using the reversed string since the other solutions
  474. // (end anchor, etc.) are inefficient.
  475. $wsLength = strspn( $revText, " \t", $lengthText - $i );
  476. $searchStart = $i - $wsLength;
  477. if ( isset( $part->commentEnd ) && $searchStart - 1 == $part->commentEnd ) {
  478. // Comment found at line end
  479. // Search for equals signs before the comment
  480. $searchStart = $part->visualEnd;
  481. $searchStart -= strspn( $revText, " \t", $lengthText - $searchStart );
  482. }
  483. $count = $piece->count;
  484. $equalsLength = strspn( $revText, '=', $lengthText - $searchStart );
  485. if ( $equalsLength > 0 ) {
  486. if ( $searchStart - $equalsLength == $piece->startPos ) {
  487. // This is just a single string of equals signs on its own line
  488. // Replicate the doHeadings behavior /={count}(.+)={count}/
  489. // First find out how many equals signs there really are (don't stop at 6)
  490. $count = $equalsLength;
  491. if ( $count < 3 ) {
  492. $count = 0;
  493. } else {
  494. $count = min( 6, intval( ( $count - 1 ) / 2 ) );
  495. }
  496. } else {
  497. $count = min( $equalsLength, $count );
  498. }
  499. if ( $count > 0 ) {
  500. // Normal match, output <h>
  501. $element = [ [ 'possible-h',
  502. array_merge(
  503. [
  504. [ '@level', [ $count ] ],
  505. [ '@i', [ $headingIndex++ ] ]
  506. ],
  507. $accum
  508. )
  509. ] ];
  510. } else {
  511. // Single equals sign on its own line, count=0
  512. $element = $accum;
  513. }
  514. } else {
  515. // No match, no <h>, just pass down the inner text
  516. $element = $accum;
  517. }
  518. // Unwind the stack
  519. $stack->pop();
  520. $accum =& $stack->getAccum();
  521. $stackFlags = $stack->getFlags();
  522. if ( isset( $stackFlags['findEquals'] ) ) {
  523. $findEquals = $stackFlags['findEquals'];
  524. }
  525. if ( isset( $stackFlags['findPipe'] ) ) {
  526. $findPipe = $stackFlags['findPipe'];
  527. }
  528. if ( isset( $stackFlags['inHeading'] ) ) {
  529. $inHeading = $stackFlags['inHeading'];
  530. }
  531. // Append the result to the enclosing accumulator
  532. array_splice( $accum, count( $accum ), 0, $element );
  533. // Note that we do NOT increment the input pointer.
  534. // This is because the closing linebreak could be the opening linebreak of
  535. // another heading. Infinite loops are avoided because the next iteration MUST
  536. // hit the heading open case above, which unconditionally increments the
  537. // input pointer.
  538. } elseif ( $found == 'open' ) {
  539. # count opening brace characters
  540. $curLen = strlen( $curChar );
  541. $count = ( $curLen > 1 ) ?
  542. # allow the final character to repeat
  543. strspn( $text, $curChar[$curLen - 1], $i + 1 ) + 1 :
  544. strspn( $text, $curChar, $i );
  545. $savedPrefix = '';
  546. $lineStart = ( $i > 0 && $text[$i - 1] == "\n" );
  547. if ( $curChar === "-{" && $count > $curLen ) {
  548. // -{ => {{ transition because rightmost wins
  549. $savedPrefix = '-';
  550. $i++;
  551. $curChar = '{';
  552. $count--;
  553. $rule = $this->rules[$curChar];
  554. }
  555. # we need to add to stack only if opening brace count is enough for one of the rules
  556. if ( $count >= $rule['min'] ) {
  557. # Add it to the stack
  558. $piece = [
  559. 'open' => $curChar,
  560. 'close' => $rule['end'],
  561. 'savedPrefix' => $savedPrefix,
  562. 'count' => $count,
  563. 'lineStart' => $lineStart,
  564. ];
  565. $stack->push( $piece );
  566. $accum =& $stack->getAccum();
  567. $stackFlags = $stack->getFlags();
  568. if ( isset( $stackFlags['findEquals'] ) ) {
  569. $findEquals = $stackFlags['findEquals'];
  570. }
  571. if ( isset( $stackFlags['findPipe'] ) ) {
  572. $findPipe = $stackFlags['findPipe'];
  573. }
  574. if ( isset( $stackFlags['inHeading'] ) ) {
  575. $inHeading = $stackFlags['inHeading'];
  576. }
  577. } else {
  578. # Add literal brace(s)
  579. self::addLiteral( $accum, $savedPrefix . str_repeat( $curChar, $count ) );
  580. }
  581. $i += $count;
  582. } elseif ( $found == 'close' ) {
  583. /** @var PPDStackElement_Hash $piece */
  584. $piece = $stack->top;
  585. '@phan-var PPDStackElement_Hash $piece';
  586. # lets check if there are enough characters for closing brace
  587. $maxCount = $piece->count;
  588. if ( $piece->close === '}-' && $curChar === '}' ) {
  589. $maxCount--; # don't try to match closing '-' as a '}'
  590. }
  591. $curLen = strlen( $curChar );
  592. $count = ( $curLen > 1 ) ? $curLen :
  593. strspn( $text, $curChar, $i, $maxCount );
  594. # check for maximum matching characters (if there are 5 closing
  595. # characters, we will probably need only 3 - depending on the rules)
  596. $rule = $this->rules[$piece->open];
  597. if ( $count > $rule['max'] ) {
  598. # The specified maximum exists in the callback array, unless the caller
  599. # has made an error
  600. $matchingCount = $rule['max'];
  601. } else {
  602. # Count is less than the maximum
  603. # Skip any gaps in the callback array to find the true largest match
  604. # Need to use array_key_exists not isset because the callback can be null
  605. $matchingCount = $count;
  606. while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names'] ) ) {
  607. --$matchingCount;
  608. }
  609. }
  610. if ( $matchingCount <= 0 ) {
  611. # No matching element found in callback array
  612. # Output a literal closing brace and continue
  613. $endText = substr( $text, $i, $count );
  614. self::addLiteral( $accum, $endText );
  615. $i += $count;
  616. continue;
  617. }
  618. $name = $rule['names'][$matchingCount];
  619. if ( $name === null ) {
  620. // No element, just literal text
  621. $endText = substr( $text, $i, $matchingCount );
  622. $element = $piece->breakSyntax( $matchingCount );
  623. self::addLiteral( $element, $endText );
  624. } else {
  625. # Create XML element
  626. $parts = $piece->parts;
  627. $titleAccum = $parts[0]->out;
  628. unset( $parts[0] );
  629. $children = [];
  630. # The invocation is at the start of the line if lineStart is set in
  631. # the stack, and all opening brackets are used up.
  632. if ( $maxCount == $matchingCount &&
  633. !empty( $piece->lineStart ) &&
  634. strlen( $piece->savedPrefix ) == 0 ) {
  635. $children[] = [ '@lineStart', [ 1 ] ];
  636. }
  637. $titleNode = [ 'title', $titleAccum ];
  638. $children[] = $titleNode;
  639. $argIndex = 1;
  640. foreach ( $parts as $part ) {
  641. if ( isset( $part->eqpos ) ) {
  642. $equalsNode = $part->out[$part->eqpos];
  643. $nameNode = [ 'name', array_slice( $part->out, 0, $part->eqpos ) ];
  644. $valueNode = [ 'value', array_slice( $part->out, $part->eqpos + 1 ) ];
  645. $partNode = [ 'part', [ $nameNode, $equalsNode, $valueNode ] ];
  646. $children[] = $partNode;
  647. } else {
  648. $nameNode = [ 'name', [ [ '@index', [ $argIndex++ ] ] ] ];
  649. $valueNode = [ 'value', $part->out ];
  650. $partNode = [ 'part', [ $nameNode, $valueNode ] ];
  651. $children[] = $partNode;
  652. }
  653. }
  654. $element = [ [ $name, $children ] ];
  655. }
  656. # Advance input pointer
  657. $i += $matchingCount;
  658. # Unwind the stack
  659. $stack->pop();
  660. $accum =& $stack->getAccum();
  661. # Re-add the old stack element if it still has unmatched opening characters remaining
  662. if ( $matchingCount < $piece->count ) {
  663. $piece->parts = [ new PPDPart_Hash ];
  664. $piece->count -= $matchingCount;
  665. # do we still qualify for any callback with remaining count?
  666. $min = $this->rules[$piece->open]['min'];
  667. if ( $piece->count >= $min ) {
  668. $stack->push( $piece );
  669. $accum =& $stack->getAccum();
  670. } elseif ( $piece->count == 1 && $piece->open === '{' && $piece->savedPrefix === '-' ) {
  671. $piece->savedPrefix = '';
  672. $piece->open = '-{';
  673. $piece->count = 2;
  674. $piece->close = $this->rules[$piece->open]['end'];
  675. $stack->push( $piece );
  676. $accum =& $stack->getAccum();
  677. } else {
  678. $s = substr( $piece->open, 0, -1 );
  679. $s .= str_repeat(
  680. substr( $piece->open, -1 ),
  681. $piece->count - strlen( $s )
  682. );
  683. self::addLiteral( $accum, $piece->savedPrefix . $s );
  684. }
  685. } elseif ( $piece->savedPrefix !== '' ) {
  686. self::addLiteral( $accum, $piece->savedPrefix );
  687. }
  688. $stackFlags = $stack->getFlags();
  689. if ( isset( $stackFlags['findEquals'] ) ) {
  690. $findEquals = $stackFlags['findEquals'];
  691. }
  692. if ( isset( $stackFlags['findPipe'] ) ) {
  693. $findPipe = $stackFlags['findPipe'];
  694. }
  695. if ( isset( $stackFlags['inHeading'] ) ) {
  696. $inHeading = $stackFlags['inHeading'];
  697. }
  698. # Add XML element to the enclosing accumulator
  699. array_splice( $accum, count( $accum ), 0, $element );
  700. } elseif ( $found == 'pipe' ) {
  701. $findEquals = true; // shortcut for getFlags()
  702. $stack->addPart();
  703. $accum =& $stack->getAccum();
  704. ++$i;
  705. } elseif ( $found == 'equals' ) {
  706. $findEquals = false; // shortcut for getFlags()
  707. $accum[] = [ 'equals', [ '=' ] ];
  708. $stack->getCurrentPart()->eqpos = count( $accum ) - 1;
  709. ++$i;
  710. }
  711. }
  712. # Output any remaining unclosed brackets
  713. foreach ( $stack->stack as $piece ) {
  714. array_splice( $stack->rootAccum, count( $stack->rootAccum ), 0, $piece->breakSyntax() );
  715. }
  716. # Enable top-level headings
  717. foreach ( $stack->rootAccum as &$node ) {
  718. if ( is_array( $node ) && $node[PPNode_Hash_Tree::NAME] === 'possible-h' ) {
  719. $node[PPNode_Hash_Tree::NAME] = 'h';
  720. }
  721. }
  722. $rootStore = [ [ 'root', $stack->rootAccum ] ];
  723. $rootNode = new PPNode_Hash_Tree( $rootStore, 0 );
  724. // Cache
  725. $tree = json_encode( $rootStore, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE );
  726. if ( $tree !== false ) {
  727. $this->cacheSetTree( $text, $flags, $tree );
  728. }
  729. return $rootNode;
  730. }
  731. private static function addLiteral( array &$accum, $text ) {
  732. $n = count( $accum );
  733. if ( $n && is_string( $accum[$n - 1] ) ) {
  734. $accum[$n - 1] .= $text;
  735. } else {
  736. $accum[] = $text;
  737. }
  738. }
  739. }