BlockLevelPass.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. <?php
  2. /**
  3. * This is the part of the wikitext parser which handles automatic paragraphs
  4. * and conversion of start-of-line prefixes to HTML lists.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with this program; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. * http://www.gnu.org/copyleft/gpl.html
  20. *
  21. * @file
  22. * @ingroup Parser
  23. */
  24. class BlockLevelPass {
  25. private $DTopen = false;
  26. private $inPre = false;
  27. private $lastSection = '';
  28. private $linestart;
  29. private $text;
  30. # State constants for the definition list colon extraction
  31. const COLON_STATE_TEXT = 0;
  32. const COLON_STATE_TAG = 1;
  33. const COLON_STATE_TAGSTART = 2;
  34. const COLON_STATE_CLOSETAG = 3;
  35. const COLON_STATE_TAGSLASH = 4;
  36. const COLON_STATE_COMMENT = 5;
  37. const COLON_STATE_COMMENTDASH = 6;
  38. const COLON_STATE_COMMENTDASHDASH = 7;
  39. const COLON_STATE_LC = 8;
  40. /**
  41. * Make lists from lines starting with ':', '*', '#', etc.
  42. *
  43. * @param string $text
  44. * @param bool $lineStart Whether or not this is at the start of a line.
  45. * @return string The lists rendered as HTML
  46. */
  47. public static function doBlockLevels( $text, $lineStart ) {
  48. $pass = new self( $text, $lineStart );
  49. return $pass->execute();
  50. }
  51. /**
  52. * Private constructor
  53. */
  54. private function __construct( $text, $lineStart ) {
  55. $this->text = $text;
  56. $this->lineStart = $lineStart;
  57. }
  58. /**
  59. * If a pre or p is open, return the corresponding close tag and update
  60. * the state. If no tag is open, return an empty string.
  61. * @return string
  62. */
  63. private function closeParagraph() {
  64. $result = '';
  65. if ( $this->lastSection !== '' ) {
  66. $result = '</' . $this->lastSection . ">\n";
  67. }
  68. $this->inPre = false;
  69. $this->lastSection = '';
  70. return $result;
  71. }
  72. /**
  73. * getCommon() returns the length of the longest common substring
  74. * of both arguments, starting at the beginning of both.
  75. *
  76. * @param string $st1
  77. * @param string $st2
  78. *
  79. * @return int
  80. */
  81. private function getCommon( $st1, $st2 ) {
  82. $shorter = min( strlen( $st1 ), strlen( $st2 ) );
  83. for ( $i = 0; $i < $shorter; ++$i ) {
  84. if ( $st1[$i] !== $st2[$i] ) {
  85. break;
  86. }
  87. }
  88. return $i;
  89. }
  90. /**
  91. * Open the list item element identified by the prefix character.
  92. *
  93. * @param string $char
  94. *
  95. * @return string
  96. */
  97. private function openList( $char ) {
  98. $result = $this->closeParagraph();
  99. if ( '*' === $char ) {
  100. $result .= "<ul><li>";
  101. } elseif ( '#' === $char ) {
  102. $result .= "<ol><li>";
  103. } elseif ( ':' === $char ) {
  104. $result .= "<dl><dd>";
  105. } elseif ( ';' === $char ) {
  106. $result .= "<dl><dt>";
  107. $this->DTopen = true;
  108. } else {
  109. $result = '<!-- ERR 1 -->';
  110. }
  111. return $result;
  112. }
  113. /**
  114. * Close the current list item and open the next one.
  115. * @param string $char
  116. *
  117. * @return string
  118. */
  119. private function nextItem( $char ) {
  120. if ( '*' === $char || '#' === $char ) {
  121. return "</li>\n<li>";
  122. } elseif ( ':' === $char || ';' === $char ) {
  123. $close = "</dd>\n";
  124. if ( $this->DTopen ) {
  125. $close = "</dt>\n";
  126. }
  127. if ( ';' === $char ) {
  128. $this->DTopen = true;
  129. return $close . '<dt>';
  130. } else {
  131. $this->DTopen = false;
  132. return $close . '<dd>';
  133. }
  134. }
  135. return '<!-- ERR 2 -->';
  136. }
  137. /**
  138. * Close the current list item identified by the prefix character.
  139. * @param string $char
  140. *
  141. * @return string
  142. */
  143. private function closeList( $char ) {
  144. if ( '*' === $char ) {
  145. $text = "</li></ul>";
  146. } elseif ( '#' === $char ) {
  147. $text = "</li></ol>";
  148. } elseif ( ':' === $char ) {
  149. if ( $this->DTopen ) {
  150. $this->DTopen = false;
  151. $text = "</dt></dl>";
  152. } else {
  153. $text = "</dd></dl>";
  154. }
  155. } else {
  156. return '<!-- ERR 3 -->';
  157. }
  158. return $text;
  159. }
  160. /**
  161. * Execute the pass.
  162. * @return string
  163. */
  164. private function execute() {
  165. $text = $this->text;
  166. # Parsing through the text line by line. The main thing
  167. # happening here is handling of block-level elements p, pre,
  168. # and making lists from lines starting with * # : etc.
  169. $textLines = StringUtils::explode( "\n", $text );
  170. $lastPrefix = $output = '';
  171. $this->DTopen = $inBlockElem = false;
  172. $prefixLength = 0;
  173. $pendingPTag = false;
  174. $inBlockquote = false;
  175. foreach ( $textLines as $inputLine ) {
  176. # Fix up $lineStart
  177. if ( !$this->lineStart ) {
  178. $output .= $inputLine;
  179. $this->lineStart = true;
  180. continue;
  181. }
  182. # * = ul
  183. # # = ol
  184. # ; = dt
  185. # : = dd
  186. $lastPrefixLength = strlen( $lastPrefix );
  187. $preCloseMatch = preg_match( '/<\\/pre/i', $inputLine );
  188. $preOpenMatch = preg_match( '/<pre/i', $inputLine );
  189. # If not in a <pre> element, scan for and figure out what prefixes are there.
  190. if ( !$this->inPre ) {
  191. # Multiple prefixes may abut each other for nested lists.
  192. $prefixLength = strspn( $inputLine, '*#:;' );
  193. $prefix = substr( $inputLine, 0, $prefixLength );
  194. # eh?
  195. # ; and : are both from definition-lists, so they're equivalent
  196. # for the purposes of determining whether or not we need to open/close
  197. # elements.
  198. $prefix2 = str_replace( ';', ':', $prefix );
  199. $t = substr( $inputLine, $prefixLength );
  200. $this->inPre = (bool)$preOpenMatch;
  201. } else {
  202. # Don't interpret any other prefixes in preformatted text
  203. $prefixLength = 0;
  204. $prefix = $prefix2 = '';
  205. $t = $inputLine;
  206. }
  207. # List generation
  208. if ( $prefixLength && $lastPrefix === $prefix2 ) {
  209. # Same as the last item, so no need to deal with nesting or opening stuff
  210. $output .= $this->nextItem( substr( $prefix, -1 ) );
  211. $pendingPTag = false;
  212. if ( substr( $prefix, -1 ) === ';' ) {
  213. # The one nasty exception: definition lists work like this:
  214. # ; title : definition text
  215. # So we check for : in the remainder text to split up the
  216. # title and definition, without b0rking links.
  217. $term = $t2 = '';
  218. if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
  219. $t = $t2;
  220. $output .= $term . $this->nextItem( ':' );
  221. }
  222. }
  223. } elseif ( $prefixLength || $lastPrefixLength ) {
  224. # We need to open or close prefixes, or both.
  225. # Either open or close a level...
  226. $commonPrefixLength = $this->getCommon( $prefix, $lastPrefix );
  227. $pendingPTag = false;
  228. # Close all the prefixes which aren't shared.
  229. while ( $commonPrefixLength < $lastPrefixLength ) {
  230. $output .= $this->closeList( $lastPrefix[$lastPrefixLength - 1] );
  231. --$lastPrefixLength;
  232. }
  233. # Continue the current prefix if appropriate.
  234. if ( $prefixLength <= $commonPrefixLength && $commonPrefixLength > 0 ) {
  235. $output .= $this->nextItem( $prefix[$commonPrefixLength - 1] );
  236. }
  237. # Close an open <dt> if we have a <dd> (":") starting on this line
  238. if ( $this->DTopen && $commonPrefixLength > 0 && $prefix[$commonPrefixLength - 1] === ':' ) {
  239. $output .= $this->nextItem( ':' );
  240. }
  241. # Open prefixes where appropriate.
  242. if ( $lastPrefix && $prefixLength > $commonPrefixLength ) {
  243. $output .= "\n";
  244. }
  245. while ( $prefixLength > $commonPrefixLength ) {
  246. $char = $prefix[$commonPrefixLength];
  247. $output .= $this->openList( $char );
  248. if ( ';' === $char ) {
  249. # @todo FIXME: This is dupe of code above
  250. if ( $this->findColonNoLinks( $t, $term, $t2 ) !== false ) {
  251. $t = $t2;
  252. $output .= $term . $this->nextItem( ':' );
  253. }
  254. }
  255. ++$commonPrefixLength;
  256. }
  257. if ( !$prefixLength && $lastPrefix ) {
  258. $output .= "\n";
  259. }
  260. $lastPrefix = $prefix2;
  261. }
  262. # If we have no prefixes, go to paragraph mode.
  263. if ( 0 == $prefixLength ) {
  264. # No prefix (not in list)--go to paragraph mode
  265. # @todo consider using a stack for nestable elements like span, table and div
  266. $openMatch = preg_match(
  267. '/(?:<table|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|'
  268. . '<p|<ul|<ol|<dl|<li|<\\/tr|<\\/td|<\\/th)\\b/iS',
  269. $t
  270. );
  271. $closeMatch = preg_match(
  272. '/(?:<\\/table|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|'
  273. . '<td|<th|<\\/?blockquote|<\\/?div|<hr|<\\/pre|<\\/p|<\\/mw:|'
  274. . Parser::MARKER_PREFIX
  275. . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/dl|<\\/?center)\\b/iS',
  276. $t
  277. );
  278. if ( $openMatch || $closeMatch ) {
  279. $pendingPTag = false;
  280. # @todo T7718: paragraph closed
  281. $output .= $this->closeParagraph();
  282. if ( $preOpenMatch && !$preCloseMatch ) {
  283. $this->inPre = true;
  284. }
  285. $bqOffset = 0;
  286. while ( preg_match( '/<(\\/?)blockquote[\s>]/i', $t,
  287. $bqMatch, PREG_OFFSET_CAPTURE, $bqOffset )
  288. ) {
  289. $inBlockquote = !$bqMatch[1][0]; // is this a close tag?
  290. $bqOffset = $bqMatch[0][1] + strlen( $bqMatch[0][0] );
  291. }
  292. $inBlockElem = !$closeMatch;
  293. } elseif ( !$inBlockElem && !$this->inPre ) {
  294. if ( ' ' == substr( $t, 0, 1 )
  295. && ( $this->lastSection === 'pre' || trim( $t ) != '' )
  296. && !$inBlockquote
  297. ) {
  298. # pre
  299. if ( $this->lastSection !== 'pre' ) {
  300. $pendingPTag = false;
  301. $output .= $this->closeParagraph() . '<pre>';
  302. $this->lastSection = 'pre';
  303. }
  304. $t = substr( $t, 1 );
  305. } else {
  306. # paragraph
  307. if ( trim( $t ) === '' ) {
  308. if ( $pendingPTag ) {
  309. $output .= $pendingPTag . '<br />';
  310. $pendingPTag = false;
  311. $this->lastSection = 'p';
  312. } else {
  313. if ( $this->lastSection !== 'p' ) {
  314. $output .= $this->closeParagraph();
  315. $this->lastSection = '';
  316. $pendingPTag = '<p>';
  317. } else {
  318. $pendingPTag = '</p><p>';
  319. }
  320. }
  321. } else {
  322. if ( $pendingPTag ) {
  323. $output .= $pendingPTag;
  324. $pendingPTag = false;
  325. $this->lastSection = 'p';
  326. } elseif ( $this->lastSection !== 'p' ) {
  327. $output .= $this->closeParagraph() . '<p>';
  328. $this->lastSection = 'p';
  329. }
  330. }
  331. }
  332. }
  333. }
  334. # somewhere above we forget to get out of pre block (T2785)
  335. if ( $preCloseMatch && $this->inPre ) {
  336. $this->inPre = false;
  337. }
  338. if ( $pendingPTag === false ) {
  339. $output .= $t;
  340. if ( $prefixLength === 0 ) {
  341. $output .= "\n";
  342. }
  343. }
  344. }
  345. while ( $prefixLength ) {
  346. $output .= $this->closeList( $prefix2[$prefixLength - 1] );
  347. --$prefixLength;
  348. if ( !$prefixLength ) {
  349. $output .= "\n";
  350. }
  351. }
  352. if ( $this->lastSection !== '' ) {
  353. $output .= '</' . $this->lastSection . '>';
  354. $this->lastSection = '';
  355. }
  356. return $output;
  357. }
  358. /**
  359. * Split up a string on ':', ignoring any occurrences inside tags
  360. * to prevent illegal overlapping.
  361. *
  362. * @param string $str The string to split
  363. * @param string &$before Set to everything before the ':'
  364. * @param string &$after Set to everything after the ':'
  365. * @throws MWException
  366. * @return string The position of the ':', or false if none found
  367. */
  368. private function findColonNoLinks( $str, &$before, &$after ) {
  369. if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) {
  370. # Nothing to find!
  371. return false;
  372. }
  373. if ( $m[0][0] === ':' ) {
  374. # Easy; no tag nesting to worry about
  375. $colonPos = $m[0][1];
  376. $before = substr( $str, 0, $colonPos );
  377. $after = substr( $str, $colonPos + 1 );
  378. return $colonPos;
  379. }
  380. # Ugly state machine to walk through avoiding tags.
  381. $state = self::COLON_STATE_TEXT;
  382. $ltLevel = 0;
  383. $lcLevel = 0;
  384. $len = strlen( $str );
  385. for ( $i = $m[0][1]; $i < $len; $i++ ) {
  386. $c = $str[$i];
  387. switch ( $state ) {
  388. case self::COLON_STATE_TEXT:
  389. switch ( $c ) {
  390. case "<":
  391. # Could be either a <start> tag or an </end> tag
  392. $state = self::COLON_STATE_TAGSTART;
  393. break;
  394. case ":":
  395. if ( $ltLevel === 0 ) {
  396. # We found it!
  397. $before = substr( $str, 0, $i );
  398. $after = substr( $str, $i + 1 );
  399. return $i;
  400. }
  401. # Embedded in a tag; don't break it.
  402. break;
  403. default:
  404. # Skip ahead looking for something interesting
  405. if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
  406. # Nothing else interesting
  407. return false;
  408. }
  409. if ( $m[0][0] === '-{' ) {
  410. $state = self::COLON_STATE_LC;
  411. $lcLevel++;
  412. $i = $m[0][1] + 1;
  413. } else {
  414. # Skip ahead to next interesting character.
  415. $i = $m[0][1] - 1;
  416. }
  417. break;
  418. }
  419. break;
  420. case self::COLON_STATE_LC:
  421. # In language converter markup -{ ... }-
  422. if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
  423. # Nothing else interesting to find; abort!
  424. # We're nested in language converter markup, but there
  425. # are no close tags left. Abort!
  426. break 2;
  427. } elseif ( $m[0][0] === '-{' ) {
  428. $i = $m[0][1] + 1;
  429. $lcLevel++;
  430. } elseif ( $m[0][0] === '}-' ) {
  431. $i = $m[0][1] + 1;
  432. $lcLevel--;
  433. if ( $lcLevel === 0 ) {
  434. $state = self::COLON_STATE_TEXT;
  435. }
  436. }
  437. break;
  438. case self::COLON_STATE_TAG:
  439. # In a <tag>
  440. switch ( $c ) {
  441. case ">":
  442. $ltLevel++;
  443. $state = self::COLON_STATE_TEXT;
  444. break;
  445. case "/":
  446. # Slash may be followed by >?
  447. $state = self::COLON_STATE_TAGSLASH;
  448. break;
  449. default:
  450. # ignore
  451. }
  452. break;
  453. case self::COLON_STATE_TAGSTART:
  454. switch ( $c ) {
  455. case "/":
  456. $state = self::COLON_STATE_CLOSETAG;
  457. break;
  458. case "!":
  459. $state = self::COLON_STATE_COMMENT;
  460. break;
  461. case ">":
  462. # Illegal early close? This shouldn't happen D:
  463. $state = self::COLON_STATE_TEXT;
  464. break;
  465. default:
  466. $state = self::COLON_STATE_TAG;
  467. }
  468. break;
  469. case self::COLON_STATE_CLOSETAG:
  470. # In a </tag>
  471. if ( $c === ">" ) {
  472. if ( $ltLevel > 0 ) {
  473. $ltLevel--;
  474. } else {
  475. # ignore the excess close tag, but keep looking for
  476. # colons. (This matches Parsoid behavior.)
  477. wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" );
  478. }
  479. $state = self::COLON_STATE_TEXT;
  480. }
  481. break;
  482. case self::COLON_STATE_TAGSLASH:
  483. if ( $c === ">" ) {
  484. # Yes, a self-closed tag <blah/>
  485. $state = self::COLON_STATE_TEXT;
  486. } else {
  487. # Probably we're jumping the gun, and this is an attribute
  488. $state = self::COLON_STATE_TAG;
  489. }
  490. break;
  491. case self::COLON_STATE_COMMENT:
  492. if ( $c === "-" ) {
  493. $state = self::COLON_STATE_COMMENTDASH;
  494. }
  495. break;
  496. case self::COLON_STATE_COMMENTDASH:
  497. if ( $c === "-" ) {
  498. $state = self::COLON_STATE_COMMENTDASHDASH;
  499. } else {
  500. $state = self::COLON_STATE_COMMENT;
  501. }
  502. break;
  503. case self::COLON_STATE_COMMENTDASHDASH:
  504. if ( $c === ">" ) {
  505. $state = self::COLON_STATE_TEXT;
  506. } else {
  507. $state = self::COLON_STATE_COMMENT;
  508. }
  509. break;
  510. default:
  511. throw new MWException( "State machine error in " . __METHOD__ );
  512. }
  513. }
  514. if ( $ltLevel > 0 || $lcLevel > 0 ) {
  515. wfDebug(
  516. __METHOD__ . ": Invalid input; not enough close tags " .
  517. "(level $ltLevel/$lcLevel, state $state)\n"
  518. );
  519. return false;
  520. }
  521. return false;
  522. }
  523. }