TokenizerTest.php 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981
  1. <?php
  2. namespace Masterminds\HTML5\Tests\Parser;
  3. use Masterminds\HTML5\Parser\UTF8Utils;
  4. use Masterminds\HTML5\Parser\Scanner;
  5. use Masterminds\HTML5\Parser\Tokenizer;
  6. class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase
  7. {
  8. // ================================================================
  9. // Additional assertions.
  10. // ================================================================
  11. /**
  12. * Tests that an event matches both the event type and the expected value.
  13. *
  14. * @param string $type
  15. * Expected event type
  16. * @param string $expects
  17. * The value expected in $event['data'][0]
  18. */
  19. public function assertEventEquals($type, $expects, $event)
  20. {
  21. $this->assertEquals($type, $event['name'], "Event $type for " . print_r($event, true));
  22. if (is_array($expects)) {
  23. $this->assertEquals($expects, $event['data'], "Event $type should equal " . print_r($expects, true) . ': ' . print_r($event, true));
  24. } else {
  25. $this->assertEquals($expects, $event['data'][0], "Event $type should equal $expects: " . print_r($event, true));
  26. }
  27. }
  28. /**
  29. * Assert that a given event is 'error'.
  30. */
  31. public function assertEventError($event)
  32. {
  33. $this->assertEquals('error', $event['name'], 'Expected error for event: ' . print_r($event, true));
  34. }
  35. /**
  36. * Asserts that all of the tests are good.
  37. *
  38. * This loops through a map of tests/expectations and runs a few assertions on each test.
  39. *
  40. * Checks:
  41. * - depth (if depth is > 0)
  42. * - event name
  43. * - matches on event 0.
  44. */
  45. protected function isAllGood($name, $depth, $tests, $debug = false)
  46. {
  47. foreach ($tests as $try => $expects) {
  48. if ($debug) {
  49. fprintf(STDOUT, "%s expects %s\n", $try, print_r($expects, true));
  50. }
  51. $e = $this->parse($try);
  52. if ($depth > 0) {
  53. $this->assertEquals($depth, $e->depth(), "Expected depth $depth for test $try." . print_r($e, true));
  54. }
  55. $this->assertEventEquals($name, $expects, $e->get(0));
  56. }
  57. }
  58. // ================================================================
  59. // Utility functions.
  60. // ================================================================
  61. public function testParse()
  62. {
  63. list($tok, $events) = $this->createTokenizer('');
  64. $tok->parse();
  65. $e1 = $events->get(0);
  66. $this->assertEquals(1, $events->Depth());
  67. $this->assertEquals('eof', $e1['name']);
  68. }
  69. public function testWhitespace()
  70. {
  71. $spaces = ' ';
  72. list($tok, $events) = $this->createTokenizer($spaces);
  73. $tok->parse();
  74. $this->assertEquals(2, $events->depth());
  75. $e1 = $events->get(0);
  76. $this->assertEquals('text', $e1['name']);
  77. $this->assertEquals($spaces, $e1['data'][0]);
  78. }
  79. public function testCharacterReference()
  80. {
  81. $good = array(
  82. '&amp;' => '&',
  83. '&#x0003c;' => '<',
  84. '&#38;' => '&',
  85. '&' => '&',
  86. );
  87. $this->isAllGood('text', 2, $good);
  88. // Test with broken charref
  89. $str = '&foo';
  90. $events = $this->parse($str);
  91. $e1 = $events->get(0);
  92. $this->assertEquals('error', $e1['name']);
  93. $str = '&#xfoo';
  94. $events = $this->parse($str);
  95. $e1 = $events->get(0);
  96. $this->assertEquals('error', $e1['name']);
  97. $str = '&#foo';
  98. $events = $this->parse($str);
  99. $e1 = $events->get(0);
  100. $this->assertEquals('error', $e1['name']);
  101. // FIXME: Once the text processor is done, need to verify that the
  102. // tokens are transformed correctly into text.
  103. }
  104. public function testBogusComment()
  105. {
  106. $bogus = array(
  107. '</+this is a bogus comment. +>',
  108. '<!+this is a bogus comment. !>',
  109. '<!D OCTYPE foo bar>',
  110. '<!DOCTYEP foo bar>',
  111. '<![CADATA[ TEST ]]>',
  112. '<![CDATA Hello ]]>',
  113. '<![CDATA[ Hello [[>',
  114. '<!CDATA[[ test ]]>',
  115. '<![CDATA[',
  116. '<![CDATA[hellooooo hello',
  117. '<? Hello World ?>',
  118. '<? Hello World',
  119. );
  120. foreach ($bogus as $str) {
  121. $events = $this->parse($str);
  122. $this->assertEventError($events->get(0));
  123. $this->assertEventEquals('comment', $str, $events->get(1));
  124. }
  125. }
  126. public function testEndTag()
  127. {
  128. $succeed = array(
  129. '</a>' => 'a',
  130. '</test>' => 'test',
  131. '</test
  132. >' => 'test',
  133. '</thisIsTheTagThatDoesntEndItJustGoesOnAndOnMyFriend>' => 'thisisthetagthatdoesntenditjustgoesonandonmyfriend',
  134. // See 8.2.4.10, which requires this and does not say error.
  135. '</a<b>' => 'a<b',
  136. );
  137. $this->isAllGood('endTag', 2, $succeed);
  138. // Recoverable failures
  139. $fail = array(
  140. '</a class="monkey">' => 'a',
  141. '</a <b>' => 'a',
  142. '</a <b <c>' => 'a',
  143. '</a is the loneliest letter>' => 'a',
  144. '</a' => 'a',
  145. );
  146. foreach ($fail as $test => $result) {
  147. $events = $this->parse($test);
  148. $this->assertEquals(3, $events->depth());
  149. // Should have triggered an error.
  150. $this->assertEventError($events->get(0));
  151. // Should have tried to parse anyway.
  152. $this->assertEventEquals('endTag', $result, $events->get(1));
  153. }
  154. // BogoComments
  155. $comments = array(
  156. '</>' => '</>',
  157. '</ >' => '</ >',
  158. '</ a>' => '</ a>',
  159. );
  160. foreach ($comments as $test => $result) {
  161. $events = $this->parse($test);
  162. $this->assertEquals(3, $events->depth());
  163. // Should have triggered an error.
  164. $this->assertEventError($events->get(0));
  165. // Should have tried to parse anyway.
  166. $this->assertEventEquals('comment', $result, $events->get(1));
  167. }
  168. }
  169. public function testComment()
  170. {
  171. $good = array(
  172. '<!--easy-->' => 'easy',
  173. '<!-- 1 > 0 -->' => ' 1 > 0 ',
  174. '<!-- --$i -->' => ' --$i ',
  175. '<!----$i-->' => '--$i',
  176. "<!--\nHello World.\na-->" => "\nHello World.\na",
  177. '<!-- <!-- -->' => ' <!-- ',
  178. );
  179. foreach ($good as $test => $expected) {
  180. $events = $this->parse($test);
  181. $this->assertEventEquals('comment', $expected, $events->get(0));
  182. }
  183. $fail = array(
  184. '<!-->' => '',
  185. '<!--Hello' => 'Hello',
  186. "<!--\0Hello" => UTF8Utils::FFFD . 'Hello',
  187. '<!--' => '',
  188. );
  189. foreach ($fail as $test => $expected) {
  190. $events = $this->parse($test);
  191. $this->assertEquals(3, $events->depth());
  192. $this->assertEventError($events->get(0));
  193. $this->assertEventEquals('comment', $expected, $events->get(1));
  194. }
  195. }
  196. public function testCDATASection()
  197. {
  198. $good = array(
  199. '<![CDATA[ This is a test. ]]>' => ' This is a test. ',
  200. '<![CDATA[CDATA]]>' => 'CDATA',
  201. '<![CDATA[ ]] > ]]>' => ' ]] > ',
  202. '<![CDATA[ ]]>' => ' ',
  203. );
  204. $this->isAllGood('cdata', 2, $good);
  205. }
  206. public function testDoctype()
  207. {
  208. $good = array(
  209. '<!DOCTYPE html>' => array(
  210. 'html',
  211. 0,
  212. null,
  213. false,
  214. ),
  215. '<!doctype html>' => array(
  216. 'html',
  217. 0,
  218. null,
  219. false,
  220. ),
  221. '<!DocType html>' => array(
  222. 'html',
  223. 0,
  224. null,
  225. false,
  226. ),
  227. "<!DOCTYPE\nhtml>" => array(
  228. 'html',
  229. 0,
  230. null,
  231. false,
  232. ),
  233. "<!DOCTYPE\fhtml>" => array(
  234. 'html',
  235. 0,
  236. null,
  237. false,
  238. ),
  239. '<!DOCTYPE html PUBLIC "foo bar">' => array(
  240. 'html',
  241. EventStack::DOCTYPE_PUBLIC,
  242. 'foo bar',
  243. false,
  244. ),
  245. "<!DOCTYPE html PUBLIC 'foo bar'>" => array(
  246. 'html',
  247. EventStack::DOCTYPE_PUBLIC,
  248. 'foo bar',
  249. false,
  250. ),
  251. '<!DOCTYPE html PUBLIC "foo bar" >' => array(
  252. 'html',
  253. EventStack::DOCTYPE_PUBLIC,
  254. 'foo bar',
  255. false,
  256. ),
  257. "<!DOCTYPE html \nPUBLIC\n'foo bar'>" => array(
  258. 'html',
  259. EventStack::DOCTYPE_PUBLIC,
  260. 'foo bar',
  261. false,
  262. ),
  263. '<!DOCTYPE html SYSTEM "foo bar">' => array(
  264. 'html',
  265. EventStack::DOCTYPE_SYSTEM,
  266. 'foo bar',
  267. false,
  268. ),
  269. "<!DOCTYPE html SYSTEM 'foo bar'>" => array(
  270. 'html',
  271. EventStack::DOCTYPE_SYSTEM,
  272. 'foo bar',
  273. false,
  274. ),
  275. '<!DOCTYPE html SYSTEM "foo/bar" >' => array(
  276. 'html',
  277. EventStack::DOCTYPE_SYSTEM,
  278. 'foo/bar',
  279. false,
  280. ),
  281. "<!DOCTYPE html \nSYSTEM\n'foo bar'>" => array(
  282. 'html',
  283. EventStack::DOCTYPE_SYSTEM,
  284. 'foo bar',
  285. false,
  286. ),
  287. );
  288. $this->isAllGood('doctype', 2, $good);
  289. $bad = array(
  290. '<!DOCTYPE>' => array(
  291. null,
  292. EventStack::DOCTYPE_NONE,
  293. null,
  294. true,
  295. ),
  296. '<!DOCTYPE >' => array(
  297. null,
  298. EventStack::DOCTYPE_NONE,
  299. null,
  300. true,
  301. ),
  302. '<!DOCTYPE foo' => array(
  303. 'foo',
  304. EventStack::DOCTYPE_NONE,
  305. null,
  306. true,
  307. ),
  308. '<!DOCTYPE foo PUB' => array(
  309. 'foo',
  310. EventStack::DOCTYPE_NONE,
  311. null,
  312. true,
  313. ),
  314. '<!DOCTYPE foo PUB>' => array(
  315. 'foo',
  316. EventStack::DOCTYPE_NONE,
  317. null,
  318. true,
  319. ),
  320. '<!DOCTYPE foo PUB "Looks good">' => array(
  321. 'foo',
  322. EventStack::DOCTYPE_NONE,
  323. null,
  324. true,
  325. ),
  326. '<!DOCTYPE foo SYSTME "Looks good"' => array(
  327. 'foo',
  328. EventStack::DOCTYPE_NONE,
  329. null,
  330. true,
  331. ),
  332. // Can't tell whether these are ids or ID types, since the context is chopped.
  333. '<!DOCTYPE foo PUBLIC' => array(
  334. 'foo',
  335. EventStack::DOCTYPE_NONE,
  336. null,
  337. true,
  338. ),
  339. '<!DOCTYPE foo PUBLIC>' => array(
  340. 'foo',
  341. EventStack::DOCTYPE_NONE,
  342. null,
  343. true,
  344. ),
  345. '<!DOCTYPE foo SYSTEM' => array(
  346. 'foo',
  347. EventStack::DOCTYPE_NONE,
  348. null,
  349. true,
  350. ),
  351. '<!DOCTYPE foo SYSTEM>' => array(
  352. 'foo',
  353. EventStack::DOCTYPE_NONE,
  354. null,
  355. true,
  356. ),
  357. '<!DOCTYPE html SYSTEM "foo bar"' => array(
  358. 'html',
  359. EventStack::DOCTYPE_SYSTEM,
  360. 'foo bar',
  361. true,
  362. ),
  363. '<!DOCTYPE html SYSTEM "foo bar" more stuff>' => array(
  364. 'html',
  365. EventStack::DOCTYPE_SYSTEM,
  366. 'foo bar',
  367. true,
  368. ),
  369. );
  370. foreach ($bad as $test => $expects) {
  371. $events = $this->parse($test);
  372. // fprintf(STDOUT, $test . PHP_EOL);
  373. $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  374. $this->assertEventError($events->get(0));
  375. $this->assertEventEquals('doctype', $expects, $events->get(1));
  376. }
  377. }
  378. public function testProcessorInstruction()
  379. {
  380. $good = array(
  381. '<?hph ?>' => 'hph',
  382. '<?hph echo "Hello World"; ?>' => array(
  383. 'hph',
  384. 'echo "Hello World"; ',
  385. ),
  386. "<?hph \necho 'Hello World';\n?>" => array(
  387. 'hph',
  388. "echo 'Hello World';\n",
  389. ),
  390. );
  391. $this->isAllGood('pi', 2, $good);
  392. }
  393. /**
  394. * This tests just simple tags.
  395. */
  396. public function testSimpleTags()
  397. {
  398. $open = array(
  399. '<foo>' => 'foo',
  400. '<FOO>' => 'foo',
  401. '<fOO>' => 'foo',
  402. '<foo >' => 'foo',
  403. "<foo\n\n\n\n>" => 'foo',
  404. '<foo:bar>' => 'foo:bar',
  405. );
  406. $this->isAllGood('startTag', 2, $open);
  407. $selfClose = array(
  408. '<foo/>' => 'foo',
  409. '<FOO/>' => 'foo',
  410. '<foo />' => 'foo',
  411. "<foo\n\n\n\n/>" => 'foo',
  412. '<foo:bar/>' => 'foo:bar',
  413. );
  414. foreach ($selfClose as $test => $expects) {
  415. $events = $this->parse($test);
  416. $this->assertEquals(2, $events->depth(), "Counting events for '$test'" . print_r($events, true));
  417. $this->assertEventEquals('startTag', $expects, $events->get(0));
  418. $event = $events->get(0);
  419. $this->assertTrue($event['data'][2]);
  420. }
  421. $bad = array(
  422. '<foo' => 'foo',
  423. '<foo ' => 'foo',
  424. '<foo/' => 'foo',
  425. '<foo /' => 'foo',
  426. );
  427. foreach ($bad as $test => $expects) {
  428. $events = $this->parse($test);
  429. $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  430. $this->assertEventError($events->get(0));
  431. $this->assertEventEquals('startTag', $expects, $events->get(1));
  432. }
  433. }
  434. public function testTagsWithAttributeAndMissingName()
  435. {
  436. $cases = array(
  437. '<id="top_featured">' => 'id',
  438. '<color="white">' => 'color',
  439. "<class='neaktivni_stranka'>" => 'class',
  440. '<bgcolor="white">' => 'bgcolor',
  441. '<class="nom">' => 'class',
  442. );
  443. foreach ($cases as $html => $expected) {
  444. $events = $this->parse($html);
  445. $this->assertEventError($events->get(0));
  446. $this->assertEventError($events->get(1));
  447. $this->assertEventError($events->get(2));
  448. $this->assertEventEquals('startTag', $expected, $events->get(3));
  449. $this->assertEventEquals('eof', null, $events->get(4));
  450. }
  451. }
  452. public function testTagNotClosedAfterTagName()
  453. {
  454. $cases = array(
  455. '<noscript<img>' => array(
  456. 'noscript',
  457. 'img',
  458. ),
  459. '<center<a>' => array(
  460. 'center',
  461. 'a',
  462. ),
  463. '<br<br>' => array(
  464. 'br',
  465. 'br',
  466. ),
  467. );
  468. foreach ($cases as $html => $expected) {
  469. $events = $this->parse($html);
  470. $this->assertEventError($events->get(0));
  471. $this->assertEventEquals('startTag', $expected[0], $events->get(1));
  472. $this->assertEventEquals('startTag', $expected[1], $events->get(2));
  473. $this->assertEventEquals('eof', null, $events->get(3));
  474. }
  475. $events = $this->parse('<span<>02</span>');
  476. $this->assertEventError($events->get(0));
  477. $this->assertEventEquals('startTag', 'span', $events->get(1));
  478. $this->assertEventError($events->get(2));
  479. $this->assertEventEquals('text', '>02', $events->get(3));
  480. $this->assertEventEquals('endTag', 'span', $events->get(4));
  481. $this->assertEventEquals('eof', null, $events->get(5));
  482. $events = $this->parse('<p</p>');
  483. $this->assertEventError($events->get(0));
  484. $this->assertEventEquals('startTag', 'p', $events->get(1));
  485. $this->assertEventEquals('endTag', 'p', $events->get(2));
  486. $this->assertEventEquals('eof', null, $events->get(3));
  487. $events = $this->parse('<strong><WordPress</strong>');
  488. $this->assertEventEquals('startTag', 'strong', $events->get(0));
  489. $this->assertEventError($events->get(1));
  490. $this->assertEventEquals('startTag', 'wordpress', $events->get(2));
  491. $this->assertEventEquals('endTag', 'strong', $events->get(3));
  492. $this->assertEventEquals('eof', null, $events->get(4));
  493. $events = $this->parse('<src=<a>');
  494. $this->assertEventError($events->get(0));
  495. $this->assertEventError($events->get(1));
  496. $this->assertEventError($events->get(2));
  497. $this->assertEventEquals('startTag', 'src', $events->get(3));
  498. $this->assertEventEquals('startTag', 'a', $events->get(4));
  499. $this->assertEventEquals('eof', null, $events->get(5));
  500. $events = $this->parse('<br...<a>');
  501. $this->assertEventError($events->get(0));
  502. $this->assertEventEquals('startTag', 'br', $events->get(1));
  503. $this->assertEventEquals('eof', null, $events->get(2));
  504. }
  505. public function testIllegalTagNames()
  506. {
  507. $cases = array(
  508. '<li">' => 'li',
  509. '<p">' => 'p',
  510. '<b&nbsp; >' => 'b',
  511. '<static*all>' => 'static',
  512. '<h*0720/>' => 'h',
  513. '<st*ATTRIBUTE />' => 'st',
  514. );
  515. foreach ($cases as $html => $expected) {
  516. $events = $this->parse($html);
  517. $this->assertEventError($events->get(0));
  518. $this->assertEventEquals('startTag', $expected, $events->get(1));
  519. }
  520. }
  521. /**
  522. * @depends testCharacterReference
  523. */
  524. public function testTagAttributes()
  525. {
  526. // Opening tags.
  527. $good = array(
  528. '<foo bar="baz">' => array(
  529. 'foo',
  530. array(
  531. 'bar' => 'baz',
  532. ),
  533. false,
  534. ),
  535. '<foo bar=" baz ">' => array(
  536. 'foo',
  537. array(
  538. 'bar' => ' baz ',
  539. ),
  540. false,
  541. ),
  542. "<foo bar=\"\nbaz\n\">" => array(
  543. 'foo',
  544. array(
  545. 'bar' => "\nbaz\n",
  546. ),
  547. false,
  548. ),
  549. "<foo bar='baz'>" => array(
  550. 'foo',
  551. array(
  552. 'bar' => 'baz',
  553. ),
  554. false,
  555. ),
  556. '<foo bar="A full sentence.">' => array(
  557. 'foo',
  558. array(
  559. 'bar' => 'A full sentence.',
  560. ),
  561. false,
  562. ),
  563. "<foo a='1' b=\"2\">" => array(
  564. 'foo',
  565. array(
  566. 'a' => '1',
  567. 'b' => '2',
  568. ),
  569. false,
  570. ),
  571. "<foo ns:bar='baz'>" => array(
  572. 'foo',
  573. array(
  574. 'ns:bar' => 'baz',
  575. ),
  576. false,
  577. ),
  578. "<foo a='blue&red'>" => array(
  579. 'foo',
  580. array(
  581. 'a' => 'blue&red',
  582. ),
  583. false,
  584. ),
  585. "<foo a='blue&amp;red'>" => array(
  586. 'foo',
  587. array(
  588. 'a' => 'blue&red',
  589. ),
  590. false,
  591. ),
  592. "<foo a='blue&&amp;&red'>" => array(
  593. 'foo',
  594. array(
  595. 'a' => 'blue&&&red',
  596. ),
  597. false,
  598. ),
  599. "<foo a='blue&&amp;red'>" => array(
  600. 'foo',
  601. array(
  602. 'a' => 'blue&&red',
  603. ),
  604. false,
  605. ),
  606. "<foo\nbar='baz'\n>" => array(
  607. 'foo',
  608. array(
  609. 'bar' => 'baz',
  610. ),
  611. false,
  612. ),
  613. '<doe a deer>' => array(
  614. 'doe',
  615. array(
  616. 'a' => null,
  617. 'deer' => null,
  618. ),
  619. false,
  620. ),
  621. '<foo bar=baz>' => array(
  622. 'foo',
  623. array(
  624. 'bar' => 'baz',
  625. ),
  626. false,
  627. ),
  628. // Updated for 8.1.2.3
  629. '<foo bar = "baz" >' => array(
  630. 'foo',
  631. array(
  632. 'bar' => 'baz',
  633. ),
  634. false,
  635. ),
  636. // The spec allows an unquoted value '/'. This will not be a closing
  637. // tag.
  638. '<foo bar=/>' => array(
  639. 'foo',
  640. array(
  641. 'bar' => '/',
  642. ),
  643. false,
  644. ),
  645. '<foo bar=baz/>' => array(
  646. 'foo',
  647. array(
  648. 'bar' => 'baz/',
  649. ),
  650. false,
  651. ),
  652. );
  653. $this->isAllGood('startTag', 2, $good);
  654. // Self-closing tags.
  655. $withEnd = array(
  656. '<foo bar="baz"/>' => array(
  657. 'foo',
  658. array(
  659. 'bar' => 'baz',
  660. ),
  661. true,
  662. ),
  663. '<foo BAR="baz"/>' => array(
  664. 'foo',
  665. array(
  666. 'bar' => 'baz',
  667. ),
  668. true,
  669. ),
  670. '<foo BAR="BAZ"/>' => array(
  671. 'foo',
  672. array(
  673. 'bar' => 'BAZ',
  674. ),
  675. true,
  676. ),
  677. "<foo a='1' b=\"2\" c=3 d/>" => array(
  678. 'foo',
  679. array(
  680. 'a' => '1',
  681. 'b' => '2',
  682. 'c' => '3',
  683. 'd' => null,
  684. ),
  685. true,
  686. ),
  687. );
  688. $this->isAllGood('startTag', 2, $withEnd);
  689. // Cause a parse error.
  690. $bad = array(
  691. // This will emit an entity lookup failure for &+dark.
  692. "<foo a='blue&+dark'>" => array(
  693. 'foo',
  694. array(
  695. 'a' => 'blue&+dark',
  696. ),
  697. false,
  698. ),
  699. '<foo bar=>' => array(
  700. 'foo',
  701. array(
  702. 'bar' => null,
  703. ),
  704. false,
  705. ),
  706. '<foo bar="oh' => array(
  707. 'foo',
  708. array(
  709. 'bar' => 'oh',
  710. ),
  711. false,
  712. ),
  713. '<foo bar=oh">' => array(
  714. 'foo',
  715. array(
  716. 'bar' => 'oh"',
  717. ),
  718. false,
  719. ),
  720. // these attributes are ignored because of current implementation
  721. // of method "DOMElement::setAttribute"
  722. // see issue #23: https://github.com/Masterminds/html5-php/issues/23
  723. '<foo b"="baz">' => array(
  724. 'foo',
  725. array(),
  726. false,
  727. ),
  728. '<foo 2abc="baz">' => array(
  729. 'foo',
  730. array(),
  731. false,
  732. ),
  733. '<foo ?="baz">' => array(
  734. 'foo',
  735. array(),
  736. false,
  737. ),
  738. '<foo foo?bar="baz">' => array(
  739. 'foo',
  740. array(),
  741. false,
  742. ),
  743. )
  744. ;
  745. foreach ($bad as $test => $expects) {
  746. $events = $this->parse($test);
  747. $this->assertEquals(3, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  748. $this->assertEventError($events->get(0));
  749. $this->assertEventEquals('startTag', $expects, $events->get(1));
  750. }
  751. // Cause multiple parse errors.
  752. $reallyBad = array(
  753. '<foo ="bar">' => array(
  754. 'foo',
  755. array(
  756. '=' => null,
  757. '"bar"' => null,
  758. ),
  759. false,
  760. ),
  761. '<foo////>' => array(
  762. 'foo',
  763. array(),
  764. true,
  765. ),
  766. // character "&" in unquoted attribute shouldn't cause an infinite loop
  767. '<foo bar=index.php?str=1&amp;id=29>' => array(
  768. 'foo',
  769. array(
  770. 'bar' => 'index.php?str=1&id=29',
  771. ),
  772. false,
  773. ),
  774. );
  775. foreach ($reallyBad as $test => $expects) {
  776. $events = $this->parse($test);
  777. // fprintf(STDOUT, $test . print_r($events, true));
  778. $this->assertEventError($events->get(0));
  779. $this->assertEventError($events->get(1));
  780. // $this->assertEventEquals('startTag', $expects, $events->get(1));
  781. }
  782. // Regression: Malformed elements should be detected.
  783. // '<foo baz="1" <bar></foo>' => array('foo', array('baz' => '1'), false),
  784. $events = $this->parse('<foo baz="1" <bar></foo>');
  785. $this->assertEventError($events->get(0));
  786. $this->assertEventEquals('startTag', array(
  787. 'foo',
  788. array(
  789. 'baz' => '1',
  790. ),
  791. false,
  792. ), $events->get(1));
  793. $this->assertEventEquals('startTag', array(
  794. 'bar',
  795. array(),
  796. false,
  797. ), $events->get(2));
  798. $this->assertEventEquals('endTag', array(
  799. 'foo',
  800. ), $events->get(3));
  801. }
  802. public function testRawText()
  803. {
  804. $good = array(
  805. '<script>abcd efg hijk lmnop</script> ' => 'abcd efg hijk lmnop',
  806. '<script><not/><the/><tag></script>' => '<not/><the/><tag>',
  807. '<script><<<<<<<<</script>' => '<<<<<<<<',
  808. '<script>hello</script</script>' => 'hello</script',
  809. "<script>\nhello</script\n</script>" => "\nhello</script\n",
  810. '<script>&amp;</script>' => '&amp;',
  811. '<script><!--not a comment--></script>' => '<!--not a comment-->',
  812. '<script><![CDATA[not a comment]]></script>' => '<![CDATA[not a comment]]>',
  813. );
  814. foreach ($good as $test => $expects) {
  815. $events = $this->parse($test);
  816. $this->assertEventEquals('startTag', 'script', $events->get(0));
  817. $this->assertEventEquals('text', $expects, $events->get(1));
  818. $this->assertEventEquals('endTag', 'script', $events->get(2));
  819. }
  820. $bad = array(
  821. '<script>&amp;</script' => '&amp;</script',
  822. '<script>Hello world' => 'Hello world',
  823. );
  824. foreach ($bad as $test => $expects) {
  825. $events = $this->parse($test);
  826. $this->assertEquals(4, $events->depth(), "Counting events for '$test': " . print_r($events, true));
  827. $this->assertEventEquals('startTag', 'script', $events->get(0));
  828. $this->assertEventError($events->get(1));
  829. $this->assertEventEquals('text', $expects, $events->get(2));
  830. }
  831. // Testing case sensitivity
  832. $events = $this->parse('<TITLE>a test</TITLE>');
  833. $this->assertEventEquals('startTag', 'title', $events->get(0));
  834. $this->assertEventEquals('text', 'a test', $events->get(1));
  835. $this->assertEventEquals('endTag', 'title', $events->get(2));
  836. // Testing end tags with whitespaces
  837. $events = $this->parse('<title>Whitespaces are tasty</title >');
  838. $this->assertEventEquals('startTag', 'title', $events->get(0));
  839. $this->assertEventEquals('text', 'Whitespaces are tasty', $events->get(1));
  840. $this->assertEventEquals('endTag', 'title', $events->get(2));
  841. }
  842. public function testRcdata()
  843. {
  844. list($tok, $events) = $this->createTokenizer('<title>&#x27;<!-- not a comment --></TITLE>');
  845. $tok->setTextMode(\Masterminds\HTML5\Elements::TEXT_RCDATA, 'title');
  846. $tok->parse();
  847. $this->assertEventEquals('text', "'<!-- not a comment -->", $events->get(1));
  848. }
  849. public function testText()
  850. {
  851. $events = $this->parse('a<br>b');
  852. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  853. $this->assertEventEquals('text', 'a', $events->get(0));
  854. $this->assertEventEquals('startTag', 'br', $events->get(1));
  855. $this->assertEventEquals('text', 'b', $events->get(2));
  856. $events = $this->parse('<a>Test</a>');
  857. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  858. $this->assertEventEquals('startTag', 'a', $events->get(0));
  859. $this->assertEventEquals('text', 'Test', $events->get(1));
  860. $this->assertEventEquals('endTag', 'a', $events->get(2));
  861. $events = $this->parse('<p>0</p><p>1</p>');
  862. $this->assertEquals(7, $events->depth(), 'Events: ' . print_r($events, true));
  863. $this->assertEventEquals('startTag', 'p', $events->get(0));
  864. $this->assertEventEquals('text', '0', $events->get(1));
  865. $this->assertEventEquals('endTag', 'p', $events->get(2));
  866. $this->assertEventEquals('startTag', 'p', $events->get(3));
  867. $this->assertEventEquals('text', '1', $events->get(4));
  868. $this->assertEventEquals('endTag', 'p', $events->get(5));
  869. $events = $this->parse('a<![CDATA[test]]>b');
  870. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  871. $this->assertEventEquals('text', 'a', $events->get(0));
  872. $this->assertEventEquals('cdata', 'test', $events->get(1));
  873. $this->assertEventEquals('text', 'b', $events->get(2));
  874. $events = $this->parse('a<!--test-->b');
  875. $this->assertEquals(4, $events->depth(), 'Events: ' . print_r($events, true));
  876. $this->assertEventEquals('text', 'a', $events->get(0));
  877. $this->assertEventEquals('comment', 'test', $events->get(1));
  878. $this->assertEventEquals('text', 'b', $events->get(2));
  879. $events = $this->parse('a&amp;b');
  880. $this->assertEquals(2, $events->depth(), 'Events: ' . print_r($events, true));
  881. $this->assertEventEquals('text', 'a&b', $events->get(0));
  882. $events = $this->parse('a&sup2;b');
  883. $this->assertEquals(2, $events->depth(), 'Events: ' . print_r($events, true));
  884. $this->assertEventEquals('text', 'a²b', $events->get(0));
  885. }
  886. // ================================================================
  887. // Utility functions.
  888. // ================================================================
  889. protected function createTokenizer($string, $debug = false)
  890. {
  891. $eventHandler = new EventStack();
  892. $scanner = new Scanner($string);
  893. $scanner->debug = $debug;
  894. return array(
  895. new Tokenizer($scanner, $eventHandler),
  896. $eventHandler,
  897. );
  898. }
  899. public function parse($string, $debug = false)
  900. {
  901. list($tok, $events) = $this->createTokenizer($string, $debug);
  902. $tok->parse();
  903. return $events;
  904. }
  905. }