ParserTestResultNormalizer.php 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. <?php
  2. /**
  3. * @file
  4. * @ingroup Testing
  5. */
  6. class ParserTestResultNormalizer {
  7. protected $doc, $xpath, $invalid;
  8. public static function normalize( $text, $funcs ) {
  9. $norm = new self( $text );
  10. if ( $norm->invalid ) {
  11. return $text;
  12. }
  13. foreach ( $funcs as $func ) {
  14. $norm->$func();
  15. }
  16. return $norm->serialize();
  17. }
  18. protected function __construct( $text ) {
  19. $this->doc = new DOMDocument( '1.0', 'utf-8' );
  20. // Note: parsing a supposedly XHTML document with an XML parser is not
  21. // guaranteed to give accurate results. For example, it may introduce
  22. // differences in the number of line breaks in <pre> tags.
  23. Wikimedia\suppressWarnings();
  24. if ( !$this->doc->loadXML( '<html><body>' . $text . '</body></html>' ) ) {
  25. $this->invalid = true;
  26. }
  27. Wikimedia\restoreWarnings();
  28. $this->xpath = new DOMXPath( $this->doc );
  29. $this->body = $this->xpath->query( '//body' )->item( 0 );
  30. }
  31. protected function removeTbody() {
  32. foreach ( $this->xpath->query( '//tbody' ) as $tbody ) {
  33. while ( $tbody->firstChild ) {
  34. $child = $tbody->firstChild;
  35. $tbody->removeChild( $child );
  36. $tbody->parentNode->insertBefore( $child, $tbody );
  37. }
  38. $tbody->parentNode->removeChild( $tbody );
  39. }
  40. }
  41. /**
  42. * The point of this function is to produce a normalized DOM in which
  43. * Tidy's output matches the output of html5depurate. Tidy both trims
  44. * and pretty-prints, so this requires fairly aggressive treatment.
  45. *
  46. * In particular, note that Tidy converts <pre>x</pre> to <pre>\nx\n</pre>,
  47. * which theoretically affects display since the second line break is not
  48. * ignored by compliant HTML parsers.
  49. *
  50. * This function also removes empty elements, as does Tidy.
  51. */
  52. protected function trimWhitespace() {
  53. foreach ( $this->xpath->query( '//text()' ) as $child ) {
  54. if ( strtolower( $child->parentNode->nodeName ) === 'pre' ) {
  55. // Just trim one line break from the start and end
  56. if ( substr_compare( $child->data, "\n", 0 ) === 0 ) {
  57. $child->data = substr( $child->data, 1 );
  58. }
  59. if ( substr_compare( $child->data, "\n", -1 ) === 0 ) {
  60. $child->data = substr( $child->data, 0, -1 );
  61. }
  62. } else {
  63. // Trim all whitespace
  64. $child->data = trim( $child->data );
  65. }
  66. if ( $child->data === '' ) {
  67. $child->parentNode->removeChild( $child );
  68. }
  69. }
  70. }
  71. /**
  72. * Serialize the XML DOM for comparison purposes. This does not generate HTML.
  73. * @return string
  74. */
  75. protected function serialize() {
  76. return strtr( $this->doc->saveXML( $this->body ),
  77. [ '<body>' => '', '</body>' => '' ] );
  78. }
  79. }