Tidy.php 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. <?php
  2. /**
  3. * Class to interact with HTML tidy
  4. *
  5. * Either the external tidy program or the in-process tidy extension
  6. * will be used depending on availability. Override the default
  7. * $wgTidyInternal setting to disable the internal if it's not working.
  8. *
  9. * @ingroup Parser
  10. */
  11. class MWTidy {
  12. /**
  13. * Interface with html tidy, used if $wgUseTidy = true.
  14. * If tidy isn't able to correct the markup, the original will be
  15. * returned in all its glory with a warning comment appended.
  16. *
  17. * @param string $text Hideous HTML input
  18. * @return string Corrected HTML output
  19. */
  20. public static function tidy( $text ) {
  21. global $wgTidyInternal;
  22. $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
  23. ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'.
  24. '<head><title>test</title></head><body>'.$text.'</body></html>';
  25. # Tidy is known to clobber tabs; convert them to entities
  26. $wrappedtext = str_replace( "\t", '&#9;', $wrappedtext );
  27. if( $wgTidyInternal ) {
  28. $correctedtext = self::execInternalTidy( $wrappedtext );
  29. } else {
  30. $correctedtext = self::execExternalTidy( $wrappedtext );
  31. }
  32. if( is_null( $correctedtext ) ) {
  33. wfDebug( "Tidy error detected!\n" );
  34. return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
  35. }
  36. # Convert the tabs back from entities
  37. $correctedtext = str_replace( '&#9;', "\t", $correctedtext );
  38. return $correctedtext;
  39. }
  40. /**
  41. * Check HTML for errors, used if $wgValidateAllHtml = true.
  42. *
  43. * @param $text String
  44. * @param &$errorStr String: return the error string
  45. * @return Boolean: whether the HTML is valid
  46. */
  47. public static function checkErrors( $text, &$errorStr = null ) {
  48. global $wgTidyInternal;
  49. $retval = 0;
  50. if( $wgTidyInternal ) {
  51. $errorStr = self::execInternalTidy( $text, true, $retval );
  52. } else {
  53. $errorStr = self::execExternalTidy( $text, true, $retval );
  54. }
  55. return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
  56. }
  57. /**
  58. * Spawn an external HTML tidy process and get corrected markup back from it.
  59. * Also called in OutputHandler.php for full page validation
  60. *
  61. * @param $text String: HTML to check
  62. * @param $stderr Boolean: Whether to read from STDERR rather than STDOUT
  63. * @param &$retval Exit code (-1 on internal error)
  64. * @retrun mixed String or null
  65. */
  66. private static function execExternalTidy( $text, $stderr = false, &$retval = null ) {
  67. global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
  68. wfProfileIn( __METHOD__ );
  69. $cleansource = '';
  70. $opts = ' -utf8';
  71. if( $stderr ) {
  72. $descriptorspec = array(
  73. 0 => array( 'pipe', 'r' ),
  74. 1 => array( 'file', wfGetNull(), 'a' ),
  75. 2 => array( 'pipe', 'w' )
  76. );
  77. } else {
  78. $descriptorspec = array(
  79. 0 => array( 'pipe', 'r' ),
  80. 1 => array( 'pipe', 'w' ),
  81. 2 => array( 'file', wfGetNull(), 'a' )
  82. );
  83. }
  84. $readpipe = $stderr ? 2 : 1;
  85. $pipes = array();
  86. if( function_exists( 'proc_open' ) ) {
  87. $process = proc_open( "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );
  88. if ( is_resource( $process ) ) {
  89. // Theoretically, this style of communication could cause a deadlock
  90. // here. If the stdout buffer fills up, then writes to stdin could
  91. // block. This doesn't appear to happen with tidy, because tidy only
  92. // writes to stdout after it's finished reading from stdin. Search
  93. // for tidyParseStdin and tidySaveStdout in console/tidy.c
  94. fwrite( $pipes[0], $text );
  95. fclose( $pipes[0] );
  96. while ( !feof( $pipes[$readpipe] ) ) {
  97. $cleansource .= fgets( $pipes[$readpipe], 1024 );
  98. }
  99. fclose( $pipes[$readpipe] );
  100. $retval = proc_close( $process );
  101. } else {
  102. $retval = -1;
  103. }
  104. } else {
  105. $retval = -1;
  106. }
  107. wfProfileOut( __METHOD__ );
  108. if( !$stderr && $cleansource == '' && $text != '' ) {
  109. // Some kind of error happened, so we couldn't get the corrected text.
  110. // Just give up; we'll use the source text and append a warning.
  111. return null;
  112. } else {
  113. return $cleansource;
  114. }
  115. }
  116. /**
  117. * Use the HTML tidy PECL extension to use the tidy library in-process,
  118. * saving the overhead of spawning a new process.
  119. *
  120. * 'pear install tidy' should be able to compile the extension module.
  121. */
  122. private static function execInternalTidy( $text, $stderr = false, &$retval = null ) {
  123. global $wgTidyConf, $IP, $wgDebugTidy;
  124. wfProfileIn( __METHOD__ );
  125. $tidy = new tidy;
  126. $tidy->parseString( $text, $wgTidyConf, 'utf8' );
  127. if( $stderr ) {
  128. $retval = $tidy->getStatus();
  129. return $tidy->errorBuffer;
  130. } else {
  131. $tidy->cleanRepair();
  132. $retval = $tidy->getStatus();
  133. if( $retval == 2 ) {
  134. // 2 is magic number for fatal error
  135. // http://www.php.net/manual/en/function.tidy-get-status.php
  136. $cleansource = null;
  137. } else {
  138. $cleansource = tidy_get_output( $tidy );
  139. }
  140. if ( $wgDebugTidy && $retval > 0 ) {
  141. $cleansource .= "<!--\nTidy reports:\n" .
  142. str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
  143. "\n-->";
  144. }
  145. wfProfileOut( __METHOD__ );
  146. return $cleansource;
  147. }
  148. }
  149. }