compareParsers.php 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. <?php
  2. /**
  3. * Take page text out of an XML dump file and render basic HTML out to files.
  4. * This is *NOT* suitable for publishing or offline use; it's intended for
  5. * running comparative tests of parsing behavior using real-world data.
  6. *
  7. * Templates etc are pulled from the local wiki database, not from the dump.
  8. *
  9. * Copyright © 2011 Platonides
  10. * https://www.mediawiki.org/
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License as published by
  14. * the Free Software Foundation; either version 2 of the License, or
  15. * (at your option) any later version.
  16. *
  17. * This program is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU General Public License along
  23. * with this program; if not, write to the Free Software Foundation, Inc.,
  24. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  25. * http://www.gnu.org/copyleft/gpl.html
  26. *
  27. * @file
  28. * @ingroup Maintenance
  29. */
  30. require_once __DIR__ . '/dumpIterator.php';
  31. /**
  32. * Maintenance script to take page text out of an XML dump file and render
  33. * basic HTML out to files.
  34. *
  35. * @ingroup Maintenance
  36. */
  37. class CompareParsers extends DumpIterator {
  38. private $count = 0;
  39. public function __construct() {
  40. parent::__construct();
  41. $this->saveFailed = false;
  42. $this->addDescription( 'Run a file or dump with several parsers' );
  43. $this->addOption( 'parser1', 'The first parser to compare.', true, true );
  44. $this->addOption( 'parser2', 'The second parser to compare.', true, true );
  45. $this->addOption( 'tidy', 'Run tidy on the articles.', false, false );
  46. $this->addOption(
  47. 'save-failed',
  48. 'Folder in which articles which differ will be stored.',
  49. false,
  50. true
  51. );
  52. $this->addOption( 'show-diff', 'Show a diff of the two renderings.', false, false );
  53. $this->addOption(
  54. 'diff-bin',
  55. 'Binary to use for diffing (can also be provided by DIFF env var).',
  56. false,
  57. false
  58. );
  59. $this->addOption(
  60. 'strip-parameters',
  61. 'Remove parameters of html tags to increase readability.',
  62. false,
  63. false
  64. );
  65. $this->addOption(
  66. 'show-parsed-output',
  67. 'Show the parsed html if both Parsers give the same output.',
  68. false,
  69. false
  70. );
  71. }
  72. public function checkOptions() {
  73. if ( $this->hasOption( 'save-failed' ) ) {
  74. $this->saveFailed = $this->getOption( 'save-failed' );
  75. }
  76. $this->stripParametersEnabled = $this->hasOption( 'strip-parameters' );
  77. $this->showParsedOutput = $this->hasOption( 'show-parsed-output' );
  78. $this->showDiff = $this->hasOption( 'show-diff' );
  79. if ( $this->showDiff ) {
  80. $bin = $this->getOption( 'diff-bin', getenv( 'DIFF' ) );
  81. if ( $bin != '' ) {
  82. global $wgDiff;
  83. $wgDiff = $bin;
  84. }
  85. }
  86. $user = new User();
  87. $this->options = ParserOptions::newFromUser( $user );
  88. if ( $this->hasOption( 'tidy' ) ) {
  89. global $wgUseTidy;
  90. if ( !$wgUseTidy ) {
  91. $this->error( 'Tidy was requested but $wgUseTidy is not set in LocalSettings.php', true );
  92. }
  93. $this->options->setTidy( true );
  94. }
  95. $this->failed = 0;
  96. }
  97. public function conclusions() {
  98. $this->error( "{$this->failed} failed revisions out of {$this->count}" );
  99. if ( $this->count > 0 ) {
  100. $this->output( " (" . ( $this->failed / $this->count ) . "%)\n" );
  101. }
  102. }
  103. function stripParameters( $text ) {
  104. if ( !$this->stripParametersEnabled ) {
  105. return $text;
  106. }
  107. return preg_replace( '/(<a) [^>]+>/', '$1>', $text );
  108. }
  109. /**
  110. * Callback function for each revision, parse with both parsers and compare
  111. * @param Revision $rev
  112. */
  113. public function processRevision( $rev ) {
  114. $title = $rev->getTitle();
  115. $parser1Name = $this->getOption( 'parser1' );
  116. $parser2Name = $this->getOption( 'parser2' );
  117. self::checkParserLocally( $parser1Name );
  118. self::checkParserLocally( $parser2Name );
  119. $parser1 = new $parser1Name();
  120. $parser2 = new $parser2Name();
  121. $content = $rev->getContent();
  122. if ( $content->getModel() !== CONTENT_MODEL_WIKITEXT ) {
  123. $this->error( "Page {$title->getPrefixedText()} does not contain wikitext "
  124. . "but {$content->getModel()}\n" );
  125. return;
  126. }
  127. $text = strval( $content->getNativeData() );
  128. $output1 = $parser1->parse( $text, $title, $this->options );
  129. $output2 = $parser2->parse( $text, $title, $this->options );
  130. if ( $output1->getText() != $output2->getText() ) {
  131. $this->failed++;
  132. $this->error( "Parsing for {$title->getPrefixedText()} differs\n" );
  133. if ( $this->saveFailed ) {
  134. file_put_contents(
  135. $this->saveFailed . '/' . rawurlencode( $title->getPrefixedText() ) . ".txt",
  136. $text
  137. );
  138. }
  139. if ( $this->showDiff ) {
  140. $this->output( wfDiff(
  141. $this->stripParameters( $output1->getText() ),
  142. $this->stripParameters( $output2->getText() ),
  143. ''
  144. ) );
  145. }
  146. } else {
  147. $this->output( $title->getPrefixedText() . "\tOK\n" );
  148. if ( $this->showParsedOutput ) {
  149. $this->output( $this->stripParameters( $output1->getText() ) );
  150. }
  151. }
  152. }
  153. private static function checkParserLocally( $parserName ) {
  154. /* Look for the parser in a file appropiately named in the current folder */
  155. if ( !class_exists( $parserName ) && file_exists( "$parserName.php" ) ) {
  156. global $wgAutoloadClasses;
  157. $wgAutoloadClasses[$parserName] = realpath( '.' ) . "/$parserName.php";
  158. }
  159. }
  160. }
  161. $maintClass = "CompareParsers";
  162. require_once RUN_MAINTENANCE_IF_MAIN;