compareParsers.php 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. <?php
  2. /**
  3. * Take page text out of an XML dump file and render basic HTML out to files.
  4. * This is *NOT* suitable for publishing or offline use; it's intended for
  5. * running comparative tests of parsing behavior using real-world data.
  6. *
  7. * Templates etc are pulled from the local wiki database, not from the dump.
  8. *
  9. * Copyright © 2011 Platonides
  10. * https://www.mediawiki.org/
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License as published by
  14. * the Free Software Foundation; either version 2 of the License, or
  15. * (at your option) any later version.
  16. *
  17. * This program is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU General Public License along
  23. * with this program; if not, write to the Free Software Foundation, Inc.,
  24. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  25. * http://www.gnu.org/copyleft/gpl.html
  26. *
  27. * @file
  28. * @ingroup Maintenance
  29. */
  30. require_once __DIR__ . '/dumpIterator.php';
  31. /**
  32. * Maintenance script to take page text out of an XML dump file and render
  33. * basic HTML out to files.
  34. *
  35. * @ingroup Maintenance
  36. */
  37. class CompareParsers extends DumpIterator {
  38. private $count = 0;
  39. public function __construct() {
  40. parent::__construct();
  41. $this->saveFailed = false;
  42. $this->addDescription( 'Run a file or dump with several parsers' );
  43. $this->addOption( 'parser1', 'The first parser to compare.', true, true );
  44. $this->addOption( 'parser2', 'The second parser to compare.', true, true );
  45. $this->addOption( 'tidy', 'Run tidy on the articles.', false, false );
  46. $this->addOption(
  47. 'save-failed',
  48. 'Folder in which articles which differ will be stored.',
  49. false,
  50. true
  51. );
  52. $this->addOption( 'show-diff', 'Show a diff of the two renderings.', false, false );
  53. $this->addOption(
  54. 'diff-bin',
  55. 'Binary to use for diffing (can also be provided by DIFF env var).',
  56. false,
  57. false
  58. );
  59. $this->addOption(
  60. 'strip-parameters',
  61. 'Remove parameters of html tags to increase readability.',
  62. false,
  63. false
  64. );
  65. $this->addOption(
  66. 'show-parsed-output',
  67. 'Show the parsed html if both Parsers give the same output.',
  68. false,
  69. false
  70. );
  71. }
  72. public function checkOptions() {
  73. if ( $this->hasOption( 'save-failed' ) ) {
  74. $this->saveFailed = $this->getOption( 'save-failed' );
  75. }
  76. $this->stripParametersEnabled = $this->hasOption( 'strip-parameters' );
  77. $this->showParsedOutput = $this->hasOption( 'show-parsed-output' );
  78. $this->showDiff = $this->hasOption( 'show-diff' );
  79. if ( $this->showDiff ) {
  80. $bin = $this->getOption( 'diff-bin', getenv( 'DIFF' ) );
  81. if ( $bin != '' ) {
  82. global $wgDiff;
  83. $wgDiff = $bin;
  84. }
  85. }
  86. $user = new User();
  87. $this->options = ParserOptions::newFromUser( $user );
  88. if ( $this->hasOption( 'tidy' ) ) {
  89. if ( !MWTidy::isEnabled() ) {
  90. $this->fatalError( 'Tidy was requested but $wgTidyConfig is not set in LocalSettings.php' );
  91. }
  92. $this->options->setTidy( true );
  93. }
  94. $this->failed = 0;
  95. }
  96. public function conclusions() {
  97. $this->error( "{$this->failed} failed revisions out of {$this->count}" );
  98. if ( $this->count > 0 ) {
  99. $this->output( " (" . ( $this->failed / $this->count ) . "%)\n" );
  100. }
  101. }
  102. function stripParameters( $text ) {
  103. if ( !$this->stripParametersEnabled ) {
  104. return $text;
  105. }
  106. return preg_replace( '/(<a) [^>]+>/', '$1>', $text );
  107. }
  108. /**
  109. * Callback function for each revision, parse with both parsers and compare
  110. * @param Revision $rev
  111. */
  112. public function processRevision( $rev ) {
  113. $title = $rev->getTitle();
  114. $parser1Name = $this->getOption( 'parser1' );
  115. $parser2Name = $this->getOption( 'parser2' );
  116. self::checkParserLocally( $parser1Name );
  117. self::checkParserLocally( $parser2Name );
  118. $parser1 = new $parser1Name();
  119. $parser2 = new $parser2Name();
  120. $content = $rev->getContent();
  121. if ( $content->getModel() !== CONTENT_MODEL_WIKITEXT ) {
  122. $this->error( "Page {$title->getPrefixedText()} does not contain wikitext "
  123. . "but {$content->getModel()}\n" );
  124. return;
  125. }
  126. $text = strval( $content->getNativeData() );
  127. $output1 = $parser1->parse( $text, $title, $this->options );
  128. $output2 = $parser2->parse( $text, $title, $this->options );
  129. if ( $output1->getText() != $output2->getText() ) {
  130. $this->failed++;
  131. $this->error( "Parsing for {$title->getPrefixedText()} differs\n" );
  132. if ( $this->saveFailed ) {
  133. file_put_contents(
  134. $this->saveFailed . '/' . rawurlencode( $title->getPrefixedText() ) . ".txt",
  135. $text
  136. );
  137. }
  138. if ( $this->showDiff ) {
  139. $this->output( wfDiff(
  140. $this->stripParameters( $output1->getText() ),
  141. $this->stripParameters( $output2->getText() ),
  142. ''
  143. ) );
  144. }
  145. } else {
  146. $this->output( $title->getPrefixedText() . "\tOK\n" );
  147. if ( $this->showParsedOutput ) {
  148. $this->output( $this->stripParameters( $output1->getText() ) );
  149. }
  150. }
  151. }
  152. private static function checkParserLocally( $parserName ) {
  153. /* Look for the parser in a file appropiately named in the current folder */
  154. if ( !class_exists( $parserName ) && file_exists( "$parserName.php" ) ) {
  155. global $wgAutoloadClasses;
  156. $wgAutoloadClasses[$parserName] = realpath( '.' ) . "/$parserName.php";
  157. }
  158. }
  159. }
  160. $maintClass = CompareParsers::class;
  161. require_once RUN_MAINTENANCE_IF_MAIN;