populateRevisionSha1.php 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. <?php
  2. /**
  3. * Fills the rev_sha1 and ar_sha1 columns of revision
  4. * and archive tables for revisions created before MW 1.19.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License along
  17. * with this program; if not, write to the Free Software Foundation, Inc.,
  18. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. * http://www.gnu.org/copyleft/gpl.html
  20. *
  21. * @file
  22. * @ingroup Maintenance
  23. */
  24. require_once __DIR__ . '/Maintenance.php';
  25. /**
  26. * Maintenance script that fills the rev_sha1 and ar_sha1 columns of revision
  27. * and archive tables for revisions created before MW 1.19.
  28. *
  29. * @ingroup Maintenance
  30. */
  31. class PopulateRevisionSha1 extends LoggedUpdateMaintenance {
  32. public function __construct() {
  33. parent::__construct();
  34. $this->addDescription( 'Populates the rev_sha1 and ar_sha1 fields' );
  35. $this->setBatchSize( 200 );
  36. }
  37. protected function getUpdateKey() {
  38. return 'populate rev_sha1';
  39. }
  40. protected function doDBUpdates() {
  41. $db = $this->getDB( DB_MASTER );
  42. if ( !$db->tableExists( 'revision' ) ) {
  43. $this->fatalError( "revision table does not exist" );
  44. } elseif ( !$db->tableExists( 'archive' ) ) {
  45. $this->fatalError( "archive table does not exist" );
  46. } elseif ( !$db->fieldExists( 'revision', 'rev_sha1', __METHOD__ ) ) {
  47. $this->output( "rev_sha1 column does not exist\n\n", true );
  48. return false;
  49. }
  50. $this->output( "Populating rev_sha1 column\n" );
  51. $rc = $this->doSha1Updates( 'revision', 'rev_id', Revision::getQueryInfo(), 'rev' );
  52. $this->output( "Populating ar_sha1 column\n" );
  53. $ac = $this->doSha1Updates( 'archive', 'ar_rev_id', Revision::getArchiveQueryInfo(), 'ar' );
  54. $this->output( "Populating ar_sha1 column legacy rows\n" );
  55. $ac += $this->doSha1LegacyUpdates();
  56. $this->output( "rev_sha1 and ar_sha1 population complete "
  57. . "[$rc revision rows, $ac archive rows].\n" );
  58. return true;
  59. }
  60. /**
  61. * @param string $table
  62. * @param string $idCol
  63. * @param array $queryInfo
  64. * @param string $prefix
  65. * @return int Rows changed
  66. */
  67. protected function doSha1Updates( $table, $idCol, $queryInfo, $prefix ) {
  68. $db = $this->getDB( DB_MASTER );
  69. $batchSize = $this->getBatchSize();
  70. $start = $db->selectField( $table, "MIN($idCol)", '', __METHOD__ );
  71. $end = $db->selectField( $table, "MAX($idCol)", '', __METHOD__ );
  72. if ( !$start || !$end ) {
  73. $this->output( "...$table table seems to be empty.\n" );
  74. return 0;
  75. }
  76. $count = 0;
  77. # Do remaining chunk
  78. $end += $batchSize - 1;
  79. $blockStart = $start;
  80. $blockEnd = $start + $batchSize - 1;
  81. while ( $blockEnd <= $end ) {
  82. $this->output( "...doing $idCol from $blockStart to $blockEnd\n" );
  83. $cond = "$idCol BETWEEN " . (int)$blockStart . " AND " . (int)$blockEnd .
  84. " AND $idCol IS NOT NULL AND {$prefix}_sha1 = ''";
  85. $res = $db->select(
  86. $queryInfo['tables'], $queryInfo['fields'], $cond, __METHOD__, [], $queryInfo['joins']
  87. );
  88. $this->beginTransaction( $db, __METHOD__ );
  89. foreach ( $res as $row ) {
  90. if ( $this->upgradeRow( $row, $table, $idCol, $prefix ) ) {
  91. $count++;
  92. }
  93. }
  94. $this->commitTransaction( $db, __METHOD__ );
  95. $blockStart += $batchSize;
  96. $blockEnd += $batchSize;
  97. }
  98. return $count;
  99. }
  100. /**
  101. * @return int
  102. */
  103. protected function doSha1LegacyUpdates() {
  104. $count = 0;
  105. $db = $this->getDB( DB_MASTER );
  106. $arQuery = Revision::getArchiveQueryInfo();
  107. $res = $db->select( $arQuery['tables'], $arQuery['fields'],
  108. [ 'ar_rev_id IS NULL', 'ar_sha1' => '' ], __METHOD__, [], $arQuery['joins'] );
  109. $updateSize = 0;
  110. $this->beginTransaction( $db, __METHOD__ );
  111. foreach ( $res as $row ) {
  112. if ( $this->upgradeLegacyArchiveRow( $row ) ) {
  113. ++$count;
  114. }
  115. if ( ++$updateSize >= 100 ) {
  116. $updateSize = 0;
  117. $this->commitTransaction( $db, __METHOD__ );
  118. $this->output( "Commited row with ar_timestamp={$row->ar_timestamp}\n" );
  119. $this->beginTransaction( $db, __METHOD__ );
  120. }
  121. }
  122. $this->commitTransaction( $db, __METHOD__ );
  123. return $count;
  124. }
  125. /**
  126. * @param stdClass $row
  127. * @param string $table
  128. * @param string $idCol
  129. * @param string $prefix
  130. * @return bool
  131. */
  132. protected function upgradeRow( $row, $table, $idCol, $prefix ) {
  133. $db = $this->getDB( DB_MASTER );
  134. try {
  135. $rev = ( $table === 'archive' )
  136. ? Revision::newFromArchiveRow( $row )
  137. : new Revision( $row );
  138. $text = $rev->getSerializedData();
  139. } catch ( Exception $e ) {
  140. $this->output( "Data of revision with {$idCol}={$row->$idCol} unavailable!\n" );
  141. return false; // T24624?
  142. }
  143. if ( !is_string( $text ) ) {
  144. # This should not happen, but sometimes does (T22757)
  145. $this->output( "Data of revision with {$idCol}={$row->$idCol} unavailable!\n" );
  146. return false;
  147. } else {
  148. $db->update( $table,
  149. [ "{$prefix}_sha1" => Revision::base36Sha1( $text ) ],
  150. [ $idCol => $row->$idCol ],
  151. __METHOD__
  152. );
  153. return true;
  154. }
  155. }
  156. /**
  157. * @param stdClass $row
  158. * @return bool
  159. */
  160. protected function upgradeLegacyArchiveRow( $row ) {
  161. $db = $this->getDB( DB_MASTER );
  162. try {
  163. $rev = Revision::newFromArchiveRow( $row );
  164. } catch ( Exception $e ) {
  165. $this->output( "Text of revision with timestamp {$row->ar_timestamp} unavailable!\n" );
  166. return false; // T24624?
  167. }
  168. $text = $rev->getSerializedData();
  169. if ( !is_string( $text ) ) {
  170. # This should not happen, but sometimes does (T22757)
  171. $this->output( "Data of revision with timestamp {$row->ar_timestamp} unavailable!\n" );
  172. return false;
  173. } else {
  174. # Archive table as no PK, but (NS,title,time) should be near unique.
  175. # Any duplicates on those should also have duplicated text anyway.
  176. $db->update( 'archive',
  177. [ 'ar_sha1' => Revision::base36Sha1( $text ) ],
  178. [
  179. 'ar_namespace' => $row->ar_namespace,
  180. 'ar_title' => $row->ar_title,
  181. 'ar_timestamp' => $row->ar_timestamp,
  182. 'ar_len' => $row->ar_len // extra sanity
  183. ],
  184. __METHOD__
  185. );
  186. return true;
  187. }
  188. }
  189. }
  190. $maintClass = PopulateRevisionSha1::class;
  191. require_once RUN_MAINTENANCE_IF_MAIN;