populateContentTables.php 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. <?php
  2. /**
  3. * This program is free software; you can redistribute it and/or modify
  4. * it under the terms of the GNU General Public License as published by
  5. * the Free Software Foundation; either version 2 of the License, or
  6. * (at your option) any later version.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License along
  14. * with this program; if not, write to the Free Software Foundation, Inc.,
  15. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16. * http://www.gnu.org/copyleft/gpl.html
  17. *
  18. * @file
  19. * @ingroup Maintenance
  20. */
  21. use MediaWiki\MediaWikiServices;
  22. use MediaWiki\Storage\NameTableStore;
  23. use MediaWiki\Storage\SqlBlobStore;
  24. use Wikimedia\Assert\Assert;
  25. use Wikimedia\Rdbms\IDatabase;
  26. use Wikimedia\Rdbms\ResultWrapper;
  27. require_once __DIR__ . '/Maintenance.php';
  28. /**
  29. * Populate the content and slot tables.
  30. * @since 1.32
  31. */
  32. class PopulateContentTables extends Maintenance {
  33. /** @var IDatabase */
  34. private $dbw;
  35. /** @var NameTableStore */
  36. private $contentModelStore;
  37. /** @var int */
  38. private $mainRoleId;
  39. /** @var array|null Map "{$modelId}:{$address}" to content_id */
  40. private $contentRowMap = null;
  41. private $count = 0, $totalCount = 0;
  42. public function __construct() {
  43. parent::__construct();
  44. $this->addDescription( 'Populate content and slot tables' );
  45. $this->addOption( 'table', 'revision or archive table, or `all` to populate both', false,
  46. true );
  47. $this->addOption( 'reuse-content',
  48. 'Reuse content table rows when the address and model are the same. '
  49. . 'This will increase the script\'s time and memory usage, perhaps significantly.',
  50. false, false );
  51. $this->addOption( 'start-revision', 'The rev_id to start at', false, true );
  52. $this->addOption( 'start-archive', 'The ar_rev_id to start at', false, true );
  53. $this->setBatchSize( 500 );
  54. }
  55. private function initServices() {
  56. $this->dbw = $this->getDB( DB_MASTER );
  57. $this->contentModelStore = MediaWikiServices::getInstance()->getContentModelStore();
  58. $this->mainRoleId = MediaWikiServices::getInstance()->getSlotRoleStore()->acquireId( 'main' );
  59. }
  60. public function execute() {
  61. global $wgMultiContentRevisionSchemaMigrationStage;
  62. $t0 = microtime( true );
  63. if ( ( $wgMultiContentRevisionSchemaMigrationStage & SCHEMA_COMPAT_WRITE_NEW ) === 0 ) {
  64. $this->writeln(
  65. '...cannot update while \$wgMultiContentRevisionSchemaMigrationStage '
  66. . 'does not have the SCHEMA_COMPAT_WRITE_NEW bit set.'
  67. );
  68. return false;
  69. }
  70. $this->initServices();
  71. if ( $this->getOption( 'reuse-content', false ) ) {
  72. $this->loadContentMap();
  73. }
  74. foreach ( $this->getTables() as $table ) {
  75. $this->populateTable( $table );
  76. }
  77. $elapsed = microtime( true ) - $t0;
  78. $this->writeln( "Done. Processed $this->totalCount rows in $elapsed seconds" );
  79. return true;
  80. }
  81. /**
  82. * @return string[]
  83. */
  84. private function getTables() {
  85. $table = $this->getOption( 'table', 'all' );
  86. $validTableOptions = [ 'all', 'revision', 'archive' ];
  87. if ( !in_array( $table, $validTableOptions ) ) {
  88. $this->fatalError( 'Invalid table. Must be either `revision` or `archive` or `all`' );
  89. }
  90. if ( $table === 'all' ) {
  91. $tables = [ 'revision', 'archive' ];
  92. } else {
  93. $tables = [ $table ];
  94. }
  95. return $tables;
  96. }
  97. private function loadContentMap() {
  98. $t0 = microtime( true );
  99. $this->writeln( "Loading existing content table rows..." );
  100. $this->contentRowMap = [];
  101. $dbr = $this->getDB( DB_REPLICA );
  102. $from = false;
  103. while ( true ) {
  104. $res = $dbr->select(
  105. 'content',
  106. [ 'content_id', 'content_address', 'content_model' ],
  107. $from ? "content_id > $from" : '',
  108. __METHOD__,
  109. [ 'ORDER BY' => 'content_id', 'LIMIT' => $this->getBatchSize() ]
  110. );
  111. if ( !$res || !$res->numRows() ) {
  112. break;
  113. }
  114. foreach ( $res as $row ) {
  115. $from = $row->content_id;
  116. $this->contentRowMap["{$row->content_model}:{$row->content_address}"] = $row->content_id;
  117. }
  118. }
  119. $elapsed = microtime( true ) - $t0;
  120. $this->writeln( "Loaded " . count( $this->contentRowMap ) . " rows in $elapsed seconds" );
  121. }
  122. /**
  123. * @param string $table
  124. */
  125. private function populateTable( $table ) {
  126. $t0 = microtime( true );
  127. $this->count = 0;
  128. $this->writeln( "Populating $table..." );
  129. if ( $table === 'revision' ) {
  130. $idField = 'rev_id';
  131. $tables = [ 'revision', 'slots', 'page' ];
  132. $fields = [
  133. 'rev_id',
  134. 'len' => 'rev_len',
  135. 'sha1' => 'rev_sha1',
  136. 'text_id' => 'rev_text_id',
  137. 'content_model' => 'rev_content_model',
  138. 'namespace' => 'page_namespace',
  139. 'title' => 'page_title',
  140. ];
  141. $joins = [
  142. 'slots' => [ 'LEFT JOIN', 'rev_id=slot_revision_id' ],
  143. 'page' => [ 'LEFT JOIN', 'rev_page=page_id' ],
  144. ];
  145. $startOption = 'start-revision';
  146. } else {
  147. $idField = 'ar_rev_id';
  148. $tables = [ 'archive', 'slots' ];
  149. $fields = [
  150. 'rev_id' => 'ar_rev_id',
  151. 'len' => 'ar_len',
  152. 'sha1' => 'ar_sha1',
  153. 'text_id' => 'ar_text_id',
  154. 'content_model' => 'ar_content_model',
  155. 'namespace' => 'ar_namespace',
  156. 'title' => 'ar_title',
  157. ];
  158. $joins = [
  159. 'slots' => [ 'LEFT JOIN', 'ar_rev_id=slot_revision_id' ],
  160. ];
  161. $startOption = 'start-archive';
  162. }
  163. $minmax = $this->dbw->selectRow(
  164. $table,
  165. [ 'min' => "MIN( $idField )", 'max' => "MAX( $idField )" ],
  166. '',
  167. __METHOD__
  168. );
  169. if ( $this->hasOption( $startOption ) ) {
  170. $minmax->min = (int)$this->getOption( $startOption );
  171. }
  172. if ( !$minmax || !is_numeric( $minmax->min ) || !is_numeric( $minmax->max ) ) {
  173. // No rows?
  174. $minmax = (object)[ 'min' => 1, 'max' => 0 ];
  175. }
  176. $batchSize = $this->getBatchSize();
  177. for ( $startId = $minmax->min; $startId <= $minmax->max; $startId += $batchSize ) {
  178. $endId = min( $startId + $batchSize - 1, $minmax->max );
  179. $rows = $this->dbw->select(
  180. $tables,
  181. $fields,
  182. [
  183. "$idField >= $startId",
  184. "$idField <= $endId",
  185. 'slot_revision_id IS NULL',
  186. ],
  187. __METHOD__,
  188. [ 'ORDER BY' => 'rev_id' ],
  189. $joins
  190. );
  191. if ( $rows->numRows() !== 0 ) {
  192. $this->populateContentTablesForRowBatch( $rows, $startId, $table );
  193. }
  194. $elapsed = microtime( true ) - $t0;
  195. $this->writeln(
  196. "... $table processed up to revision id $endId of {$minmax->max}"
  197. . " ($this->count rows in $elapsed seconds)"
  198. );
  199. }
  200. $elapsed = microtime( true ) - $t0;
  201. $this->writeln( "Done populating $table table. Processed $this->count rows in $elapsed seconds" );
  202. }
  203. /**
  204. * @param ResultWrapper $rows
  205. * @param int $startId
  206. * @param string $table
  207. * @return int|null
  208. */
  209. private function populateContentTablesForRowBatch( ResultWrapper $rows, $startId, $table ) {
  210. $this->beginTransaction( $this->dbw, __METHOD__ );
  211. if ( $this->contentRowMap === null ) {
  212. $map = [];
  213. } else {
  214. $map = &$this->contentRowMap;
  215. }
  216. $contentKeys = [];
  217. try {
  218. // Step 1: Figure out content rows needing insertion.
  219. $contentRows = [];
  220. foreach ( $rows as $row ) {
  221. $revisionId = $row->rev_id;
  222. Assert::invariant( $revisionId !== null, 'rev_id must not be null' );
  223. $modelId = $this->contentModelStore->acquireId( $this->getContentModel( $row ) );
  224. $address = SqlBlobStore::makeAddressFromTextId( $row->text_id );
  225. $key = "{$modelId}:{$address}";
  226. $contentKeys[$revisionId] = $key;
  227. if ( !isset( $map[$key] ) ) {
  228. $map[$key] = false;
  229. $contentRows[] = [
  230. 'content_size' => (int)$row->len,
  231. 'content_sha1' => $row->sha1,
  232. 'content_model' => $modelId,
  233. 'content_address' => $address,
  234. ];
  235. }
  236. }
  237. // Step 2: Insert them, then read them back in for use in the next step.
  238. if ( $contentRows ) {
  239. $id = $this->dbw->selectField( 'content', 'MAX(content_id)', '', __METHOD__ );
  240. $this->dbw->insert( 'content', $contentRows, __METHOD__ );
  241. $res = $this->dbw->select(
  242. 'content',
  243. [ 'content_id', 'content_model', 'content_address' ],
  244. 'content_id > ' . (int)$id,
  245. __METHOD__
  246. );
  247. foreach ( $res as $row ) {
  248. $key = $row->content_model . ':' . $row->content_address;
  249. $map[$key] = $row->content_id;
  250. }
  251. }
  252. // Step 3: Insert the slot rows.
  253. $slotRows = [];
  254. foreach ( $rows as $row ) {
  255. $revisionId = $row->rev_id;
  256. $contentId = $map[$contentKeys[$revisionId]] ?? false;
  257. if ( $contentId === false ) {
  258. throw new \RuntimeException( "Content row for $revisionId not found after content insert" );
  259. }
  260. $slotRows[] = [
  261. 'slot_revision_id' => $revisionId,
  262. 'slot_role_id' => $this->mainRoleId,
  263. 'slot_content_id' => $contentId,
  264. // There's no way to really know the previous revision, so assume no inheriting.
  265. // rev_parent_id can get changed on undeletions, and deletions can screw up
  266. // rev_timestamp ordering.
  267. 'slot_origin' => $revisionId,
  268. ];
  269. }
  270. $this->dbw->insert( 'slots', $slotRows, __METHOD__ );
  271. $this->count += count( $slotRows );
  272. $this->totalCount += count( $slotRows );
  273. } catch ( \Exception $e ) {
  274. $this->rollbackTransaction( $this->dbw, __METHOD__ );
  275. $this->fatalError( "Failed to populate content table $table row batch starting at $startId "
  276. . "due to exception: " . $e->__toString() );
  277. }
  278. $this->commitTransaction( $this->dbw, __METHOD__ );
  279. }
  280. /**
  281. * @param \stdClass $row
  282. * @return string
  283. */
  284. private function getContentModel( $row ) {
  285. if ( isset( $row->content_model ) ) {
  286. return $row->content_model;
  287. }
  288. $title = Title::makeTitle( $row->namespace, $row->title );
  289. return ContentHandler::getDefaultModelFor( $title );
  290. }
  291. /**
  292. * @param string $msg
  293. */
  294. private function writeln( $msg ) {
  295. $this->output( "$msg\n" );
  296. }
  297. }
  298. $maintClass = 'PopulateContentTables';
  299. require_once RUN_MAINTENANCE_IF_MAIN;