refreshLinks.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. <?php
  2. /**
  3. * Refresh link tables.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, write to the Free Software Foundation, Inc.,
  17. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18. * http://www.gnu.org/copyleft/gpl.html
  19. *
  20. * @file
  21. * @ingroup Maintenance
  22. */
  23. use MediaWiki\MediaWikiServices;
  24. use Wikimedia\Rdbms\IDatabase;
  25. require_once __DIR__ . '/Maintenance.php';
  26. /**
  27. * Maintenance script to refresh link tables.
  28. *
  29. * @ingroup Maintenance
  30. */
  31. class RefreshLinks extends Maintenance {
  32. const REPORTING_INTERVAL = 100;
  33. /** @var int|bool */
  34. protected $namespace = false;
  35. public function __construct() {
  36. parent::__construct();
  37. $this->addDescription( 'Refresh link tables' );
  38. $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
  39. $this->addOption( 'new-only', 'Only affect articles with just a single edit' );
  40. $this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
  41. $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
  42. $this->addOption( 'e', 'Last page id to refresh', false, true );
  43. $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
  44. 'query, default 100000', false, true );
  45. $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
  46. $this->addOption( 'category', 'Only fix pages in this category', false, true );
  47. $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
  48. $this->addArg( 'start', 'Page_id to start from, default 1', false );
  49. $this->setBatchSize( 100 );
  50. }
  51. public function execute() {
  52. // Note that there is a difference between not specifying the start
  53. // and end IDs and using the minimum and maximum values from the page
  54. // table. In the latter case, deleteLinksFromNonexistent() will not
  55. // delete entries for nonexistent IDs that fall outside the range.
  56. $start = (int)$this->getArg( 0 ) ?: null;
  57. $end = (int)$this->getOption( 'e' ) ?: null;
  58. $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
  59. $ns = $this->getOption( 'namespace' );
  60. if ( $ns === null ) {
  61. $this->namespace = false;
  62. } else {
  63. $this->namespace = (int)$ns;
  64. }
  65. if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
  66. $title = Title::makeTitleSafe( NS_CATEGORY, $category );
  67. if ( !$title ) {
  68. $this->fatalError( "'$category' is an invalid category name!\n" );
  69. }
  70. $this->refreshCategory( $title );
  71. } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
  72. $this->refreshTrackingCategory( $category );
  73. } elseif ( !$this->hasOption( 'dfn-only' ) ) {
  74. $new = $this->hasOption( 'new-only' );
  75. $redir = $this->hasOption( 'redirects-only' );
  76. $oldRedir = $this->hasOption( 'old-redirects-only' );
  77. $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
  78. $this->deleteLinksFromNonexistent( null, null, $this->getBatchSize(), $dfnChunkSize );
  79. } else {
  80. $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
  81. }
  82. }
  83. private function namespaceCond() {
  84. return $this->namespace !== false
  85. ? [ 'page_namespace' => $this->namespace ]
  86. : [];
  87. }
  88. /**
  89. * Do the actual link refreshing.
  90. * @param int|null $start Page_id to start from
  91. * @param bool $newOnly Only do pages with 1 edit
  92. * @param int|null $end Page_id to stop at
  93. * @param bool $redirectsOnly Only fix redirects
  94. * @param bool $oldRedirectsOnly Only fix redirects without redirect entries
  95. */
  96. private function doRefreshLinks( $start, $newOnly = false,
  97. $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
  98. ) {
  99. $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
  100. if ( $start === null ) {
  101. $start = 1;
  102. }
  103. // Give extensions a chance to optimize settings
  104. Hooks::run( 'MaintenanceRefreshLinksInit', [ $this ] );
  105. $what = $redirectsOnly ? "redirects" : "links";
  106. if ( $oldRedirectsOnly ) {
  107. # This entire code path is cut-and-pasted from below. Hurrah.
  108. $conds = [
  109. "page_is_redirect=1",
  110. "rd_from IS NULL",
  111. self::intervalCond( $dbr, 'page_id', $start, $end ),
  112. ] + $this->namespaceCond();
  113. $res = $dbr->select(
  114. [ 'page', 'redirect' ],
  115. 'page_id',
  116. $conds,
  117. __METHOD__,
  118. [],
  119. [ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ]
  120. );
  121. $num = $res->numRows();
  122. $this->output( "Refreshing $num old redirects from $start...\n" );
  123. $i = 0;
  124. foreach ( $res as $row ) {
  125. if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
  126. $this->output( "$i\n" );
  127. wfWaitForSlaves();
  128. }
  129. $this->fixRedirect( $row->page_id );
  130. }
  131. } elseif ( $newOnly ) {
  132. $this->output( "Refreshing $what from " );
  133. $res = $dbr->select( 'page',
  134. [ 'page_id' ],
  135. [
  136. 'page_is_new' => 1,
  137. self::intervalCond( $dbr, 'page_id', $start, $end ),
  138. ] + $this->namespaceCond(),
  139. __METHOD__
  140. );
  141. $num = $res->numRows();
  142. $this->output( "$num new articles...\n" );
  143. $i = 0;
  144. foreach ( $res as $row ) {
  145. if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
  146. $this->output( "$i\n" );
  147. wfWaitForSlaves();
  148. }
  149. if ( $redirectsOnly ) {
  150. $this->fixRedirect( $row->page_id );
  151. } else {
  152. self::fixLinksFromArticle( $row->page_id, $this->namespace );
  153. }
  154. }
  155. } else {
  156. if ( !$end ) {
  157. $maxPage = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
  158. $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', '', __METHOD__ );
  159. $end = max( $maxPage, $maxRD );
  160. }
  161. $this->output( "Refreshing redirects table.\n" );
  162. $this->output( "Starting from page_id $start of $end.\n" );
  163. for ( $id = $start; $id <= $end; $id++ ) {
  164. if ( !( $id % self::REPORTING_INTERVAL ) ) {
  165. $this->output( "$id\n" );
  166. wfWaitForSlaves();
  167. }
  168. $this->fixRedirect( $id );
  169. }
  170. if ( !$redirectsOnly ) {
  171. $this->output( "Refreshing links tables.\n" );
  172. $this->output( "Starting from page_id $start of $end.\n" );
  173. for ( $id = $start; $id <= $end; $id++ ) {
  174. if ( !( $id % self::REPORTING_INTERVAL ) ) {
  175. $this->output( "$id\n" );
  176. wfWaitForSlaves();
  177. }
  178. self::fixLinksFromArticle( $id, $this->namespace );
  179. }
  180. }
  181. }
  182. }
  183. /**
  184. * Update the redirect entry for a given page.
  185. *
  186. * This methods bypasses the "redirect" table to get the redirect target,
  187. * and parses the page's content to fetch it. This allows to be sure that
  188. * the redirect target is up to date and valid.
  189. * This is particularly useful when modifying namespaces to be sure the
  190. * entry in the "redirect" table points to the correct page and not to an
  191. * invalid one.
  192. *
  193. * @param int $id The page ID to check
  194. */
  195. private function fixRedirect( $id ) {
  196. $page = WikiPage::newFromID( $id );
  197. $dbw = $this->getDB( DB_MASTER );
  198. if ( $page === null ) {
  199. // This page doesn't exist (any more)
  200. // Delete any redirect table entry for it
  201. $dbw->delete( 'redirect', [ 'rd_from' => $id ],
  202. __METHOD__ );
  203. return;
  204. } elseif ( $this->namespace !== false
  205. && !$page->getTitle()->inNamespace( $this->namespace )
  206. ) {
  207. return;
  208. }
  209. $rt = null;
  210. $content = $page->getContent( Revision::RAW );
  211. if ( $content !== null ) {
  212. $rt = $content->getUltimateRedirectTarget();
  213. }
  214. if ( $rt === null ) {
  215. // The page is not a redirect
  216. // Delete any redirect table entry for it
  217. $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
  218. $fieldValue = 0;
  219. } else {
  220. $page->insertRedirectEntry( $rt );
  221. $fieldValue = 1;
  222. }
  223. // Update the page table to be sure it is an a consistent state
  224. $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
  225. [ 'page_id' => $id ], __METHOD__ );
  226. }
  227. /**
  228. * Run LinksUpdate for all links on a given page_id
  229. * @param int $id The page_id
  230. * @param int|bool $ns Only fix links if it is in this namespace
  231. */
  232. public static function fixLinksFromArticle( $id, $ns = false ) {
  233. $page = WikiPage::newFromID( $id );
  234. MediaWikiServices::getInstance()->getLinkCache()->clear();
  235. if ( $page === null ) {
  236. return;
  237. } elseif ( $ns !== false
  238. && !$page->getTitle()->inNamespace( $ns ) ) {
  239. return;
  240. }
  241. // Defer updates to post-send but then immediately execute deferred updates;
  242. // this is the simplest way to run all updates immediately (including updates
  243. // scheduled by other updates).
  244. $page->doSecondaryDataUpdates( [
  245. 'defer' => DeferredUpdates::POSTSEND,
  246. 'recursive' => false,
  247. ] );
  248. DeferredUpdates::doUpdates();
  249. }
  250. /**
  251. * Removes non-existing links from pages from pagelinks, imagelinks,
  252. * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables.
  253. *
  254. * @param int|null $start Page_id to start from
  255. * @param int|null $end Page_id to stop at
  256. * @param int $batchSize The size of deletion batches
  257. * @param int $chunkSize Maximum number of existent IDs to check per query
  258. *
  259. * @author Merlijn van Deen <valhallasw@arctus.nl>
  260. */
  261. private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
  262. $chunkSize = 100000
  263. ) {
  264. wfWaitForSlaves();
  265. $this->output( "Deleting illegal entries from the links tables...\n" );
  266. $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
  267. do {
  268. // Find the start of the next chunk. This is based only
  269. // on existent page_ids.
  270. $nextStart = $dbr->selectField(
  271. 'page',
  272. 'page_id',
  273. [ self::intervalCond( $dbr, 'page_id', $start, $end ) ]
  274. + $this->namespaceCond(),
  275. __METHOD__,
  276. [ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ]
  277. );
  278. if ( $nextStart !== false ) {
  279. // To find the end of the current chunk, subtract one.
  280. // This will serve to limit the number of rows scanned in
  281. // dfnCheckInterval(), per query, to at most the sum of
  282. // the chunk size and deletion batch size.
  283. $chunkEnd = $nextStart - 1;
  284. } else {
  285. // This is the last chunk. Check all page_ids up to $end.
  286. $chunkEnd = $end;
  287. }
  288. $fmtStart = $start !== null ? "[$start" : '(-INF';
  289. $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
  290. $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" );
  291. $this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
  292. $start = $nextStart;
  293. } while ( $nextStart !== false );
  294. }
  295. /**
  296. * @see RefreshLinks::deleteLinksFromNonexistent()
  297. * @param int|null $start Page_id to start from
  298. * @param int|null $end Page_id to stop at
  299. * @param int $batchSize The size of deletion batches
  300. */
  301. private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
  302. $dbw = $this->getDB( DB_MASTER );
  303. $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
  304. $linksTables = [ // table name => page_id field
  305. 'pagelinks' => 'pl_from',
  306. 'imagelinks' => 'il_from',
  307. 'categorylinks' => 'cl_from',
  308. 'templatelinks' => 'tl_from',
  309. 'externallinks' => 'el_from',
  310. 'iwlinks' => 'iwl_from',
  311. 'langlinks' => 'll_from',
  312. 'redirect' => 'rd_from',
  313. 'page_props' => 'pp_page',
  314. ];
  315. foreach ( $linksTables as $table => $field ) {
  316. $this->output( " $table: 0" );
  317. $tableStart = $start;
  318. $counter = 0;
  319. do {
  320. $ids = $dbr->selectFieldValues(
  321. $table,
  322. $field,
  323. [
  324. self::intervalCond( $dbr, $field, $tableStart, $end ),
  325. "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id' )})",
  326. ],
  327. __METHOD__,
  328. [ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ]
  329. );
  330. $numIds = count( $ids );
  331. if ( $numIds ) {
  332. $counter += $numIds;
  333. $dbw->delete( $table, [ $field => $ids ], __METHOD__ );
  334. $this->output( ", $counter" );
  335. $tableStart = $ids[$numIds - 1] + 1;
  336. wfWaitForSlaves();
  337. }
  338. } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
  339. $this->output( " deleted.\n" );
  340. }
  341. }
  342. /**
  343. * Build a SQL expression for a closed interval (i.e. BETWEEN).
  344. *
  345. * By specifying a null $start or $end, it is also possible to create
  346. * half-bounded or unbounded intervals using this function.
  347. *
  348. * @param IDatabase $db
  349. * @param string $var Field name
  350. * @param mixed $start First value to include or null
  351. * @param mixed $end Last value to include or null
  352. * @return string
  353. */
  354. private static function intervalCond( IDatabase $db, $var, $start, $end ) {
  355. if ( $start === null && $end === null ) {
  356. return "$var IS NOT NULL";
  357. } elseif ( $end === null ) {
  358. return "$var >= {$db->addQuotes( $start )}";
  359. } elseif ( $start === null ) {
  360. return "$var <= {$db->addQuotes( $end )}";
  361. } else {
  362. return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
  363. }
  364. }
  365. /**
  366. * Refershes links for pages in a tracking category
  367. *
  368. * @param string $category Category key
  369. */
  370. private function refreshTrackingCategory( $category ) {
  371. $cats = $this->getPossibleCategories( $category );
  372. if ( !$cats ) {
  373. $this->error( "Tracking category '$category' is disabled\n" );
  374. // Output to stderr but don't bail out,
  375. }
  376. foreach ( $cats as $cat ) {
  377. $this->refreshCategory( $cat );
  378. }
  379. }
  380. /**
  381. * Refreshes links to a category
  382. *
  383. * @param Title $category
  384. */
  385. private function refreshCategory( Title $category ) {
  386. $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
  387. $dbr = $this->getDB( DB_REPLICA );
  388. $conds = [
  389. 'page_id=cl_from',
  390. 'cl_to' => $category->getDBkey(),
  391. ];
  392. if ( $this->namespace !== false ) {
  393. $conds['page_namespace'] = $this->namespace;
  394. }
  395. $i = 0;
  396. $timestamp = '';
  397. $lastId = 0;
  398. do {
  399. $finalConds = $conds;
  400. $timestamp = $dbr->addQuotes( $timestamp );
  401. $finalConds [] =
  402. "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
  403. $res = $dbr->select( [ 'page', 'categorylinks' ],
  404. [ 'page_id', 'cl_timestamp' ],
  405. $finalConds,
  406. __METHOD__,
  407. [
  408. 'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
  409. 'LIMIT' => $this->getBatchSize(),
  410. ]
  411. );
  412. foreach ( $res as $row ) {
  413. if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
  414. $this->output( "$i\n" );
  415. wfWaitForSlaves();
  416. }
  417. $lastId = $row->page_id;
  418. $timestamp = $row->cl_timestamp;
  419. self::fixLinksFromArticle( $row->page_id );
  420. }
  421. } while ( $res->numRows() == $this->getBatchSize() );
  422. }
  423. /**
  424. * Returns a list of possible categories for a given tracking category key
  425. *
  426. * @param string $categoryKey
  427. * @return Title[]
  428. */
  429. private function getPossibleCategories( $categoryKey ) {
  430. $trackingCategories = new TrackingCategories( $this->getConfig() );
  431. $cats = $trackingCategories->getTrackingCategories();
  432. if ( isset( $cats[$categoryKey] ) ) {
  433. return $cats[$categoryKey]['cats'];
  434. }
  435. $this->fatalError( "Unknown tracking category {$categoryKey}\n" );
  436. }
  437. }
  438. $maintClass = RefreshLinks::class;
  439. require_once RUN_MAINTENANCE_IF_MAIN;