dumpCategoriesAsRdf.php 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. <?php
  2. /**
  3. * This program is free software; you can redistribute it and/or modify
  4. * it under the terms of the GNU General Public License as published by
  5. * the Free Software Foundation; either version 2 of the License, or
  6. * (at your option) any later version.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. * GNU General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public License along
  14. * with this program; if not, write to the Free Software Foundation, Inc.,
  15. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16. * http://www.gnu.org/copyleft/gpl.html
  17. *
  18. */
  19. use Wikimedia\Purtle\RdfWriter;
  20. use Wikimedia\Purtle\RdfWriterFactory;
  21. use Wikimedia\Rdbms\IDatabase;
  22. require_once __DIR__ . '/Maintenance.php';
  23. /**
  24. * Maintenance script to provide RDF representation of the category tree.
  25. *
  26. * @ingroup Maintenance
  27. * @since 1.30
  28. */
  29. class DumpCategoriesAsRdf extends Maintenance {
  30. /**
  31. * @var RdfWriter
  32. */
  33. private $rdfWriter;
  34. /**
  35. * Categories RDF helper.
  36. * @var CategoriesRdf
  37. */
  38. private $categoriesRdf;
  39. public function __construct() {
  40. parent::__construct();
  41. $this->addDescription( "Generate RDF dump of categories in a wiki." );
  42. $this->setBatchSize( 200 );
  43. $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
  44. false, true );
  45. $this->addOption( 'format', "Set the dump format.", false, true );
  46. }
  47. /**
  48. * Produce row iterator for categories.
  49. * @param IDatabase $dbr Database connection
  50. * @return RecursiveIterator
  51. */
  52. public function getCategoryIterator( IDatabase $dbr ) {
  53. $it = new BatchRowIterator(
  54. $dbr,
  55. [ 'page', 'page_props', 'category' ],
  56. [ 'page_title' ],
  57. $this->getBatchSize()
  58. );
  59. $it->addConditions( [
  60. 'page_namespace' => NS_CATEGORY,
  61. ] );
  62. $it->setFetchColumns( [
  63. 'page_title',
  64. 'page_id',
  65. 'pp_propname',
  66. 'cat_pages',
  67. 'cat_subcats',
  68. 'cat_files'
  69. ] );
  70. $it->addJoinConditions(
  71. [
  72. 'page_props' => [
  73. 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ]
  74. ],
  75. 'category' => [
  76. 'LEFT JOIN', [ 'cat_title = page_title' ]
  77. ]
  78. ]
  79. );
  80. return $it;
  81. }
  82. /**
  83. * Get iterator for links for categories.
  84. * @param IDatabase $dbr
  85. * @param array $ids List of page IDs
  86. * @return Traversable
  87. */
  88. public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
  89. $it = new BatchRowIterator(
  90. $dbr,
  91. 'categorylinks',
  92. [ 'cl_from', 'cl_to' ],
  93. $this->getBatchSize()
  94. );
  95. $it->addConditions( [
  96. 'cl_type' => 'subcat',
  97. 'cl_from' => $ids
  98. ] );
  99. $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
  100. return new RecursiveIteratorIterator( $it );
  101. }
  102. /**
  103. * @param int $timestamp
  104. */
  105. public function addDumpHeader( $timestamp ) {
  106. global $wgRightsUrl;
  107. $licenseUrl = $wgRightsUrl;
  108. if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
  109. $licenseUrl = 'https:' . $licenseUrl;
  110. }
  111. $this->rdfWriter->about( $this->categoriesRdf->getDumpURI() )
  112. ->a( 'schema', 'Dataset' )
  113. ->a( 'owl', 'Ontology' )
  114. ->say( 'cc', 'license' )->is( $licenseUrl )
  115. ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
  116. ->say( 'schema', 'dateModified' )
  117. ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
  118. ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
  119. ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
  120. }
  121. public function execute() {
  122. $outFile = $this->getOption( 'output', 'php://stdout' );
  123. if ( $outFile === '-' ) {
  124. $outFile = 'php://stdout';
  125. }
  126. $output = fopen( $outFile, 'w' );
  127. $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
  128. $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
  129. $this->categoriesRdf->setupPrefixes();
  130. $this->rdfWriter->start();
  131. $this->addDumpHeader( time() );
  132. fwrite( $output, $this->rdfWriter->drain() );
  133. $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
  134. foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
  135. $pages = [];
  136. foreach ( $batch as $row ) {
  137. $this->categoriesRdf->writeCategoryData(
  138. $row->page_title,
  139. $row->pp_propname === 'hiddencat',
  140. (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
  141. (int)$row->cat_subcats
  142. );
  143. $pages[$row->page_id] = $row->page_title;
  144. }
  145. foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
  146. $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
  147. }
  148. fwrite( $output, $this->rdfWriter->drain() );
  149. }
  150. fflush( $output );
  151. if ( $outFile !== '-' ) {
  152. fclose( $output );
  153. }
  154. }
  155. /**
  156. * @param string $format Writer format
  157. * @return RdfWriter
  158. */
  159. private function createRdfWriter( $format ) {
  160. $factory = new RdfWriterFactory();
  161. return $factory->getWriter( $factory->getFormatName( $format ) );
  162. }
  163. }
  164. $maintClass = DumpCategoriesAsRdf::class;
  165. require_once RUN_MAINTENANCE_IF_MAIN;