rdf_dump.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. #!/usr/bin/env python3
  2. # This script is used to generate a RDF graph from the list of packages and manpages
  3. import gzip
  4. import json
  5. import os
  6. from rdflib import BNode, Graph, Literal, Namespace, URIRef
  7. from rdflib.namespace import RDF
  8. DIST = 'testing'
  9. mangraph = Graph ()
  10. MANPAGE = Namespace ('urn:manpage:')
  11. MANPAGE_TERMS = Namespace ('urn:manpage:terms:')
  12. SCHEMA = Namespace ('http://schema.org/')
  13. mangraph.bind ('manpage', MANPAGE)
  14. mangraph.bind ('manpage-terms', MANPAGE_TERMS)
  15. mangraph.bind ('schema', SCHEMA)
  16. # Add sections to the graph
  17. def add_section_info (number, description):
  18. section_n = 'section' + number
  19. mangraph.add ((MANPAGE[section_n],
  20. RDF.type,
  21. MANPAGE_TERMS.Section))
  22. mangraph.add ((MANPAGE[section_n],
  23. SCHEMA.name,
  24. Literal ('Section ' + number)))
  25. mangraph.add ((MANPAGE[section_n],
  26. MANPAGE_TERMS.section_number,
  27. Literal (number)))
  28. mangraph.add ((MANPAGE[section_n],
  29. SCHEMA.disambiguatingDescription,
  30. Literal (description, lang='en')))
  31. add_section_info ('1', 'Executable programs or shell commands')
  32. add_section_info ('2', 'System calls (functions provided by the kernel)')
  33. add_section_info ('3', 'Library calls (functions within program libraries)')
  34. add_section_info ('4', 'Special files')
  35. add_section_info ('5', 'File formats and conventions')
  36. add_section_info ('6', 'Games')
  37. add_section_info ('7', 'Miscellaneous')
  38. add_section_info ('8', 'System administration commands')
  39. with open ('manpages.json', 'rt') as fp:
  40. packages = json.load (fp)
  41. # Loop packages
  42. for package in packages:
  43. rdfpackage = URIRef (MANPAGE + 'debian-testing/amd64/' + package['name'])
  44. mangraph.add ((rdfpackage,
  45. RDF.type,
  46. MANPAGE_TERMS.Package))
  47. mangraph.add ((rdfpackage,
  48. SCHEMA.name,
  49. Literal (package['name'])))
  50. mangraph.add ((rdfpackage,
  51. SCHEMA.version,
  52. Literal (package['version'])))
  53. mangraph.add ((rdfpackage,
  54. MANPAGE_TERMS.architecture,
  55. Literal (package['architecture'])))
  56. mangraph.add ((rdfpackage,
  57. MANPAGE_TERMS.filename,
  58. Literal (package['deb'])))
  59. # Loop manpages for this package
  60. for page in package['manpages']:
  61. subject = URIRef (MANPAGE + page['identifier'])
  62. # Is this page just a link to another page or not?
  63. if page['link_to'] is None:
  64. mangraph.add ((subject,
  65. RDF.type,
  66. SCHEMA.TextDigitalDocument))
  67. mangraph.add ((subject,
  68. SCHEMA.name,
  69. Literal (page['name'])))
  70. mangraph.add ((subject,
  71. SCHEMA.identifier,
  72. Literal (page['identifier'])))
  73. # Sections are 1..8 but can have variants such as "1p"
  74. mangraph.add ((subject,
  75. MANPAGE_TERMS.section,
  76. URIRef (MANPAGE.section + page['section'])))
  77. mangraph.add ((subject,
  78. MANPAGE_TERMS.section_variant,
  79. Literal (page['section_varinat'])))
  80. # Link to package
  81. mangraph.add ((subject, SCHEMA.isPartOf, rdfpackage))
  82. # Extract description from manpage
  83. # Example: ls - list directory contents
  84. page_txt = 'man/man' + page['section'] + '/' + page['identifier'] + '.txt'
  85. # Read TXT version, find NAME section, then read next line
  86. if os.path.isfile (page_txt):
  87. name_section_found = False
  88. description = ''
  89. with open (page_txt, 'rt') as f:
  90. for line in f:
  91. if name_section_found:
  92. # Multiline NAME section
  93. if line.startswith (' '):
  94. description += line
  95. # NAME section terminated
  96. else:
  97. # There can be different kind of separators
  98. # Usually it's '—' to split name/description, but others
  99. # possibilities are '--', '-', ',', others?
  100. if ' — ' in description:
  101. description = description.split (' — ', 1)[1].strip ()
  102. elif ' -- ' in description:
  103. description = description.split (' -- ', 1)[1].strip ()
  104. elif ' - ' in description:
  105. description = description.split (' - ', 1)[1].strip ()
  106. elif ',' in description:
  107. description = description.split (',', 1)[1].strip ()
  108. else:
  109. # Tough luck! No description for this page :(
  110. description = ''
  111. break
  112. if line == 'NAME\n':
  113. name_section_found = True
  114. mangraph.add ((subject,
  115. SCHEMA.disambiguatingDescription,
  116. Literal (description)))
  117. else:
  118. print ('Warning: file not found ', page_txt)
  119. else:
  120. mangraph.add ((URIRef (MANPAGE + page['link_to']),
  121. SCHEMA.alternateName,
  122. Literal (page['identifier'])))
  123. mangraph.serialize ('manpages.ttl', 'turtle')