nvb
/
graphs
forked from DOKK-legacy/data


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
							#!/usr/bin/env python3

# This is a small module to find relations across nodes (which node links to
# which). It's used in /explore to build the graph.

import json
import rdflib
import xml.etree.ElementTree as ET

g = rdflib.graph.Graph()
g.parse ('dokk.ttl', format='turtle')

# Parse nodes from Turtle into a dictionary
nodes = {}

for s, p, o in g:
    s, p, o = str (s), str (p), str (o)
    
    if s not in nodes.keys ():
        nodes[s] = {}
    
    if p == 'http://schema.org/headline':
        nodes[s]['headline'] = o
    
    if p == 'http://schema.org/image':
        if len (o) > 0:
            nodes[s]['image'] = 'https://archive.dokk.org/images/' + o
    
    if p == 'http://schema.org/disambiguatingDescription':
        nodes[s]['description'] = o
    
    if p == 'http://schema.org/articleBody':
        nodes[s]['file'] = o

# Now parse all DOCBOOK files and find which nodes links to which
for key, node in nodes.items ():
    tree = ET.parse ('docbook/' + node['file'][:-5] + '.dbk')
    root = tree.getroot ()
    
    nodes[key]['links'] = []
    
    for link in root.iter ('ulink'):
        url = 'https://dokk.org/' + link.attrib['url']
        
        if url in nodes.keys ():
            nodes[key]['links'].append (url)

# Write out dictionary to JSON
with open ('dokk.json', 'w', encoding='UTF-8') as f:
    json.dump (nodes, f)