wikidata-GSoC/extract_TeachStud.py at main · Sharingan-Coder01/wikidata-GSoC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Code to extract Teacher and Student associated with each person
import rdflib
import os
from rdflib.namespace import RDF, RDFS, SKOS, OWL, Namespace, NamespaceManager, XSD, URIRef
import csv
import pyewts
import sys

BDR = Namespace("http://purl.bdrc.io/resource/")
BDO = Namespace("http://purl.bdrc.io/ontology/core/")
BDA = Namespace("http://purl.bdrc.io/admindata/")
BDG = Namespace("http://purl.bdrc.io/graph/")
ADM = Namespace("http://purl.bdrc.io/ontology/admin/")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
DILA = Namespace("http://purl.dila.edu.tw/resource/")
VIAF = Namespace("http://viaf.org/viaf/")

NSM = NamespaceManager(rdflib.Graph())
NSM.bind("bdr", BDR)
NSM.bind("bdg", BDG)
NSM.bind("bdo", BDO)
NSM.bind("bda", BDA)
NSM.bind("adm", ADM)
NSM.bind("skos", SKOS)
NSM.bind("rdfs", RDFS)
NSM.bind("wd", WD)
NSM.bind("owl", OWL)
NSM.bind("wdt", WDT)
NSM.bind("dila", DILA)
NSM.bind("viaf", VIAF)

# see https://github.com/RDFLib/rdflib/issues/806
if rdflib.__version__ == '4.2.2':
    x = rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear'])

converter = pyewts.pyewts()

# Function to extract values for teachers and students
def extract(g, id):
    values = {}
    typeD = ""

    # ID's Teachers
    for _, _, teID in g.triples((BDR[id], BDO.personStudentOf, None)):
        # Gets teachers of a person
        typeD = "teachers"
        _, _, teachID = NSM.compute_qname_strict(teID)
        if typeD not in values:
            values[typeD] = []
        values[typeD].append(teachID)

    # ID's Students
    for _, _, stID in g.triples((BDR[id], BDO.personTeacherOf, None)):
        # Gets students of a person
        typeD = "students"
        _, _, studID = NSM.compute_qname_strict(stID)
        if typeD not in values:
            values[typeD] = []
        values[typeD].append(studID)

    return values

# Creates list with the extracted data
def createList(personID, vals, COUNTPROP):
    row = []
    row.append(personID)

    for tp, nbcols in COUNTPROP.items():
        if tp not in vals:
            continue
        if nbcols < len(vals[tp]):
            print("!!Error!! There should be at least %i columns for %s aliases" % (len(vals[tp]), tp))
            print(personID)
            continue
        for i in range(nbcols):
            if i < len(vals[tp]):
                row.append(vals[tp][i])
            else:
                row.append("")

    return row

# Wrapper function for all function call
def run(file_path, id, entity_list):
    ext_val = {}

    g = rdflib.ConjunctiveGraph()
    g.parse(file_path, format="trig")

    ext_val = extract(g, id)

    # Dictionary for number of teachers and students
    COUNTPROP = {
        "teachers" : 40, #Maximum number of teachers associated with a Person
        "students" : 80  #Maximum number of students associated with a Person
    }

    nlist = createList(id, ext_val, COUNTPROP)
    entity_list.append(nlist)


# Function to create CSV using master list
def createCSV(all_list):
    with open('ExtractProp1.csv', "a") as f:
        writer = csv.writer(f)
        for r in all_list:
            writer.writerow(r)


def main():
    main_list = []
    dir = os.listdir('persons')
    folder = 'persons/'
    directories = []
    for dir_name in dir:
        if dir_name.find(".git") == -1:
            directory = folder + dir_name
            directories.append(directory)
        else:
            continue

    for d in directories:
        l = os.listdir(d)

        person_links = []
        for f in l:
            file_p = d + "/" + f
            person_links.append(file_p)

        for file in person_links:
            c = file.rsplit('/', 1)[-1]
            id = c[:-5]

            if id.find("P0RK") == -1:
                run(file, id, main_list)
            else:
                continue

    createCSV(main_list)


if __name__ == "__main__":
    main()