-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_Places.py
More file actions
259 lines (207 loc) · 7.73 KB
/
extract_Places.py
File metadata and controls
259 lines (207 loc) · 7.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# Code to extract Places Entities data which includes: Places label, aliases, place type, coordinate location, date of foundation & Tradition associated
import rdflib
import os
from rdflib.namespace import RDF, RDFS, SKOS, OWL, Namespace, NamespaceManager, XSD, URIRef
import csv
import pyewts
import sys
BDR = Namespace("http://purl.bdrc.io/resource/")
BDO = Namespace("http://purl.bdrc.io/ontology/core/")
BDA = Namespace("http://purl.bdrc.io/admindata/")
BDG = Namespace("http://purl.bdrc.io/graph/")
ADM = Namespace("http://purl.bdrc.io/ontology/admin/")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
DILA = Namespace("http://purl.dila.edu.tw/resource/")
VIAF = Namespace("http://viaf.org/viaf/")
NSM = NamespaceManager(rdflib.Graph())
NSM.bind("bdr", BDR)
NSM.bind("bdg", BDG)
NSM.bind("bdo", BDO)
NSM.bind("bda", BDA)
NSM.bind("adm", ADM)
NSM.bind("skos", SKOS)
NSM.bind("rdfs", RDFS)
NSM.bind("wd", WD)
NSM.bind("owl", OWL)
NSM.bind("wdt", WDT)
NSM.bind("dila", DILA)
NSM.bind("viaf", VIAF)
# see https://github.com/RDFLib/rdflib/issues/806
if rdflib.__version__ == '4.2.2':
x = rdflib.term._toPythonMapping.pop(rdflib.XSD['gYear'])
converter = pyewts.pyewts()
def normalize(literal):
if literal.language == "bo-x-ewts":
return [converter.toUnicode(literal.value), "bo"]
return [literal.value, literal.language]
# Function to get labels
def get_prefLabels(g, id):
pplabels = {} #Extractes name labels and stores in the dictionary against the language
for _, _, prefL in g.triples((BDR[id], SKOS.prefLabel, None)):
prefLabelVal, lang = normalize(prefL)
if lang not in pplabels:
pplabels[lang] = []
pplabels[lang].append(prefLabelVal)
return pplabels
# Function to get alternate labels
def get_altLabels(g, id):
ppalias = {} #Extractes alias names and stores in the dictionary against the language
for _, _, altL in g.triples((BDR[id], SKOS.altLabel, None)):
altLabelVal, lang = normalize(altL)
if lang not in ppalias:
ppalias[lang] = []
ppalias[lang].append(altLabelVal)
return ppalias
# Function to extract values
def extractValues(g, id, f):
PlaceType = ""
Plabels = {}
Paliases = {}
lat = ""
long = ""
foundation = ""
converted = ""
tradStr = ""
str1 = f
# Assigns type of the place
if(str1 == "bdr:PT0037"):
PlaceType = "monastery"
elif(str1 == "bdr:PT0059"):
PlaceType = "printery"
elif(str1 == "bdr:PT0084"):
PlaceType = "village"
elif(str1 == "bdr:PT0050"):
PlaceType = "nunnery"
elif(str1 == "bdr:PT0028"):
PlaceType = "lake"
elif(str1 == "bdr:PT0008"):
PlaceType = "city"
elif(str1 == "bdr:PT0020"):
PlaceType = "hamlet"
elif(str1 == "bdr:PT0074"):
PlaceType = "temple"
elif(str1 == "bdr:PT0011"):
PlaceType = "county"
Plabels = get_prefLabels(g, id) # Function call to get name labels
Paliases = get_altLabels(g, id) # Function call to get alias name
# Stores Place Latitude
for _, _, placeLat in g.triples((BDR[id], BDO.placeLat, None)):
lat = placeLat
# Stores Place Longitude
for _, _, placeLong in g.triples((BDR[id], BDO.placeLong, None)):
long = placeLong
# Stores Date of Foundation
for _, _, eveId in g.triples((BDR[id], BDO.placeEvent, None)):
for _, _, event in g.triples((eveId, RDF.type, None)):
_, _, check = NSM.compute_qname_strict(event)
if(check == "PlaceFounded"):
for _, _, date in g.triples((eveId, BDO.onYear, None)):
foundOn = date[:4]
foundation = foundOn
# Stores Date of Converion
for _, _, eveId in g.triples((BDR[id], BDO.placeEvent, None)):
for _, _, event in g.triples((eveId, RDF.type, None)):
_, _, check = NSM.compute_qname_strict(event)
if(check == "PlaceConverted"):
for _, _, date2 in g.triples((eveId, BDO.onYear, None)):
convertedOn = date2[:4]
converted = convertedOn
# Stores Associated Tradition
for _, _, peveId in g.triples((BDR[id], BDO.placeEvent, None)):
for _, _, extTrad in g.triples((peveId, BDO.associatedTradition, None)):
_, _, tradName = NSM.compute_qname_strict(extTrad)
tradStr = tradName[9:]
return PlaceType, Plabels, Paliases, lat, long, foundation, converted, tradStr
# Function to create list from all extracted data
def createList(LANGLABELS, BDRCid, PlaceType, Plabels, Paliases, lati, longi, foundationO, convertedO, AsTradition):
row = []
row.append(BDRCid)
row.append(PlaceType)
for lang in LANGLABELS.keys():
if lang in Plabels and len(Plabels[lang]):
if Plabels[lang][0] == "?" or Plabels[lang][0] == "??":
row.append("")
else:
row.append(Plabels[lang][0])
else:
row.append("")
for lang in LANGLABELS.keys():
if lang in Paliases and len(Paliases[lang]):
if Paliases[lang][0] == "?" or Paliases[lang][0] == "??":
row.append("")
else:
row.append(Paliases[lang][0])
else:
row.append("")
row.append(lati)
row.append(longi)
row.append(AsTradition)
row.append(foundationO)
row.append(convertedO)
return row
# Function to create CSV
def createCSV(all_list):
with open('Places_FinalExt.csv', "a") as f:
writer = csv.writer(f)
for r in all_list:
writer.writerow(r)
def run(file_path, id, entity_list):
Ptype = ""
labels = {}
aliases = {}
latitude = ""
longitude = ""
foundationOnD = ""
convertedOnD = ""
AsscTrad = ""
filter = ""
g = rdflib.ConjunctiveGraph()
g.parse(file_path, format="trig")
for _, _, status in g.triples((BDA[id], ADM.status, None)):
s = status.rsplit('/', 1)[-1]
if s != "StatusReleased":
return
for _, _, placeID in g.triples((BDR[id], BDO.placeType, None)):
nsshort, _, lname = NSM.compute_qname_strict(placeID)
filter = nsshort + ':' + lname
# Filter type of places
type_place_Do = ["bdr:PT0037", "bdr:PT0059", "bdr:PT0084" , "bdr:PT0050" , "bdr:PT0028" , "bdr:PT0008", "bdr:PT0020", "bdr:PT0074", "bdr:PT0011"]
if filter in type_place_Do:
Ptype, labels, aliases, latitude, longitude, foundationOnD, convertedOnD, AsscTrad = extractValues(g, id, filter)
else:
return
# Dictionary stores all languages for name labels and alias
LANGLABELS = {
"bo": 1,
"zh-hans": 1,
"en": 1,
"zh-latn-pinyin-x-ndia" : 1,
"mn-x-trans" : 1
}
plist = createList(LANGLABELS, id, Ptype, labels, aliases, latitude, longitude, foundationOnD, convertedOnD, AsscTrad)
entity_list.append(plist)
def main():
entity_list = []
dir = os.listdir('places')
folder = 'places/'
directories = []
for dir_name in dir: # loop to get all directories from places data
if dir_name.find(".git") == -1:
directory = folder + dir_name
directories.append(directory)
else:
continue
for d in directories:
l = os.listdir(d)
person_links = []
for f in l: # Loop to get all file path from each directory.
file_p = d + "/" + f
person_links.append(file_p)
for file in person_links:
c = file.rsplit('/', 1)[-1]
id = c[:-5]
run(file, id, entity_list)
createCSV(entity_list)
if __name__ == "__main__":
main()