-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess_sql_data.py
More file actions
183 lines (152 loc) · 6.54 KB
/
preprocess_sql_data.py
File metadata and controls
183 lines (152 loc) · 6.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""preprocess_sql_data.py
Chris Fisher, 2023
Script to process data from raw SQL output into an adjacency-list-based graph,
a dictionary to translate wiki page IDs into page names, and a dictionary to
resolve redirect pages to their targets.
Implements some additional processing such as filtering based on page
namespace, and lookup of redirect targets in order to improve the quality of
the resulting graph.
"""
import json
from datetime import datetime
from time import sleep
from get_redirect_target import parse
edges_file_name = 'osrs.txt'
titles_file_name = 'osrs_pages.txt'
# edges_file_name = 'rsc.txt'
# titles_file_name = 'rsc_pages.txt'
output_name = 'osrs_7dec2023_noredir.json'
# output_name = 'rsc_7dec2023_noredir.json'
# Required in order to prevent some odd behavior caused by inclusion of some technical namespaces
# OSRS
allowable_namespaces = {0, 14, 110, 112, 118, 122, 3002}
# RSC
# allowable_namespaces = {0, 14, 110, 122}
remove_redirects = True
# List of namespaces can be seen at
# {oldschool, classic}.runescape.wiki/api.php?action=query&meta=siteinfo&siprop=namespaces
data_path = './data'
edges_file = f'{data_path}/{edges_file_name}'
titles_file = f'{data_path}/{titles_file_name}'
start_time = datetime.now()
with open(edges_file, 'r', encoding='utf8') as infile:
# Skip the first 4 lines of the file as those contain the SQL query and table
# headers
for i in range(4):
infile.readline()
# Initialize dictionary to use as adjacency list
adj_dict = dict()
all_pages = set()
i = 0
for line in infile:
if line: # Ignore empty lines
if line.startswith('|'): # Ignores some SQL info at end of file
split_line = line.strip(' \n').split('|')
source = int(split_line[1])
destination = int(split_line[2])
all_pages.add(source)
all_pages.add(destination)
if source not in adj_dict.keys():
adj_dict[source] = {destination}
else:
adj_dict[source].add(destination)
i += 1
if i % 50000 == 0:
print(f'Parsed {i} edges.')
missing_keys = all_pages.difference(set(adj_dict.keys()))
# Add adjacency lists for vertices that are linked to but don't link to anything
for key in missing_keys:
adj_dict[key] = []
with open(titles_file, 'r', encoding='utf8') as infile:
# Skip the first 4 lines of the file as those contain the SQL query and table
# headers
redirects = dict()
id_map = dict()
namespaces = dict()
for i in range(4):
infile.readline()
i = 0
for line in infile:
if line: # Ignore empty lines
if line.startswith('|'): # Ignores some SQL info at end of file
split_line = line.strip(' \n').split('|')
page_id = int(split_line[1].strip())
namespace = int(split_line[2].strip())
title = split_line[3].strip().replace('_', ' ')
is_redirect = int(split_line[4].strip())
if page_id in all_pages:
if namespace in allowable_namespaces:
id_map[page_id] = title
namespaces[page_id] = namespace
if is_redirect:
try:
if len(adj_dict[page_id]) > 0:
redirects[page_id] = int(next(iter(adj_dict[page_id])))
except KeyError:
try:
print(f'KeyError: {page_id}, namespace = {namespace}')
adj_dict[page_id] = {int(parse(page_id))}
except TypeError:
print(f'Interwiki link from {page_id} in namespace {namespace}')
sleep(0.25)
i += 1
if i % 5000 == 0:
print(f'Parsed {i} pages.')
keys_to_remove = set()
# Sets are not JSON serializable, so we'll have to re-cast these lists to sets when running the graph alg itself
# While doing so, remove any nodes from the graph that aren't in an allowable namespace.
for key in adj_dict:
if namespaces[key] in allowable_namespaces:
try:
destinations = []
for element in adj_dict[key]:
if namespaces[element] in allowable_namespaces:
destinations.append(element)
adj_dict[key] = destinations
except TypeError:
print(f'Key {key} giving TypeError for value {adj_dict[key]}')
adj_dict[key] = []
else:
keys_to_remove.add(key)
for key in keys_to_remove:
adj_dict.pop(key, None)
if remove_redirects:
normalized_graph = dict()
for k in adj_dict.keys():
redir_set = set()
dest_set = set()
for value in adj_dict[k]:
if value in redirects.keys():
redir_set.add(value)
dest_set.add(redirects[value])
if k in redirects.keys():
if redirects[k] in normalized_graph.keys():
try:
adj = set(adj_dict[k]).union(dest_set)
normalized_graph[redirects[k]] = list(adj.difference(redir_set).union(set(adj_dict[redirects[k]])))
except KeyError:
adj = set(adj_dict[k]).union(dest_set)
normalized_graph[redirects[k]] = list(adj.difference(redir_set))
else:
normalized_graph[redirects[k]] = list(set(adj_dict[k]).union(dest_set).difference(redir_set))
else:
if k in normalized_graph.keys():
normalized_graph[k] = list(set(adj_dict[k]).union(dest_set,
set(normalized_graph[k])).difference(redir_set))
else:
normalized_graph[k] = list(set(adj_dict[k]).union(dest_set).difference(redir_set))
adj_dict = normalized_graph
graph_json = {
"graph": adj_dict,
"mapping": id_map,
"redirects": redirects
}
parsing_end_time = datetime.now()
with open(output_name, 'w') as outfile:
json.dump(graph_json, outfile)
finished_time = datetime.now()
print(
f'Started at {start_time}. Finished parsing at {parsing_end_time}. Finished writing at {finished_time}.\n'
f'Time spent parsing: {parsing_end_time - start_time}. Time spent '
f'writing: {finished_time - parsing_end_time}')
print(f'{len(adj_dict)} pages, {len(id_map)} titles, {len(redirects)} redirects')