-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmatch_sources_fuzzy.py
More file actions
executable file
·251 lines (219 loc) · 10.2 KB
/
match_sources_fuzzy.py
File metadata and controls
executable file
·251 lines (219 loc) · 10.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python
# -*- coding: utf8 -*-
#
# Script to map rivers from different sources:
# osm with rivers from eauxvives.org screen scraping
# osm with rivers from rivermap.ch screen scraping
# Options
print_not_found = True
#print_not_found = False
#exact_match = True
exact_match = False
console_encoding = 'utf8' #latin1 on Windows
BULK_SIZE = 10
# Hard codes
# Bad fuzzy matches
exclude_list = (('Garon', 'La Garonne'),('Volp','Volpajola'),('Rauma','Le Raumartin'),
('Ostri','Ostriconi'),('Orb','U Fium Orbu'),('Dore','Dorette'),
('Varrados (Rio)','Le Var'),('Saint Bonnette','La Bonne'),
('Rhins','Le Rhin'),
("Lignon de l'Ardèche","L'Ardèche"),("Alignon (du Haut Tarn)","Le Tarn"),
("Chasse","Le Chassezac"),("Hérault de St Guilhem","Le Guil"),
("Our","L'Ourse"),("Lez (d'Ariège)","L'Ariège"),
("Archiane","L'Arc"),("Rivière des Marsouins","Le Mars"),
("Petite Rhue","La Rhue"),
("Guiers Mort","Le Guiers"),
("Guiers Vif","Le Guiers"),
("Portet sur Garonne","La Garonne"),
("Sorgues d'Entraigues","La Sorgue"),
("Dunière","Le Dun"),
("Ese","Rio Esera"),
("Saint-Bonnette","La Bonne"),
("Sava Bohinjka","Sava"),
("Salza","Salzach"),
("Tara","Le Taravo"),
("Vézère de Pérols","La Vézère"),
("Valser Rhein","Rhein"),
("Vézère de Saint-Merd","La Vézère"),
("Neste d'Aure","La Neste"), # Neste du Lauron + Neste d'Aure = Neste
("Neste du Lauron","La Neste"),
("Grande Nive","Grande"),
("Grande Eyvia","Grande"),
("Barossa (Rio)","Rio"),
("Rushia","Le Ru"),
("Ruetzbach","Le Ru"),
("Rutor","Le Ru"),
("Rußbach","Le Ru"),
("Bras de la Plaine","La Plaine"),
("Reuss","Le Reu"),
("Göschener Reuss","Le Reu"),
# Below problematic ones (several in evo -> one in osm
("Ouvèze de l'ardèche","L'Ouvèze"), # Ouveze des Baronnies == Ouveze, mais doublon dans osm
("Ouvèze des Baronnies","L'Ouvèze"), # Ouveze des Baronnies == Ouveze, mais doublon dans osm
("Ariège (Haute)","L'Ariège"),
("Garonne - Les Roches","La Garonne"),
("Ardèche (basse)","L'Ardèche"),
('Rhône (entier)','Le Rhône'),
("Ouvèze des Baronnies","L'Ouvèze"),
# ckfiumi
("Cant","La Cantache"),
("Varrone","Le Var"),
("Varatella","Le Var"),
("Varaita","Le Var"),
("Vara","Le Var"),
("Garibaldo","Gari"),
("Savenca","La Save"),
("Lima","La Limagne"),
("Po","Rivière de Porto"),
("Riu Leonai","Leo"),
("Varaita","Fiume Vara"),
("Varatella","Fiume Vara"),
("Ogliolo di edolo","Fiume Oglio")
)
#TODO: check:
#SRC2: Ligne SRC1: Le Ligneron
#SRC2: Lieux SRC1: Le Lieux de Naucelle
#SRC2: Esca SRC1: Vieil Escaut
#SRC2: Ese SRC1: Rio Esera
#SRC2: Chasse SRC1: Le Chassezac
#SRC2: Loire SRC1: La Loire - Bras de Pirmil
#already_updated: L,rio,Le Garon,L,L,L,L'Orb,Le Lan,L'Isard,Le Lignon,Le Foron,L,L,rio,L,Le Lan,L'Arre,L'Hérault,Rio Lot,Rio Grande,Rio Rio,L,L'Ur,L,L,Le Sal,L,Rio Ri,Le Sal,L'Ur,La Dordogne,La Nogue,La Nogue,La Sioule,Le Dadou
#already_updated: L,Schwarzach,Rio Rin,Rotbach,La Vis,L,Bist,L,L,L'Orb,L'Arac,Le Lignon,La Weiss,L,Rio Re,L'Ur,Rio Men,Rio Ber,Bist,Le Talbach,L,L,L,L'Ur,La Weiss,Rio Val,L,Rio Re,L,Rio Rin,Rio Ser,L,Rettenbach,Le Dadou,L,L,Rio Sta,Rio Re,Po,Schwarzwasser
#already_updated: Fiume Calore,Gesso,Rio Noce,Taro,Gesso,L,Strona,Po,Rio Valle,Torrente,Rio Turrite,L,Rivière d'Argent,Verde,Bidente,Fiume Brembo,Gesso,Bormida,Orba,La Tave,Fiume Rabbi
#Alareks = arménie,
# End of Hard codes
import json
import sys
from fuzzywuzzy import fuzz
from text_utils import clean4fuzzy
from manual_matches import manual_matches
def open_source(src_input):
if src_input[0]=='file':
with open(src_input[1],'r') as f:
src_json=json.load(f)
names_src={}
for river in src_json:
if river['name'] in names_src:
names_src[river['name']] += 1
river['name'] = '%s_%d' % (river['name'],names_src[river['name']])
print river['name']
else:
names_src[river['name']] = 0
river_names_src_cache = dict([(river['name'],river) for river in src_json])
assert(len(river_names_src_cache)==len(src_json))
def river_names_src():
for name in river_names_src_cache.keys():
yield name
def river_src(id):
return river_names_src_cache[id]
elif src_input[0]=='mongo':
river_names_src_cache = [river['_id'] for river in client.wwsupdb[src_input[1]].find({},{'_id':1})]
def river_names_src():
for name in river_names_src_cache:
yield name
def river_src(id):
return client.wwsupdb[src_input[1]].find_one({'_id':id})
else:
raise Exception('Input type not handled')
return river_names_src,river_src
def match(src1_input,src2_input,output):
# Prepare data input and output
# Connect to MongoDB if needed
if 'mongo' in (src2_input[0],src1_input[0],output[0]):
import pymongo
global client
client = pymongo.MongoClient()
# Open source 2 data
river_names_src2,river_src2 = open_source(src2_input)
# Open source 1 data
river_names_src1,river_src1 = open_source(src1_input)
# Prepare output
if output[0]=='file':
rivers_output = []
def river_output(river):
rivers_output.append(river)
elif output[0]=='mongo':
global bulk
global updated
updated = set([])
already_updated = []
bulk = []
def river_output(river):
global bulk
if river['_id'] in updated:
already_updated.append(river['_id'])
return
#raise Exception("Already updated %s" % river['_id'].encode(console_encoding))
else:
updated.add(river['_id'])
bulk.append(pymongo.UpdateOne({'_id':river['_id']},{"$set":river},upsert=True))
if len(bulk) % BULK_SIZE == 0:
result = client.wwsupdb[output[1]].bulk_write(bulk)
#if not(result.modified_count+result.upserted_count == BULK_SIZE):
# raise Exception("Cannot upsert into mongodb wwsupdb.%s result.modified_count=%s result.upserted_count=%s"%(output[1],result.modified_count,result.upserted_count))
bulk = []
def output_reset():
print 'already_updated:',','.join(map(lambda r:r.encode(console_encoding),already_updated))
updated = set([])
else:
raise Exception('Output type not handled')
# Try to match
for name_src2 in river_names_src2():
matches = []
# First try to match with partial_ratio (example: 'The Big River" = "Big River")
for name_src1 in river_names_src1():
if exact_match:
if clean4fuzzy(name_src2)==clean4fuzzy(name_src1):
matches.append(name_src1)
else:
if fuzz.partial_ratio(clean4fuzzy(name_src2),clean4fuzzy(name_src1)) == 100:
matches.append(name_src1)
mm = manual_matches[src2_input[1]].get(name_src2)
if mm!=None:
matches = [mm]
if len(matches)>0:
# If there are several matches use ratio instead of partial_ratio and get the best
if len(matches)>1:
ratio_matches = map(lambda match: (fuzz.ratio(clean4fuzzy(name_src2),clean4fuzzy(match)),match),matches)
ratio_matches.sort(reverse=True)
match = ratio_matches[0][1]
else:
match = matches[0]
name_src1 = match
if (name_src2.encode('utf8'),name_src1.encode('utf8')) in exclude_list:
print 'Ignoring bad match SRC2: %s\tSRC1: %s' % (name_src2.encode(console_encoding),name_src1.encode(console_encoding))
continue
#print 'SRC2: %s\tSRC1: %s' % (name_src2.encode(console_encoding), name_src1.encode(console_encoding))
print '("%s","%s"),' % (name_src2.encode(console_encoding), name_src1.encode(console_encoding))
new_river = river_src1(name_src1)
assert(new_river!=None)
id = new_river['_id']
new_river.update(river_src2(name_src2))
new_river['name_%s'%src2_input[1]] = new_river['_id'] # add src2 name
new_river['_id'] = id # keep old _id
river_output(new_river)
else:
if print_not_found:
print 'SRC2 river %s not found in SRC1' % (name_src2.encode(console_encoding))
if output[0]=='file':
with open(output[1],'w') as f:
json.dump(rivers_output,f)
elif output[0]=='mongo':
bulk_size = len(bulk)
if bulk_size > 0:
result = client.wwsupdb[output[1]].bulk_write(bulk)
#if not(result.modified_count+result.upserted_count == bulk_size):
# raise Exception("Cannot upsert into mongodb wwsupdb.%s result.modified_count=%s result.upserted_count=%s"%(output[1],result.modified_count,result.upserted_count))
output_reset()
def main():
if len(sys.argv)==7:
match((sys.argv[1],sys.argv[2]),(sys.argv[3],sys.argv[4],sys.argv[4]),(sys.argv[5],sys.argv[6]))
elif len(sys.argv)==2 and sys.argv[1] in ('-h','--help'):
print 'Usage: %s src1type src1 src2type src2 outtype output' % sys.argv[0]
print 'Example: %s mongo rivers mongo evo mongo rivers_merged' % sys.argv[0]
else:
match(('mongo','rivers'),('mongo','evo','evo'),('mongo','rivers_merged'))
match(('mongo','rivers'),('mongo','rivermap','rivermap'),('mongo','rivers_merged'))
match(('mongo','rivers'),('mongo','ckfiumi','ckfiumi'),('mongo','rivers_merged'))
if __name__=='__main__':
main()