-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtext_utils.py
More file actions
20 lines (16 loc) · 843 Bytes
/
text_utils.py
File metadata and controls
20 lines (16 loc) · 843 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# -*- coding: utf8 -*-
# Tokens to be ignored, don't forget the trailing space
unsignificant_tokens = ("Rivière d'","Rivière de",'Rivière '," (Ruisseau d')","Ruisseau d'","Ruisseau de la ","Ruisseau de l'"," (Ruisseau de)",'Ruisseau de ',"Ruisseau du ",'Ruisseau ','Le ','La ',"L'","Rio del ","Rio ","Fiume ","Torrent d'","Torrent de ","Torrent ","Torrente Valle ","Torrente ")
def clean4fuzzy(i):
return unicode(remove_tokens(i.encode('utf8'),unsignificant_tokens).lower().strip(),encoding='utf8')
def remove_tokens(s,tokens):
for token in tokens:
i = 0
while i>-1:
#i = unicode(s).find(unicode(token,encoding='utf8'))
i = s.find(token)
if i>-1:
s=s[:i]+s[i+len(token):]
return s
if __name__=='__main__':
print clean4fuzzy("Ourtrigous (Ruisseau d')")