-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsynsets.py
More file actions
93 lines (78 loc) · 3.06 KB
/
synsets.py
File metadata and controls
93 lines (78 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from collections import defaultdict
from typing import Iterable, Set, Dict, List
class Synsets:
"""Simple synset loader and expander based on Moby-style CSV lines.
Each line in the file is a comma-separated group of synonyms.
"""
def __init__(self):
# word -> set_id
self.word_to_synset: Dict[str, int] = {}
self.synset_to_words: Dict[int, Set[str]] = defaultdict(set)
self._next_id = 0
def add_group(self, words: Iterable[str]) -> int:
clean = [w.strip().lower() for w in words if w and w.strip()]
if len(clean) < 2:
return -1
syn_id = self._next_id
self._next_id += 1
for w in clean:
if w not in self.word_to_synset:
self.word_to_synset[w] = syn_id
self.synset_to_words[syn_id].add(w)
return syn_id
def load_from_file(self, filepath: str) -> int:
count = 0
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
line = line.strip()
if not line:
continue
parts = [p.strip() for p in line.split(',') if p.strip()]
if len(parts) >= 2:
self.add_group(parts)
count += 1
return count
def expand(self, word: str) -> Set[str]:
w = word.strip().lower()
if w in self.word_to_synset:
sid = self.word_to_synset[w]
return set(self.synset_to_words[sid])
return {w}
def expand_words(self, words: Iterable[str]) -> Set[str]:
out = set()
for w in words:
out.update(self.expand(w))
return out
from typing import Dict, Set, Iterable
class SynsetMap:
"""Simple synset manager: map word -> set(synonyms)"""
def __init__(self):
self.map: Dict[str, Set[str]] = {}
def add_group(self, words: Iterable[str]):
clean = [w.strip().lower() for w in words if w and w.strip()]
if len(clean) < 2:
return
s = set(clean)
for w in clean:
if w not in self.map:
self.map[w] = set()
self.map[w].update(s - {w})
def expand_word(self, word: str) -> Set[str]:
w = word.strip().lower()
return {w} | set(self.map.get(w, []))
def load_from_file(self, filepath: str, delimiter: str = ',') -> int:
"""Load comma-separated synset groups from a file.
Each line is a group where items are separated by delimiter.
Returns the number of groups added.
"""
added = 0
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
line = line.strip()
if not line:
continue
parts = [p.strip() for p in line.split(delimiter) if p.strip()]
if len(parts) >= 2:
self.add_group(parts)
added += 1
return added