forked from babylonhealth/fastText_multilingual
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfasttext.py
More file actions
66 lines (55 loc) · 2.07 KB
/
fasttext.py
File metadata and controls
66 lines (55 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#
# Copyright (c) 2017-present, babylon health
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
import numpy as np
class FastVector:
"""
Minimal wrapper for fastvector embeddings.
```
Usage:
$ model = FastVector(vector_file='/path/to/wiki.en.vec')
$ 'apple' in model
> TRUE
$ model['apple'].shape
> (300,)
```
"""
def __init__(self, vector_file='', transform=None):
"""Read in word vectors in fasttext format"""
self.word2id = {}
print('reading word vectors from %s' % vector_file)
with open(vector_file, 'r') as f:
(self.n_words, self.n_dim) = \
(int(x) for x in f.readline().rstrip('\n').split(' '))
self.embed = np.zeros((self.n_words, self.n_dim))
for i, line in enumerate(f):
elems = line.rstrip('\n').split(' ')
self.word2id[elems[0]] = i
self.embed[i] = elems[1:self.n_dim+1]
if transform is not None:
print('Applying transformation to embedding')
self.apply_transform(transform)
def apply_transform(self, transform):
"""
Apply the given transformation to the vector space
Right-multiplies given transform with embeddings E:
E = E * transform
Transform can either be a string with a filename to a
text file containing a ndarray (compat. with np.loadtxt)
or a numpy ndarray.
"""
transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
self.embed = np.matmul(self.embed, transmat)
@classmethod
def cosine_similarity(cls, vec_a, vec_b):
"""Compute cosine similarity between vec_a and vec_b"""
return np.dot(vec_a, vec_b) / \
(np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
def __contains__(self, key):
return key in self.word2id
def __getitem__(self, key):
return self.embed[self.word2id[key]]