From 8cdd640dff9cdbc04112c147fd316910e91c4ed8 Mon Sep 17 00:00:00 2001 From: sjyan Date: Thu, 19 Oct 2017 22:42:28 +0800 Subject: [PATCH] Fix a bug caused by commas in the embeddings.w2v.txt file --- nea/w2vEmbReader.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/nea/w2vEmbReader.py b/nea/w2vEmbReader.py index 04f0d52..daac508 100644 --- a/nea/w2vEmbReader.py +++ b/nea/w2vEmbReader.py @@ -28,9 +28,10 @@ def __init__(self, emb_path, emb_dim=None): counter = 0 for line in emb_file: tokens = line.split() - assert len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info' word = tokens[0] - vec = tokens[1:] + str = tokens[1] + vec = str.split(',') + assert len(vec) == self.emb_dim, 'The number of dimensions does not match the header info' self.embeddings[word] = vec counter += 1 assert counter == self.vocab_size, 'Vocab size does not match the header info' @@ -41,13 +42,16 @@ def __init__(self, emb_path, emb_dim=None): self.embeddings = {} for line in emb_file: tokens = line.split() + word = tokens[0] + str = tokens[1] + vec = str.split(',') + if self.emb_dim == -1: - self.emb_dim = len(tokens) - 1 + self.emb_dim = len(vec) assert self.emb_dim == emb_dim, 'The embeddings dimension does not match with the requested dimension' else: - assert len(tokens) == self.emb_dim + 1, 'The number of dimensions does not match the header info' - word = tokens[0] - vec = tokens[1:] + assert len(vec) == self.emb_dim, 'The number of dimensions does not match the header info' + self.embeddings[word] = vec self.vocab_size += 1