From ea5e3f69814ba78182d8b1809fb2df28a1c0018a Mon Sep 17 00:00:00 2001 From: heejun Date: Sun, 21 Nov 2021 11:22:28 +0900 Subject: [PATCH 1/4] tokens added to context_tokens and response --- crslab/data/dataset/redial/redial.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crslab/data/dataset/redial/redial.py b/crslab/data/dataset/redial/redial.py index c84565b..f713c02 100644 --- a/crslab/data/dataset/redial/redial.py +++ b/crslab/data/dataset/redial/redial.py @@ -185,6 +185,12 @@ def _augment_and_add(self, raw_conv_dict): entity_set, word_set = set(), set() for i, conv in enumerate(raw_conv_dict): text_tokens, entities, movies, words = conv["text"], conv["entity"], conv["movie"], conv["word"] + + if conv['role'] == 'Seeker': + text_tokens.insert(0,23096) + else: + text_tokens.insert(0,23097) + if len(context_tokens) > 0: conv_dict = { "role": conv['role'], From 098379e799b9cc09f4ce48f6956bd0d31c717964 Mon Sep 17 00:00:00 2001 From: krta2 Date: Tue, 23 Nov 2021 14:56:41 +0900 Subject: [PATCH 2/4] Add role tokens --- crslab/data/dataset/redial/redial.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crslab/data/dataset/redial/redial.py b/crslab/data/dataset/redial/redial.py index f713c02..83ca774 100644 --- a/crslab/data/dataset/redial/redial.py +++ b/crslab/data/dataset/redial/redial.py @@ -108,6 +108,16 @@ def _load_raw_data(self): def _load_vocab(self): self.tok2ind = json.load(open(os.path.join(self.dpath, 'token2id.json'), 'r', encoding='utf-8')) + + # Add role tokens + last_index = len(self.tok2ind) + self.role_seeker_token_idx = last_index + self.role_recommender_token_idx = last_index + 1 + self.tok2ind.update({ + "__Seeker__": self.role_seeker_token_idx, + "__Recommender__": self.role_recommender_token_idx, + }) + self.ind2tok = {idx: word for word, idx in self.tok2ind.items()} logger.debug(f"[Load vocab from {os.path.join(self.dpath, 'token2id.json')}]") @@ -187,9 +197,9 @@ def _augment_and_add(self, raw_conv_dict): text_tokens, entities, movies, words = conv["text"], conv["entity"], conv["movie"], conv["word"] if conv['role'] == 'Seeker': - text_tokens.insert(0,23096) + text_tokens.insert(0, self.role_seeker_token_idx) else: - text_tokens.insert(0,23097) + text_tokens.insert(0, self.role_recommender_token_idx) if len(context_tokens) > 0: conv_dict = { From 41ffc492ab90908edd66947b66314475350f1067 Mon Sep 17 00:00:00 2001 From: krta2 Date: Tue, 23 Nov 2021 15:08:54 +0900 Subject: [PATCH 3/4] Refactor adding items to dictionary - just assign rather than update because of large dict size --- crslab/data/dataset/redial/redial.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/crslab/data/dataset/redial/redial.py b/crslab/data/dataset/redial/redial.py index 83ca774..50f308c 100644 --- a/crslab/data/dataset/redial/redial.py +++ b/crslab/data/dataset/redial/redial.py @@ -113,10 +113,8 @@ def _load_vocab(self): last_index = len(self.tok2ind) self.role_seeker_token_idx = last_index self.role_recommender_token_idx = last_index + 1 - self.tok2ind.update({ - "__Seeker__": self.role_seeker_token_idx, - "__Recommender__": self.role_recommender_token_idx, - }) + self.tok2ind["__Seeker__"] = self.role_seeker_token_idx + self.tok2ind["__Recommender__"] = self.role_recommender_token_idx self.ind2tok = {idx: word for word, idx in self.tok2ind.items()} From 7f6b5ac613b9c7dafe0f8521e386c19c9571435a Mon Sep 17 00:00:00 2001 From: krta2 Date: Tue, 23 Nov 2021 16:35:39 +0900 Subject: [PATCH 4/4] Update role token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 이미 있는 토큰 사용 --- crslab/data/dataset/redial/redial.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/crslab/data/dataset/redial/redial.py b/crslab/data/dataset/redial/redial.py index 50f308c..9089a95 100644 --- a/crslab/data/dataset/redial/redial.py +++ b/crslab/data/dataset/redial/redial.py @@ -110,11 +110,11 @@ def _load_vocab(self): self.tok2ind = json.load(open(os.path.join(self.dpath, 'token2id.json'), 'r', encoding='utf-8')) # Add role tokens - last_index = len(self.tok2ind) - self.role_seeker_token_idx = last_index - self.role_recommender_token_idx = last_index + 1 - self.tok2ind["__Seeker__"] = self.role_seeker_token_idx - self.tok2ind["__Recommender__"] = self.role_recommender_token_idx + # last_index = len(self.tok2ind) + # self.role_seeker_token_idx = last_index + # self.role_recommender_token_idx = last_index + 1 + # self.tok2ind["__Seeker__"] = self.role_seeker_token_idx + # self.tok2ind["__Recommender__"] = self.role_recommender_token_idx self.ind2tok = {idx: word for word, idx in self.tok2ind.items()} @@ -194,6 +194,9 @@ def _augment_and_add(self, raw_conv_dict): for i, conv in enumerate(raw_conv_dict): text_tokens, entities, movies, words = conv["text"], conv["entity"], conv["movie"], conv["word"] + # Add role token in front of text_tokens + self.role_seeker_token_idx = 2459 # "seeker" + self.role_recommender_token_idx = 1755 # "recommender" if conv['role'] == 'Seeker': text_tokens.insert(0, self.role_seeker_token_idx) else: