From 9bf62d780dfc4147639c3471abe2040a12a79006 Mon Sep 17 00:00:00 2001 From: nittolese <25707285+nittolese@users.noreply.github.com> Date: Fri, 27 Mar 2020 00:40:26 +0100 Subject: [PATCH 1/6] Update apriori.py added tqdm --- lib/apriori.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/apriori.py b/lib/apriori.py index 0909d1d..5ad734b 100644 --- a/lib/apriori.py +++ b/lib/apriori.py @@ -4,6 +4,7 @@ a simple implementation of Apriori algorithm by Python. """ +from tqdm import tqdm import sys import csv import argparse @@ -285,7 +286,7 @@ def apriori(transactions, **kwargs): transaction_manager, min_support, max_length=max_length) # Calculate ordered stats. - for support_record in support_records: + for support_record in tqdm(support_records): ordered_statistics = list( _filter_ordered_statistics( _gen_ordered_statistics(transaction_manager, support_record), From 2002adcdb1a1b58a71a1d2ab4936c9284c274508 Mon Sep 17 00:00:00 2001 From: nittolese <25707285+nittolese@users.noreply.github.com> Date: Fri, 27 Mar 2020 00:41:45 +0100 Subject: [PATCH 2/6] Update bert.py added tqdm and multiprocessing --- lib/bert.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/bert.py b/lib/bert.py index c2e1077..58b0f77 100644 --- a/lib/bert.py +++ b/lib/bert.py @@ -1,3 +1,5 @@ +import multiprocessing +from tqdm import tqdm import torch import pandas as pd from torch import nn @@ -16,7 +18,8 @@ def __init__(self, transformer_model, random_seed): random.seed(random_seed) np.random.seed(random_seed) - torch.manual_seed(random_seed) + torch.manual_seed(random_seed) + torch.set_num_threads(multiprocessing.cpu_count()) self.random_seed = random_seed self.model = AutoModelWithLMHead.from_pretrained(transformer_model) @@ -35,7 +38,7 @@ def read_df(self,df, term_col = 'terms', diff_col = 'diffs'): def add_terms(self, texts): - for t in texts: + for t in tqdm(texts): if t not in self.terms: emb = self.get_embedding(t) self.terms.append(t) From e57ed253aac7c98b51f128d6ac35fab5bc914d12 Mon Sep 17 00:00:00 2001 From: nittolese <25707285+nittolese@users.noreply.github.com> Date: Fri, 27 Mar 2020 00:42:16 +0100 Subject: [PATCH 3/6] Update requirements_colab.txt added tqdm --- requirements_colab.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements_colab.txt b/requirements_colab.txt index a988935..3515934 100644 --- a/requirements_colab.txt +++ b/requirements_colab.txt @@ -1,3 +1,4 @@ transformers pyfpgrowth unidecode +tqdm From 13553a0c77183ad75e4a6f2f95bb3a9c91c4e12a Mon Sep 17 00:00:00 2001 From: nittolese <25707285+nittolese@users.noreply.github.com> Date: Fri, 27 Mar 2020 00:49:22 +0100 Subject: [PATCH 4/6] Update normalization.py added tqdm in loop cycle --- lib/normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/normalization.py b/lib/normalization.py index 55bb5b0..c1924ea 100644 --- a/lib/normalization.py +++ b/lib/normalization.py @@ -170,7 +170,7 @@ def normalize_corpus(corpus, lemmatize=True, normalized_corpus = [] - for text in corpus: + for text in tqdm(corpus): text = html_parser.unescape(text) text = expand_contractions(text, CONTRACTION_MAP) if lemmatize: From 52d8f66013980ff118f5421ad376702ecedbfdee Mon Sep 17 00:00:00 2001 From: nittolese <25707285+nittolese@users.noreply.github.com> Date: Fri, 27 Mar 2020 16:02:52 +0100 Subject: [PATCH 5/6] Update bert.py optimized bert --- lib/bert.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/bert.py b/lib/bert.py index 58b0f77..6621c22 100644 --- a/lib/bert.py +++ b/lib/bert.py @@ -22,7 +22,8 @@ def __init__(self, transformer_model, random_seed): torch.set_num_threads(multiprocessing.cpu_count()) self.random_seed = random_seed - self.model = AutoModelWithLMHead.from_pretrained(transformer_model) + # self.model = AutoModelWithLMHead.from_pretrained(transformer_model) + self.model = AutoModel.from_pretrained(transformer_model) self.tokenizer = AutoTokenizer.from_pretrained(transformer_model) self.terms = [] self.embeddings = torch.FloatTensor([]) @@ -37,22 +38,25 @@ def read_df(self,df, term_col = 'terms', diff_col = 'diffs'): self.diffs = df[diff_col].tolist() - def add_terms(self, texts): + def add_terms(self, texts, method="sum"): for t in tqdm(texts): if t not in self.terms: - emb = self.get_embedding(t) + emb = self.get_embedding(t, method=method) self.terms.append(t) self.embeddings = torch.cat((self.embeddings, emb), dim=0) - def get_embedding(self, text): + def get_embedding(self, text, method="sum"): with torch.no_grad(): input_ids = torch.LongTensor(self.tokenizer.encode(text, add_special_tokens=False)).unsqueeze(0) outputs = self.model(input_ids) lh = outputs[0] if self.embed is not None: lh = self.embed(lh) - emb = torch.sum(lh, dim=1) + if method== "sum": + emb = torch.sum(lh, dim=1) + elif method== "mean": + emb = torch.mean(lh, dim=1) return emb From 317c6bbc98c7319b8a895f821b6fd7fc85deebf2 Mon Sep 17 00:00:00 2001 From: nittolese <25707285+nittolese@users.noreply.github.com> Date: Fri, 27 Mar 2020 16:16:40 +0100 Subject: [PATCH 6/6] Update bert.py edited BERT --- lib/bert.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/bert.py b/lib/bert.py index 6621c22..063e2e0 100644 --- a/lib/bert.py +++ b/lib/bert.py @@ -26,7 +26,7 @@ def __init__(self, transformer_model, random_seed): self.model = AutoModel.from_pretrained(transformer_model) self.tokenizer = AutoTokenizer.from_pretrained(transformer_model) self.terms = [] - self.embeddings = torch.FloatTensor([]) + self.embeddings = [] self.embeddings_2d = None self.diffs = [] self.embed = None @@ -43,7 +43,9 @@ def add_terms(self, texts, method="sum"): if t not in self.terms: emb = self.get_embedding(t, method=method) self.terms.append(t) - self.embeddings = torch.cat((self.embeddings, emb), dim=0) + self.embeddings.append(emb) + + self.embeddings = torch.cat(self.embeddings, dim=0) def get_embedding(self, text, method="sum"):