diff --git a/lib/apriori.py b/lib/apriori.py index 0909d1d..5ad734b 100644 --- a/lib/apriori.py +++ b/lib/apriori.py @@ -4,6 +4,7 @@ a simple implementation of Apriori algorithm by Python. """ +from tqdm import tqdm import sys import csv import argparse @@ -285,7 +286,7 @@ def apriori(transactions, **kwargs): transaction_manager, min_support, max_length=max_length) # Calculate ordered stats. - for support_record in support_records: + for support_record in tqdm(support_records): ordered_statistics = list( _filter_ordered_statistics( _gen_ordered_statistics(transaction_manager, support_record), diff --git a/lib/bert.py b/lib/bert.py index c2e1077..063e2e0 100644 --- a/lib/bert.py +++ b/lib/bert.py @@ -1,3 +1,5 @@ +import multiprocessing +from tqdm import tqdm import torch import pandas as pd from torch import nn @@ -16,13 +18,15 @@ def __init__(self, transformer_model, random_seed): random.seed(random_seed) np.random.seed(random_seed) - torch.manual_seed(random_seed) + torch.manual_seed(random_seed) + torch.set_num_threads(multiprocessing.cpu_count()) self.random_seed = random_seed - self.model = AutoModelWithLMHead.from_pretrained(transformer_model) + # self.model = AutoModelWithLMHead.from_pretrained(transformer_model) + self.model = AutoModel.from_pretrained(transformer_model) self.tokenizer = AutoTokenizer.from_pretrained(transformer_model) self.terms = [] - self.embeddings = torch.FloatTensor([]) + self.embeddings = [] self.embeddings_2d = None self.diffs = [] self.embed = None @@ -34,22 +38,27 @@ def read_df(self,df, term_col = 'terms', diff_col = 'diffs'): self.diffs = df[diff_col].tolist() - def add_terms(self, texts): - for t in texts: + def add_terms(self, texts, method="sum"): + for t in tqdm(texts): if t not in self.terms: - emb = self.get_embedding(t) + emb = self.get_embedding(t, method=method) self.terms.append(t) - self.embeddings = torch.cat((self.embeddings, emb), dim=0) + self.embeddings.append(emb) + + self.embeddings = torch.cat(self.embeddings, dim=0) - def get_embedding(self, text): + def get_embedding(self, text, method="sum"): with torch.no_grad(): input_ids = torch.LongTensor(self.tokenizer.encode(text, add_special_tokens=False)).unsqueeze(0) outputs = self.model(input_ids) lh = outputs[0] if self.embed is not None: lh = self.embed(lh) - emb = torch.sum(lh, dim=1) + if method== "sum": + emb = torch.sum(lh, dim=1) + elif method== "mean": + emb = torch.mean(lh, dim=1) return emb diff --git a/lib/normalization.py b/lib/normalization.py index 55bb5b0..c1924ea 100644 --- a/lib/normalization.py +++ b/lib/normalization.py @@ -170,7 +170,7 @@ def normalize_corpus(corpus, lemmatize=True, normalized_corpus = [] - for text in corpus: + for text in tqdm(corpus): text = html_parser.unescape(text) text = expand_contractions(text, CONTRACTION_MAP) if lemmatize: diff --git a/requirements_colab.txt b/requirements_colab.txt index a988935..3515934 100644 --- a/requirements_colab.txt +++ b/requirements_colab.txt @@ -1,3 +1,4 @@ transformers pyfpgrowth unidecode +tqdm