From 590eac5efd936216cbd99501cb3182c8e83278ba Mon Sep 17 00:00:00 2001 From: FrozenWolf-Cyber Date: Mon, 15 Aug 2022 10:29:12 +0530 Subject: [PATCH 1/2] Optimized garbage collection, Added tqdm progressbars --- build_graph.py | 217 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 174 insertions(+), 43 deletions(-) diff --git a/build_graph.py b/build_graph.py index e2b75f6..88e4987 100644 --- a/build_graph.py +++ b/build_graph.py @@ -1,16 +1,12 @@ import os +import gc import random import numpy as np import pickle as pkl -import networkx as nx import scipy.sparse as sp -from utils import loadWord2Vec, clean_str from math import log -from sklearn import svm -from nltk.corpus import wordnet as wn -from sklearn.feature_extraction.text import TfidfVectorizer import sys -from scipy.spatial.distance import cosine +from tqdm import tqdm if len(sys.argv) != 2: sys.exit("Use: python build_graph.py ") @@ -61,7 +57,9 @@ for train_name in doc_train_list: train_id = doc_name_list.index(train_name) train_ids.append(train_id) -print(train_ids) + del train_id + +gc.collect() random.shuffle(train_ids) # partial labeled data @@ -71,18 +69,22 @@ f = open('data/' + dataset + '.train.index', 'w') f.write(train_ids_str) f.close() +del train_ids_str, doc_train_list test_ids = [] for test_name in doc_test_list: test_id = doc_name_list.index(test_name) test_ids.append(test_id) -print(test_ids) + del test_id + +gc.collect() random.shuffle(test_ids) test_ids_str = '\n'.join(str(index) for index in test_ids) f = open('data/' + dataset + '.test.index', 'w') f.write(test_ids_str) f.close() +del test_ids_str ids = train_ids + test_ids print(ids) @@ -99,26 +101,33 @@ f = open('data/' + dataset + '_shuffle.txt', 'w') f.write(shuffle_doc_name_str) f.close() +del shuffle_doc_name_str, doc_name_list, doc_test_list, doc_content_list f = open('data/corpus/' + dataset + '_shuffle.txt', 'w') f.write(shuffle_doc_words_str) f.close() +del shuffle_doc_words_str # build vocab -word_freq = {} +# word_freq = {} word_set = set() for doc_words in shuffle_doc_words_list: words = doc_words.split() for word in words: word_set.add(word) - if word in word_freq: - word_freq[word] += 1 - else: - word_freq[word] = 1 + # if word in word_freq: + # word_freq[word] += 1 + # else: + # word_freq[word] = 1 + + del words vocab = list(word_set) vocab_size = len(vocab) +del word_set +gc.collect + word_doc_list = {} for i in range(len(shuffle_doc_words_list)): @@ -134,11 +143,20 @@ word_doc_list[word] = doc_list else: word_doc_list[word] = [i] + appeared.add(word) + del word + +del appeared +gc.collect() word_doc_freq = {} for word, doc_list in word_doc_list.items(): word_doc_freq[word] = len(doc_list) + del word, doc_list + +del word_doc_list +gc.collect() word_id_map = {} for i in range(vocab_size): @@ -149,6 +167,8 @@ f = open('data/corpus/' + dataset + '_vocab.txt', 'w') f.write(vocab_str) f.close() +del vocab_str +gc.collect() ''' Word definitions begin @@ -212,12 +232,18 @@ for doc_meta in shuffle_doc_name_list: temp = doc_meta.split('\t') label_set.add(temp[2]) + del temp + label_list = list(label_set) +del label_set +gc.collect() label_list_str = '\n'.join(label_list) f = open('data/corpus/' + dataset + '_labels.txt', 'w') f.write(label_list_str) f.close() +del label_list_str +gc.collect() # x: feature vectors of training docs, no initial features # slect 90% training set @@ -232,6 +258,8 @@ f = open('data/' + dataset + '.real_train.name', 'w') f.write(real_train_doc_names_str) f.close() +del real_train_doc_names_str, real_train_doc_names +gc.collect() row_x = [] col_x = [] @@ -254,10 +282,17 @@ # np.random.uniform(-0.25, 0.25) data_x.append(doc_vec[j] / doc_len) # doc_vec[j]/ doc_len + del doc_vec, doc_words, words + +gc.collect() + # x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32) x = sp.csr_matrix((data_x, (row_x, col_x)), shape=( real_train_size, word_embeddings_dim)) +del row_x, col_x, data_x +gc.collect() + y = [] for i in range(real_train_size): doc_meta = shuffle_doc_name_list[i] @@ -267,6 +302,11 @@ label_index = label_list.index(label) one_hot[label_index] = 1 y.append(one_hot) + + del doc_meta, label, one_hot, temp + +gc.collect() + y = np.array(y) print(y) @@ -285,6 +325,8 @@ if word in word_vector_map: word_vector = word_vector_map[word] doc_vec = doc_vec + np.array(word_vector) + + del word_vector for j in range(word_embeddings_dim): row_tx.append(i) @@ -292,10 +334,17 @@ # np.random.uniform(-0.25, 0.25) data_tx.append(doc_vec[j] / doc_len) # doc_vec[j] / doc_len + del doc_vec, doc_words, words + +gc.collect() + # tx = sp.csr_matrix((test_size, word_embeddings_dim), dtype=np.float32) tx = sp.csr_matrix((data_tx, (row_tx, col_tx)), shape=(test_size, word_embeddings_dim)) +del row_tx, col_tx, data_tx +gc.collect() + ty = [] for i in range(test_size): doc_meta = shuffle_doc_name_list[i + train_size] @@ -305,6 +354,9 @@ label_index = label_list.index(label) one_hot[label_index] = 1 ty.append(one_hot) + + del doc_meta, label, one_hot, temp + ty = np.array(ty) print(ty) @@ -321,6 +373,8 @@ vector = word_vector_map[word] word_vectors[i] = vector + del vector + row_allx = [] col_allx = [] data_allx = [] @@ -335,11 +389,19 @@ word_vector = word_vector_map[word] doc_vec = doc_vec + np.array(word_vector) + del word_vector + for j in range(word_embeddings_dim): row_allx.append(int(i)) col_allx.append(j) # np.random.uniform(-0.25, 0.25) data_allx.append(doc_vec[j] / doc_len) # doc_vec[j]/doc_len + + del doc_vec, doc_words, words + +del word_vector_map +gc.collect() + for i in range(vocab_size): for j in range(word_embeddings_dim): row_allx.append(int(i + train_size)) @@ -354,6 +416,9 @@ allx = sp.csr_matrix( (data_allx, (row_allx, col_allx)), shape=(train_size + vocab_size, word_embeddings_dim)) +del row_allx, col_allx, data_allx, word_vectors +gc.collect() + ally = [] for i in range(train_size): doc_meta = shuffle_doc_name_list[i] @@ -364,23 +429,68 @@ one_hot[label_index] = 1 ally.append(one_hot) + del doc_meta, temp, one_hot + for i in range(vocab_size): one_hot = [0 for l in range(len(label_list))] ally.append(one_hot) + del one_hot + +del shuffle_doc_name_list +gc.collect() + ally = np.array(ally) print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape) +print("Pickling data...") +f = open("data/ind.{}.x".format(dataset), 'wb') +pkl.dump(x, f) +f.close() +del x +gc.collect() + +f = open("data/ind.{}.y".format(dataset), 'wb') +pkl.dump(y, f) +f.close() +del y +gc.collect() + +f = open("data/ind.{}.tx".format(dataset), 'wb') +pkl.dump(tx, f) +f.close() +del tx +gc.collect() + +f = open("data/ind.{}.ty".format(dataset), 'wb') +pkl.dump(ty, f) +f.close() +del ty +gc.collect() + +f = open("data/ind.{}.allx".format(dataset), 'wb') +pkl.dump(allx, f) +f.close() +del allx +gc.collect() + +f = open("data/ind.{}.ally".format(dataset), 'wb') +pkl.dump(ally, f) +f.close() +del ally, f +gc.collect() + + ''' Doc word heterogeneous graph ''' - +print("word co-occurence with context windows") # word co-occurence with context windows window_size = 20 windows = [] -for doc_words in shuffle_doc_words_list: +for doc_words in tqdm(shuffle_doc_words_list): words = doc_words.split() length = len(words) if length <= window_size: @@ -391,10 +501,15 @@ window = words[j: j + window_size] windows.append(window) # print(window) + del window + + del words +gc.collect() +print("1") word_window_freq = {} -for window in windows: +for window in tqdm(windows): appeared = set() for i in range(len(window)): if window[i] in appeared: @@ -405,8 +520,14 @@ word_window_freq[window[i]] = 1 appeared.add(window[i]) + del window + +del appeared +gc.collect() +print("2") + word_pair_count = {} -for window in windows: +for window in tqdm(windows): for i in range(1, len(window)): for j in range(0, i): word_i = window[i] @@ -427,15 +548,22 @@ else: word_pair_count[word_pair_str] = 1 + del word_i, word_i_id, word_j, word_j_id, word_pair_str + + del window + +gc.collect() +print("3") + row = [] col = [] weight = [] # pmi as weights - +print("pmi as weights") num_window = len(windows) -for key in word_pair_count: +for key in tqdm(word_pair_count): temp = key.split(',') i = int(temp[0]) j = int(temp[1]) @@ -450,6 +578,11 @@ col.append(train_size + j) weight.append(pmi) + del pmi, word_freq_i, word_freq_j, temp + +del word_pair_count, word_window_freq, windows +gc.collect() + # word vector cosine similarity as weights ''' @@ -466,9 +599,10 @@ weight.append(similarity) ''' # doc word frequency +print("doc word frequency") doc_word_freq = {} -for doc_id in range(len(shuffle_doc_words_list)): +for doc_id in tqdm(range(len(shuffle_doc_words_list))): doc_words = shuffle_doc_words_list[doc_id] words = doc_words.split() for word in words: @@ -479,7 +613,12 @@ else: doc_word_freq[doc_word_str] = 1 -for i in range(len(shuffle_doc_words_list)): + del doc_words, words, doc_word_str + +gc.collect() +print("1") + +for i in tqdm(range(len(shuffle_doc_words_list))): doc_words = shuffle_doc_words_list[i] words = doc_words.split() doc_word_set = set() @@ -499,35 +638,27 @@ weight.append(freq * idf) doc_word_set.add(word) -node_size = train_size + vocab_size + test_size -adj = sp.csr_matrix( - (weight, (row, col)), shape=(node_size, node_size)) + del doc_words, words, idf, doc_word_set -# dump objects -f = open("data/ind.{}.x".format(dataset), 'wb') -pkl.dump(x, f) -f.close() -f = open("data/ind.{}.y".format(dataset), 'wb') -pkl.dump(y, f) -f.close() +del shuffle_doc_words_list, word_id_map +gc.collect() -f = open("data/ind.{}.tx".format(dataset), 'wb') -pkl.dump(tx, f) -f.close() +print("2") -f = open("data/ind.{}.ty".format(dataset), 'wb') -pkl.dump(ty, f) -f.close() +node_size = train_size + vocab_size + test_size +adj = sp.csr_matrix( + (weight, (row, col)), shape=(node_size, node_size)) -f = open("data/ind.{}.allx".format(dataset), 'wb') -pkl.dump(allx, f) -f.close() +del weight, row, col +gc.collect() +print("3") + +# dump objects -f = open("data/ind.{}.ally".format(dataset), 'wb') -pkl.dump(ally, f) -f.close() f = open("data/ind.{}.adj".format(dataset), 'wb') pkl.dump(adj, f) f.close() +del adj +gc.collect() From 6ea489eeb532b349fa8af11d41cc194a37acb103 Mon Sep 17 00:00:00 2001 From: FrozenWolf-Cyber Date: Mon, 15 Aug 2022 10:45:30 +0530 Subject: [PATCH 2/2] Added ignite progressbar --- train_bert_gcn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/train_bert_gcn.py b/train_bert_gcn.py index 178b051..f15872b 100644 --- a/train_bert_gcn.py +++ b/train_bert_gcn.py @@ -5,6 +5,7 @@ import dgl import torch.utils.data as Data from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer, Engine +from ignite.contrib.handlers.tqdm_logger import ProgressBar from ignite.metrics import Accuracy, Loss from sklearn.metrics import accuracy_score import numpy as np @@ -214,6 +215,8 @@ def train_step(engine, batch): trainer = Engine(train_step) +pbar = ProgressBar() +pbar.attach(trainer) @trainer.on(Events.EPOCH_COMPLETED)