From a6d24aa66c5ac6998e4b7d4293c41cd3ea18b619 Mon Sep 17 00:00:00 2001 From: steven-mindswire Date: Sun, 21 Feb 2021 22:51:13 -0500 Subject: [PATCH 1/2] production and testing model --- nlp/production_models/.DS_Store | Bin 0 -> 8196 bytes nlp/production_models/bert_script | 12 ++ nlp/production_models/embedded_valid.py | 111 ++++++++++++ nlp/production_models/embedded_valid2.py | 140 +++++++++++++++ nlp/production_models/embedding_train_data.py | 25 +++ nlp/production_models/embedding_wrap.py | 50 ++++++ nlp/production_models/embedding_wrap2.py | 53 ++++++ nlp/production_models/embedding_wrap_lstm.py | 49 +++++ nlp/production_models/fasttext_kfold.py | 135 ++++++++++++++ nlp/production_models/kfold_data_gen.py | 57 ++++++ nlp/production_models/kfold_regen.py | 28 +++ nlp/production_models/kfold_train_data.py | 20 +++ nlp/production_models/prod_model_lstm.py | 111 ++++++++++++ .../prod_model_transformer.py | 113 ++++++++++++ .../prod_model_transformer2.py | 141 +++++++++++++++ nlp/production_models/t_tests.py | 83 +++++++++ nlp/test_models/embedded_valid_test.py | 106 +++++++++++ nlp/test_models/embedding_wrap_lstm2.py | 49 +++++ nlp/test_models/input_filter.py | 47 +++++ nlp/test_models/model_char_a.py | 27 +++ nlp/test_models/model_cnn_b.py | 128 ++++++++++++++ nlp/test_models/model_file_comb.py | 7 + nlp/test_models/model_lstm.py | 127 +++++++++++++ nlp/test_models/model_lstm_a_filtered.py | 167 ++++++++++++++++++ nlp/test_models/model_lstm_b.py | 122 +++++++++++++ nlp/test_models/model_lstm_c.py | 139 +++++++++++++++ nlp/test_models/model_lstm_c_filtered.py | 159 +++++++++++++++++ nlp/test_models/model_lstm_d.py | 137 ++++++++++++++ nlp/test_models/model_lstm_e.py | 148 ++++++++++++++++ nlp/test_models/prod_model_lstm2.py | 110 ++++++++++++ 30 files changed, 2601 insertions(+) create mode 100644 nlp/production_models/.DS_Store create mode 100644 nlp/production_models/bert_script create mode 100644 nlp/production_models/embedded_valid.py create mode 100644 nlp/production_models/embedded_valid2.py create mode 100644 nlp/production_models/embedding_train_data.py create mode 100644 nlp/production_models/embedding_wrap.py create mode 100644 nlp/production_models/embedding_wrap2.py create mode 100644 nlp/production_models/embedding_wrap_lstm.py create mode 100644 nlp/production_models/fasttext_kfold.py create mode 100644 nlp/production_models/kfold_data_gen.py create mode 100644 nlp/production_models/kfold_regen.py create mode 100644 nlp/production_models/kfold_train_data.py create mode 100644 nlp/production_models/prod_model_lstm.py create mode 100644 nlp/production_models/prod_model_transformer.py create mode 100644 nlp/production_models/prod_model_transformer2.py create mode 100644 nlp/production_models/t_tests.py create mode 100644 nlp/test_models/embedded_valid_test.py create mode 100644 nlp/test_models/embedding_wrap_lstm2.py create mode 100644 nlp/test_models/input_filter.py create mode 100644 nlp/test_models/model_char_a.py create mode 100644 nlp/test_models/model_cnn_b.py create mode 100644 nlp/test_models/model_file_comb.py create mode 100644 nlp/test_models/model_lstm.py create mode 100644 nlp/test_models/model_lstm_a_filtered.py create mode 100644 nlp/test_models/model_lstm_b.py create mode 100644 nlp/test_models/model_lstm_c.py create mode 100644 nlp/test_models/model_lstm_c_filtered.py create mode 100644 nlp/test_models/model_lstm_d.py create mode 100644 nlp/test_models/model_lstm_e.py create mode 100644 nlp/test_models/prod_model_lstm2.py diff --git a/nlp/production_models/.DS_Store b/nlp/production_models/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8460dc1f37023646b3cccae0e8c7788f2493973e GIT binary patch literal 8196 zcmeHMU2GIp6h5adFt<>83lxT8b+uB8th!tNN)gz$+e)bjvi;*nwzE4!J8^bq*_qu^ zYOzrt1rz@gdC(Z11fowQ>Vpp&jYboVsX-omG3tXa#uwv*M$er)C8cx|p46mslY7sd zd(N4A&v(DMw`U6g*jd!$04e||RE6Zrsac>1y{Na9ieQA1MDho)AWhW;4|2CzhXWx3 zAp#)+Ap#)+Ap-vg1nABdMJaRdOKsSO2!seMN(99D5TPn$B9PNk`bP&fehNUciiZ6{ zeaZuTpGY7Rft;4occpjA>;VH;3{nh~?xc?hbCQWbPD?4>0i`=&Ff#@j3i`8?U&J>D zOh_5FAp#)+w<92O_iD=49nis;o4;qq-F(J!GNk`JlebLQi^Z;EMP=2JrOOnhTB)w7 zelj)YjHf)!E4rOp;kbA@ta)RZpx^H}*3G__88nUYG+)!@*q&zEM!|PBb%RV!44Jm> zjCVRkTX+5Uq@+Son8MOrot$jl8I81Uor*>#<6GNe(MUWN-!?VHlyw{9dwNH56K5x% zo;vrsj1Pk|0&3^Qd$C-wx|_8!v|Qo1AR5+aJe75F`8h9K*v>6=Nz^JPW>NcY7B$9} zt38kI?@e=Fbd4j1rFmw_NNS$prg^PP@bzXL`*6W15mZB2$13J*($r-&D_bOEqrp|>S2@ULYHC;Vwd+)M zxR^0d7fo-1Hz|t$t_^Ae4%6o3C={2~VN?$9 zQTM4Q#mtXsu6HPJ*kV*I@)V zVl%ejW^Bcs*oN)68jzgHn5zJr~Ei7UQC-5vjhcDsF_$t1JZ{pi{8L!|~ zyoMj($8v2Ivn)R_hm_RCPpBzJ&ft@-IJpox8;G2n{u6TESm85r)#^3%cQ>^}<2!aI zZtm#3J-mfxbD8DSN%F<#(JW8=9i1WGu5Vy>HO|s{KHUe1dBzz<(1$OR)(!XGrx0Vf z;y<#6=1q)f%`wn6Zy{|(pnEt*I*nIK9ox2DA~g@BSdpRU`K~r{V@|B-U*r-aUX(+>Seld%Jw* zbz=|i$0zUzrf`7xH-Z}RPsdZ}U>;ASOFTS>&)_sZi_hbEynrtfA73FpzJZtKF|cO1z+yuHtCFc?Cq-TEOcc3xgz5-YQH0&Jl)ei!emYLlPsd5#{D&d+ gBV;Ok5`mnSl18Zf>puj@KWoGLKfM2a{w>z#A5h;n+yDRo literal 0 HcmV?d00001 diff --git a/nlp/production_models/bert_script b/nlp/production_models/bert_script new file mode 100644 index 0000000..c5af485 --- /dev/null +++ b/nlp/production_models/bert_script @@ -0,0 +1,12 @@ +export BERT_BASE_DIR=/Users/stevenlu/Downloads/bert-master/uncased_L-12_H-768_A-12 + + +python3 extract_features.py \ + --input_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content/4th_fold_content.txt \ + --output_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded/4th_fold_base_-3_-2_-1.jsonl \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --layers=-10,-11,-12 \ + --max_seq_length=128 \ + --batch_size=8 diff --git a/nlp/production_models/embedded_valid.py b/nlp/production_models/embedded_valid.py new file mode 100644 index 0000000..3fade06 --- /dev/null +++ b/nlp/production_models/embedded_valid.py @@ -0,0 +1,111 @@ +from tensorflow import keras,nn +from tensorflow.keras import layers +import json +import numpy as np +import tensorflow as tf +import os +import sys + +#sudo python3 -m pip install h5py==2.10.0 + + +def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter): + date_files = {} + for file_d in train_files: + file_name,file_len = file_d.split(':') + date_files[file_name] = int(file_len) + total_size = 0 + for train_f in date_files: + total_size+=date_files[train_f] + input_sample = np.zeros((total_size, *(sentence_len,embedd_dim))) + out_sample = [] + i = 0 + for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + row = json.loads(row) + features = row['features'] + if token_len_filter: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i max_score[1]: + max_score = score + max_model = f + elif score[1] == max_score[1] and max_model is not None: + epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-') + epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-') + if score[0]epoch_n_m: + max_score = score + max_model = f + print('max_score: ',max_score) + print ('max_model: ',max_model) + valid_input,valid_output= data_gen(valid_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter) + model = keras.models.load_model('{}/{}'.format(model_folder,max_model)) + score = model.evaluate(valid_input, valid_output) + print('valid_score: ',score) + return max_model,score,max_score,len(valid_input) + +if __name__=='__main__': + #sentence_len = 50 + #embedd_dim = 768 + #test_date_files = '2th_fold.json:1684' + #valid_date_files = '3th_fold.json:1684' + #model_folder = 'model_lstm_3_small' + #token_len_filter = True + #token_len_range = ['0', '50'] + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + test_date_files = sys.argv[3].split(',') + valid_date_files = sys.argv[4].split(',') + model_folder = sys.argv[5] + token_len_filter = sys.argv[6].lower() == 'true' + token_len_range = sys.argv[7].split(',') + #python3 embedded_valid.py 768 50 3th_fold.json:1684 4th_fold.json:1691 trans_model1_filtered true 0,50 + mdoel_valid(embedd_dim, sentence_len, test_date_files, valid_date_files, model_folder, token_len_range, + token_len_filter) \ No newline at end of file diff --git a/nlp/production_models/embedded_valid2.py b/nlp/production_models/embedded_valid2.py new file mode 100644 index 0000000..9109943 --- /dev/null +++ b/nlp/production_models/embedded_valid2.py @@ -0,0 +1,140 @@ +from tensorflow import keras,nn +from tensorflow.keras import layers +import json +import numpy as np +import tensorflow as tf +import os +import sys + +#sudo python3 -m pip install h5py==2.10.0 + + +def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter): + date_files = {} + for file_d in train_files: + file_name,file_len = file_d.split(':') + date_files[file_name] = int(file_len) + total_size = 0 + for train_f in date_files: + total_size+=date_files[train_f] + input_sample = np.zeros((total_size, *(sentence_len,embedd_dim))) + out_sample = [] + i = 0 + for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + row = json.loads(row) + features = row['features'] + if token_len_filter: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i max_score[1]: + max_score = score + max_model = f + elif score[1] == max_score[1] and max_model is not None: + epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-') + epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-') + if score[0]epoch_n_m: + max_score = score + max_model = f + print('max_score: ',max_score) + print ('max_model: ',max_model) + valid_input,valid_output= data_gen(valid_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter) + valid_sample_ind = np.where(valid_input != 0, 1, 0) + valid_pos_ind = np.multiply(position_embb,valid_sample_ind) + valid_pos =valid_input+(valid_pos_ind*pos_encode_scale) + model = keras.models.load_model('{}/{}'.format(model_folder,max_model)) + score = model.evaluate(valid_pos, valid_output) + print('valid_score: ',score) + print (score_l) + return max_model,score,max_score,len(valid_input) + +if __name__=='__main__': + #sentence_len = 50 + #embedd_dim = 768 + #test_date_files = '2th_fold.json:1684' + #valid_date_files = '3th_fold.json:1684' + #model_folder = 'model_lstm_3_small' + #token_len_filter = True + #token_len_range = ['0', '50'] + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + test_date_files = sys.argv[3].split(',') + valid_date_files = sys.argv[4].split(',') + model_folder = sys.argv[5] + token_len_filter = sys.argv[6].lower() == 'true' + token_len_range = sys.argv[7].split(',') + #python3 embedded_valid2.py 768 50 3th_fold.json:1684 4th_fold.json:1691 transB_1_filtered true 0,50 + #inputs1: 768 50 ['3th_fold.json:1684'] ['4th_fold.json:1691'] transB_1_filtered ['0', '50'] True + mdoel_valid(embedd_dim, sentence_len, test_date_files, valid_date_files, model_folder, token_len_range, + token_len_filter) \ No newline at end of file diff --git a/nlp/production_models/embedding_train_data.py b/nlp/production_models/embedding_train_data.py new file mode 100644 index 0000000..016e233 --- /dev/null +++ b/nlp/production_models/embedding_train_data.py @@ -0,0 +1,25 @@ +import sys + +base_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/labels' + + +label_names = ['Added', 'Changed', 'Deprecated', 'Fixed', 'Removed', 'Security'] + +#label_names = ['Added'] + + + +for label_n in label_names: + label_file_path = '{}/{}.txt'.format(base_path, label_n.lower()) + label = '__label__{}'.format(label_n) + label_len = len(label) + label_content_file = open('{}/{}_content.txt'.format(base_path, label_n.lower()),'w') + label_fiile = open(label_file_path, 'r') + for row in label_fiile: + row = row.strip() + content = row[label_len + 1:] + label_content_file.write(content+'\n') + label_content_file.close() + label_fiile.close() + + diff --git a/nlp/production_models/embedding_wrap.py b/nlp/production_models/embedding_wrap.py new file mode 100644 index 0000000..83d8919 --- /dev/null +++ b/nlp/production_models/embedding_wrap.py @@ -0,0 +1,50 @@ +from prod_model_transformer import mdoel_build +from embedded_valid import mdoel_valid +import os +import sys + +def model_write(embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr): + out_dict = {} + out_dict['filtered'] = {} + out_dict['restricted'] = {} + out_dict['model_name'] = model_name + #res models + res_model = '{}_restricted'.format(model_name) + os.mkdir(res_model) + res_model_build_time,res_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,res_model, res_model,seq_range,False,lr) + res_model_name1, res_valid_score1, res_test_score1,res_valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, res_model, seq_range,False) + out_dict['restricted']['mold_build_time'] = res_model_build_time + out_dict['restricted']['sample_length'] = res_sample_l + out_dict['restricted']['validiation'] = [] + out_dict['restricted']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':res_model_name1,'valid_score':res_valid_score1,'test_score':res_test_score1,'valid_sample_size':res_valid_l1}) + res_model_name2, res_valid_score2, res_test_score2,res_valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, res_model, seq_range,False) + out_dict['restricted']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':res_model_name2,'valid_score':res_valid_score2,'test_score':res_test_score2,'valid_sample_size':res_valid_l2}) + #filtered_model + filtered_model = '{}_filtered'.format(model_name) + os.mkdir(filtered_model) + filtered_model_build_time,filter_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,filtered_model, filtered_model,seq_range,True,lr) + filtered_model_name1, filter_valid_score1, filter_test_score1,valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, filtered_model, seq_range,True) + out_dict['filtered']['mold_build_time'] = filtered_model_build_time + out_dict['filtered']['sample_length'] = filter_sample_l + out_dict['filtered']['validiation'] = [] + out_dict['filtered']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':filtered_model_name1,'valid_score':filter_valid_score1,'test_score':filter_test_score1,'valid_sample_size':valid_l1}) + filtered_model_name2, filter_valid_score2, filter_test_score2,valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, filtered_model, seq_range,True) + out_dict['filtered']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':filtered_model_name2,'valid_score':filter_valid_score2,'test_score':filter_test_score2,'valid_sample_size':valid_l2}) + + print(out_dict) + return out_dict + +if __name__=='__main__': + #embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + neuron_size = int(sys.argv[3]) + train_files = sys.argv[4].split(',') + test_files = sys.argv[5].split(',') + model_name = sys.argv[6] + seq_range = sys.argv[7].split(',') + valid_date_files = sys.argv[8].split(',') + lr = float(sys.argv[9]) + #python3 embedding_wrap_lstm.py 768 50 100 1th_fold.json:1684,2th_fold.json:1684,3th_fold.json:1684 4th_fold.json:1691 model7 0,50 0th_fold.json:1684 0.0001 50 + model_write(embedd_dim, sentence_len, neuron_size, train_files, test_files, model_name, seq_range,valid_date_files, lr) + diff --git a/nlp/production_models/embedding_wrap2.py b/nlp/production_models/embedding_wrap2.py new file mode 100644 index 0000000..a772748 --- /dev/null +++ b/nlp/production_models/embedding_wrap2.py @@ -0,0 +1,53 @@ +from prod_model_transformer2 import mdoel_build +from embedded_valid2 import mdoel_valid +import os +import sys + +def model_write(embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr,pos_encode_scale): + out_dict = {} + out_dict['filtered'] = {} + out_dict['restricted'] = {} + out_dict['model_name'] = model_name + #res models + res_model = '{}_restricted'.format(model_name) + os.mkdir(res_model) + res_model_build_time,res_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,res_model, res_model,seq_range,False,lr,pos_encode_scale) + res_model_name1, res_valid_score1, res_test_score1,res_valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, res_model, seq_range,False,pos_encode_scale) + out_dict['restricted']['mold_build_time'] = res_model_build_time + out_dict['restricted']['sample_length'] = res_sample_l + out_dict['restricted']['validiation'] = [] + out_dict['restricted']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':res_model_name1,'valid_score':res_valid_score1,'test_score':res_test_score1,'valid_sample_size':res_valid_l1}) + res_model_name2, res_valid_score2, res_test_score2,res_valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, res_model, seq_range,False,pos_encode_scale) + out_dict['restricted']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':res_model_name2,'valid_score':res_valid_score2,'test_score':res_test_score2,'valid_sample_size':res_valid_l2}) + #filtered_model + filtered_model = '{}_filtered'.format(model_name) + os.mkdir(filtered_model) + filtered_model_build_time,filter_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,filtered_model, filtered_model,seq_range,True,lr,pos_encode_scale) + print('inputs1: ',embedd_dim, sentence_len, test_files, valid_date_files, filtered_model, seq_range,True,pos_encode_scale) + filtered_model_name1, filter_valid_score1, filter_test_score1,valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, filtered_model, seq_range,True,pos_encode_scale) + print('filter1: ', filtered_model_name1, filter_valid_score1, filter_test_score1, valid_l1) + out_dict['filtered']['mold_build_time'] = filtered_model_build_time + out_dict['filtered']['sample_length'] = filter_sample_l + out_dict['filtered']['validiation'] = [] + out_dict['filtered']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':filtered_model_name1,'valid_score':filter_valid_score1,'test_score':filter_test_score1,'valid_sample_size':valid_l1}) + filtered_model_name2, filter_valid_score2, filter_test_score2,valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, filtered_model, seq_range,True,pos_encode_scale) + print ('filter2: ',filtered_model_name2, filter_valid_score2, filter_test_score2,valid_l2) + out_dict['filtered']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':filtered_model_name2,'valid_score':filter_valid_score2,'test_score':filter_test_score2,'valid_sample_size':valid_l2}) + print(out_dict) + return out_dict + +if __name__=='__main__': + #embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + neuron_size = int(sys.argv[3]) + train_files = sys.argv[4].split(',') + test_files = sys.argv[5].split(',') + model_name = sys.argv[6] + seq_range = sys.argv[7].split(',') + valid_date_files = sys.argv[8].split(',') + lr = float(sys.argv[9]) + pos_encode_scale = float(sys.argv[10]) + ##python3 embedding_wrap2.py 768 50 100 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 transB_1 0,50 4th_fold.json:1691 0.00005 0.00001 + model_write(embedd_dim, sentence_len, neuron_size, train_files, test_files, model_name, seq_range,valid_date_files, lr,pos_encode_scale) + diff --git a/nlp/production_models/embedding_wrap_lstm.py b/nlp/production_models/embedding_wrap_lstm.py new file mode 100644 index 0000000..2b6fc40 --- /dev/null +++ b/nlp/production_models/embedding_wrap_lstm.py @@ -0,0 +1,49 @@ +from prod_model_lstm import mdoel_build +from embedded_valid import mdoel_valid +import os +import sys + +def model_write(embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr,skip_iter): + out_dict = {} + out_dict['filtered'] = {} + out_dict['restricted'] = {} + out_dict['model_name'] = 'lstm_{}'.format(model_name) + res_model = '{}_restricted'.format(model_name) + os.mkdir(res_model) + res_model_build_time,res_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,res_model, res_model,seq_range,False,lr,skip_iter) + res_model_name1, res_valid_score1, res_test_score1,res_valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, res_model, seq_range,False) + out_dict['restricted']['mold_build_time'] = res_model_build_time + out_dict['restricted']['sample_length'] = res_sample_l + out_dict['restricted']['validiation'] = [] + out_dict['restricted']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':res_model_name1,'valid_score':res_valid_score1,'test_score':res_test_score1,'valid_sample_size':res_valid_l1}) + res_model_name2, res_valid_score2, res_test_score2,res_valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, res_model, seq_range,False) + out_dict['restricted']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':res_model_name2,'valid_score':res_valid_score2,'test_score':res_test_score2,'valid_sample_size':res_valid_l2}) + filtered_model = '{}_filtered'.format(model_name) + os.mkdir(filtered_model) + filtered_model_build_time,filter_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,filtered_model, filtered_model,seq_range,True,lr,skip_iter) + filtered_model_name1, filter_valid_score1, filter_test_score1,valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, filtered_model, seq_range,True) + out_dict['filtered']['mold_build_time'] = filtered_model_build_time + out_dict['filtered']['sample_length'] = filter_sample_l + out_dict['filtered']['validiation'] = [] + out_dict['filtered']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':filtered_model_name1,'valid_score':filter_valid_score1,'test_score':filter_test_score1,'valid_sample_size':valid_l1}) + filtered_model_name2, filter_valid_score2, filter_test_score2,valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, filtered_model, seq_range,True) + out_dict['filtered']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':filtered_model_name2,'valid_score':filter_valid_score2,'test_score':filter_test_score2,'valid_sample_size':valid_l2}) + print(out_dict) + return out_dict + +if __name__=='__main__': + #embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + neuron_size = int(sys.argv[3]) + train_files = sys.argv[4].split(',') + test_files = sys.argv[5].split(',') + model_name = sys.argv[6] + seq_range = sys.argv[7].split(',') + valid_date_files = sys.argv[8].split(',') + lr = float(sys.argv[9]) + skip_iter = int(sys.argv[10]) + #python3 embedding_wrap_lstm.py 768 50 100 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 model1 0,50 4th_fold.json:1691 0.00008 50 + model_write(embedd_dim, sentence_len, neuron_size, train_files, test_files, model_name, seq_range,valid_date_files, lr,skip_iter) + + diff --git a/nlp/production_models/fasttext_kfold.py b/nlp/production_models/fasttext_kfold.py new file mode 100644 index 0000000..f8a4f21 --- /dev/null +++ b/nlp/production_models/fasttext_kfold.py @@ -0,0 +1,135 @@ +import os +import sys +from datetime import datetime +import random +import json +import fasttext +import time + + +#model_id = '6' + + +#train_files = ['0th_fold.json', '3th_fold.json', '4th_fold.json'] +#test_files = ['1th_fold.json'] +#valid_files = ['2th_fold.json'] + +def mdoel_build(train_files,test_files,valid_files, model_id,seq_range): + out_dict = {} + out_dict['filtered'] = {} + out_dict['full'] = {} + out_dict['model_name'] = 'fasttext_{}'.format(model_id) + out_dict['full']['validiation'] = [] + out_dict['filtered']['validiation'] = [] + file_size = {} + size = '1M' + for isfiltered in [True,False]: + if isfiltered: + tf = 'fast_train_{}_token_len_50.txt'.format(model_id) + sf = 'fast_select_{}_token_len_50.txt'.format(model_id) + vf = 'fast_valid_{}_token_len_50.txt'.format(model_id) + else: + tf = 'fast_train_{}.txt'.format(model_id) + sf = 'fast_select_{}.txt'.format(model_id) + vf = 'fast_valid_{}.txt'.format(model_id) + print ('iffiltered: ',isfiltered) + print(tf,sf,vf) + out_file = open(tf, 'w') + select_file = open(sf, 'w') + valid_file = open(vf, 'w') + data_c = 0 + for fold_n in train_files: + infile = open(fold_n,'r') + for row in infile: + row = json.loads(row.strip()) + label = row['label'] + content = row['content'] + features = row['features'] + if isfiltered: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + output = ' '.join([label,content])+'\n' + a=out_file.write(output) + data_c+=1 + else: + output = ' '.join([label, content]) + '\n' + a = out_file.write(output) + data_c+=1 + infile.close() + out_file.close() + file_size[tf] = data_c + data_c = 0 + for fold_n in test_files: + infile = open(fold_n,'r') + for row in infile: + row = json.loads(row.strip()) + label = row['label'] + content = row['content'] + features = row['features'] + if isfiltered: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + output = ' '.join([label,content])+'\n' + a=select_file.write(output) + data_c+=1 + else: + output = ' '.join([label, content]) + '\n' + a = select_file.write(output) + data_c+=1 + infile.close() + select_file.close() + file_size[sf] = data_c + data_c = 0 + for fold_n in valid_files: + infile = open(fold_n,'r') + for row in infile: + row = json.loads(row.strip()) + label = row['label'] + content = row['content'] + features = row['features'] + if isfiltered: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + output = ' '.join([label,content])+'\n' + a=valid_file.write(output) + data_c += 1 + else: + output = ' '.join([label, content]) + '\n' + a = valid_file.write(output) + data_c += 1 + infile.close() + valid_file.close() + file_size[vf] = data_c + for model_i in [0,1]: + params = dict(input=tf) + initial_time = time.time() + if model_i ==0: + out_model_id = model_id + selecting = sf + validating = vf + else: + out_model_id = '-'.join([model_id, '1']) + selecting = vf + validating = sf + params.update(dict( + autotuneValidationFile=selecting, + autotuneModelSize=size, + )) + model = fasttext.train_supervised(**params) + score = model.test(validating) + time_consum = time.time() - initial_time + if isfiltered: + save_model_name = 'model_token_filtered/fasttext_model{}.ftz'.format(out_model_id) + out_dict['filtered']['validiation'].append({'test_file':selecting,'valid_file':validating,'model_name':save_model_name,'valid_score':score,'valid_sample_size':file_size[validating],'model_build_time':time_consum}) + else: + save_model_name = 'model_token_full/fasttext_model{}.ftz'.format(out_model_id) + out_dict['full']['validiation'].append({'test_file':selecting,'valid_file':validating,'model_name':save_model_name,'valid_score':score,'valid_sample_size':file_size[validating],'model_build_time':time_consum}) + model.save_model(save_model_name) + print(out_dict) + return out_dict + +if __name__=='__main__': + #train_files,test_files,valid_files, model_id,seq_range + train_files = sys.argv[1].split(',') + test_files = sys.argv[2].split(',') + valid_files = sys.argv[3].split(',') + model_id = sys.argv[4] + seq_range = sys.argv[5].split(',') + mdoel_build(train_files, test_files, valid_files, model_id, seq_range) diff --git a/nlp/production_models/kfold_data_gen.py b/nlp/production_models/kfold_data_gen.py new file mode 100644 index 0000000..3b08066 --- /dev/null +++ b/nlp/production_models/kfold_data_gen.py @@ -0,0 +1,57 @@ +import json +import random + +content_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/labels' +embedded_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/embedded' +k_fold_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold' + +label_names = ['Added', 'Changed', 'Deprecated', 'Fixed', 'Removed', 'Security'] + +k = 5 + +k_fold_l = {} + +for k_i in range(0,k): + k_fold_l[k_i] = [] + +#label_names = ['Security','Changed'] + + +for label_n in label_names: + input_l = [] + label = '__label__{}'.format(label_n) + embedded_file = open('{}/{}.jsonl'.format(embedded_path, label_n.lower()),'r') + for row in embedded_file: + row = json.loads(row.strip()) + row['label'] = label + input_l.append(row) + embedded_file.close() + file_i = 0 + content_file = open('{}/{}_content.txt'.format(content_path, label_n.lower()), 'r') + for row in content_file: + input_l[file_i]['content'] = row.strip() + file_i+=1 + content_file.close() + random.shuffle(input_l) + chunk_size = int(file_i/k) + for k_i in range(0, k): + inital_i = k_i * chunk_size + if k_i == k-1: + k_fold_l[k_i] += input_l[inital_i:] + else: + end_i = (k_i+1)*chunk_size + k_fold_l[k_i] += input_l[inital_i:end_i] + + + + +for k_i in range(0,k): + k_file = open('{}/{}th_fold.json'.format(k_fold_path,k_i),'w') + out_l = k_fold_l[k_i] + random.shuffle(out_l) + for record in out_l: + record = json.dumps(record) + k_file.write(record+'\n') + k_file.close() + + diff --git a/nlp/production_models/kfold_regen.py b/nlp/production_models/kfold_regen.py new file mode 100644 index 0000000..2ab7277 --- /dev/null +++ b/nlp/production_models/kfold_regen.py @@ -0,0 +1,28 @@ +import json +base_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold' +embedded_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded' +out_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_regen3' + +fold_names = ['0th_fold', '1th_fold', '2th_fold', '3th_fold', '4th_fold'] + +embedded_type = '_base_-3_-2_-1' + +for label_n in fold_names: + k_file = open('{}/{}.json'.format(base_path, label_n.lower()),'r') + fold_data = [] + for row in k_file: + row= json.loads(row.strip()) + a= row.pop('features', None) + fold_data.append(row) + k_file.close() + embedded_file = open('{}/{}{}.jsonl'.format(embedded_path, label_n.lower(),embedded_type),'r') + out_file = open('{}/{}.json'.format(out_path, label_n.lower()),'w') + i = 0 + for row in embedded_file: + row = json.loads(row.strip()) + fold_data[i]['features']= row['features'] + out_file.write(json.dumps(fold_data[i])+'\n') + i+=1 + out_file.close() + embedded_file.close() + diff --git a/nlp/production_models/kfold_train_data.py b/nlp/production_models/kfold_train_data.py new file mode 100644 index 0000000..60d06ea --- /dev/null +++ b/nlp/production_models/kfold_train_data.py @@ -0,0 +1,20 @@ +import json +base_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold' +content_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content' + +fold_names = ['0th_fold', '1th_fold', '2th_fold', '3th_fold', '4th_fold'] + +for label_n in fold_names: + k_file_path = '{}/{}.json'.format(base_path, label_n.lower()) + label = '__label__{}'.format(label_n) + label_len = len(label) + k_content_file = open('{}/{}_content.txt'.format(content_path, label_n.lower()),'w') + k_fiile = open(k_file_path, 'r') + for row in k_fiile: + row = json.loads(row.strip()) + content = row['content'] + k_content_file.write(content+'\n') + k_content_file.close() + k_fiile.close() + + diff --git a/nlp/production_models/prod_model_lstm.py b/nlp/production_models/prod_model_lstm.py new file mode 100644 index 0000000..89beffc --- /dev/null +++ b/nlp/production_models/prod_model_lstm.py @@ -0,0 +1,111 @@ +from tensorflow import keras,nn +from tensorflow.keras import layers +import json +import numpy as np +import time +from contextlib import redirect_stdout +import sys + + +def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter): + date_files = {} + for file_d in train_files: + file_name,file_len = file_d.split(':') + date_files[file_name] = int(file_len) + total_size = 0 + for train_f in date_files: + total_size+=date_files[train_f] + input_sample = np.zeros((total_size, *(sentence_len,embedd_dim))) + out_sample = [] + i = 0 + for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + row = json.loads(row) + features = row['features'] + if token_len_filter: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i 0: + model.fit(input_sample, out_sample_array, epochs=skip_iter, batch_size=200, validation_data=(test_sample, test_label)) + #model.fit(input_sample, out_sample_array,epochs=20,batch_size = 200,validation_data = (test_sample, test_label)) + model.fit(input_sample, out_sample_array,epochs=150,batch_size = 200,validation_data = (test_sample, test_label),callbacks=[checkpoint1]) + consumed_time = time.time()-initial_time + print('consumed_time',consumed_time) + return consumed_time,len(input_sample) + + +if __name__=='__main__': + #embedd_dim,sentence_len,neuron_size,train_files,outpath, model_name + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + neuron_size = int(sys.argv[3]) + train_files = sys.argv[4].split(',') + test_files = sys.argv[5].split(',') + outpath = sys.argv[6] + model_name = sys.argv[7] + seq_range = sys.argv[8].split(',') + token_len_filter = sys.argv[9].lower() == 'true' + lr = float(sys.argv[10]) + skip_iter = int(sys.argv[11]) + #python3 embedding_wrap_lstm.py 768 50 100 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 model1 0,50 4th_fold.json:1691 0.0001 50 + mdoel_build(embedd_dim, sentence_len, neuron_size, train_files, test_files, outpath, model_name,seq_range,token_len_filter,lr,skip_iter) \ No newline at end of file diff --git a/nlp/production_models/prod_model_transformer.py b/nlp/production_models/prod_model_transformer.py new file mode 100644 index 0000000..8307312 --- /dev/null +++ b/nlp/production_models/prod_model_transformer.py @@ -0,0 +1,113 @@ +from tensorflow import keras,nn +from tensorflow.keras import layers +import json +import numpy as np +import time +from contextlib import redirect_stdout +import sys + +#https://keras.io/examples/nlp/text_classification_with_transformer/ + +def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter): + date_files = {} + for file_d in train_files: + file_name,file_len = file_d.split(':') + date_files[file_name] = int(file_len) + total_size = 0 + for train_f in date_files: + total_size+=date_files[train_f] + input_sample = np.zeros((total_size, *(sentence_len,embedd_dim))) + out_sample = [] + i = 0 + for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + row = json.loads(row) + features = row['features'] + if token_len_filter: + if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i max_score[1]: + max_score = score + max_model = f + elif score[1] == max_score[1] and max_model is not None: + epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-') + epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-') + if score[0]epoch_n_m: + max_score = score + max_model = f + +print('max_score: ',max_score) +print ('max_model: ',max_model) diff --git a/nlp/test_models/embedding_wrap_lstm2.py b/nlp/test_models/embedding_wrap_lstm2.py new file mode 100644 index 0000000..967be1d --- /dev/null +++ b/nlp/test_models/embedding_wrap_lstm2.py @@ -0,0 +1,49 @@ +from prod_model_lstm2 import mdoel_build +from embedded_valid import mdoel_valid +import os +import sys + +def model_write(embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr,skip_iter): + out_dict = {} + out_dict['filtered'] = {} + out_dict['restricted'] = {} + out_dict['model_name'] = 'lstm_{}'.format(model_name) + res_model = '{}_restricted'.format(model_name) + os.mkdir(res_model) + res_model_build_time,res_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,res_model, res_model,seq_range,False,lr,skip_iter) + res_model_name1, res_valid_score1, res_test_score1,res_valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, res_model, seq_range,False) + out_dict['restricted']['mold_build_time'] = res_model_build_time + out_dict['restricted']['sample_length'] = res_sample_l + out_dict['restricted']['validiation'] = [] + out_dict['restricted']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':res_model_name1,'valid_score':res_valid_score1,'test_score':res_test_score1,'valid_sample_size':res_valid_l1}) + res_model_name2, res_valid_score2, res_test_score2,res_valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, res_model, seq_range,False) + out_dict['restricted']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':res_model_name2,'valid_score':res_valid_score2,'test_score':res_test_score2,'valid_sample_size':res_valid_l2}) + filtered_model = '{}_filtered'.format(model_name) + os.mkdir(filtered_model) + filtered_model_build_time,filter_sample_l= mdoel_build(embedd_dim,sentence_len,neuron_size,train_files,test_files,filtered_model, filtered_model,seq_range,True,lr,skip_iter) + filtered_model_name1, filter_valid_score1, filter_test_score1,valid_l1 = mdoel_valid(embedd_dim, sentence_len, test_files, valid_date_files, filtered_model, seq_range,True) + out_dict['filtered']['mold_build_time'] = filtered_model_build_time + out_dict['filtered']['sample_length'] = filter_sample_l + out_dict['filtered']['validiation'] = [] + out_dict['filtered']['validiation'].append({'test_file':test_files,'valid_file':valid_date_files,'model_name':filtered_model_name1,'valid_score':filter_valid_score1,'test_score':filter_test_score1,'valid_sample_size':valid_l1}) + filtered_model_name2, filter_valid_score2, filter_test_score2,valid_l2 = mdoel_valid(embedd_dim, sentence_len, valid_date_files,test_files, filtered_model, seq_range,True) + out_dict['filtered']['validiation'].append({'test_file':valid_date_files,'valid_file':test_files,'model_name':filtered_model_name2,'valid_score':filter_valid_score2,'test_score':filter_test_score2,'valid_sample_size':valid_l2}) + print(out_dict) + return out_dict + +if __name__=='__main__': + #embedd_dim,sentence_len,neuron_size,train_files,test_files, model_name,seq_range,valid_date_files,lr + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + neuron_size = int(sys.argv[3]) + train_files = sys.argv[4].split(',') + test_files = sys.argv[5].split(',') + model_name = sys.argv[6] + seq_range = sys.argv[7].split(',') + valid_date_files = sys.argv[8].split(',') + lr = float(sys.argv[9]) + skip_iter = int(sys.argv[10]) + #python3 embedding_wrap_lstm2.py 768 50 300 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 lstemB_1 0,50 4th_fold.json:1691 0.00005 0 + model_write(embedd_dim, sentence_len, neuron_size, train_files, test_files, model_name, seq_range,valid_date_files, lr,skip_iter) + + diff --git a/nlp/test_models/input_filter.py b/nlp/test_models/input_filter.py new file mode 100644 index 0000000..05881bf --- /dev/null +++ b/nlp/test_models/input_filter.py @@ -0,0 +1,47 @@ +import json + +infile = open('0th_fold.json','r') + +long_c = 0 + +short_c = 0 + +long_sampe = [] + +for row in infile: + row = json.loads(row.strip()) + content = row['content'] + feature = row['features'] + if len(feature) > 20: + long_c+=1 + long_sampe.append(row) + else: + short_c+=1 + + + +sample_i = 5 + +remain_w = 0 + +total_len = len(long_sampe[sample_i]['features']) + +full_token = [] + +remain_tokens = [] + +#for sample in long_sampe: + +for token_d in long_sampe[sample_i]['features']: + full_token.append(token_d['token']) + if token_d['token'] in ['.',',',';','?','!']: + remain_tokens.append(token_d['token']) + elif len(token_d['token']) > 1 and '##' not in token_d['token'] and not token_d['token'].isnumeric(): + remain_tokens.append(token_d['token']) + elif '##' in token_d['token'] and len(token_d['token']) >=4 and not token_d['token'].replace('##','').isnumeric(): + remain_tokens.append(token_d['token']) + + +print (len(full_token), full_token) + +print (len(remain_tokens), remain_tokens) diff --git a/nlp/test_models/model_char_a.py b/nlp/test_models/model_char_a.py new file mode 100644 index 0000000..3d96f1a --- /dev/null +++ b/nlp/test_models/model_char_a.py @@ -0,0 +1,27 @@ +from tensorflow import keras,nn +from tensorflow.keras import layers +import json +import numpy as np +import time +from contextlib import redirect_stdout +import sys + +maxlen = 50 + +chars = set([]) + +char_size = 100 + +word_embedding = keras.Input(shape=(maxlen,char_size),name='word_input') + +con1 = keras.layers.Conv1D( 100, 6, activation='relu',strides = 3, input_shape=(None,maxlen,char_size))(word_embedding) + +pool1 = keras.layers.GlobalMaxPooling1D()(con1) + +output = keras.layers.Dense(6, activation='softmax')(pool1) + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +opt = keras.optimizers.Adam(learning_rate=0.0005) + +model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['categorical_accuracy']) diff --git a/nlp/test_models/model_cnn_b.py b/nlp/test_models/model_cnn_b.py new file mode 100644 index 0000000..2519ea7 --- /dev/null +++ b/nlp/test_models/model_cnn_b.py @@ -0,0 +1,128 @@ +from tensorflow import keras +from tensorflow.keras import layers +import json +import numpy as np +import tensorflow as tf + +word_embedding = keras.Input(shape=(None,512),name='word_input') + + +convs = [] +for kernel_size in [2]: + conv_output= layers.Conv1D(kernel_size=kernel_size, filters=200, padding='causal',activation='relu', strides=1)(word_embedding) + maxpool_output = layers.MaxPooling1D(2)(conv_output) + convs.append(maxpool_output) + +if len(convs) > 1: + con_convs = layers.Concatenate()(convs) +else: + con_convs = convs[0] + + + +#norm_lay = layers.BatchNormalization()(word_embedding) + + + +lstm_forward1 = layers.LSTM(100, activation='relu',return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(con_convs) +#lstm_forward1 = layers.LSTM(100, activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +lstm_forward2 = layers.LSTM(100, activation='relu',dropout=0.1, recurrent_dropout=0.1)(lstm_forward1) +lstm_backward1 = layers.LSTM(100,activation='relu', return_sequences=True, dropout=0.1, recurrent_dropout=0.1,go_backwards = True)(con_convs) +#lstm_backward1 = layers.LSTM(100,activation='relu', dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +lstm_backward2 = layers.LSTM(100,activation='relu', dropout=0.1, recurrent_dropout=0.1,go_backwards = True)(lstm_backward1) + + +#con_layer1 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) + +con_layer2 = layers.Concatenate(axis=1)([lstm_forward2, lstm_backward2]) + + +#con_layer3 = layers.Concatenate(axis=1)([con_layer1, con_layer2]) +#bidir_lay = layers.Add()([con_layer1,con_layer2]) + + +#output = layers.TimeDistributed(layers.Dense(48))(bidir_lay) + +#droup_out1 = layers.Dropout(.5)(con_layer1) + + +dense_layer1 = layers.Dense(100,activation='relu')(con_layer2) + +droup_out2 = layers.Dropout(.5)(dense_layer1) + +dense_layer2 = layers.Dense(100,activation='relu')(droup_out2) + +#droup_out3 = layers.Dropout(.2)(dense_layer2) + +output = layers.Dense(6, activation='softmax')(dense_layer2) + + + + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['categorical_accuracy']) + + + + +date_files = {'0th_fold.json':1684,'1th_fold.json':1684,'2th_fold.json':1684,'3th_fold.json':1684} + +total_size = 0 +for train_f in date_files: + total_size+=date_files[train_f] + + +cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5} + +#input_sample = [] + +content_sample = [] + + + +input_sample = np.empty((total_size, *(25,512))) +#out_sample = np.empty((total_size, *(6))) +out_sample = [] + +out_sample_array = np.empty((total_size, 6)) + +i = 0 +for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + embedded_sentence = [] + row = json.loads(row) + features = row['features'] + content_sample.append(row['content']) + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i<25: + #embedded_sentence.append([word['layers'][0]['values']]) + input_sample[i,word_i] = np.array(word['layers'][3]['values']) + word_i+=1 + else: + break + while word_i <25: + #embedded_sentence.append([[0]*128]) + input_sample[i, word_i] = np.array([0]*512) + word_i+=1 + i+=1 + + + +out_sample=keras.utils.to_categorical(out_sample) + +i =0 +for sample in out_sample: + out_sample_array[i,] = np.array(sample) + i+=1 + +checkpoint_filepath = 'model_testb_2cnn.h5' + +checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max') + +callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2000) + +model.fit(input_sample, out_sample_array,epochs=2000,batch_size = 200,validation_split = 0.1,callbacks=[checkpoint,callback]) diff --git a/nlp/test_models/model_file_comb.py b/nlp/test_models/model_file_comb.py new file mode 100644 index 0000000..746fa3a --- /dev/null +++ b/nlp/test_models/model_file_comb.py @@ -0,0 +1,7 @@ +from itertools import combinations +file_l = ['0th_fold','1th_fold','2th_fold','3th_fold','4th_fold'] +out_list = list(combinations(file_l, 3)) + + +for fold_c in out_list: + print(','.join(fold_c)) \ No newline at end of file diff --git a/nlp/test_models/model_lstm.py b/nlp/test_models/model_lstm.py new file mode 100644 index 0000000..c8fe0c5 --- /dev/null +++ b/nlp/test_models/model_lstm.py @@ -0,0 +1,127 @@ +from tensorflow import keras +from tensorflow.keras import layers +import json +import numpy as np +import tensorflow as tf + +word_embedding = keras.Input(shape=(None,256),name='word_input') + +''' +convs = [] +for kernel_size in [2]: + conv_output= layers.TimeDistributed(layers.Conv1D(kernel_size=kernel_size, filters=200, padding='causal',activation='relu', strides=1))(activity_input) + maxpool_output = layers.TimeDistributed(layers.MaxPooling1D(2))(conv_output) + convs.append(maxpool_output) + +if len(convs) > 1: + con_convs = layers.Concatenate()(convs) +else: + con_convs = convs[0] + +char_conv = layers.TimeDistributed(layers.Flatten())(con_convs) +''' + +#norm_lay = layers.BatchNormalization()(word_embedding) + + + +#lstm_forward1 = layers.LSTM(100, activation='relu',return_sequences=True, dropout=0.1, recurrent_dropout=0.1)(word_embedding) +lstm_forward1 = layers.LSTM(20, activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +#lstm_forward2 = layers.LSTM(100, activation='relu',dropout=0.1, recurrent_dropout=0.1)(lstm_forward1) +#lstm_backward1 = layers.LSTM(100,activation='relu', return_sequences=True, dropout=0.1, recurrent_dropout=0.1,go_backwards = True)(word_embedding) +lstm_backward1 = layers.LSTM(20,activation='relu', dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +#lstm_backward2 = layers.LSTM(100,activation='relu', dropout=0.1, recurrent_dropout=0.1,go_backwards = True)(lstm_backward1) + +con_layer1 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) + +#con_layer2 = layers.Concatenate(axis=1)([lstm_forward2, lstm_backward2]) + +#bidir_lay = layers.Add()([con_layer1,con_layer2]) + + +#output = layers.TimeDistributed(layers.Dense(48))(bidir_lay) + +#droup_out1 = layers.Dropout(.5)(con_layer1) + + +dense_layer1 = layers.Dense(20,activation='relu')(con_layer1) + +droup_out2 = layers.Dropout(.5)(dense_layer1) + +dense_layer2 = layers.Dense(20,activation='relu')(droup_out2) + +#droup_out3 = layers.Dropout(.2)(dense_layer2) + +output = layers.Dense(6, activation='softmax')(dense_layer2) + + + + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['categorical_accuracy']) + + + + +date_files = {'0th_fold.json':1684,'1th_fold.json':1684,'2th_fold.json':1684,'3th_fold.json':1684} + +total_size = 0 +for train_f in date_files: + total_size+=date_files[train_f] + + +cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5} + +#input_sample = [] + +content_sample = [] + + + +input_sample = np.empty((total_size, *(25,256))) +#out_sample = np.empty((total_size, *(6))) +out_sample = [] + +out_sample_array = np.empty((total_size, 6)) + +i = 0 +for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + embedded_sentence = [] + row = json.loads(row) + features = row['features'] + content_sample.append(row['content']) + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i<25: + #embedded_sentence.append([word['layers'][0]['values']]) + input_sample[i,word_i] = np.array(word['layers'][1]['values']+word['layers'][0]['values']) + word_i+=1 + else: + break + while word_i <25: + #embedded_sentence.append([[0]*128]) + input_sample[i, word_i] = np.array([0]*256) + word_i+=1 + i+=1 + + + +out_sample=keras.utils.to_categorical(out_sample) + +i =0 +for sample in out_sample: + out_sample_array[i,] = np.array(sample) + i+=1 + +checkpoint_filepath = 'model_test1_256.h5' + +checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max') + +callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20) + +model.fit(input_sample, out_sample_array,epochs=2000,batch_size = 200,validation_split = 0.1,callbacks=[checkpoint,callback]) + diff --git a/nlp/test_models/model_lstm_a_filtered.py b/nlp/test_models/model_lstm_a_filtered.py new file mode 100644 index 0000000..3723198 --- /dev/null +++ b/nlp/test_models/model_lstm_a_filtered.py @@ -0,0 +1,167 @@ +from tensorflow import keras +from tensorflow.keras import layers +import json +import numpy as np +import tensorflow as tf +import time + +if_filter = False + +embedd_dim = 768 + +sentence_len = 30 + +initial_time = time.time() + +word_embedding = keras.Input(shape=(None,embedd_dim),name='word_input') + +''' +convs = [] +for kernel_size in [2]: + conv_output= layers.TimeDistributed(layers.Conv1D(kernel_size=kernel_size, filters=200, padding='causal',activation='relu', strides=1))(activity_input) + maxpool_output = layers.TimeDistributed(layers.MaxPooling1D(2))(conv_output) + convs.append(maxpool_output) + +if len(convs) > 1: + con_convs = layers.Concatenate()(convs) +else: + con_convs = convs[0] + +char_conv = layers.TimeDistributed(layers.Flatten())(con_convs) +''' + +#norm_lay = layers.BatchNormalization()(word_embedding) + + + + +lstm_forward1 = layers.LSTM(200, activation='relu',dropout=0.2, recurrent_dropout=0.2)(word_embedding) +#lstm_backward1 = layers.LSTM(200,activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +#con_layer1 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) +#con_layer2 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) + + +#con_layer3 = layers.Concatenate(axis=1)([con_layer1, con_layer2]) +#bidir_lay = layers.Add()([con_layer1,con_layer2]) + + +#output = layers.TimeDistributed(layers.Dense(48))(bidir_lay) + +#droup_out1 = layers.Dropout(.5)(con_layer1) + + +#dense_layer1 = layers.Dense(500,activation='relu')(con_layer2) + +droup_out1 = layers.Dropout(.2)(lstm_forward1) + +dense_layer2 = layers.Dense(200,activation='relu')(droup_out1) + +droup_out2 = layers.Dropout(.2)(dense_layer2) + +dense_layer3 = layers.Dense(200,activation='relu')(droup_out2) + +droup_out3 = layers.Dropout(.2)(dense_layer3) + +dense_layer4 = layers.Dense(200,activation='relu')(droup_out3) + +droup_out4 = layers.Dropout(.2)(dense_layer4) + +con_layer1 = layers.Concatenate(axis=1)([droup_out2, droup_out3,droup_out4]) + +#droup_out3 = layers.Dropout(.2)(dense_layer2) + +output = layers.Dense(6, activation='softmax')(con_layer1) + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['categorical_accuracy']) + + + +date_files = {'1th_fold.json':1684,'2th_fold.json':1684,'3th_fold.json':1684,'4th_fold.json':1691} + +total_size = 0 +for train_f in date_files: + total_size+=date_files[train_f] + + +cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5} + +#input_sample = [] + +content_sample = [] + + + +input_sample = np.empty((total_size, *(sentence_len,embedd_dim))) +#out_sample = np.empty((total_size, *(6))) +out_sample = [] + + + + +total_sampe_c = 0 + +i = 0 +for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + embedded_sentence = [] + row = json.loads(row) + features = row['features'] + content_sample.append(row['content']) + sample_tokens = [] + for token_d in features: + if if_filter: + if token_d['token'] in ['.', ',', ';', '?']: + sample_tokens.append(token_d['layers'][-1]['values']) + elif len(token_d['token']) > 1 and '##' not in token_d['token'] and token_d['token'].isalpha() : + sample_tokens.append(token_d['layers'][-1]['values']) + elif '##' in token_d['token'] and len(token_d['token']) >= 4 and not token_d['token'].replace('##','').isnumeric(): + sample_tokens.append(token_d['layers'][-1]['values']) + else: + sample_tokens.append(token_d['layers'][-1]['values']) + if sample_tokens: + out_sample.append(cat_map[row['label']]) + total_sampe_c+=1 + word_i = 0 + for word in sample_tokens: + if word_i 1: + con_convs = layers.Concatenate()(convs) +else: + con_convs = convs[0] + +char_conv = layers.TimeDistributed(layers.Flatten())(con_convs) +''' + +#norm_lay = layers.BatchNormalization()(word_embedding) + + + +lstm_forward1 = layers.LSTM(700, activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +#lstm_forward1 = layers.LSTM(100, activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +#lstm_forward2 = layers.LSTM(300, activation='relu',dropout=0.2, recurrent_dropout=0.2)(lstm_forward1) +lstm_backward1 = layers.LSTM(700,activation='relu', dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +#lstm_backward1 = layers.LSTM(100,activation='relu', dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +#lstm_backward2 = layers.LSTM(300,activation='relu', dropout=0.2, recurrent_dropout=0.2)(lstm_backward1) + +#con_layer1 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) + +con_layer2 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) + + +droup_out2 = layers.Dropout(.2)(con_layer2) + +dense_layer2 = layers.Dense(700,activation='relu')(droup_out2) + +#droup_out3 = layers.Dropout(.2)(dense_layer2) + +output = layers.Dense(6, activation='softmax')(dense_layer2) + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +opt = keras.optimizers.Adam(learning_rate=0.0005) + + +model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['categorical_accuracy']) + +date_files = {'0th_fold.json':1684,'2th_fold.json':1684,'3th_fold.json':1684,'4th_fold.json':1691} + +total_size = 0 +for train_f in date_files: + total_size+=date_files[train_f] + + +cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5} + +#input_sample = [] + +content_sample = [] + + + +input_sample = np.empty((total_size, *(30,768))) +#out_sample = np.empty((total_size, *(6))) +out_sample = [] + +out_sample_array = np.empty((total_size, 6)) + +i = 0 +for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + embedded_sentence = [] + row = json.loads(row) + features = row['features'] + content_sample.append(row['content']) + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i<30: + #embedded_sentence.append([word['layers'][0]['values']]) + input_sample[i,word_i] = np.array([sum(x)/len(x) for x in zip(word['layers'][-1]['values'],word['layers'][-2]['values'])]) + word_i+=1 + else: + break + while word_i <30: + #embedded_sentence.append([[0]*128]) + input_sample[i, word_i] = np.array([0]*768) + word_i+=1 + i+=1 + + + +out_sample=keras.utils.to_categorical(out_sample) + +i =0 +for sample in out_sample: + out_sample_array[i,] = np.array(sample) + i+=1 + +checkpoint_filepath = 'model_ds/model_b-{epoch:02d}-{val_categorical_accuracy:.3f}-{val_loss:.3f}.hdf5' + +checkpoint1 = keras.callbacks.ModelCheckpoint(checkpoint_filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=False, mode='max') + +model.fit(input_sample, out_sample_array,epochs=10,batch_size = 1200,validation_split = 0.25) + +model.fit(input_sample, out_sample_array,epochs=50,batch_size = 1200,validation_split = 0.25,callbacks=[checkpoint1]) + + +consumed_time = time.time-initial_time + +print('consumed_time',consumed_time) \ No newline at end of file diff --git a/nlp/test_models/model_lstm_c.py b/nlp/test_models/model_lstm_c.py new file mode 100644 index 0000000..f5a47e8 --- /dev/null +++ b/nlp/test_models/model_lstm_c.py @@ -0,0 +1,139 @@ +from tensorflow import keras +from tensorflow.keras import layers +import json +import numpy as np +import tensorflow as tf +import time + +initial_time = time.time() + + +embedd_dim = 768 + +sentence_len = 30 + +initial_time = time.time() + + +word_embedding = keras.Input(shape=(None,embedd_dim),name='word_input') + +''' +convs = [] +for kernel_size in [2]: + conv_output= layers.TimeDistributed(layers.Conv1D(kernel_size=kernel_size, filters=200, padding='causal',activation='relu', strides=1))(activity_input) + maxpool_output = layers.TimeDistributed(layers.MaxPooling1D(2))(conv_output) + convs.append(maxpool_output) + +if len(convs) > 1: + con_convs = layers.Concatenate()(convs) +else: + con_convs = convs[0] + +char_conv = layers.TimeDistributed(layers.Flatten())(con_convs) +''' + +#norm_lay = layers.BatchNormalization()(word_embedding) + + +lstm_forward1 = layers.LSTM(300, activation='relu',return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(word_embedding) +#lstm_forward1 = layers.LSTM(100, activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +lstm_forward2 = layers.LSTM(300, activation='relu',dropout=0.2, recurrent_dropout=0.2)(lstm_forward1) +lstm_backward1 = layers.LSTM(300,activation='relu', return_sequences=True, dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +#lstm_backward1 = layers.LSTM(100,activation='relu', dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +lstm_backward2 = layers.LSTM(300,activation='relu', dropout=0.2, recurrent_dropout=0.2)(lstm_backward1) +#con_layer1 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) +con_layer2 = layers.Concatenate(axis=1)([lstm_forward2, lstm_backward2]) + + +#con_layer3 = layers.Concatenate(axis=1)([con_layer1, con_layer2]) +#bidir_lay = layers.Add()([con_layer1,con_layer2]) + + +#output = layers.TimeDistributed(layers.Dense(48))(bidir_lay) + +#droup_out1 = layers.Dropout(.5)(con_layer1) + + +dense_layer1 = layers.Dense(300,activation='relu')(con_layer2) + +droup_out2 = layers.Dropout(.5)(dense_layer1) + +dense_layer2 = layers.Dense(300,activation='relu')(droup_out2) + + +#droup_out3 = layers.Dropout(.2)(dense_layer2) + +output = layers.Dense(6, activation='softmax')(dense_layer2) + + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['categorical_accuracy']) + + +date_files = {'1th_fold.json':1684,'2th_fold.json':1684,'3th_fold.json':1684,'4th_fold.json':1691} + +total_size = 0 +for train_f in date_files: + total_size+=date_files[train_f] + + +cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5} + +#input_sample = [] + +content_sample = [] + + +input_sample = np.empty((total_size, *(sentence_len,embedd_dim))) +#out_sample = np.empty((total_size, *(6))) +out_sample = [] + +out_sample_array = np.empty((total_size, 6)) + +i = 0 +for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + embedded_sentence = [] + row = json.loads(row) + features = row['features'] + content_sample.append(row['content']) + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i 1: + con_convs = layers.Concatenate()(convs) +else: + con_convs = convs[0] + +char_conv = layers.TimeDistributed(layers.Flatten())(con_convs) +''' + +#norm_lay = layers.BatchNormalization()(word_embedding) + + + +lstm_forward1 = layers.LSTM(300, activation='relu',return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(word_embedding) +lstm_forward2 = layers.LSTM(300, activation='relu',return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(lstm_forward1) +#lstm_forward1 = layers.LSTM(100, activation='relu', dropout=0.2, recurrent_dropout=0.2)(word_embedding) +lstm_forward3 = layers.LSTM(300, activation='relu',dropout=0.2, recurrent_dropout=0.2)(lstm_forward2) +lstm_backward1 = layers.LSTM(300,activation='relu', return_sequences=True, dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +#lstm_backward1 = layers.LSTM(100,activation='relu', dropout=0.2, recurrent_dropout=0.2,go_backwards = True)(word_embedding) +lstm_backward2 = layers.LSTM(300,activation='relu', return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(lstm_backward1) +lstm_backward3 = layers.LSTM(300,activation='relu', dropout=0.2, recurrent_dropout=0.2)(lstm_backward2) +#con_layer1 = layers.Concatenate(axis=1)([lstm_forward1, lstm_backward1]) +con_layer2 = layers.Concatenate(axis=1)([lstm_forward3, lstm_backward3]) + + +#con_layer3 = layers.Concatenate(axis=1)([con_layer1, con_layer2]) +#bidir_lay = layers.Add()([con_layer1,con_layer2]) + + +#output = layers.TimeDistributed(layers.Dense(48))(bidir_lay) + +#droup_out1 = layers.Dropout(.5)(con_layer1) + + +#dense_layer1 = layers.Dense(500,activation='relu')(con_layer2) + +droup_out2 = layers.Dropout(.5)(con_layer2) + +dense_layer2 = layers.Dense(300,activation='relu')(droup_out2) + +#droup_out3 = layers.Dropout(.2)(dense_layer2) + +output = layers.Dense(6, activation='softmax')(dense_layer2) + + + + +model = keras.Model(inputs=[word_embedding], outputs=[output]) + +model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['categorical_accuracy']) + + + +date_files = {'1th_fold.json':1684,'2th_fold.json':1684,'3th_fold.json':1684,'4th_fold.json':1691} + +total_size = 0 +for train_f in date_files: + total_size+=date_files[train_f] + + +cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5} + +#input_sample = [] + +content_sample = [] + + + +input_sample = np.empty((total_size, *(sentence_len,embedd_dim))) +#out_sample = np.empty((total_size, *(6))) +out_sample = [] + + + + +total_sampe_c = 0 + +i = 0 +for file_n in date_files: + data_file = open('{}'.format(file_n), 'r') + for row in data_file: + embedded_sentence = [] + row = json.loads(row) + features = row['features'] + content_sample.append(row['content']) + sample_tokens = [] + for token_d in features: + if token_d['token'] in ['.', ',', ';', '?', '!']: + sample_tokens.append(token_d['layers'][-1]['values']) + elif len(token_d['token']) > 1 and '##' not in token_d['token'] and not token_d['token'].isnumeric(): + sample_tokens.append(token_d['layers'][-1]['values']) + elif '##' in token_d['token'] and len(token_d['token']) >= 4 and not token_d['token'].replace('##','').isnumeric(): + sample_tokens.append(token_d['layers'][-1]['values']) + if sample_tokens: + out_sample.append(cat_map[row['label']]) + total_sampe_c+=1 + word_i = 0 + for word in sample_tokens: + if word_i int(seq_range[0]) and len(features) <= int(seq_range[1]): + out_sample.append(cat_map[row['label']]) + word_i = 0 + for word in features: + if word_i 0: + model.fit(input_sample, out_sample_array, epochs=skip_iter, batch_size=200, validation_data=(test_sample, test_label)) + #model.fit(input_sample, out_sample_array,epochs=20,batch_size = 200,validation_data = (test_sample, test_label)) + model.fit(input_sample, out_sample_array,epochs=50,batch_size = 200,validation_data = (test_sample, test_label),callbacks=[checkpoint1]) + consumed_time = time.time()-initial_time + print('consumed_time',consumed_time) + return consumed_time,len(input_sample) + + +if __name__=='__main__': + #embedd_dim,sentence_len,neuron_size,train_files,outpath, model_name + embedd_dim = int(sys.argv[1]) + sentence_len = int(sys.argv[2]) + neuron_size = int(sys.argv[3]) + train_files = sys.argv[4].split(',') + test_files = sys.argv[5].split(',') + outpath = sys.argv[6] + model_name = sys.argv[7] + seq_range = sys.argv[8].split(',') + token_len_filter = sys.argv[9].lower() == 'true' + lr = float(sys.argv[10]) + skip_iter = int(sys.argv[11]) + mdoel_build(embedd_dim, sentence_len, neuron_size, train_files, test_files, outpath, model_name,seq_range,token_len_filter,lr,skip_iter) \ No newline at end of file From 1aa9f8decd2fbe6c5747d041d2ad91ac29de27b7 Mon Sep 17 00:00:00 2001 From: steven-mindswire Date: Mon, 22 Feb 2021 12:56:36 -0500 Subject: [PATCH 2/2] Create README.md --- nlp/production_models/README.md | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 nlp/production_models/README.md diff --git a/nlp/production_models/README.md b/nlp/production_models/README.md new file mode 100644 index 0000000..f6ccb00 --- /dev/null +++ b/nlp/production_models/README.md @@ -0,0 +1,55 @@ +## Data preparation +1. Raw data: https://github.com/EQWorks/release/tree/master/nlp/data/labels +2. Execute kfold_data_gen.py + * This will randomly separate each label’s content into 5 folds. +3. Execute kfold_train_data.py + *Strip label from each fold file, prepare content for Bert embedding +4. In case of switching to another Bert pre-trained model and want k-fold remain the same, execute the kfild_regen.py + +## Bert embedding generation +1. Clone Bert from Github: https://github.com/google-research/bert +2. Download one of the pre-trained models +3. Set up environment path to the model + * Example: export BERT_BASE_DIR=/Users/stevenlu/Downloads/bert-master/uncased_L-12_H-768_A-12 +4. Execute script + * Example: +python3 extract_features.py \ +--input_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content/4th_fold_content.txt \ +--output_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded/4th_fold_base_-3_-2_-1.jsonl \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --layers=-10,-11,-12 \ + --max_seq_length=128 \ + --batch_size=8 + +## Model execution on ec2 +1. Initial the cluster and prepare for the model training + * ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20201026 - ami-0885b1f6bd170450c + * sudo apt-get update + * sudo apt-get -y install python3-pip + * sudo apt install python3-testresources + * sudo pip3 install --upgrade tensorflow + * sudo apt install awscli +2. Configure your AWS +3. Copy script and k-fold files from s3: s3://eq-miner/test2/release_label/ +4. Create a model output folder +5. sudo pip3 install fasttext (if your building fasttext model) +6. Execute the script + * Execute embedding_wrap.py/embedding_wrap2.py/embedding_wrap_lstm.py + * This will train the model and generate testing result + * embedding_wrap: Multi-head self-attention without location encoding + * embedding_wrap2: Multi-head self-attention with location encoding + * embedding_wrap_lstm: Bi-directional LSTM + * Input example: + * embedding_wrap/embedding_wrap2: python3 embedding_wrap_lstm.py 768 50 100 1th_fold.json:1684,2th_fold.json:1684,3th_fold.json:1684 4th_fold.json:1691 model7 0,50 0th_fold.json:1684 0.0001 50 + * embedding_wrap_lstm: python3 embedding_wrap_lstm.py 768 50 100 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 model1 0,50 4th_fold.json:1691 0.00008 50 + * Example of output: + * {'filtered': {'mold_build_time': 5105.467138528824, 'sample_length': 4595, 'validiation': [{'test_file': ['0th_fold.json:1691'], 'valid_file': ['3th_fold.json:1684'], 'model_name': '47-0.9978-0.0171.hdf5', 'valid_score': [0.7473069429397583, 0.8219354748725891], 'test_score': [0.7553659081459045, 0.8098001480102539], 'valid_sample_size': 1550}, {'test_file': ['3th_fold.json:1684'], 'valid_file': ['0th_fold.json:1691'], 'model_name': '46-0.9965-0.0207.hdf5', 'valid_score': [0.7761548161506653, 0.8052868843078613], 'test_score': [0.7688340544700623, 0.8225806355476379], 'valid_sample_size': 1551}]}, 'restricted': {'mold_build_time': 3495.59513258934, 'sample_length': 5059, 'validiation': [{'test_file': ['0th_fold.json:1691'], 'valid_file': ['3th_fold.json:1684'], 'model_name': '46-0.9937-0.0317.hdf5', 'valid_score': [0.8246729373931885, 0.8105700612068176], 'test_score': [0.813213050365448, 0.801068902015686], 'valid_sample_size': 1684}, {'test_file': ['3th_fold.json:1684'], 'valid_file': ['0th_fold.json:1691'], 'model_name': '16-0.9302-0.2405.hdf5', 'valid_score': [0.5546706318855286, 0.7897862195968628], 'test_score': [0.5412384867668152, 0.8141329884529114], 'valid_sample_size': 1684}]}, 'model_name': 'transB_8'} +7. Execute fasttext + * Install fasttext + * Execute fasttext_kfold.py + +## T-test +1. Gather result accuracy to a list +2. Execute t_tests.py