Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added nlp/production_models/.DS_Store
Binary file not shown.
55 changes: 55 additions & 0 deletions nlp/production_models/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
## Data preparation
1. Raw data: https://github.com/EQWorks/release/tree/master/nlp/data/labels
2. Execute kfold_data_gen.py
* This will randomly separate each label’s content into 5 folds.
3. Execute kfold_train_data.py
*Strip label from each fold file, prepare content for Bert embedding
4. In case of switching to another Bert pre-trained model and want k-fold remain the same, execute the kfild_regen.py

## Bert embedding generation
1. Clone Bert from Github: https://github.com/google-research/bert
2. Download one of the pre-trained models
3. Set up environment path to the model
* Example: export BERT_BASE_DIR=/Users/stevenlu/Downloads/bert-master/uncased_L-12_H-768_A-12
4. Execute script
* Example:
python3 extract_features.py \
--input_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content/4th_fold_content.txt \
--output_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded/4th_fold_base_-3_-2_-1.jsonl \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--layers=-10,-11,-12 \
--max_seq_length=128 \
--batch_size=8

## Model execution on ec2
1. Initial the cluster and prepare for the model training
* ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20201026 - ami-0885b1f6bd170450c
* sudo apt-get update
* sudo apt-get -y install python3-pip
* sudo apt install python3-testresources
* sudo pip3 install --upgrade tensorflow
* sudo apt install awscli
2. Configure your AWS
3. Copy script and k-fold files from s3: s3://eq-miner/test2/release_label/
4. Create a model output folder
5. sudo pip3 install fasttext (if your building fasttext model)
6. Execute the script
* Execute embedding_wrap.py/embedding_wrap2.py/embedding_wrap_lstm.py
* This will train the model and generate testing result
* embedding_wrap: Multi-head self-attention without location encoding
* embedding_wrap2: Multi-head self-attention with location encoding
* embedding_wrap_lstm: Bi-directional LSTM
* Input example:
* embedding_wrap/embedding_wrap2: python3 embedding_wrap_lstm.py 768 50 100 1th_fold.json:1684,2th_fold.json:1684,3th_fold.json:1684 4th_fold.json:1691 model7 0,50 0th_fold.json:1684 0.0001 50
* embedding_wrap_lstm: python3 embedding_wrap_lstm.py 768 50 100 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 model1 0,50 4th_fold.json:1691 0.00008 50
* Example of output:
* {'filtered': {'mold_build_time': 5105.467138528824, 'sample_length': 4595, 'validiation': [{'test_file': ['0th_fold.json:1691'], 'valid_file': ['3th_fold.json:1684'], 'model_name': '47-0.9978-0.0171.hdf5', 'valid_score': [0.7473069429397583, 0.8219354748725891], 'test_score': [0.7553659081459045, 0.8098001480102539], 'valid_sample_size': 1550}, {'test_file': ['3th_fold.json:1684'], 'valid_file': ['0th_fold.json:1691'], 'model_name': '46-0.9965-0.0207.hdf5', 'valid_score': [0.7761548161506653, 0.8052868843078613], 'test_score': [0.7688340544700623, 0.8225806355476379], 'valid_sample_size': 1551}]}, 'restricted': {'mold_build_time': 3495.59513258934, 'sample_length': 5059, 'validiation': [{'test_file': ['0th_fold.json:1691'], 'valid_file': ['3th_fold.json:1684'], 'model_name': '46-0.9937-0.0317.hdf5', 'valid_score': [0.8246729373931885, 0.8105700612068176], 'test_score': [0.813213050365448, 0.801068902015686], 'valid_sample_size': 1684}, {'test_file': ['3th_fold.json:1684'], 'valid_file': ['0th_fold.json:1691'], 'model_name': '16-0.9302-0.2405.hdf5', 'valid_score': [0.5546706318855286, 0.7897862195968628], 'test_score': [0.5412384867668152, 0.8141329884529114], 'valid_sample_size': 1684}]}, 'model_name': 'transB_8'}
7. Execute fasttext
* Install fasttext
* Execute fasttext_kfold.py

## T-test
1. Gather result accuracy to a list
2. Execute t_tests.py
12 changes: 12 additions & 0 deletions nlp/production_models/bert_script
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
export BERT_BASE_DIR=/Users/stevenlu/Downloads/bert-master/uncased_L-12_H-768_A-12


python3 extract_features.py \
--input_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content/4th_fold_content.txt \
--output_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded/4th_fold_base_-3_-2_-1.jsonl \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
--layers=-10,-11,-12 \
--max_seq_length=128 \
--batch_size=8
111 changes: 111 additions & 0 deletions nlp/production_models/embedded_valid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from tensorflow import keras,nn
from tensorflow.keras import layers
import json
import numpy as np
import tensorflow as tf
import os
import sys

#sudo python3 -m pip install h5py==2.10.0


def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter):
date_files = {}
for file_d in train_files:
file_name,file_len = file_d.split(':')
date_files[file_name] = int(file_len)
total_size = 0
for train_f in date_files:
total_size+=date_files[train_f]
input_sample = np.zeros((total_size, *(sentence_len,embedd_dim)))
out_sample = []
i = 0
for file_n in date_files:
data_file = open('{}'.format(file_n), 'r')
for row in data_file:
row = json.loads(row)
features = row['features']
if token_len_filter:
if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]):
out_sample.append(cat_map[row['label']])
word_i = 0
for word in features:
if word_i<sentence_len:
input_sample[i,word_i] = np.array([sum(x)/len(x) for x in zip(word['layers'][-1]['values'],word['layers'][-2]['values'])])
word_i+=1
else:
break
i+=1
else:
out_sample.append(cat_map[row['label']])
word_i = 0
for word in features:
if word_i < sentence_len:
input_sample[i, word_i] = np.array(
[sum(x) / len(x) for x in zip(word['layers'][-1]['values'], word['layers'][-2]['values'])])
word_i += 1
else:
break
i += 1
input_sample = input_sample[0:i]
out_sample = out_sample[0:i]
out_sample=keras.utils.to_categorical(out_sample)
out_sample_array = np.empty((len(out_sample), 6))
i =0
for sample in out_sample:
out_sample_array[i,] = np.array(sample)
i+=1
return (input_sample,out_sample_array)


def mdoel_valid(embedd_dim,sentence_len,test_date_files,valid_date_files,model_folder,token_len_range,token_len_filter):
cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5}
input_sample,out_sample_array= data_gen(test_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
max_score = [200000000,0]
max_model = None
file_l = os.listdir(model_folder)
score_l = []
for f in file_l:
model = keras.models.load_model('{}/{}'.format(model_folder,f))
score = model.evaluate(input_sample, out_sample_array)
#print(f,score)
score_l.append((f,score))
if score[1] > max_score[1]:
max_score = score
max_model = f
elif score[1] == max_score[1] and max_model is not None:
epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-')
epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-')
if score[0]<max_score[0]:
max_score = score
max_model = f
elif score[0]==max_score[0]:
if epoch_n>epoch_n_m:
max_score = score
max_model = f
print('max_score: ',max_score)
print ('max_model: ',max_model)
valid_input,valid_output= data_gen(valid_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
model = keras.models.load_model('{}/{}'.format(model_folder,max_model))
score = model.evaluate(valid_input, valid_output)
print('valid_score: ',score)
return max_model,score,max_score,len(valid_input)

if __name__=='__main__':
#sentence_len = 50
#embedd_dim = 768
#test_date_files = '2th_fold.json:1684'
#valid_date_files = '3th_fold.json:1684'
#model_folder = 'model_lstm_3_small'
#token_len_filter = True
#token_len_range = ['0', '50']
embedd_dim = int(sys.argv[1])
sentence_len = int(sys.argv[2])
test_date_files = sys.argv[3].split(',')
valid_date_files = sys.argv[4].split(',')
model_folder = sys.argv[5]
token_len_filter = sys.argv[6].lower() == 'true'
token_len_range = sys.argv[7].split(',')
#python3 embedded_valid.py 768 50 3th_fold.json:1684 4th_fold.json:1691 trans_model1_filtered true 0,50
mdoel_valid(embedd_dim, sentence_len, test_date_files, valid_date_files, model_folder, token_len_range,
token_len_filter)
140 changes: 140 additions & 0 deletions nlp/production_models/embedded_valid2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from tensorflow import keras,nn
from tensorflow.keras import layers
import json
import numpy as np
import tensorflow as tf
import os
import sys

#sudo python3 -m pip install h5py==2.10.0


def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter):
date_files = {}
for file_d in train_files:
file_name,file_len = file_d.split(':')
date_files[file_name] = int(file_len)
total_size = 0
for train_f in date_files:
total_size+=date_files[train_f]
input_sample = np.zeros((total_size, *(sentence_len,embedd_dim)))
out_sample = []
i = 0
for file_n in date_files:
data_file = open('{}'.format(file_n), 'r')
for row in data_file:
row = json.loads(row)
features = row['features']
if token_len_filter:
if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]):
out_sample.append(cat_map[row['label']])
word_i = 0
for word in features:
if word_i<sentence_len:
input_sample[i,word_i] = np.array([sum(x)/len(x) for x in zip(word['layers'][-1]['values'],word['layers'][-2]['values'])])
word_i+=1
else:
break
i+=1
else:
out_sample.append(cat_map[row['label']])
word_i = 0
for word in features:
if word_i < sentence_len:
input_sample[i, word_i] = np.array(
[sum(x) / len(x) for x in zip(word['layers'][-1]['values'], word['layers'][-2]['values'])])
word_i += 1
else:
break
i += 1
input_sample = input_sample[0:i]
out_sample = out_sample[0:i]
out_sample=keras.utils.to_categorical(out_sample)
out_sample_array = np.empty((len(out_sample), 6))
i =0
for sample in out_sample:
out_sample_array[i,] = np.array(sample)
i+=1
return (input_sample,out_sample_array)


def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates


def positional_encoding(position, d_model):
angle_rads = get_angles(np.arange(position)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# apply sin to even indices in the array; 2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# apply cos to odd indices in the array; 2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)


def mdoel_valid(embedd_dim,sentence_len,test_date_files,valid_date_files,model_folder,token_len_range,token_len_filter,pos_encode_scale):
cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5}
input_sample,out_sample_array= data_gen(test_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
print ('input_sample: ',input_sample[0])
position_embb = positional_encoding(sentence_len, embedd_dim)
input_sample_ind = np.where(input_sample!=0,1,0)
pos_ind = np.multiply(position_embb,input_sample_ind)
word_pos = input_sample + (pos_ind*pos_encode_scale)
print('word_pos: ',word_pos[0])
max_score = [200000000,0]
max_model = None
file_l = os.listdir(model_folder)
score_l = []
for f in file_l:
print('model: ','{}/{}'.format(model_folder,f))
model = keras.models.load_model('{}/{}'.format(model_folder,f))
score = model.evaluate(word_pos, out_sample_array)
print(f,score)
score_l.append((f,score))
if score[1] > max_score[1]:
max_score = score
max_model = f
elif score[1] == max_score[1] and max_model is not None:
epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-')
epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-')
if score[0]<max_score[0]:
max_score = score
max_model = f
elif score[0]==max_score[0]:
if epoch_n>epoch_n_m:
max_score = score
max_model = f
print('max_score: ',max_score)
print ('max_model: ',max_model)
valid_input,valid_output= data_gen(valid_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
valid_sample_ind = np.where(valid_input != 0, 1, 0)
valid_pos_ind = np.multiply(position_embb,valid_sample_ind)
valid_pos =valid_input+(valid_pos_ind*pos_encode_scale)
model = keras.models.load_model('{}/{}'.format(model_folder,max_model))
score = model.evaluate(valid_pos, valid_output)
print('valid_score: ',score)
print (score_l)
return max_model,score,max_score,len(valid_input)

if __name__=='__main__':
#sentence_len = 50
#embedd_dim = 768
#test_date_files = '2th_fold.json:1684'
#valid_date_files = '3th_fold.json:1684'
#model_folder = 'model_lstm_3_small'
#token_len_filter = True
#token_len_range = ['0', '50']
embedd_dim = int(sys.argv[1])
sentence_len = int(sys.argv[2])
test_date_files = sys.argv[3].split(',')
valid_date_files = sys.argv[4].split(',')
model_folder = sys.argv[5]
token_len_filter = sys.argv[6].lower() == 'true'
token_len_range = sys.argv[7].split(',')
#python3 embedded_valid2.py 768 50 3th_fold.json:1684 4th_fold.json:1691 transB_1_filtered true 0,50
#inputs1: 768 50 ['3th_fold.json:1684'] ['4th_fold.json:1691'] transB_1_filtered ['0', '50'] True
mdoel_valid(embedd_dim, sentence_len, test_date_files, valid_date_files, model_folder, token_len_range,
token_len_filter)
25 changes: 25 additions & 0 deletions nlp/production_models/embedding_train_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sys

base_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/labels'


label_names = ['Added', 'Changed', 'Deprecated', 'Fixed', 'Removed', 'Security']

#label_names = ['Added']



for label_n in label_names:
label_file_path = '{}/{}.txt'.format(base_path, label_n.lower())
label = '__label__{}'.format(label_n)
label_len = len(label)
label_content_file = open('{}/{}_content.txt'.format(base_path, label_n.lower()),'w')
label_fiile = open(label_file_path, 'r')
for row in label_fiile:
row = row.strip()
content = row[label_len + 1:]
label_content_file.write(content+'\n')
label_content_file.close()
label_fiile.close()


Loading