EQWorks · steven-mindswire · Feb 22, 2021 · Feb 22, 2021
diff --git a/nlp/production_models/.DS_Store b/nlp/production_models/.DS_Store
diff --git a/nlp/production_models/README.md b/nlp/production_models/README.md
@@ -0,0 +1,55 @@
+## Data preparation
+1. Raw data: https://github.com/EQWorks/release/tree/master/nlp/data/labels
+2. Execute kfold_data_gen.py
+    * This will randomly separate each label’s content into 5 folds.
+3. Execute kfold_train_data.py
+    *Strip label from each fold file, prepare content for Bert embedding
+4. In case of switching to another Bert pre-trained model and want k-fold remain the same, execute the kfild_regen.py
+
+## Bert embedding generation
+1. Clone Bert from Github: https://github.com/google-research/bert
+2. Download one of the pre-trained models
+3. Set up environment path to the model
+    * Example: export BERT_BASE_DIR=/Users/stevenlu/Downloads/bert-master/uncased_L-12_H-768_A-12
+4. Execute script
+    * Example: 
+python3 extract_features.py \
+--input_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content/4th_fold_content.txt \
+--output_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded/4th_fold_base_-3_-2_-1.jsonl \
+ --vocab_file=$BERT_BASE_DIR/vocab.txt \
+ --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+ --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+ --layers=-10,-11,-12 \
+ --max_seq_length=128 \
+ --batch_size=8
+
+## Model execution on ec2
+1. Initial the cluster and prepare for the model training
+    * ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20201026 - ami-0885b1f6bd170450c
+    * sudo apt-get update
+    * sudo apt-get -y install python3-pip
+    * sudo apt install python3-testresources
+    * sudo pip3 install --upgrade tensorflow
+    * sudo apt install awscli
+2. Configure your AWS
+3. Copy script and k-fold files from s3: s3://eq-miner/test2/release_label/
+4. Create a model output folder
+5. sudo pip3 install fasttext (if your building fasttext model)
+6. Execute the script
+    * Execute embedding_wrap.py/embedding_wrap2.py/embedding_wrap_lstm.py
+    * This will train the model and generate testing result
+      * embedding_wrap: Multi-head self-attention without location encoding
+      * embedding_wrap2: Multi-head self-attention with location encoding
+      * embedding_wrap_lstm: Bi-directional LSTM
+    * Input example:
+      * embedding_wrap/embedding_wrap2: python3 embedding_wrap_lstm.py 768 50 100 1th_fold.json:1684,2th_fold.json:1684,3th_fold.json:1684 4th_fold.json:1691 model7 0,50 0th_fold.json:1684 0.0001 50
+      * embedding_wrap_lstm: python3 embedding_wrap_lstm.py 768 50 100 0th_fold.json:1684,1th_fold.json:1684,2th_fold.json:1684 3th_fold.json:1684 model1 0,50 4th_fold.json:1691 0.00008 50
+    * Example of output:
+      * {'filtered': {'mold_build_time': 5105.467138528824, 'sample_length': 4595, 'validiation': [{'test_file': ['0th_fold.json:1691'], 'valid_file': ['3th_fold.json:1684'], 'model_name': '47-0.9978-0.0171.hdf5', 'valid_score': [0.7473069429397583, 0.8219354748725891], 'test_score': [0.7553659081459045, 0.8098001480102539], 'valid_sample_size': 1550}, {'test_file': ['3th_fold.json:1684'], 'valid_file': ['0th_fold.json:1691'], 'model_name': '46-0.9965-0.0207.hdf5', 'valid_score': [0.7761548161506653, 0.8052868843078613], 'test_score': [0.7688340544700623, 0.8225806355476379], 'valid_sample_size': 1551}]}, 'restricted': {'mold_build_time': 3495.59513258934, 'sample_length': 5059, 'validiation': [{'test_file': ['0th_fold.json:1691'], 'valid_file': ['3th_fold.json:1684'], 'model_name': '46-0.9937-0.0317.hdf5', 'valid_score': [0.8246729373931885, 0.8105700612068176], 'test_score': [0.813213050365448, 0.801068902015686], 'valid_sample_size': 1684}, {'test_file': ['3th_fold.json:1684'], 'valid_file': ['0th_fold.json:1691'], 'model_name': '16-0.9302-0.2405.hdf5', 'valid_score': [0.5546706318855286, 0.7897862195968628], 'test_score': [0.5412384867668152, 0.8141329884529114], 'valid_sample_size': 1684}]}, 'model_name': 'transB_8'}
+7. Execute fasttext
+    * Install fasttext
+    * Execute fasttext_kfold.py
+
+## T-test
+1. Gather result accuracy to a list
+2. Execute t_tests.py
diff --git a/nlp/production_models/bert_script b/nlp/production_models/bert_script
@@ -0,0 +1,12 @@
+export BERT_BASE_DIR=/Users/stevenlu/Downloads/bert-master/uncased_L-12_H-768_A-12
+
+
+python3 extract_features.py \
+  --input_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_content/4th_fold_content.txt \
+  --output_file=/Users/stevenlu/Documents/GitHub/release/nlp/data/k_fold_embedded/4th_fold_base_-3_-2_-1.jsonl \
+  --vocab_file=$BERT_BASE_DIR/vocab.txt \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --layers=-10,-11,-12 \
+  --max_seq_length=128 \
+  --batch_size=8
diff --git a/nlp/production_models/embedded_valid.py b/nlp/production_models/embedded_valid.py
@@ -0,0 +1,111 @@
+from tensorflow import keras,nn
+from tensorflow.keras import layers
+import json
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+
+#sudo python3 -m pip install h5py==2.10.0
+
+
+def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter):
+    date_files = {}
+    for file_d in train_files:
+        file_name,file_len = file_d.split(':')
+        date_files[file_name] = int(file_len)
+    total_size = 0
+    for train_f in date_files:
+        total_size+=date_files[train_f]
+    input_sample = np.zeros((total_size, *(sentence_len,embedd_dim)))
+    out_sample = []
+    i = 0
+    for file_n in date_files:
+        data_file = open('{}'.format(file_n), 'r')
+        for row in data_file:
+            row = json.loads(row)
+            features = row['features']
+            if token_len_filter:
+                if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]):
+                    out_sample.append(cat_map[row['label']])
+                    word_i = 0
+                    for word in features:
+                        if word_i<sentence_len:
+                            input_sample[i,word_i] = np.array([sum(x)/len(x) for x in zip(word['layers'][-1]['values'],word['layers'][-2]['values'])])
+                            word_i+=1
+                        else:
+                            break
+                    i+=1
+            else:
+                out_sample.append(cat_map[row['label']])
+                word_i = 0
+                for word in features:
+                    if word_i < sentence_len:
+                        input_sample[i, word_i] = np.array(
+                            [sum(x) / len(x) for x in zip(word['layers'][-1]['values'], word['layers'][-2]['values'])])
+                        word_i += 1
+                    else:
+                        break
+                i += 1
+    input_sample = input_sample[0:i]
+    out_sample = out_sample[0:i]
+    out_sample=keras.utils.to_categorical(out_sample)
+    out_sample_array = np.empty((len(out_sample), 6))
+    i =0
+    for sample in out_sample:
+        out_sample_array[i,] = np.array(sample)
+        i+=1
+    return (input_sample,out_sample_array)
+
+
+def mdoel_valid(embedd_dim,sentence_len,test_date_files,valid_date_files,model_folder,token_len_range,token_len_filter):
+    cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5}
+    input_sample,out_sample_array= data_gen(test_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
+    max_score = [200000000,0]
+    max_model = None
+    file_l = os.listdir(model_folder)
+    score_l = []
+    for f in file_l:
+        model = keras.models.load_model('{}/{}'.format(model_folder,f))
+        score = model.evaluate(input_sample, out_sample_array)
+        #print(f,score)
+        score_l.append((f,score))
+        if score[1] > max_score[1]:
+            max_score = score
+            max_model = f
+        elif score[1] == max_score[1] and max_model is not None:
+            epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-')
+            epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-')
+            if score[0]<max_score[0]:
+                max_score = score
+                max_model = f
+            elif score[0]==max_score[0]:
+                if epoch_n>epoch_n_m:
+                    max_score = score
+                    max_model = f
+    print('max_score: ',max_score)
+    print ('max_model: ',max_model)
+    valid_input,valid_output= data_gen(valid_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
+    model = keras.models.load_model('{}/{}'.format(model_folder,max_model))
+    score = model.evaluate(valid_input, valid_output)
+    print('valid_score: ',score)
+    return max_model,score,max_score,len(valid_input)
+
+if __name__=='__main__':
+    #sentence_len = 50
+    #embedd_dim = 768
+    #test_date_files = '2th_fold.json:1684'
+    #valid_date_files = '3th_fold.json:1684'
+    #model_folder = 'model_lstm_3_small'
+    #token_len_filter = True
+    #token_len_range = ['0', '50']
+    embedd_dim = int(sys.argv[1])
+    sentence_len = int(sys.argv[2])
+    test_date_files = sys.argv[3].split(',')
+    valid_date_files = sys.argv[4].split(',')
+    model_folder = sys.argv[5]
+    token_len_filter = sys.argv[6].lower() == 'true'
+    token_len_range = sys.argv[7].split(',')
+    #python3 embedded_valid.py 768 50 3th_fold.json:1684 4th_fold.json:1691 trans_model1_filtered true 0,50
+    mdoel_valid(embedd_dim, sentence_len, test_date_files, valid_date_files, model_folder, token_len_range,
+                token_len_filter)
diff --git a/nlp/production_models/embedded_valid2.py b/nlp/production_models/embedded_valid2.py
@@ -0,0 +1,140 @@
+from tensorflow import keras,nn
+from tensorflow.keras import layers
+import json
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+
+#sudo python3 -m pip install h5py==2.10.0
+
+
+def data_gen(train_files,cat_map,sentence_len,embedd_dim,seq_range,token_len_filter):
+    date_files = {}
+    for file_d in train_files:
+        file_name,file_len = file_d.split(':')
+        date_files[file_name] = int(file_len)
+    total_size = 0
+    for train_f in date_files:
+        total_size+=date_files[train_f]
+    input_sample = np.zeros((total_size, *(sentence_len,embedd_dim)))
+    out_sample = []
+    i = 0
+    for file_n in date_files:
+        data_file = open('{}'.format(file_n), 'r')
+        for row in data_file:
+            row = json.loads(row)
+            features = row['features']
+            if token_len_filter:
+                if len(features) > int(seq_range[0]) and len(features) <= int(seq_range[1]):
+                    out_sample.append(cat_map[row['label']])
+                    word_i = 0
+                    for word in features:
+                        if word_i<sentence_len:
+                            input_sample[i,word_i] = np.array([sum(x)/len(x) for x in zip(word['layers'][-1]['values'],word['layers'][-2]['values'])])
+                            word_i+=1
+                        else:
+                            break
+                    i+=1
+            else:
+                out_sample.append(cat_map[row['label']])
+                word_i = 0
+                for word in features:
+                    if word_i < sentence_len:
+                        input_sample[i, word_i] = np.array(
+                            [sum(x) / len(x) for x in zip(word['layers'][-1]['values'], word['layers'][-2]['values'])])
+                        word_i += 1
+                    else:
+                        break
+                i += 1
+    input_sample = input_sample[0:i]
+    out_sample = out_sample[0:i]
+    out_sample=keras.utils.to_categorical(out_sample)
+    out_sample_array = np.empty((len(out_sample), 6))
+    i =0
+    for sample in out_sample:
+        out_sample_array[i,] = np.array(sample)
+        i+=1
+    return (input_sample,out_sample_array)
+
+
+def get_angles(pos, i, d_model):
+  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
+  return pos * angle_rates
+
+
+def positional_encoding(position, d_model):
+  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
+                          np.arange(d_model)[np.newaxis, :],
+                          d_model)
+  # apply sin to even indices in the array; 2i
+  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+  # apply cos to odd indices in the array; 2i+1
+  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+  pos_encoding = angle_rads[np.newaxis, ...]
+  return tf.cast(pos_encoding, dtype=tf.float32)
+
+
+def mdoel_valid(embedd_dim,sentence_len,test_date_files,valid_date_files,model_folder,token_len_range,token_len_filter,pos_encode_scale):
+    cat_map = {'__label__Added':0,'__label__Changed':1,'__label__Deprecated':2,'__label__Fixed':3,'__label__Removed':4,'__label__Security':5}
+    input_sample,out_sample_array= data_gen(test_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
+    print ('input_sample: ',input_sample[0])
+    position_embb = positional_encoding(sentence_len, embedd_dim)
+    input_sample_ind = np.where(input_sample!=0,1,0)
+    pos_ind = np.multiply(position_embb,input_sample_ind)
+    word_pos = input_sample + (pos_ind*pos_encode_scale)
+    print('word_pos: ',word_pos[0])
+    max_score = [200000000,0]
+    max_model = None
+    file_l = os.listdir(model_folder)
+    score_l = []
+    for f in file_l:
+        print('model: ','{}/{}'.format(model_folder,f))
+        model = keras.models.load_model('{}/{}'.format(model_folder,f))
+        score = model.evaluate(word_pos, out_sample_array)
+        print(f,score)
+        score_l.append((f,score))
+        if score[1] > max_score[1]:
+            max_score = score
+            max_model = f
+        elif score[1] == max_score[1] and max_model is not None:
+            epoch_n,train_acc,train_loss = f.replace('.hdf5','').split('-')
+            epoch_n_m, train_acc_m, train_loss_m = max_model.replace('.hdf5', '').split('-')
+            if score[0]<max_score[0]:
+                max_score = score
+                max_model = f
+            elif score[0]==max_score[0]:
+                if epoch_n>epoch_n_m:
+                    max_score = score
+                    max_model = f
+    print('max_score: ',max_score)
+    print ('max_model: ',max_model)
+    valid_input,valid_output= data_gen(valid_date_files,cat_map,sentence_len,embedd_dim,token_len_range,token_len_filter)
+    valid_sample_ind = np.where(valid_input != 0, 1, 0)
+    valid_pos_ind = np.multiply(position_embb,valid_sample_ind)
+    valid_pos =valid_input+(valid_pos_ind*pos_encode_scale)
+    model = keras.models.load_model('{}/{}'.format(model_folder,max_model))
+    score = model.evaluate(valid_pos, valid_output)
+    print('valid_score: ',score)
+    print (score_l)
+    return max_model,score,max_score,len(valid_input)
+
+if __name__=='__main__':
+    #sentence_len = 50
+    #embedd_dim = 768
+    #test_date_files = '2th_fold.json:1684'
+    #valid_date_files = '3th_fold.json:1684'
+    #model_folder = 'model_lstm_3_small'
+    #token_len_filter = True
+    #token_len_range = ['0', '50']
+    embedd_dim = int(sys.argv[1])
+    sentence_len = int(sys.argv[2])
+    test_date_files = sys.argv[3].split(',')
+    valid_date_files = sys.argv[4].split(',')
+    model_folder = sys.argv[5]
+    token_len_filter = sys.argv[6].lower() == 'true'
+    token_len_range = sys.argv[7].split(',')
+    #python3 embedded_valid2.py 768 50 3th_fold.json:1684 4th_fold.json:1691 transB_1_filtered true 0,50
+    #inputs1: 768 50 ['3th_fold.json:1684'] ['4th_fold.json:1691'] transB_1_filtered ['0', '50'] True
+    mdoel_valid(embedd_dim, sentence_len, test_date_files, valid_date_files, model_folder, token_len_range,
+                token_len_filter)
diff --git a/nlp/production_models/embedding_train_data.py b/nlp/production_models/embedding_train_data.py
@@ -0,0 +1,25 @@
+import sys
+
+base_path = '/Users/stevenlu/Documents/GitHub/release/nlp/data/labels'
+
+
+label_names = ['Added', 'Changed', 'Deprecated', 'Fixed', 'Removed', 'Security']
+
+#label_names = ['Added']
+
+
+
+for label_n in label_names:
+    label_file_path = '{}/{}.txt'.format(base_path, label_n.lower())
+    label = '__label__{}'.format(label_n)
+    label_len = len(label)
+    label_content_file = open('{}/{}_content.txt'.format(base_path, label_n.lower()),'w')
+    label_fiile = open(label_file_path, 'r')
+    for row in label_fiile:
+        row = row.strip()
+        content = row[label_len + 1:]
+        label_content_file.write(content+'\n')
+    label_content_file.close()
+    label_fiile.close()
+
+