-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathmake_class_features.py
More file actions
151 lines (122 loc) · 5.86 KB
/
make_class_features.py
File metadata and controls
151 lines (122 loc) · 5.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python2
import sys
import os
from os.path import join, getsize, splitext
import re
from gensim.models import Word2Vec, KeyedVectors
from verbose_tools import get_classes
from verbose_tools import get_classes_properties
import logging as log
from tqdm import tqdm
NDIM = 100 ## currently set to 100 may change it to other value.
verbose_root = sys.argv[1]
log.basicConfig(level=log.DEBUG, filename='make_class_features.log', filemode='a', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = log.getLogger(__name__)
logger.info("parsing projects")
sentences = []
patterns = dict()
#data_file = "output-refined.csv"
#data_file = "input-1300.csv"
data_file = "p-mart-output-final.csv"
with open(data_file) as f:
for line in f:
# project,class_name,pattern,url = line.strip().split(",")
project, class_name, pattern = line.strip().split(",")
patterns[(project, class_name)] = pattern
file_data_store = dict()
# Iterate through all files in the input folder (verbose_root)
# for root, dirs, files in tqdm(os.walk(verbose_root)):
for root, dirs, files in os.walk(verbose_root):
# If the are no files in the verbose root folder
if files is None:
logger.error("No files found in input directory")
exit()
for f in files:
if ".verbose" in f:
file_data_store[f]=dict()
proj_name = splitext(f)[0]
file_data_store[f]["project_name"] = proj_name
# Groups the ngrams by class
class_dict = get_classes(os.path.join(root, f))
class_properties = get_classes_properties(os.path.join(root, f))
if class_dict is None:
logger.warning("No classes or ngrams extracted from project file {0}".format(f))
continue
file_data_store[f]["class_dict"] = class_dict
file_data_store[f]["class_properties"] = class_properties
# Appends the list of ngrams (in lower case) to a list of sentences
for class_name, ngrams in class_dict.iteritems():
ngram_for_class = [ngram.lower() for ngram in ngrams]
sentences.append(ngram_for_class)
logger.info("Building Word2Vec model")
## May be able to revise these paramters for better performance
ngram_model = Word2Vec(sentences, size=NDIM, window=20, min_count=2, workers=4)
saved_items_list = set()
saved_items_dicts = list()
for f,verbose_data in file_data_store.iteritems():
proj_name = verbose_data["project_name"] # Name of the current project
# Retrieve a dictionary constaining class names and corresponding ngrams
class_dict = verbose_data["class_dict"]
class_properties = verbose_data["class_properties"]
# Iterate over class names and ngrams from the verbose file
for class_name, ngrams in class_dict.iteritems():
# The if-block below makes sure that we only keep the labelled datasets in java_class_features.txt
# This reduces size of java_class_features.txt, before this java_class_features.txt was almost 100 MB
if (proj_name,class_name) not in patterns:
continue
vector_ngram = [0.0 for i in range(NDIM)]
ngram_count = 0
for ngram in ngrams:
try:
# TODO: Check if this line works as expected
vector_ngram += ngram_model.wv[ngram.lower()]
ngram_count += 1
except Exception as e:
# log.warning("Loading Word2Vec: {0}".format(e))
pass
# if any ngrams were present in the trained Word2Vec embedding model
if ngram_count > 0:
# Normalise the vector
vector_ngram /= float(ngram_count)
saved_items_list.add((proj_name,class_name))
feature_dict = dict(project_name=proj_name,class_name=class_name,)
class_properties[class_name].pop('method_return', None)
class_properties[class_name].pop('class_name_words', None)
feature_dict.update(class_properties[class_name])
feature_dict.update({"w2v_"+str(i):x for i,x in enumerate(vector_ngram)})
saved_items_dicts.append(feature_dict)
# Printing Total number of examples identified from output-refined.csv
print "Examples identified from output-refined.csv = "+str(len(saved_items_list))
# Determining which examples are in output-refined.csv (Labelled) but missed in `.verbose` files.
patterns_keys = set(patterns.keys())
print "-"*80
print "Examples in Output-refined.csv but not in `.verbose` files = " + str(len(patterns_keys-saved_items_list))
print "-"*80
print "\nMissing Project,Class,Pattern"
for i,(proj,class_name) in enumerate(sorted(patterns_keys-saved_items_list)):
print i,",",proj,",",class_name,",",patterns[(proj,class_name)]
print "-"*80
print "Missing Projects = "
print "-"*80
missing_projects = set([proj for proj,class_name in sorted(patterns_keys-saved_items_list)])
for proj in missing_projects:
print proj
import pandas as pd
df = pd.DataFrame.from_records(saved_items_dicts)
#data_file = "output-refined.csv"
#data_file = "input-1300.csv"
data_file = "p-mart-output-final.csv"
# patterns = pd.read_csv(data_file,header=None,names=["project_name","class_name","pattern","url"])
patterns = pd.read_csv(data_file,header=None,names=["project_name","class_name","pattern"])
print(patterns.shape)
# pattern_repeats = patterns.groupby(["project_name","class_name"])["pattern"].count()
#patterns.drop_duplicates(["project_name","class_name"],inplace=True)
print(patterns.shape)
dataset = df.merge(patterns,on=["project_name","class_name"],how="inner")
print(dataset.shape)
# dataset_repeats = dataset.groupby(["project_name","class_name"])["pattern"].count()
# dataset_repeats = dataset_repeats[dataset_repeats>1]
#dataset.drop_duplicates(["project_name","class_name"],inplace=True)
#print(dataset.shape)
#dataset.to_csv("dataset.csv",index=False)
dataset.to_csv("P-MARt-dataset.csv",index=False)