forked from Ashkelso/VSChatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLPClassifier.py
More file actions
199 lines (169 loc) · 8.29 KB
/
NLPClassifier.py
File metadata and controls
199 lines (169 loc) · 8.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# NLP classification engine
import pandas as pd
import pickle
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import NuSVC, SVC
from nltk.corpus import stopwords
lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')
def prepareData (data_url, testSize):
data = pd.read_excel(data_url)
data = data.loc[data['labels'].isin(['assault', 'sexual abuse'])]
X = data['data']
y = data['labels']
# since data is small and labels are imbalanced
# shuffle before splitting and use y label to stratify the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, stratify=y, shuffle=True)
return X_train, X_test, y_train, y_test, X, y
def preprocessText (message):
# removess any punctuation
nopunc = [char for char in message if char not in string.punctuation]
# forms a string without punctuation
nopunc = ''.join(nopunc)
# removes any stopwords and returns the rest as list of words
nostop = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
return nostop
def preprocessTextAdvanced (message):
# removes punctation and stop words
nopunc_stop = preprocessText(message)
# lemmatising
lemmatised = [lemma.lemmatize(word) for word in nopunc_stop]
# stemming
stemmed = [sno.stem(word) for word in lemmatised]
# removes any stopwords again after stemming which may have exposed stopwords which were contracted
stemmed_nostop = [word for word in stemmed if word.lower() not in stopwords.words('english')]
return stemmed_nostop
def getPipeline (classifier):
print('\nUsing pipeline without text preprocessing')
pipeline = Pipeline([
('vec', CountVectorizer()), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', classifier), # train on TF-IDF vectors with classifier
])
return pipeline
def getPipelineAdvanced (classifier):
print('\nUsing pipeline with text preprocessing')
pipeline = Pipeline([
('vec', CountVectorizer(analyzer = preprocessTextAdvanced)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', classifier), # train on TF-IDF vectors with classifier
])
return pipeline
def trainModel (pipeline, name, X_train, y_train, params, numkfold):
print('\nTraining with', name)
best_model = GridSearchCV(pipeline, param_grid=params, cv=numkfold, n_jobs=-1, verbose=1, scoring='accuracy')
best_model.fit(X_train, y_train)
best_accuray = best_model.cv_results_['mean_test_score'][best_model.best_index_]
best_std = best_model.cv_results_['std_test_score'][best_model.best_index_]
print('best k-fold index:', best_model.best_index_,
'\tmean accuracy:', str(best_accuray),
'\tstd:', str(best_std))
#if name == 'randomForest':
# print('oob score:', best_model.oob_score_)
print('hyperparameters:', best_model.best_params_)
print('best estimator:', best_model.best_estimator_)
return best_model
def refitModel (pipeline, name, X, y, params, numkfold):
print('\nRefitting', name)
refitted_model = GridSearchCV(pipeline, param_grid=params, cv=numkfold, n_jobs=-1, verbose=0, scoring='accuracy')
refitted_model.fit(X, y)
refitted_accuray = refitted_model.cv_results_['mean_test_score'][refitted_model.best_index_]
refitted_std = refitted_model.cv_results_['std_test_score'][refitted_model.best_index_]
print('mean accuracy:', str(refitted_accuray), '\tstd:', str(refitted_std))
return refitted_model
def testPerformance (model, name, X_test, y_test):
print('\nTesting with', name)
y_predicted = model.predict(X_test)
#print('test label\tpredicted label')
#for i in range(len(y_predicted)):
# print(y_test.values[i], '\t', y_predicted[i])
test_accuracy = model.score(X_test, y_test)
print('mean accuracy:', str(test_accuracy))
print('confusion matrix:\n', confusion_matrix(y_test, y_predicted))
print('classification report:\n', classification_report(y_test, y_predicted))
print('..............................')
return test_accuracy
def main():
#prepare training and testing data
data_url = 'training data.xlsx'
X_train, X_test, y_train, y_test, X, y = prepareData(data_url, testSize=0.2)
# define models and their parameters for hyperparameter grid search
# dictionary key name is also used as a filename to save the model
models = {
'RandomForest': (
RandomForestClassifier(),
{
# 'classifier__max_depth': [10, 50, 100],
'classifier__n_estimators': [100, 500, 1000],
'classifier__oob_score': [True, False]
}
),
'NuSVC': (
NuSVC(),
{
'classifier__gamma': [1e-4, 1e-3, 1e-2],
'classifier__nu': [0.1, 0.3, 0.5, 0.7],
}
),
'SVC': (
SVC(),
{
'classifier__C': [1, 10, 100],
'classifier__gamma': [1e-2, 1e-1, 0],
}
),
'MultiNB': (
MultinomialNB(),
{
'classifier__alpha': [1e-2, 1e-1, 1],
'classifier__fit_prior': [True, False],
}
),
}
# exersise all models
numkfold = 10
index_names = ['Without','With']
df_train_results = pd.DataFrame(index=index_names)
df_test_results = pd.DataFrame(index=index_names)
for name, (classifier, params) in models.items():
pipeline_params = {}
# add parameters for pre-processing
pipeline_params['vec__ngram_range'] = [(1, 1), (1, 2)]
# Adding model params to the pipeline for hyperparameter grid search
pipeline_params.update(params)
# Training and testing the model without text preprocessing for comparison
pipeline = getPipeline(classifier)
hyper_model = trainModel(pipeline, name, X_train, y_train, pipeline_params, numkfold)
test_accuracy = testPerformance(hyper_model, name, X_test, y_test) #testing with untouch data
df_train_results.loc[index_names[0], name] = hyper_model.cv_results_['mean_test_score'][hyper_model.best_index_] * 100
df_test_results.loc[index_names[0], name] = test_accuracy * 100
# Training and testing the model again with text preprocessig for comparison
pipeline = getPipelineAdvanced(classifier)
hyper_model = trainModel(pipeline, name, X_train, y_train, pipeline_params, numkfold)
test_accuracy = testPerformance(hyper_model, name, X_test, y_test) #testing with untouch data
df_train_results.loc[index_names[1], name] = hyper_model.cv_results_['mean_test_score'][hyper_model.best_index_] * 100
df_test_results.loc[index_names[1], name] = test_accuracy * 100
# Wrapping hyperparameters returned from the last training as an array
# so it can be used to refit the model through a grid search
hyperparameter = {}
for key, value in hyper_model.best_params_.items():
hyperparameter[key] = [value]
# Refitting with 100% of the data for production
hyper_model = refitModel(pipeline, name, X, y, hyperparameter, numkfold)
# Saving the model for each classifier
filename = name + '.p'
pickle.dump(hyper_model, open(filename,'wb'))
print('\nSaved as', filename)
print('\n==============================')
print('\nTraining Summary (mean accuracy %):\n', df_train_results)
print('\nTest Summary (mean accuracy %):\n', df_test_results)
if __name__ == '__main__':
# main function will not be called when grid search jobs are run in parallel
main()