VSChatbot/NLPClassifier.py at master · vhaung/VSChatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# NLP classification engine

import pandas as pd
import pickle
import string
import nltk

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import NuSVC, SVC
from nltk.corpus import stopwords

lemma = nltk.wordnet.WordNetLemmatizer()
sno = nltk.stem.SnowballStemmer('english')

def prepareData (data_url, testSize):
    data = pd.read_excel(data_url)
    data = data.loc[data['labels'].isin(['assault', 'sexual abuse'])]
    X = data['data']
    y = data['labels']

    # since data is small and labels are imbalanced
    # shuffle before splitting and use y label to stratify the split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, stratify=y, shuffle=True)
    return X_train, X_test, y_train, y_test, X, y

def preprocessText (message):
    # removess any punctuation
    nopunc = [char for char in message if char not in string.punctuation]

    # forms a string without punctuation
    nopunc = ''.join(nopunc)

    # removes any stopwords and returns the rest as list of words
    nostop = [word for word in nopunc.split()  if word.lower() not in stopwords.words('english')]
    return nostop

def preprocessTextAdvanced (message):
    # removes punctation and stop words
    nopunc_stop = preprocessText(message)

    # lemmatising
    lemmatised = [lemma.lemmatize(word) for word in nopunc_stop]

    # stemming
    stemmed = [sno.stem(word) for word in lemmatised]

    # removes any stopwords again after stemming which may have exposed stopwords which were contracted
    stemmed_nostop = [word for word in stemmed if word.lower() not in stopwords.words('english')]
    return stemmed_nostop

def getPipeline (classifier):
    print('\nUsing pipeline without text preprocessing')
    pipeline = Pipeline([
        ('vec', CountVectorizer()),  # strings to token integer counts
        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier', classifier),  # train on TF-IDF vectors with  classifier
    ])
    return pipeline

def getPipelineAdvanced (classifier):
    print('\nUsing pipeline with text preprocessing')
    pipeline = Pipeline([
        ('vec', CountVectorizer(analyzer = preprocessTextAdvanced)),  # strings to token integer counts
        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier', classifier),  # train on TF-IDF vectors with  classifier
    ])
    return pipeline

def trainModel (pipeline, name, X_train, y_train, params, numkfold):
    print('\nTraining with', name)
    best_model = GridSearchCV(pipeline, param_grid=params, cv=numkfold, n_jobs=-1, verbose=1, scoring='accuracy')
    best_model.fit(X_train, y_train)
    best_accuray = best_model.cv_results_['mean_test_score'][best_model.best_index_]
    best_std = best_model.cv_results_['std_test_score'][best_model.best_index_]
    print('best k-fold index:', best_model.best_index_,
          '\tmean accuracy:', str(best_accuray),
          '\tstd:', str(best_std))
    #if name == 'randomForest':
    #    print('oob score:', best_model.oob_score_)
    print('hyperparameters:', best_model.best_params_)
    print('best estimator:', best_model.best_estimator_)
    return best_model

def refitModel (pipeline, name, X, y, params, numkfold):
    print('\nRefitting', name)
    refitted_model = GridSearchCV(pipeline, param_grid=params, cv=numkfold, n_jobs=-1, verbose=0, scoring='accuracy')
    refitted_model.fit(X, y)
    refitted_accuray = refitted_model.cv_results_['mean_test_score'][refitted_model.best_index_]
    refitted_std = refitted_model.cv_results_['std_test_score'][refitted_model.best_index_]
    print('mean accuracy:', str(refitted_accuray), '\tstd:', str(refitted_std))
    return refitted_model

def testPerformance (model, name, X_test, y_test):
    print('\nTesting with', name)
    y_predicted = model.predict(X_test)
    #print('test label\tpredicted label')
    #for i in range(len(y_predicted)):
    #    print(y_test.values[i], '\t', y_predicted[i])
    test_accuracy = model.score(X_test, y_test)
    print('mean accuracy:', str(test_accuracy))
    print('confusion matrix:\n', confusion_matrix(y_test, y_predicted))
    print('classification report:\n', classification_report(y_test, y_predicted))
    print('..............................')
    return test_accuracy

def main():
    #prepare training and testing data
    data_url = 'training data.xlsx'
    X_train, X_test, y_train, y_test, X, y = prepareData(data_url, testSize=0.2)

    # define models and their parameters for hyperparameter grid search
    # dictionary key name is also used as a filename to save the model
    models = {
        'RandomForest': (
            RandomForestClassifier(),
            {
#                'classifier__max_depth': [10, 50, 100],
                'classifier__n_estimators': [100, 500, 1000],
               'classifier__oob_score': [True, False]
            }
        ),
        'NuSVC': (
            NuSVC(),
            {
                'classifier__gamma': [1e-4, 1e-3, 1e-2],
                'classifier__nu': [0.1, 0.3, 0.5, 0.7],
            }
        ),
        'SVC': (
            SVC(),
            {
                'classifier__C': [1, 10, 100],
                'classifier__gamma': [1e-2, 1e-1, 0],
            }
        ),
        'MultiNB': (
            MultinomialNB(),
            {
                'classifier__alpha': [1e-2, 1e-1, 1],
                'classifier__fit_prior': [True, False],
            }
        ),
    }

    # exersise all models
    numkfold = 10
    index_names = ['Without','With']
    df_train_results = pd.DataFrame(index=index_names)
    df_test_results = pd.DataFrame(index=index_names)

    for name, (classifier, params) in models.items():
        pipeline_params = {}

        # add parameters for pre-processing
        pipeline_params['vec__ngram_range'] = [(1, 1), (1, 2)]

        # Adding model params to the pipeline for hyperparameter grid search
        pipeline_params.update(params)

        # Training and testing the model without text preprocessing for comparison
        pipeline = getPipeline(classifier)
        hyper_model = trainModel(pipeline, name, X_train, y_train, pipeline_params, numkfold)
        test_accuracy = testPerformance(hyper_model, name, X_test, y_test) #testing with untouch data
        df_train_results.loc[index_names[0], name] = hyper_model.cv_results_['mean_test_score'][hyper_model.best_index_] * 100
        df_test_results.loc[index_names[0], name] = test_accuracy * 100

        # Training and testing the model again with text preprocessig for comparison
        pipeline = getPipelineAdvanced(classifier)
        hyper_model = trainModel(pipeline, name, X_train, y_train, pipeline_params, numkfold)
        test_accuracy = testPerformance(hyper_model, name, X_test, y_test) #testing with untouch data
        df_train_results.loc[index_names[1], name] = hyper_model.cv_results_['mean_test_score'][hyper_model.best_index_] * 100
        df_test_results.loc[index_names[1], name] = test_accuracy * 100

        # Wrapping hyperparameters returned from the last training as an array
        # so it can be used to refit the model through a grid search
        hyperparameter = {}
        for key, value in hyper_model.best_params_.items():
            hyperparameter[key] = [value]

        # Refitting with 100% of the data for production
        hyper_model = refitModel(pipeline, name, X, y, hyperparameter, numkfold)

        # Saving the model for each classifier
        filename = name + '.p'
        pickle.dump(hyper_model, open(filename,'wb'))
        print('\nSaved as', filename)
        print('\n==============================')

    print('\nTraining Summary (mean accuracy %):\n', df_train_results)
    print('\nTest Summary (mean accuracy %):\n', df_test_results)

if __name__ == '__main__':
    # main function will not be called when grid search jobs are run in parallel
    main()