Book_Index_Building/Classifier.py at master · RiddhiRex/Book_Index_Building · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

# coding: utf-8

# In[ ]:

import nltk, collections
import numpy as np
from nltk.collocations import *
from nltk.corpus import stopwords
import subprocess
import pandas as pd
import re
from sklearn.externals import joblib
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

def classifier(train_csv,test_csv):
    data_train = pd.DataFrame()
    data_test = pd.DataFrame()
    data_train = pd.read_csv(train_csv)
    data_test = pd.read_csv(test_csv)
    data_train.fillna('0', inplace=True)
    data_test.fillna('0', inplace=True)
    #Splitting the features and traget variable
    y_train = data_train["indices"]
    y_test = data_test["indices"]
    data_train.drop('indices', 1, inplace=True)
    data_train.drop('word', 1, inplace=True)
    data_train.drop('pos', 1, inplace=True)
    data_train.drop('filename', 1, inplace=True)
    data_test.drop('indices', 1, inplace=True)
    data_test.drop('word', 1, inplace=True)
    data_test.drop('pos', 1, inplace=True)
    data_test.drop('filename', 1, inplace=True)

    X_train = data_train
    X_test = data_test
    #Initialising the model
    rfc= RandomForestClassifier()
    #Fitting the data on the model
    rfc.fit(X_train, y_train)
    #Saving the model
    saveModel(rfc)
    pred = rfc.predict(X_test)
    Evaluate_accuracy(pred, y_test)
    return

def predict(model):
    data = pd.DataFrame()
    data = pd.read_csv("datasets\dataframe.csv")
    data.fillna('0', inplace=True)

    #Splitting the features and traget variable
    y = data['index']
    data.drop('index', 1, inplace=True)
    data.drop('word', 1, inplace=True)
    data.drop('pos', 1, inplace=True)
    X = data

    pred = model.predict(X)
    Evaluate_accuracy(pred, y)
    return


def saveModel(model):
    joblib.dump(model,'newrfc.model')
    return

def loadModel():
    model = joblib.load('newrfc.model')
    return

def Evaluate_accuracy(pred, true_value):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for i in range(len(pred)):
        if true_value[i]==1 and pred[i]==1:
           TP += 1
    for i in range(len(pred)):
        if pred[i]==1 and true_value[i]==0:
           FP += 1
    for i in range(len(pred)):
        if true_value[i]==pred[i]==0:
           TN += 1
    for i in range(len(pred)):
        if pred[i]==0 and true_value[i]==1:
           FN += 1

    print("TP, FP, TN, FN   :", TP, FP, TN, FN)
    print("The index words generated by the model are" , )
    print(pred, true_value)
    print("Accuracy score is ", accuracy_score(true_value, pred)*100)
    rmse = np.sqrt(mean_squared_error(true_value, pred))
    print("Root Mean Squared Error: {}".format(rmse))
    print("Mean absolute error:", mean_absolute_error(true_value,pred))
    print("Micro stats:")
    print(precision_recall_fscore_support(true_value, pred, average='micro'))
    print("Macro stats:")
    print(precision_recall_fscore_support(true_value, pred, average='macro'))


    cr= classification_report(true_value, pred)
    print(cr)
    xticks = ['precision', 'recall', 'f1-score', 'support']
    yticks = list(np.unique(true_value))
    yticks += ['avg']
    rep = np.array(precision_recall_fscore_support(true_value, pred)).T
    avg = np.mean(rep, axis=0)
    avg[-1] = np.sum(rep[:, -1])
    rep = np.insert(rep, rep.shape[0], avg, axis=0)
    plt.title('Classification Report')
#    rep =rep.astype('float') / rep.sum(axis=1)[:, np.newaxis]
    sns.heatmap(rep, annot=True, xticklabels=xticks, yticklabels=yticks)
    plt.show()


    cm=confusion_matrix(true_value,pred)

    plt.title('Confusion matrix: Not Normalized')
    sns.heatmap(cm, annot=True, linewidths=.5)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    plt.title('Confusion matrix: Normalized')
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm, annot=True, linewidths=.5)
    plt.xlabel('Predicted value')
    plt.ylabel('True value')
    plt.show()
    return