ml_models/grade_predictor at main · srdebayan/ml_models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# -*- coding: utf-8 -*-
"""Copy of Mini Proj 2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1WfpMy-LZTesHU4cq5jLN9wCv2r8nCSy8

The two approcahes used are random forest and Naive Bayes, both of them can be found below.
"""

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
# Load the library with the iris dataset
from sklearn.datasets import load_iris

# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Set random seed
np.random.seed(0)

df = pd.read_csv('/content/MP2_Data.csv')
print(df.head(2))

df.corr(method ='pearson')

df['Week_Stat0'] = df['Week1_Stat0']+df['Week2_Stat0']+df['Week3_Stat0']+df['Week4_Stat0']+df['Week5_Stat0']+df['Week6_Stat0']+df['Week7_Stat0']+df['Week8_Stat0']+df['Week9_Stat0']

df['Week_Stat0']

df['Week_Stat1'] = df['Week1_Stat1']+df['Week2_Stat1']+df['Week3_Stat1']+df['Week4_Stat1']+df['Week5_Stat1']+df['Week6_Stat1']+df['Week7_Stat1']+df['Week8_Stat1']+df['Week9_Stat1']

df['Week_Stat2'] = df['Week1_Stat2']+df['Week2_Stat2']+df['Week3_Stat2']+df['Week4_Stat2']+df['Week5_Stat2']+df['Week6_Stat2']+df['Week7_Stat2']+df['Week8_Stat2']+df['Week9_Stat2']

df['Week_Stat3'] = df['Week1_Stat3']+df['Week2_Stat3']+df['Week3_Stat3']+df['Week4_Stat3']+df['Week5_Stat3']+df['Week6_Stat3']+df['Week7_Stat3']+df['Week8_Stat3']+df['Week9_Stat3']

"""Feature selection"""

df1 = df[['Week8_Total','Week_Stat0','Week_Stat1','Week_Stat2','Week_Stat3','Grade']]
corela=df1.corr(method ='pearson')
round(corela,2)
corela

sns.heatmap(corela);

df1 = df1[['Week8_Total','Week_Stat0','Week_Stat1','Week_Stat3','Grade']]
#df1 = df[['Week_Stat0','Week_Stat3','Grade']]
#print(df1.head(5))
df1['is_train'] = np.random.uniform(0, 1, len(df1)) <= .75
print(df1.head(5))
train, test = df1[df1['is_train']==True], df1[df1['is_train']==False]

# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

features = df1.columns[:4]
features

# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], train['Grade'])

clf.predict(test[features])

clf.predict_proba(test[features])[0:10]

preds = clf.predict(test[features])
preds[0:10]

preds[0:10]

test['Grade'].head(30)

pd.crosstab(test['Grade'], preds, rownames=['Actual Grade'], colnames=['Predicted Grade'])

eg = pd.crosstab(test['Grade'], preds, rownames=['Actual Grade'], colnames=['Predicted Grade'])
sns.heatmap(eg)

length= len(preds)
length

preds
pred_clf_df = pd.DataFrame(preds.reshape(length,1))
pred_clf_df.rename(columns={0:'Prediction'}, inplace=True)
pred_clf_df

test[features]

#pred_outcome0 = pd.concat([test[features], pred_clf_df], axis=1)
#pred_outcome0 = pred_outcome0.reindex(test[features].index)
#pred_outcome0.rename(columns = {0:'Week2_Quiz1', 1:'Week4_Quiz2'}, inplace=True)
test['prediction'] = preds.tolist()
#test = test.drop(columns=['Grade'])
test

#plot the scatter plot of variable in data
test.plot.scatter(x="Grade",y="prediction")
plt.show()

#merging the prediction with original dataset
#pred_comp0 = pd.merge(df,pred_outcome0, on=['Week8_Total','Week_Stat0','Week_Stat3'])
pred_comp0 = pd.merge(df,test, on=['Week8_Total','Week_Stat0','Week_Stat1','Week_Stat3'])
#print top 10 lines of the final predictions
print((pred_comp0).head(10))
print ("\n")

#Save the file to csv
pred_comp0.to_csv('RF_Predictions.csv', sep=',')

accuracy_scorerf=accuracy_score(test['Grade'], preds)
accuracy_scorerf

list(zip(train[features], clf.feature_importances_))

# Set the figure size
plt.rcParams["figure.figsize"] = [7.00, 3.50]
plt.rcParams["figure.autolayout"] = True

# Plot bar chart with data points
plt.bar(features, clf.feature_importances_)

# Display the plot
plt.show()

"""Naive Bayes"""

df.head(2)

#Initialize Gaussian Naive Bayes
clf1 = GaussianNB()

clf1.fit(train[features], train['Grade'])

#Predicting for the Test Set
pred_clf1 = clf1.predict(test[features])


#Prediction Probability
prob_pos_clf1 = clf1.predict_proba(test[features])[:, 4]


pred_clf1

#prob_pos_clf1

pd.crosstab(test['Grade'], pred_clf1, rownames=['Actual Grade'], colnames=['Predicted Grade'])

htmpN = pd.crosstab(test['Grade'], pred_clf1, rownames=['Actual Grade'], colnames=['Predicted Grade'])
sns.heatmap(htmpN)

pred_clf1
length1= len(pred_clf1)
length1

#Create the prediction file by concatenation of the original data and predictions
#Reshaping needed to perform the concatenation
#pred_clf1_df = pd.DataFrame(pred_clf1.reshape(length1,1))

#Column renaming to indicate the predictions
#pred_clf1_df.rename(columns={0:'Prediction'}, inplace=True)
#pred_clf1_df

test[features]
test1=test
test1[features]

#concatenating the two pandas dataframes over the columns to create a prediction dataset
test1['prediction'] = pred_clf1.tolist()
#pred_outcome = pd.concat([test[features], pred_clf1_df], axis=1)
#pred_outcome = pred_outcome.reindex(test[features].index)
#pred_outcome.rename(columns = {0:'Week2_Quiz1', 1:'Week4_Quiz2'}, inplace=True)
test1

#plot the scatter plot of variable in data
test.plot.scatter(x="Grade",y="prediction")
plt.show()

#merging the prediction with original dataset
#pred_comp0 = pd.merge(df,pred_outcome0, on=['Week8_Total','Week_Stat0','Week_Stat3'])
pred_comp00 = pd.merge(df,test1, on=['Week8_Total','Week_Stat0','Week_Stat1','Week_Stat3'])
#print top 10 lines of the final predictions
print((pred_comp00).head(10))
print ("\n")

#Save the file to csv
pred_comp00.to_csv('NB_Predictions.csv', sep=',')

#Model Performance
#setting performance parameters
seed=7
kfold = model_selection.KFold(n_splits=10,shuffle=True, random_state=seed)

#calling the cross validation function
scoring='accuracy'
cv_results = model_selection.cross_val_score(GaussianNB(), train[features], train['Grade'], cv=kfold, scoring=scoring)

#displaying the mean and standard deviation of the prediction
msg = "%s: %f (%f)" % ('NB accuracy', cv_results.mean(), cv_results.std())
#msg1= " %f " % ( cv_results.mean())
print(msg)

list2=[]
list2.append(accuracy_scorerf)
msg1= " %f " % ( cv_results.mean())
msg1=float(msg1)
list2.append(msg1)
listname=["Random Forest","Naive Bayes"]

list2
#listname

# Set the figure size
plt.rcParams["figure.figsize"] = [4.00, 5.50]
plt.rcParams["figure.autolayout"] = True


# Plot bar chart with data points
plt.bar(listname, list2)
# giving X and Y labels

plt.ylabel("Accuracy Score")

# Display the plot
plt.show()