-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path04_NaiveBayes_Classifier.py
More file actions
82 lines (65 loc) · 2.24 KB
/
04_NaiveBayes_Classifier.py
File metadata and controls
82 lines (65 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import sys
import math
import pandas as pd
import numpy as np
#initializing parameters
input_file = sys.argv[1]
output_file = sys.argv[2]
def calculateProbability(x, mean, variance):
exponent = math.exp(-(math.pow(x - mean, 2) / (2 * variance)))
return (1 / (math.sqrt(2 * math.pi * variance))) * exponent
# read the file into a dataframe
dfdata = pd.read_csv(input_file, sep='\t', header=None)
# Had to add the below if condition because Example.tsv is read as 4 columns
if dfdata.shape[1] == 4:
dfdata.columns = ['Y','X1','X2','X3']
df = dfdata.drop(columns=['X3'])
elif dfdata.shape[1] == 3:
dfdata.columns = ['Y','X1','X2']
df = dfdata
str1 = ""
# Printing the first two lines of the output.
subdf = df.loc[df['Y'] == 'A']
str1 = str(subdf["X1"].mean()) + '\t' + str(subdf["X1"].var()) + '\t' + str(subdf["X2"].mean()) + '\t' + str(subdf["X2"].var()) + '\t' + str(subdf.shape[0]/df.shape[0])
#print(str1)
ax1mean = subdf["X1"].mean()
ax1var = subdf["X1"].var()
ax2mean = subdf["X2"].mean()
ax2var = subdf["X2"].var()
aprob = subdf.shape[0]/df.shape[0]
subdf = df.loc[df['Y'] == 'B']
str2 = str(subdf["X1"].mean()) + '\t' + str(subdf["X1"].var()) + '\t' + str(subdf["X2"].mean()) + '\t' + str(subdf["X2"].var()) + '\t' + str(subdf.shape[0]/df.shape[0])
#print(str1)
bx1mean = subdf["X1"].mean()
bx1var = subdf["X1"].var()
bx2mean = subdf["X2"].mean()
bx2var = subdf["X2"].var()
bprob = subdf.shape[0]/df.shape[0]
df['E'] = df['Y'].map({'A': 1, 'B': 0})
ax1 = []
ax2 = []
bx1 = []
bx2 = []
for ins in df['X1']:
ax1_prob = calculateProbability(ins, ax1mean, ax1var)
bx1_prob = calculateProbability(ins, bx1mean, bx1var)
ax1.append(ax1_prob)
bx1.append(bx1_prob)
for ins in df['X2']:
ax2_prob = calculateProbability(ins, ax2mean, ax2var)
bx2_prob = calculateProbability(ins, bx2mean, bx2var)
ax2.append(ax2_prob)
bx2.append(bx2_prob)
a_prob = np.multiply(ax1, ax2)
b_prob = np.multiply(bx1, bx2)
df['A'] = a_prob
df['B'] = b_prob
df['O'] = np.where((df['A'] > df['B']), 1, 0)
df['miscls'] = df['E'] - df['O']
misclassification = str(df[df['miscls']!=0].shape[0])
with open(output_file, 'w') as outputfile:
outputfile.write(str1)
outputfile.write('\n')
outputfile.write(str2)
outputfile.write('\n')
outputfile.write(misclassification)