-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcustomclassifier.py
More file actions
74 lines (57 loc) · 2.33 KB
/
customclassifier.py
File metadata and controls
74 lines (57 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# This script has a custom written classifier
# First we will import sklearn datasets
# Then we will split the data into test and train
# We will use training data to test the classifier
# And use the test data to test the classifier to see how accurate it was
#
import random
# Implementing custom classifier which is based on k-NN (k nearsneighbors) nearest neighbours
# classifier where K is the number of neighbour when we consider for example if K is 3 then
# we take 3 of the closest point to the test data and the label that satisfies the majority (which is 2)
# of the training data will be predicted as the label for the test data.
# First we need to find the nearest neighbour
# We use the straight line distance formula called Euclidean Distance
# Two dimension space
# d(p,q) = SquareRoot (q1-p1)2 + (q2-p2)2
#
# Three dimension space
# d(p,q) = SquareRoot (q1-p1)2 + (q2-p2)2 + (qn-pn)2
#
# As the feature increases we just add more terms to the equation 4, 5, 6 dimensions and so on
# Import a library called scipy
from scipy.spatial import distance
# Define a distance method
def euc(a,b): # here b is a point from test data and a is from training data
return distance.euclidean(a,b)
# Define classifier class ScrappyKNN
class ScrappyKNN():
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
predictions = []
for row in X_test:
label = self.closest(row)
predictions.append(label)
return predictions
# By using Euclidean function pick the nearest point to the test point
def closest(self, row):
best_dist = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.y_train[best_index]
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5)
my_classifier = ScrappyKNN()
my_classifier.fit(X_train, y_train)
predictions = my_classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print accuracy_score(y_test, predictions)