-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkNN.py
More file actions
171 lines (143 loc) · 5.93 KB
/
kNN.py
File metadata and controls
171 lines (143 loc) · 5.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
'''
This file serves the purpose of the main computation required for kNN (Nearest Neighbor). It also holds the class which represents a training example and the definition of euclidean distance
'''
#math is necessary for this class
import math
#we will need to import sys
import sys
class knn:
'''
This class is an object that holds the training examples for kNN and how to make predictions from it.
'''
def __init__(self, k=5):
'''
Initializes a kNN class. k can be provided for how many training examples should be taken into account for predictions.
Smaller k means that the predictions will be noisy.
Too large of k means that items that are really far and uncorrelated are taking into account during prediction.
Parameters:
nn: how many nearest neighbors should be accounted for during predictions
Result:
Instance of the kNN class.
'''
super().__init__()
self.nn = k
def loadData(self,X,Y):
'''
Loads training data into the kNN class.
TODO: make data appendable
Parameters:
X: a matrix of the parameters of the training examples
Y: a matrix/list of the labels.
Result:
Adds data for the kNN to use for predictions. Returns None
'''
#holds training examples
trainingExamples = []
#for each example provided
for i in range(len(X)):
#add it as an item to the list
trainingExamples.append(_item(X[i], Y[i]))
#save the list of created examples in this instance
self.examples = trainingExamples
def setK(self, k):
'''
Changes the number of nearest neighbors to use for prediction.
Note: It is advantageous to use a odd number to prevent ties.
Parameters:
k: the new k parameter
Result:
Updates the number of nearest neighbors to use. Returns None
'''
self.nn = k
def predict(self, X, positiveValue = 1, negativeValue = -1):
'''
Given parameters, makes a prediction using training data
Parameters:
X: Parameters for prediction
postiveValue: what should the value be if we guess True, usually 1.
negativeValue: what should the value be if we guess False, usually 0 or -1.
Result:
Y: returns a prediction using kNN. This also provides a confidence amount, it does not match the labels.
guess: returns a prediction that has been casted to match the provided labels as parameters.
'''
#convert X into an item object
toPredict = _item(X, -1)
#holds nearest neighbors
neighbors = {}
#build nearest neighbors
for example in self.examples:
#if neighbors isn't filled yet, fill it first
if len(neighbors) < self.nn:
dist = toPredict.dist(example)
neighbors[dist] = example
else:
#get key set
keySet = list(neighbors.keys())
#find furthest neighbor
greatestDist = 0
for key in keySet:
if key > greatestDist:
greatestDist = key
#if current example is closer than furthest, replace
dist = toPredict.dist(example)
if dist < greatestDist:
neighbors.pop(greatestDist)
neighbors[dist] = example
#find average of labels
keySet = neighbors.keys()
avg = 0
for key in keySet:
avg += neighbors[key].label
avg /= self.nn
if avg < .5:
guess = negativeValue
else:
guess = positiveValue
#return average
return avg, guess
def eucdist(X1, X2):
'''
Euclidean Distance. sqrt( (x1,0 - x2,0)^2 + ... + (x1,n - x2,n)^2 ), where n is number of parameters/dimensions.
Parameters:
X1: The parameters in the first object we are comparing as a list
X2: The parameters in the second object we are comparing as a list
Result:
The euclidean distance between the two points
'''
#holds the distance
dist = 0
#for each parameter/dimension in each item
for i in range(len(X1)):
#add the square of the difference between the i-th parameter in each of the items we are comparing
dist += (X2[i] - X1[i]) ** 2
#return the squareroot of the dist so far
return math.sqrt(dist)
class _item:
'''
This class holds an item in the dataset, both parameters and label.
'''
def __init__(self, X,Y=-1,distFunc=eucdist):
'''
The constructor for the item class. Requires parameters of a single training example.
A label should be provided if it is a true training example.
A distance function can also be provided. If not, euclidean distance is used.
Parameters:
X: Parameters of training example
Y: Label of training example, default -1
distFunc: Distance function to be used for this instance, default euclidean distance.
Result:
Instance of item class
'''
super().__init__()
self.distFunc = distFunc
self.X = X
self.label = Y
def dist(self, other):
'''
Runs the provided distance function for this class, comparing a provided item.
Parameters:
Other: The other item to compare to
Result:
The distance between the instance of this class and the instance of an item class provided as an parameter.
'''
return self.distFunc(self.X, other.X)