-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
140 lines (108 loc) · 6.19 KB
/
main.py
File metadata and controls
140 lines (108 loc) · 6.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
from src.entropy import information_gain
from src.eval import get_confusion_matrix, get_metrics, store_trees
from src.node import BinaryTreeNode
from src.pruning import prune_tree
def find_split(dataset: np.ndarray) -> tuple[np.ndarray, np.ndarray, tuple[float, float]]:
best_split = (-1, -1)
maximum = -1
left_best = np.ndarray((0, 0))
right_best = np.ndarray((0, 0))
# for each attribute in the dataset. np.shae(dataset)[1] -1 is the number of attributes
for attribute in range(dataset[0].size - 1):
ordered = dataset[np.argsort(dataset[:, attribute])] # sorting the dataset by the attribute
for i in range(1, np.shape(ordered)[0]): # looping through each value of the attribute
# if the value is different from the previous valueprint("checking split at ", i, " with gain ", gain)
if ordered[i][attribute] != ordered[i - 1][attribute]: # this ensures that you are not splitting on the same data value (emitter strength)
split = (attribute, ordered[i, attribute]) # this is the split value
left_dataset, right_dataset = ordered[:i], ordered[i:] # this is splitting the dataset
gain = information_gain(ordered, left_dataset, right_dataset)
if gain > maximum: # selecting the attribute with the highest gain.
maximum = gain
best_split = split
left_best = left_dataset
right_best = right_dataset
return left_best, right_best, best_split
# Checks if the dataset is all relating to the same room
def all_same_room(dataset: np.ndarray) -> bool: # the training dataset is what is being passed in.
room = dataset[0][7] # this is the room number
for i in range(1, len(dataset)):
if dataset[i][7] != room: # if the room number is different from the first room number, then return false
return False
return True
def decision_tree_learning(training_dataset: np.ndarray, depth: int) -> tuple[BinaryTreeNode, int]:
if all_same_room(training_dataset):
# if all the room nums are the same, then return a leaf node with the room number
return BinaryTreeNode(training_dataset[0][7]), depth
else:
l_dataset, r_dataset, split_value = find_split(training_dataset)
l_branch, l_depth = decision_tree_learning(l_dataset, depth + 1)
r_branch, r_depth = decision_tree_learning(r_dataset, depth + 1)
node = BinaryTreeNode(split_value) # creating the node
node.leftChild = l_branch # adding the left branch
node.rightChild = r_branch # adding the right branch
return node, np.maximum(l_depth, r_depth) # returning the max depth of the tree the root node is returned
def k_folds(dataset: np.ndarray, k: int, pruning: bool, limit: int):
segment = int(dataset.shape[0]/k) # Get size of segment. Assuming divisible with no remainder.
matrices = [] # Store confusion matrices.
trees = []
depths = []
for i in range(k):
depth = 1 # Reset outputs each time.
testing = dataset[i*segment:(i+1)*segment] # Iterate through all segments. Each one is the test set for a loop.
if pruning: # If pruning. Split 80/10/10
for j in range(k-1): # Iterate through all segments. Each one is the test set for a loop.
depth = 1
remainder = np.delete(dataset, range(i*segment, (i+1)*segment), 0) # this is the testing set
validation = remainder[j*segment:(j+1)*segment] # the validation set
training = np.delete(remainder, range(j*segment, (j+1)*segment), 0) # the training set
binary_tree, depth = decision_tree_learning(training, depth) # train the tree
pruned_tree = prune_tree(binary_tree, validation, limit) # pruning the tree
trees.append(pruned_tree) # store the trained trees
depths.append(pruned_tree.depth())
matrices.append(get_confusion_matrix(pruned_tree, testing))
else: # If not pruning. Split 90/10
training = np.delete(dataset, range(i*segment, (i+1)*segment), 0)
binary_tree, depth = decision_tree_learning(training, depth)
trees.append(binary_tree)
matrices.append(get_confusion_matrix(binary_tree, testing))
depths.append(depth)
return matrices, trees, depths
def process_data(title: str, dataset: np.ndarray, location: str, pruning: bool, folds: int, limit: int):
out_str = "Metrics for "
if not pruning:
out_str += title + " database (Not Pruned):"
else:
out_str += title + " database (Pruned):"
if limit != -1:
out_str += " [Depth limit = " + str(limit) + "]"
print(out_str)
print("//////////////////////////////")
confusion_matrices, trees, depths = k_folds(dataset, folds, pruning, limit)
best, worst = get_metrics(confusion_matrices, depths)
print("//////////////////////////////")
store_trees(trees, best, worst, location)
def main():
# clean = np.loadtxt("./wifi_db/clean_dataset.txt")
shuffled_clean = np.loadtxt("./wifi_db/shuffled_clean_dataset.txt")
# noisy = np.loadtxt("./wifi_db/noisy_dataset.txt")
shuffled_noisy = np.loadtxt("./wifi_db/shuffled_noisy_dataset.txt")
k = 10
clean_limit = 10
noisy_limit = 6
no_limit = -1
# CLEAN Non-pruned data (90/10 split)
process_data("CLEAN", shuffled_clean, "./TREES/CLEAN/NON-PRUNED", False, k, no_limit)
# CLEAN Pruned data (80/10/10 split)
process_data("CLEAN", shuffled_clean, "./TREES/CLEAN/PRUNED", True, k, no_limit)
# CLEAN Pruned data (80/10/10 split) WITH LIMIT = 10
process_data("CLEAN", shuffled_clean, "./TREES/CLEAN/PRUNED-LIM", True, k, clean_limit)
# NOISY Non-pruned data (90/10 split)
process_data("NOISY", shuffled_noisy, "./TREES/NOISY/NON-PRUNED", False, k, no_limit)
# NOISY Pruned data (80/10/10 split)
process_data("NOISY", shuffled_noisy, "./TREES/NOISY/PRUNED", True, k, no_limit)
# NOISY Pruned data (80/10/10 split) WITH LIMIT = 15
process_data("NOISY", shuffled_noisy, "./TREES/NOISY/PRUNED-LIM", True, k, noisy_limit)
return 0
if __name__ == "__main__":
main()