decisiontreeML/main.py at main · mahasch/decisiontreeML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np

from src.entropy import information_gain
from src.eval import get_confusion_matrix, get_metrics, store_trees
from src.node import BinaryTreeNode
from src.pruning import prune_tree


def find_split(dataset: np.ndarray) -> tuple[np.ndarray, np.ndarray, tuple[float, float]]:
    best_split = (-1, -1)
    maximum = -1
    left_best = np.ndarray((0, 0))
    right_best = np.ndarray((0, 0))
    # for each attribute in the dataset. np.shae(dataset)[1] -1 is the number of attributes
    for attribute in range(dataset[0].size - 1):
        ordered = dataset[np.argsort(dataset[:, attribute])]  # sorting the dataset by the attribute

        for i in range(1, np.shape(ordered)[0]):  # looping through each value of the attribute
            # if the value is different from the previous valueprint("checking split at ", i, " with gain ", gain)
            if ordered[i][attribute] != ordered[i - 1][attribute]: # this ensures that you are not splitting on the same data value (emitter strength)
                split = (attribute, ordered[i, attribute]) # this is the split value

                left_dataset, right_dataset = ordered[:i], ordered[i:] # this is splitting the dataset
                gain = information_gain(ordered, left_dataset, right_dataset)

                if gain > maximum:  # selecting the attribute with the highest gain.
                    maximum = gain
                    best_split = split
                    left_best = left_dataset
                    right_best = right_dataset

    return left_best, right_best, best_split


# Checks if the dataset is all relating to the same room
def all_same_room(dataset: np.ndarray) -> bool: # the training dataset is what is being passed in.
    room = dataset[0][7] # this is the room number
    for i in range(1, len(dataset)):
        if dataset[i][7] != room: # if the room number is different from the first room number, then return false
            return False
    return True


def decision_tree_learning(training_dataset: np.ndarray, depth: int) -> tuple[BinaryTreeNode, int]:
    if all_same_room(training_dataset):
        # if all the room nums are the same, then return a leaf node with the room number
        return BinaryTreeNode(training_dataset[0][7]), depth
    else:
        l_dataset, r_dataset, split_value = find_split(training_dataset)
        l_branch, l_depth = decision_tree_learning(l_dataset, depth + 1)
        r_branch, r_depth = decision_tree_learning(r_dataset, depth + 1)
        node = BinaryTreeNode(split_value)  # creating the node
        node.leftChild = l_branch    # adding the left branch
        node.rightChild = r_branch     # adding the right branch
        return node, np.maximum(l_depth, r_depth) # returning the max depth of the tree the root node is returned


def k_folds(dataset: np.ndarray, k: int, pruning: bool, limit: int):
    segment = int(dataset.shape[0]/k)  # Get size of segment. Assuming divisible with no remainder.
    matrices = []  # Store confusion matrices.
    trees = []
    depths = []

    for i in range(k):
        depth = 1  # Reset outputs each time.
        testing = dataset[i*segment:(i+1)*segment]  # Iterate through all segments. Each one is the test set for a loop.
        if pruning:  # If pruning. Split 80/10/10
            for j in range(k-1): # Iterate through all segments. Each one is the test set for a loop.
                depth = 1
                remainder = np.delete(dataset, range(i*segment, (i+1)*segment), 0) # this is the testing set
                validation = remainder[j*segment:(j+1)*segment]  # the validation set
                training = np.delete(remainder, range(j*segment, (j+1)*segment), 0) # the training set

                binary_tree, depth = decision_tree_learning(training, depth) # train the tree
                pruned_tree = prune_tree(binary_tree, validation, limit) # pruning the tree

                trees.append(pruned_tree) # store the trained trees
                depths.append(pruned_tree.depth())
                matrices.append(get_confusion_matrix(pruned_tree, testing))

        else:  # If not pruning. Split 90/10
            training = np.delete(dataset, range(i*segment, (i+1)*segment), 0)
            binary_tree, depth = decision_tree_learning(training, depth)

            trees.append(binary_tree)
            matrices.append(get_confusion_matrix(binary_tree, testing))
            depths.append(depth)

    return matrices, trees, depths


def process_data(title: str, dataset: np.ndarray, location: str, pruning: bool, folds: int, limit: int):
    out_str = "Metrics for "
    if not pruning:
        out_str += title + " database (Not Pruned):"
    else:
        out_str += title + " database (Pruned):"
    if limit != -1:
        out_str += " [Depth limit = " + str(limit) + "]"
    print(out_str)
    print("//////////////////////////////")
    confusion_matrices, trees, depths = k_folds(dataset, folds, pruning, limit)
    best, worst = get_metrics(confusion_matrices, depths)
    print("//////////////////////////////")
    store_trees(trees, best, worst, location)


def main():
    # clean = np.loadtxt("./wifi_db/clean_dataset.txt")
    shuffled_clean = np.loadtxt("./wifi_db/shuffled_clean_dataset.txt")
    # noisy = np.loadtxt("./wifi_db/noisy_dataset.txt")
    shuffled_noisy = np.loadtxt("./wifi_db/shuffled_noisy_dataset.txt")
    k = 10
    clean_limit = 10
    noisy_limit = 6
    no_limit = -1

    # CLEAN Non-pruned data (90/10 split)
    process_data("CLEAN", shuffled_clean, "./TREES/CLEAN/NON-PRUNED", False, k, no_limit)

    # CLEAN Pruned data (80/10/10 split)
    process_data("CLEAN", shuffled_clean, "./TREES/CLEAN/PRUNED", True, k, no_limit)

    # CLEAN Pruned data (80/10/10 split) WITH LIMIT = 10
    process_data("CLEAN", shuffled_clean, "./TREES/CLEAN/PRUNED-LIM", True, k, clean_limit)

    # NOISY Non-pruned data (90/10 split)
    process_data("NOISY", shuffled_noisy, "./TREES/NOISY/NON-PRUNED", False, k, no_limit)

    # NOISY Pruned data (80/10/10 split)
    process_data("NOISY", shuffled_noisy, "./TREES/NOISY/PRUNED", True, k, no_limit)

    # NOISY Pruned data (80/10/10 split) WITH LIMIT = 15
    process_data("NOISY", shuffled_noisy, "./TREES/NOISY/PRUNED-LIM", True, k, noisy_limit)

    return 0


if __name__ == "__main__":
    main()