-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTreeMaker.py
More file actions
129 lines (112 loc) · 5.15 KB
/
TreeMaker.py
File metadata and controls
129 lines (112 loc) · 5.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from collections import defaultdict
from treelib import Node, Tree
from math import log2
class NodeData:
def __init__(self, entropy, decision_value=None):
self.entropy = entropy
self.decision_value = decision_value
self.majority_class = None
def isMajority(self, classification):
return classification == self.majority_class
def makeTree(set, dims, starting_entropy):
tree = Tree()
root_node = tree.create_node('Root', 'root', data=NodeData(starting_entropy))
makeBranch(set, dims, tree, root_node)
return tree
def printTree(tree):
root_node = tree.get_node(tree.root)
for node in tree.children(tree.root):
print(_printTree(tree, 0, node))
def _printTree(tree, current_level, current_node):
parent_node = tree.parent(current_node.identifier)
string_to_build = '%s%s=%s:' % ('|' * current_level, parent_node.tag, current_node.data.decision_value)
if current_node.is_leaf():
string_to_build += ' %s' % (current_node.data.majority_class)
else:
for node in tree.children(current_node.identifier):
string_to_build += '\n%s' % (_printTree(tree, current_level + 1, node))
return string_to_build
def makeBranch(set, dims, tree, parent_node):
#base cases: out of dimensions OR labels are pure
if checkIfPure(set, dims) or len(dims) == 1: #is 1 instead of 0 because "Class" will be in there
parent_node.data.majority_class = calcMajorityClass(set, dims)
return True
status = True
chosen_dim, info_gain, value_label_counts, value_entropies = chooseDecisionDim(set, dims, parent_node.data.entropy)
#mutate original headers (dimensions) data structure as we will be passing references to children
dims.remove(chosen_dim)
#set the decision made in tag on parent node
parent_node.tag = chosen_dim
#add children
for value, label_counts in value_label_counts.items():
subset = set.copy()
#remove rows from subset which don't match current value for newest decision
subset.drop(subset[subset[chosen_dim] != value].index, inplace=True)
value_entropy = value_entropies[value]
identifier = ''.join([parent_node.identifier, '^', chosen_dim, '=', str(value)])
print('recursing to node %s with an entropy of %f' % (identifier, value_entropy))
new_node = tree.create_node(None,
identifier,
parent=parent_node.identifier,
data=NodeData(entropy=value_entropy, decision_value=value)
)
#RECURSE
status = status and makeBranch(subset, dims, tree, new_node)
return status
def calcMajorityClass(set, dims):
classifications = defaultdict(lambda: 0)
majority_class = None
max_instances = -1
for classification in set[dims[-1]]:
classifications[classification] += 1
if classifications[classification] > max_instances:
max_instances = classifications[classification]
majority_class = classification
return majority_class
def checkIfPure(set, dims):
return len(set[dims[-1]].unique()) < 2
def chooseDecisionDim(set, dims, previous_entropy):
max_info_gain = -1
max_info_gain_dim = None
max_info_value_label_counts = None
max_info_value_entropies = None
for dim in dims[0:-1]:
value_label_counts = getValueLabelCounts(set, dim)
info_gain, value_entropies = calcInfoGain(set, dim, previous_entropy, value_label_counts)
if info_gain > max_info_gain:
max_info_gain = info_gain
max_info_gain_dim = dim
max_info_value_label_counts = value_label_counts
max_info_value_entropies = value_entropies
print('max info gain is from dimension: "%s" and is: "%f"' % (max_info_gain_dim, max_info_gain))
return max_info_gain_dim, max_info_gain, max_info_value_label_counts, max_info_value_entropies
def calcInfoGain(set, dims, previous_entropy, value_label_counts):
dim_entropy = 0
value_entropies = {}
num_rows_total = set.shape[0]
for value, label_counts in value_label_counts.items():
num_rows_for_value = calcNumInstances(label_counts)
weight_factor = num_rows_for_value / num_rows_total
value_entropies[value] = calcEntropy(label_counts, num_rows_for_value)
dim_entropy += value_entropies[value] * weight_factor
return previous_entropy - dim_entropy, value_entropies
def calcNumInstances(label_counts):
total_instances = 0
for label, count in label_counts.items():
total_instances += count
return total_instances
def calcEntropy(label_count, total_instances):
entropy = 0
for count in label_count.values():
entropy += -1 * count / total_instances * log2(count / total_instances)
return entropy
def getValueLabelCounts(set, header):
value_label_counts = defaultdict(lambda: defaultdict(lambda: 0))
#get count of instances for each label
for row in set.itertuples():
value = getattr(row, header)
label = getLabel(row)
value_label_counts[value][label] += 1
return value_label_counts
def getLabel(row):
return row[-1]