From 1e9f6b40145da8dc1f264f91c7b5386eaa02dfda Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 17 Jan 2025 08:30:29 +0100 Subject: [PATCH 01/33] Testpypi install in makefile --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 404559e8..e61ab036 100644 --- a/Makefile +++ b/Makefile @@ -13,3 +13,6 @@ clean: lint: cython-lint src/* --max-line-length=127 + +test_pypi: + pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple adaXT From 086d5cc71dba2892b4d887d974c418cc636027c4 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Sat, 8 Feb 2025 21:12:41 +0100 Subject: [PATCH 02/33] Small documentation updates --- docs/user_guide/creatingCriteria.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/creatingCriteria.md b/docs/user_guide/creatingCriteria.md index 66bf7f6d..58bf69f4 100644 --- a/docs/user_guide/creatingCriteria.md +++ b/docs/user_guide/creatingCriteria.md @@ -47,11 +47,15 @@ should be computed. To access the feature and response you can make use of `self.y[indices]` are the feature and response samples for which the impurity needs to be computed. With this in place you should be able to implement almost any criteria function you can imagine. Keep in mind that the `impurity` method -is extremely often (approximately $n\log(n)$ times). Therefore you should invest -a bit of time in optimizing the function in order to avoid long fitting times. +is used often (approximately $n\log(n)$ times). Therefore you should invest a +bit of time in optimizing the function in order to avoid long fitting times. Further computational speed-ups can be achieved by implementing `proxy_improvement` and `update_proxy` methods in the criteria class. If these are not explicitly defined the code defaults to using the `impurity` method. +Although we do not provide in depth examples of those functionalities here, feel +free to look at +[criteria.pyx](https://github.com/NiklasPfister/adaXT/blob/main/src/adaXT/criteria/criteria.pyx) +where the default criteria make use of both. Once you have finished defining your critera class and saved the .pyx file, you can compile the Cython code and use it as part of adaXT. From eef958ec78bb8548e38291b6885a83a5c0c5f7fc Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Sun, 9 Feb 2025 16:31:05 +0100 Subject: [PATCH 03/33] Move from def nodes to cdef nodes With this able to gain 90% increase in prediction speed. --- setup.py | 18 +++++++++++++----- src/adaXT/decision_tree/nodes.pxd | 30 ++++++++++++++++++++++++++++++ src/adaXT/decision_tree/nodes.pyx | 11 +++++++---- src/adaXT/predictor/predictor.pyx | 28 ++++++++++++++++------------ 4 files changed, 66 insertions(+), 21 deletions(-) create mode 100644 src/adaXT/decision_tree/nodes.pxd diff --git a/setup.py b/setup.py index b82b0959..c55b3965 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ DEBUG = False PROFILE = False +ANNOTATE = True # Make all pyx files for the decision_tree ext = ".pyx" if USE_CYTHON else ".cpp" @@ -109,18 +110,25 @@ def run_build(): } ) + if PROFILE: compiler_directives["profile"] = True compiler_directives["linetrace"] = True compiler_directives["binding"] = True + arg_dir = { + "gdb_debug": False, + "language_level": "3", + "compiler_directives": compiler_directives, + "verbose": True, + } + + if ANNOTATE: + arg_dir["annotate"] = True + extensions = cythonize( extensions, - gdb_debug=False, - annotate=True, - language_level="3", - compiler_directives=compiler_directives, - verbose=True, + **arg_dir ) setup( name=NAME, diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd new file mode 100644 index 00000000..f2e8be88 --- /dev/null +++ b/src/adaXT/decision_tree/nodes.pxd @@ -0,0 +1,30 @@ +cdef class Node: + cdef public: + int[:] indices + int depth + double impurity + object parent + bint visited + bint is_leaf + + +cdef class DecisionNode(Node): + cdef public: + double threshold + int split_idx + object left_child + object right_child + +cdef class LeafNode(Node): + cdef public: + double weighted_samples + int id + double[:] value + +cdef class LocalPolynomialLeafNode(LeafNode): + cdef public: + double theta0 + double theta1 + double theta2 + + diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index a064e688..93bdae59 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -1,19 +1,20 @@ import numpy as np -class Node: +cdef class Node: def __init__( self, indices: np.ndarray, depth: int, impurity: float) -> None: + self.is_leaf = 0 self.indices = np.asarray(indices) self.depth = depth self.impurity = impurity self.visited = 0 -class DecisionNode(Node): +cdef class DecisionNode(Node): def __init__( self, indices: np.ndarray, @@ -30,9 +31,10 @@ class DecisionNode(Node): self.left_child = left_child self.right_child = right_child self.parent = parent + self.is_leaf = 0 -class LeafNode(Node): +cdef class LeafNode(Node): def __init__( self, id: int, @@ -47,9 +49,10 @@ class LeafNode(Node): self.parent = parent self.id = id self.value = np.asarray(value) + self.is_leaf = 1 -class LocalPolynomialLeafNode(LeafNode): +cdef class LocalPolynomialLeafNode(LeafNode): def __init__( self, id: int, diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 7e9c9e27..01165e1e 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -1,6 +1,7 @@ import numpy as np from numpy import float64 as DOUBLE from ..decision_tree.nodes import DecisionNode +from ..decision_tree.nodes cimport Node, DecisionNode from collections.abc import Sequence from statistics import mode cimport numpy as cnp @@ -135,31 +136,34 @@ cdef class PredictorClassification(Predictor): cur_max = i return cur_max - cdef cnp.ndarray __predict(self, cnp.ndarray X): + cdef inline cnp.ndarray __predict(self, cnp.ndarray X): cdef: int i, cur_split_idx, idx, n_obs double cur_threshold - object cur_node - cnp.ndarray[DOUBLE_t, ndim=1] prediction + Node cur_node + DecisionNode dNode + const double[:, ::1] X_ndarray + double[:] prediction # Make sure that x fits the dimensions. n_obs = X.shape[0] + X_ndarray = X prediction = np.empty(n_obs, dtype=DOUBLE) for i in range(n_obs): cur_node = self.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X_ndarray[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child + cur_node = dNode.right_child idx = self.__find_max_index(cur_node.value) - if self.classes is not None: - prediction[i] = self.classes[idx] - return prediction + prediction[i] = self.classes[idx] + return np.array(prediction) cdef cnp.ndarray __predict_proba(self, cnp.ndarray X): cdef: From 331bbe8aba7ba97b3aeab8db8a1bc8bf1780fa66 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 11 Feb 2025 13:21:00 +0100 Subject: [PATCH 04/33] Work on Node optimisations --- src/adaXT/decision_tree/_decision_tree.pyx | 1 + src/adaXT/decision_tree/nodes.pxd | 12 ----- src/adaXT/decision_tree/nodes.pyx | 28 ++++++++--- src/adaXT/decision_tree/splitter.pxd | 4 +- src/adaXT/predictor/predictor.pxd | 4 +- time_predict_tree.py | 55 ++++++++++++++++++++++ 6 files changed, 81 insertions(+), 23 deletions(-) create mode 100644 time_predict_tree.py diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 76b3a239..d698ce97 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -72,6 +72,7 @@ cdef class _DecisionTree(): self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_improvement = min_improvement + self.leaf_nodes = None self.predictor_instance = None self.root = None diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index f2e8be88..3b65108b 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -15,16 +15,4 @@ cdef class DecisionNode(Node): object left_child object right_child -cdef class LeafNode(Node): - cdef public: - double weighted_samples - int id - double[:] value - -cdef class LocalPolynomialLeafNode(LeafNode): - cdef public: - double theta0 - double theta1 - double theta2 - diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index 93bdae59..c01c52ce 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -2,17 +2,33 @@ import numpy as np cdef class Node: + def __cinit__(self): + self.is_leaf = 0 + self.visited = 0 + def __init__( self, indices: np.ndarray, depth: int, impurity: float) -> None: - self.is_leaf = 0 - self.indices = np.asarray(indices) + self.indices = indices self.depth = depth self.impurity = impurity - self.visited = 0 + def __reduce__(self): + return ( + self.__class__, # Callable object that will be called ot create + # initial state upon pickle + (self.indices, self.depth, self.impurity), # Input to Callable + { + "is_leaf": self.is_leaf, + "visited": self.visited + } # Current state of variables that can not be passed to init + ) + # This function is passed the state provided above + def __setstate__(self, d: dict): + self.is_leaf = d["is_leaf"] + self.visited = d["visited"] cdef class DecisionNode(Node): def __init__( @@ -31,10 +47,8 @@ cdef class DecisionNode(Node): self.left_child = left_child self.right_child = right_child self.parent = parent - self.is_leaf = 0 - -cdef class LeafNode(Node): +class LeafNode(Node): def __init__( self, id: int, @@ -52,7 +66,7 @@ cdef class LeafNode(Node): self.is_leaf = 1 -cdef class LocalPolynomialLeafNode(LeafNode): +class LocalPolynomialLeafNode(LeafNode): def __init__( self, id: int, diff --git a/src/adaXT/decision_tree/splitter.pxd b/src/adaXT/decision_tree/splitter.pxd index f6eee451..c99cd109 100644 --- a/src/adaXT/decision_tree/splitter.pxd +++ b/src/adaXT/decision_tree/splitter.pxd @@ -4,8 +4,8 @@ cnp.import_array() cdef class Splitter: cdef: - double[:, ::1] X - double[:, ::1] Y + const double[:, ::1] X + const double[:, ::1] Y int n_features int[:] indices int n_indices diff --git a/src/adaXT/predictor/predictor.pxd b/src/adaXT/predictor/predictor.pxd index 78655bf0..59450a4c 100644 --- a/src/adaXT/predictor/predictor.pxd +++ b/src/adaXT/predictor/predictor.pxd @@ -2,8 +2,8 @@ cimport numpy as cnp cdef class Predictor(): cdef: - double[:, ::1] X - double[:, ::1] Y + const double[:, ::1] X + const double[:, ::1] Y int n_features object root diff --git a/time_predict_tree.py b/time_predict_tree.py new file mode 100644 index 00000000..0232fadc --- /dev/null +++ b/time_predict_tree.py @@ -0,0 +1,55 @@ +import numpy as np +import adaXT.criteria as crit +from adaXT.decision_tree import DecisionTree +from adaXT.random_forest import RandomForest +from sklearn.tree import DecisionTreeClassifier +import time + +from memory_profiler import profile + +low_X = 0 +high_X = 10_000 +N_TRAIN = 100_000 +M = 5 +N_PREDICT = N_TRAIN +NUM_TREES = 1 +NUM_PREDICT = 1 + + +def predict_n_times(tree): + """ + predicts NUM_PREDICT number of times using the tree on randomly generated X + data. + + Returns the mean predict time of NUM_PREDICT randomly generated values. + """ + times = [] + for _ in range(NUM_PREDICT): + X = np.random.uniform(low_X, high_X, (N_PREDICT, M)) + st = time.time() + x = tree.predict(X) + et = time.time() + times.append(et - st) + return np.mean(times) + + +def main(): + X = np.random.uniform(low_X, high_X, (N_TRAIN, M)) + Y = np.random.randint(0, M, N_TRAIN) + + times = [] + for _ in range(NUM_TREES): + tree = RandomForest(forest_type="Classification", criteria=crit.Gini_index) + # tree = DecisionTreeClassifier(criterion="gini") + tree.fit(X, Y) + times.append(predict_n_times(tree)) + + mean_predict_time = np.mean(times) + print( + f"Mean predict times for {NUM_TREES} predicting {NUM_PREDICT} times: ", + mean_predict_time, + ) + + +if __name__ == "__main__": + main() From 52384a7965f84b9f3f23b1288165fd7755e2ef34 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 11 Feb 2025 13:39:34 +0100 Subject: [PATCH 05/33] changes to pickle --- src/adaXT/decision_tree/nodes.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index c01c52ce..c3a2703c 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -22,13 +22,15 @@ cdef class Node: (self.indices, self.depth, self.impurity), # Input to Callable { "is_leaf": self.is_leaf, - "visited": self.visited + "visited": self.visited, + "indices": self.indices.base } # Current state of variables that can not be passed to init ) # This function is passed the state provided above def __setstate__(self, d: dict): self.is_leaf = d["is_leaf"] self.visited = d["visited"] + self.indices = d["indices"] cdef class DecisionNode(Node): def __init__( From 76e319c3461e773230885c61966083d36e2e80e6 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Sun, 23 Feb 2025 20:22:51 +0100 Subject: [PATCH 06/33] Work on prediction --- docs/user_guide/creatingPredictor.md | 10 +- setup.py | 8 +- src/adaXT/decision_tree/_decision_tree.pyx | 40 +++- src/adaXT/decision_tree/decision_tree.py | 41 ++-- src/adaXT/decision_tree/nodes.pxd | 22 +- src/adaXT/decision_tree/nodes.pyx | 48 ++-- src/adaXT/parallel.py | 36 +-- src/adaXT/predictor/predictor.pxd | 6 +- src/adaXT/predictor/predictor.pyi | 14 +- src/adaXT/predictor/predictor.pyx | 266 ++++++++++++--------- src/adaXT/random_forest/random_forest.py | 77 +++--- time_predict_tree.py | 55 ----- 12 files changed, 296 insertions(+), 327 deletions(-) delete mode 100644 time_predict_tree.py diff --git a/docs/user_guide/creatingPredictor.md b/docs/user_guide/creatingPredictor.md index 0d2491a5..14cfbe84 100644 --- a/docs/user_guide/creatingPredictor.md +++ b/docs/user_guide/creatingPredictor.md @@ -29,7 +29,7 @@ cdef class MyPredictorClass(Predictor): # Define your own custom predict function @staticmethod - def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new, + def forest_predict(cnp.ndarray X_train, cnp.ndarray Y_train, cnp.ndarray X_pred, trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: # Define special handling for the RandomForest predict. @@ -151,7 +151,7 @@ def predict_quantile( cdef class PredictorQuantile(Predictor): @staticmethod - def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new, + def forest_predict(cnp.ndarray X_train, cnp.ndarray Y_train, cnp.ndarray X_pred, trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: cdef: @@ -162,9 +162,9 @@ cdef class PredictorQuantile(Predictor): "quantile called without quantile passed as argument" ) quantile = kwargs['quantile'] - n_obs = X_new.shape[0] + n_obs = X_pred.shape[0] prediction_indices = parallel.async_map(predict_quantile, - map_input=trees, X=X_new, + map_input=trees, X=X_pred, n_obs=n_obs) # In case the leaf nodes have multiple elements and not just one, we # have to combine them together @@ -175,7 +175,7 @@ cdef class PredictorQuantile(Predictor): for j in range(n_trees): indices_combined.extend(prediction_indices[j][i]) pred_indices_combined.append(indices_combined) - ret = np.quantile(Y_old[pred_indices_combined], quantile) + ret = np.quantile(Y_train[pred_indices_combined], quantile) return np.array(ret, dtype=DOUBLE) ``` diff --git a/setup.py b/setup.py index c55b3965..b536292b 100644 --- a/setup.py +++ b/setup.py @@ -110,7 +110,6 @@ def run_build(): } ) - if PROFILE: compiler_directives["profile"] = True compiler_directives["linetrace"] = True @@ -123,13 +122,10 @@ def run_build(): "verbose": True, } - if ANNOTATE: + if ANNOTATE: arg_dir["annotate"] = True - extensions = cythonize( - extensions, - **arg_dir - ) + extensions = cythonize(extensions, **arg_dir) setup( name=NAME, version=VERSION, diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index d698ce97..1b83a0cc 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -1,3 +1,4 @@ +# cython: auto_pickle=False import numpy as np import sys @@ -11,9 +12,13 @@ from libcpp cimport bool from .splitter import Splitter from ..predictor import Predictor from ..criteria import Criteria -from .nodes import DecisionNode from ..leaf_builder import LeafBuilder +# for c level definitions + +cimport cython +from .nodes import DecisionNode + from ..utils cimport dsum cdef double EPSILON = np.finfo('double').eps @@ -35,13 +40,13 @@ class refit_object(): def add_idx(self, idx: int) -> None: self.indices.append(idx) - +@cython.auto_pickle(True) cdef class _DecisionTree(): cdef public: object criteria + object splitter object predictor object leaf_builder - object splitter object leaf_nodes, predictor_instance, root long max_depth, min_samples_leaf, max_features long min_samples_split, n_nodes, n_features @@ -79,7 +84,7 @@ cdef class _DecisionTree(): self.n_nodes = -1 self.n_features = -1 - def predict(self, cnp.ndarray[DOUBLE_t, ndim=2] X, **kwargs) -> np.ndarray: + def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: return self.predictor_instance.predict(X, **kwargs) cdef dict __get_leaf(self, bool scale = False): @@ -112,6 +117,13 @@ cdef class _DecisionTree(): size_0, self.n_rows_predict, scaling=scaling) + def _forest_predict_leaf(self, double[:, ::1] X_train, double[:, ::1] + Y_train, double[:, ::1] X_pred, **kwargs): + if X_pred is None: + return self.__get_leaf() + predictor_instance = self.predictor(X_train, Y_train, self.root) + return predictor_instance.predict_leaf(X_pred, **kwargs) + def predict_leaf(self, X: np.ndarray | None = None) -> dict: if X is None: return self.__get_leaf() @@ -219,7 +231,7 @@ cdef class _DecisionTree(): cur_node = cur_node.right_child depth += 1 - leaf_builder = self.leaf_builder(X, Y, all_idx) + leaf_builder_instance = self.leaf_builder(X, Y, all_idx) criteria_instance = self.criteria(X, Y, sample_weight) # Make refit objects into leaf_nodes # Two cases: @@ -227,7 +239,7 @@ cdef class _DecisionTree(): # (2) At least one split (n_objs > 0) if self.root is None: weighted_samples = dsum(sample_weight, all_idx) - self.root = leaf_builder.build_leaf( + self.root = leaf_builder_instance.build_leaf( leaf_id=0, indices=all_idx, depth=0, @@ -242,7 +254,7 @@ cdef class _DecisionTree(): obj = refit_objs[i] leaf_indices = np.array(obj.indices, dtype=np.int32) weighted_samples = dsum(sample_weight, leaf_indices) - new_node = leaf_builder.build_leaf( + new_node = leaf_builder_instance.build_leaf( leaf_id=i, indices=leaf_indices, depth=obj.depth, @@ -386,6 +398,7 @@ class DepthTreeBuilder: splitter: Splitter, leaf_builder: LeafBuilder, predictor: Predictor, + ensemble: bool = False, ) -> None: """ Parameters @@ -420,6 +433,8 @@ class DepthTreeBuilder: self.predictor = predictor self.leaf_builder = leaf_builder + self.ensemble = ensemble + def __get_feature_indices(self) -> np.ndarray: if self.max_features == -1: return self.feature_indices @@ -443,9 +458,7 @@ class DepthTreeBuilder: returns 0 on succes """ # initialization - X = self.X - Y = self.Y - _, col = X.shape + _, col = self.X.shape self.max_features = tree.max_features self.feature_indices = np.arange(col, dtype=np.int32) @@ -471,7 +484,7 @@ class DepthTreeBuilder: ], dtype=np.int32) # Update the tree now that we have the correct samples - leaf_builder = self.leaf_builder(X, Y, all_idx) + leaf_builder_instance = self.leaf_builder(self.X, self.Y, all_idx) weighted_total = dsum(self.sample_weight, all_idx) queue.append(queue_obj(all_idx, 0, criteria_instance.impurity(all_idx))) @@ -557,7 +570,7 @@ class DepthTreeBuilder: child_imp[1], new_node, 0)) else: - new_node = leaf_builder.build_leaf( + new_node = leaf_builder_instance.build_leaf( leaf_id=leaf_count, indices=indices, depth=depth, @@ -579,4 +592,5 @@ class DepthTreeBuilder: tree.max_depth = max_depth_seen tree.root = root tree.leaf_nodes = leaf_node_list - tree.predictor_instance = self.predictor(self.X, self.Y, root) + if not self.ensemble: + tree.predictor_instance = self.predictor(self.X, self.Y, root) diff --git a/src/adaXT/decision_tree/decision_tree.py b/src/adaXT/decision_tree/decision_tree.py index a3b95fdb..8f5963f7 100644 --- a/src/adaXT/decision_tree/decision_tree.py +++ b/src/adaXT/decision_tree/decision_tree.py @@ -1,3 +1,4 @@ +from multiprocessing.dummy import Value from typing import Type, Literal from numpy.typing import ArrayLike import numpy as np @@ -56,6 +57,7 @@ def __init__( predictor: Type[Predictor] | None = None, splitter: Type[Splitter] | None = None, skip_check_input: bool = False, + ensemble: bool = False, ) -> None: """ Parameters @@ -95,6 +97,7 @@ def __init__( """ self.skip_check_input = skip_check_input + self.ensemble = ensemble # Input only checked on fitting. self.criteria = criteria @@ -155,8 +158,7 @@ def fit( self.leaf_builder, self.predictor, ) - self.max_features = self._check_max_features( - self.max_features, X.shape[0]) + self.max_features = self._check_max_features(self.max_features, X.shape[0]) self._tree = _DecisionTree( max_depth=self.max_depth, @@ -177,10 +179,8 @@ def fit( self._tree.n_features = X.shape[1] if not self.skip_check_input: - sample_weight = self._check_sample_weight( - sample_weight=sample_weight) - sample_indices = self._check_sample_indices( - sample_indices=sample_indices) + sample_weight = self._check_sample_weight(sample_weight=sample_weight) + sample_indices = self._check_sample_indices(sample_indices=sample_indices) builder = DepthTreeBuilder( X=X, @@ -192,6 +192,7 @@ def fit( leaf_builder=self.leaf_builder, predictor=self.predictor, splitter=self.splitter, + ensemble=self.ensemble, ) builder.build_tree(self._tree) @@ -237,7 +238,9 @@ def predict(self, X: ArrayLike, **kwargs) -> np.ndarray: (N, K) numpy array with the prediction, where K depends on the Prediction class and is generally 1 """ - if self.predictor_instance is None: + try: + self._tree + except AttributeError: raise AttributeError( "The tree has not been fitted before trying to call predict" ) @@ -275,6 +278,15 @@ def predict_weights( self._check_dimensions(X) return self._tree.predict_weights(X=X, scale=scale) + def _forest_predict_leaf( + self, X_pred: ArrayLike, X_train: ArrayLike, Y_train: ArrayLike + ): + if not self.skip_check_input: + raise ValueError("_forest_predict can only be called with skip_check_input") + return self._tree._forest_predict_leaf( + X_train=X_train, Y_train=Y_train, X_pred=X_pred + ) + def predict_leaf(self, X: ArrayLike | None) -> dict: """ Computes a hash table indexing in which LeafNodes the rows of the provided @@ -298,18 +310,11 @@ def predict_leaf(self, X: ArrayLike | None) -> dict: return self._tree.predict_leaf(X=X) def _tree_based_weights( - self, - hash0: dict, - hash1: dict, - size_X0: int, - size_X1: int, - scaling: str) -> np.ndarray: + self, hash0: dict, hash1: dict, size_X0: int, size_X1: int, scaling: str + ) -> np.ndarray: return self._tree._tree_based_weights( - hash0=hash0, - hash1=hash1, - size_X0=size_X0, - size_X1=size_X1, - scaling=scaling) + hash0=hash0, hash1=hash1, size_X0=size_X0, size_X1=size_X1, scaling=scaling + ) def similarity(self, X0: ArrayLike, X1: ArrayLike) -> np.ndarray: """ diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index 3b65108b..2e67b9d5 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -1,12 +1,12 @@ +cimport numpy as cnp cdef class Node: - cdef public: - int[:] indices - int depth - double impurity + cdef public: + cnp.ndarray indices + int depth + double impurity object parent - bint visited - bint is_leaf - + bint visited + bint is_leaf cdef class DecisionNode(Node): cdef public: @@ -15,4 +15,12 @@ cdef class DecisionNode(Node): object left_child object right_child +cdef class LeafNode(Node): + cdef public: + double weighted_samples + int id + cnp.ndarray value +cdef class LocalPolynomialLeafNode(LeafNode): + cdef public: + double theta0, theta1, theta2 diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index c3a2703c..da2e2e9b 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -1,37 +1,22 @@ +# cython: embedsignature=True import numpy as np - +cimport numpy as cnp cdef class Node: - def __cinit__(self): - self.is_leaf = 0 - self.visited = 0 - def __init__( self, indices: np.ndarray, depth: int, - impurity: float) -> None: - self.indices = indices + impurity: float, + parent=None) -> None: + + self.is_leaf = 0 + self.visited = 0 + self.parent = None + self.indices = np.asarray(indices, np.int32) self.depth = depth self.impurity = impurity - def __reduce__(self): - return ( - self.__class__, # Callable object that will be called ot create - # initial state upon pickle - (self.indices, self.depth, self.impurity), # Input to Callable - { - "is_leaf": self.is_leaf, - "visited": self.visited, - "indices": self.indices.base - } # Current state of variables that can not be passed to init - ) - # This function is passed the state provided above - def __setstate__(self, d: dict): - self.is_leaf = d["is_leaf"] - self.visited = d["visited"] - self.indices = d["indices"] - cdef class DecisionNode(Node): def __init__( self, @@ -42,7 +27,9 @@ cdef class DecisionNode(Node): split_idx: int, left_child: "DecisionNode|LeafNode|None" = None, right_child: "DecisionNode|LeafNode|None" = None, - parent: "DecisionNode|None" = None) -> None: + parent: "DecisionNode|None" = None, + is_leaf: int=0, + visited: int=0) -> None: super().__init__(indices, depth, impurity) self.threshold = threshold self.split_idx = split_idx @@ -50,7 +37,7 @@ cdef class DecisionNode(Node): self.right_child = right_child self.parent = parent -class LeafNode(Node): +cdef class LeafNode(Node): def __init__( self, id: int, @@ -60,15 +47,15 @@ class LeafNode(Node): weighted_samples: float, value: np.ndarray, parent: object) -> None: - super().__init__(indices, depth, impurity) + super().__init__(indices, depth, impurity, parent) self.weighted_samples = weighted_samples self.parent = parent self.id = id - self.value = np.asarray(value) + self.value = np.asarray(value, dtype=np.float64) self.is_leaf = 1 -class LocalPolynomialLeafNode(LeafNode): +cdef class LocalPolynomialLeafNode(LeafNode): def __init__( self, id: int, @@ -81,7 +68,8 @@ class LocalPolynomialLeafNode(LeafNode): theta0: float, theta1: float, theta2: float) -> None: - super().__init__(id, indices, depth, impurity, weighted_samples, value, parent) + super().__init__(id, indices, depth, impurity, weighted_samples, value, + parent) self.theta0 = theta0 self.theta1 = theta1 self.theta2 = theta2 diff --git a/src/adaXT/parallel.py b/src/adaXT/parallel.py index 1ad85434..da1704b2 100644 --- a/src/adaXT/parallel.py +++ b/src/adaXT/parallel.py @@ -19,8 +19,7 @@ def shared_numpy_array(array) -> np.ndarray: elif array.ndim == 1: row = array.shape[0] shared_array = RawArray(ctypes.c_double, row) - shared_array_np = np.ndarray( - shape=row, dtype=np.double, buffer=shared_array) + shared_array_np = np.ndarray(shape=row, dtype=np.double, buffer=shared_array) else: raise ValueError("Array is neither 1 dimensional nor 2 dimensional") np.copyto(shared_array_np, array) @@ -46,11 +45,7 @@ def __init__( self.ctx = multiprocessing.get_context("fork") self.n_jobs = n_jobs if n_jobs != -1 else cpu_count() - def async_map( - self, - function: Callable, - map_input: Iterable, - **kwargs) -> Iterable: + def async_map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterable: """ Asynchronously applies the function to the map_input passing along any kwargs given to the function. @@ -76,11 +71,7 @@ def async_map( ret = promise.get() return ret - def map( - self, - function: Callable, - map_input: Iterable, - **kwargs) -> Iterable: + def map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterable: """ Maps the function with map_input. Similair to async_map, but instead guarantees that the first element returned is the result of the first @@ -139,11 +130,7 @@ def async_starmap( ret = promise.get() return ret - def starmap( - self, - function: Callable, - map_input: Iterable, - **kwargs) -> Any: + def starmap(self, function: Callable, map_input: Iterable, **kwargs) -> Any: """ Applies function to each elemetn of map_input but guarantees that element i of return value is the result of function applied to element i @@ -174,11 +161,7 @@ def starmap( ret = p.starmap(partial_func, map_input) return ret - def async_apply( - self, - function: Callable, - n_iterations: int, - **kwargs) -> Iterable: + def async_apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterable: """ Applies the function n_iterations number of times and returns the result of the n_iterations in an unknown order. @@ -201,16 +184,11 @@ def async_apply( ret = [partial_func() for _ in range(n_iterations)] else: with self.ctx.Pool(self.n_jobs) as p: - promise = [p.apply_async(partial_func) - for _ in range(n_iterations)] + promise = [p.apply_async(partial_func) for _ in range(n_iterations)] ret = [res.get() for res in promise] return ret - def apply( - self, - function: Callable, - n_iterations: int, - **kwargs) -> Iterable: + def apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterable: """ Applies the function n_iterations number of times and returns the result of the n_iterations where element i corresponds to the i'th return value diff --git a/src/adaXT/predictor/predictor.pxd b/src/adaXT/predictor/predictor.pxd index 59450a4c..df818be4 100644 --- a/src/adaXT/predictor/predictor.pxd +++ b/src/adaXT/predictor/predictor.pxd @@ -7,7 +7,7 @@ cdef class Predictor(): int n_features object root - cpdef dict predict_leaf(self, cnp.ndarray X) + cpdef dict predict_leaf(self, double[:, ::1] X) cdef class PredictorClassification(Predictor): @@ -16,9 +16,9 @@ cdef class PredictorClassification(Predictor): cdef int __find_max_index(self, double[::1] lst) - cdef cnp.ndarray __predict_proba(self, cnp.ndarray X) + cdef cnp.ndarray __predict_proba(self, double[:, ::1] X) - cdef cnp.ndarray __predict(self, cnp.ndarray X) + cdef cnp.ndarray __predict(self, double[:, ::1] X) cdef class PredictorRegression(Predictor): diff --git a/src/adaXT/predictor/predictor.pyi b/src/adaXT/predictor/predictor.pyi index 3e6d6d22..41b5e8d5 100644 --- a/src/adaXT/predictor/predictor.pyi +++ b/src/adaXT/predictor/predictor.pyi @@ -52,9 +52,9 @@ class Predictor: @staticmethod def forest_predict( - X_old: np.ndarray, - Y_old: np.ndarray, - X_new: np.ndarray, + X_train: np.ndarray, + Y_train: np.ndarray, + X_pred: np.ndarray, trees: list[DecisionTree], parallel: ParallelModel, **kwargs, @@ -68,11 +68,11 @@ class Predictor: Parameters ---------- - X_old: np.ndarray + X_train: np.ndarray Array of feature values used during training. - Y_old: np.ndarray + Y_train: np.ndarray Array of response values used during training. - X_new: np.ndarray + X_pred: np.ndarray Array of new feature values at which to predict. trees: list[DecisionTree] List of fitted DecisionTrees fitted within the random forest. @@ -82,7 +82,7 @@ class Predictor: Returns ------- np.ndarray - An array with predictions for each row of X_new. + An array with predictions for each row of X_pred. """ pass diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 01165e1e..0db24c99 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -1,3 +1,4 @@ +import sys import numpy as np from numpy import float64 as DOUBLE from ..decision_tree.nodes import DecisionNode @@ -5,6 +6,7 @@ from ..decision_tree.nodes cimport Node, DecisionNode from collections.abc import Sequence from statistics import mode cimport numpy as cnp + from ..parallel import ParallelModel # Circular import. Since only used for typing, this fixes the issue. @@ -17,90 +19,94 @@ if TYPE_CHECKING: ctypedef cnp.float64_t DOUBLE_t -def predict_default(tree, X: np.ndarray, **kwargs) -> np.ndarray: - return np.array(tree.predict(X, **kwargs)) +def predict_default( + tree, + double[:, ::1] X_pred, + double[:, ::1] X_train, + double[:, ::1] Y_train, + predictor, + **kwargs) -> np.ndarray: + predictor_instance = predictor(X_train, Y_train, tree.root) + res = predictor_instance.predict(X_pred) + return res def predict_proba( - tree: DecisionTree, X: np.ndarray, Y: np.ndarray, unique_classes: int -) -> np.ndarray: - cdef: - int i, cur_split_idx - double cur_threshold - object cur_node - list ret_val + tree, + double[:, ::1] X_pred, + double[:, ::1] X_train, + double[:, ::1] Y_train, + predictor, + **kwargs) -> np.ndarray: - # Make sure that x fits the dimensions. - n_obs = X.shape[0] - ret_val = [] - for i in range(n_obs): - cur_node = tree.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child - else: - cur_node = cur_node.right_child - ret_val.append(cur_node.value) - return np.array(ret_val) + predictor_instance = predictor(X_train, Y_train, tree.root) + res = predictor_instance.predict(X_pred, predict_proba=True) + return res def predict_quantile( - tree: DecisionTree, X: np.ndarray, n_obs: int + tree, + X_pred: double[:, ::1], ) -> list: + cdef: + int i, cur_split_idx + double cur_threshold + int n_obs = X_pred.shape[0] + Node cur_node + DecisionNode dNode # Check if quantile is an array indices = [] for i in range(n_obs): cur_node = tree.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X_pred[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child + cur_node = dNode.right_child indices.append(cur_node.indices) + return indices cdef class Predictor(): - def __init__(self, double[:, ::1] X, double[:, ::1] Y, object root, **kwargs): + def __init__(self, const double[:, ::1] X, const double[:, ::1] Y, object root, **kwargs): self.X = X self.Y = Y - self.n_features = X.shape[1] self.root = root + self.n_features = X.shape[1] - def __reduce__(self) -> None: - return (self.__class__, (self.X.base, self.Y.base, self.root)) - - def predict(self, cnp.ndarray X, **kwargs) -> np.ndarray: + def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: raise NotImplementedError("Function predict is not implemented for this Predictor class") - cpdef dict predict_leaf(self, cnp.ndarray X): + cpdef dict predict_leaf(self, double[:, ::1] X): cdef: - int i - int row - dict ht - int cur_split_idx + int i, cur_split_idx, n_obs double cur_threshold + Node cur_node + DecisionNode dNode + dict ht # Make sure that x fits the dimensions. - row = X.shape[0] - ht = {} - for i in range(row): + n_obs = X.shape[0] + prediction = np.empty(n_obs, dtype=DOUBLE) + + for i in range(n_obs): cur_node = self.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child + cur_node = dNode.right_child if cur_node.id not in ht.keys(): ht[cur_node.id] = [i] @@ -108,21 +114,28 @@ cdef class Predictor(): ht[cur_node.id] += [i] return ht - @staticmethod - def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new, - trees: list[DecisionTree], parallel: ParallelModel, + @classmethod + def forest_predict(cls, + cnp.ndarray[DOUBLE_t, ndim=2] X_train, + cnp.ndarray[DOUBLE_t, ndim=2] Y_train, + cnp.ndarray[DOUBLE_t, ndim=2] X_pred, + trees:list[DecisionTree], + parallel: ParallelModel, **kwargs) -> np.ndarray: predictions = parallel.async_map(predict_default, trees, - X=X_new, + X_train = X_train, + Y_train = Y_train, + X_pred=X_pred, + predictor=cls, **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) cdef class PredictorClassification(Predictor): def __init__(self, - double[:, ::1] X, - double[:, ::1] Y, + const double[:, ::1] X, + const double[:, ::1] Y, object root, **kwargs) -> None: super().__init__(X, Y, root, **kwargs) self.classes = np.unique(Y) @@ -136,18 +149,16 @@ cdef class PredictorClassification(Predictor): cur_max = i return cur_max - cdef inline cnp.ndarray __predict(self, cnp.ndarray X): + cdef inline cnp.ndarray __predict(self, double[:,::1] X): cdef: - int i, cur_split_idx, idx, n_obs + int i, cur_split_idx, n_obs double cur_threshold Node cur_node DecisionNode dNode - const double[:, ::1] X_ndarray double[:] prediction # Make sure that x fits the dimensions. n_obs = X.shape[0] - X_ndarray = X prediction = np.empty(n_obs, dtype=DOUBLE) for i in range(n_obs): @@ -156,7 +167,7 @@ cdef class PredictorClassification(Predictor): dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold - if X_ndarray[i, cur_split_idx] <= cur_threshold: + if X[i, cur_split_idx] <= cur_threshold: cur_node = dNode.left_child else: cur_node = dNode.right_child @@ -165,30 +176,34 @@ cdef class PredictorClassification(Predictor): prediction[i] = self.classes[idx] return np.array(prediction) - cdef cnp.ndarray __predict_proba(self, cnp.ndarray X): + cdef inline cnp.ndarray __predict_proba(self, double[:, ::1] X): cdef: - int i, cur_split_idx + int i, cur_split_idx, n_obs double cur_threshold - object cur_node + Node cur_node + DecisionNode dNode + double[:] prediction list ret_val # Make sure that x fits the dimensions. n_obs = X.shape[0] ret_val = [] + for i in range(n_obs): cur_node = self.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child - if self.classes is not None: - ret_val.append(cur_node.value) + cur_node = dNode.right_child + + ret_val.append(cur_node.value) return np.array(ret_val) - def predict(self, cnp.ndarray X, **kwargs) -> np.ndarray: + def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: if "predict_proba" in kwargs: if kwargs["predict_proba"]: return self.__predict_proba(X) @@ -196,28 +211,43 @@ cdef class PredictorClassification(Predictor): # if predict_proba = False this return is hit return self.__predict(X) - @staticmethod - def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new, - trees: list[DecisionTree], parallel: ParallelModel, + @classmethod + def forest_predict(cls, + cnp.ndarray[DOUBLE_t, ndim=2] X_train, + cnp.ndarray[DOUBLE_t, ndim=2] Y_train, + cnp.ndarray[DOUBLE_t, ndim=2] X_pred, + trees:list[DecisionTree], + parallel: ParallelModel, **kwargs) -> np.ndarray: # Forest_predict_proba if "predict_proba" in kwargs: if kwargs["predict_proba"]: - predictions = parallel.async_map(predict_proba, trees, - X=X_new) + predictions = parallel.async_map(predict_proba, + map_input=trees, + X_train = X_train, + Y_train = Y_train, + X_pred=X_pred, + predictor=cls, + **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) - predictions = parallel.async_map(predict_default, trees, X=X_new, + predictions = parallel.async_map(predict_default, + map_input=trees, + X_train = X_train, + Y_train = Y_train, + X_pred=X_pred, + predictor=cls, **kwargs) return np.array(np.apply_along_axis(mode, 0, predictions), dtype=int) cdef class PredictorRegression(Predictor): - def predict(self, cnp.ndarray X, **kwargs) -> np.ndarray: + def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: cdef: int i, cur_split_idx, n_obs, n_col double cur_threshold - object cur_node + Node cur_node + DecisionNode dNode cnp.ndarray prediction # Make sure that x fits the dimensions. @@ -230,13 +260,15 @@ cdef class PredictorRegression(Predictor): for i in range(n_obs): cur_node = self.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child + cur_node = dNode.right_child + if cur_node.value.ndim == 1: prediction[i] = cur_node.value[0] else: @@ -245,12 +277,12 @@ cdef class PredictorRegression(Predictor): cdef class PredictorLocalPolynomial(PredictorRegression): - - def predict(self, cnp.ndarray X, **kwargs) -> np.ndarray: + def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: cdef: int i, cur_split_idx, n_obs, ind, oo double cur_threshold - object cur_node + Node cur_node + DecisionNode dNode cnp.ndarray[DOUBLE_t, ndim=2] deriv_mat if "order" not in kwargs.keys(): @@ -265,13 +297,15 @@ cdef class PredictorLocalPolynomial(PredictorRegression): for i in range(n_obs): cur_node = self.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child + cur_node = dNode.right_child + ind = 0 for oo in order: if oo == 0: @@ -285,11 +319,12 @@ cdef class PredictorLocalPolynomial(PredictorRegression): cdef class PredictorQuantile(Predictor): - def predict(self, cnp.ndarray X, **kwargs) -> np.ndarray: + def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: cdef: int i, cur_split_idx, n_obs double cur_threshold - object cur_node + Node cur_node + DecisionNode dNode cnp.ndarray prediction if "quantile" not in kwargs.keys(): raise ValueError( @@ -306,33 +341,39 @@ cdef class PredictorQuantile(Predictor): for i in range(n_obs): cur_node = self.root - while isinstance(cur_node, DecisionNode): - cur_split_idx = cur_node.split_idx - cur_threshold = cur_node.threshold - if X[i, cur_split_idx] < cur_threshold: - cur_node = cur_node.left_child + while not cur_node.is_leaf: + dNode = cur_node + cur_split_idx = dNode.split_idx + cur_threshold = dNode.threshold + if X[i, cur_split_idx] <= cur_threshold: + cur_node = dNode.left_child else: - cur_node = cur_node.right_child + cur_node = dNode.right_child prediction[i] = np.quantile(self.Y.base[cur_node.indices, 0], quantile) return prediction - @staticmethod - def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new, - trees: list[DecisionTree], parallel: ParallelModel, + @classmethod + def forest_predict(cls, + cnp.ndarray[DOUBLE_t, ndim=2] X_train, + cnp.ndarray[DOUBLE_t, ndim=2] Y_train, + cnp.ndarray[DOUBLE_t, ndim=2] X_pred, + trees:list[DecisionTree], + parallel: ParallelModel, **kwargs) -> np.ndarray: cdef: int i, j, n_obs, n_trees - list prediction_indices, pred_indices_combined, indices_combined + # cnp.ndarray + # list pred_indices_combined, indices_combined, prediction_indices if "quantile" not in kwargs.keys(): raise ValueError( "quantile called without quantile passed as argument" ) quantile = kwargs['quantile'] - n_obs = X_new.shape[0] + n_obs = X_pred.shape[0] prediction_indices = parallel.async_map(predict_quantile, - map_input=trees, X=X_new, - n_obs=n_obs) + map_input=trees, + X_pred=X_pred) # In case the leaf nodes have multiple elements and not just one, we # have to combine them together n_trees = len(prediction_indices) @@ -342,5 +383,6 @@ cdef class PredictorQuantile(Predictor): for j in range(n_trees): indices_combined.extend(prediction_indices[j][i]) pred_indices_combined.append(indices_combined) - ret = np.quantile(Y_old[pred_indices_combined, 0], quantile, axis=1) + ret = np.array([np.quantile(Y_train[indices, 0], quantile) for + indices in pred_indices_combined]) return ret diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index d1ff156c..95587abc 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -24,10 +24,12 @@ def tree_based_weights( X1: np.ndarray | None, size_X0: int, size_X1: int, + X_train: np.ndarray, + Y_train: np.ndarray, scaling: str, ) -> np.ndarray: - hash0 = tree.predict_leaf(X=X0) - hash1 = tree.predict_leaf(X=X1) + hash0 = tree._forest_predict_leaf(X_pred=X0, X_train=X_train, Y_train=Y_train) + hash1 = tree._forest_predict_leaf(X_pred=X1, X_train=X_train, Y_train=Y_train) return tree._tree_based_weights( hash0=hash0, hash1=hash1, @@ -63,8 +65,7 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min( - [sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -74,7 +75,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"]:], + indices[sampling_args["split"] :], size=resample_size1, replace=sampling_args["replace"], ) @@ -85,8 +86,7 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min( - [sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -96,7 +96,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"]:], + indices[sampling_args["split"] :], size=resample_size1, replace=sampling_args["replace"], ) @@ -151,18 +151,13 @@ def build_single_tree( leaf_builder=leaf_builder, predictor=predictor, splitter=splitter, + ensemble=True, ) - tree.fit( - X=X, - Y=Y, - sample_indices=fitting_indices, - sample_weight=sample_weight) + tree.fit(X=X, Y=Y, sample_indices=fitting_indices, sample_weight=sample_weight) if honest_tree: tree.refit_leaf_nodes( - X=X, - Y=Y, - sample_weight=sample_weight, - sample_indices=prediction_indices) + X=X, Y=Y, sample_weight=sample_weight, sample_indices=prediction_indices + ) return tree @@ -170,30 +165,24 @@ def build_single_tree( def oob_calculation( idx: np.int64, trees: list, - X_old: np.ndarray, - Y_old: np.ndarray, + X_train: np.ndarray, + Y_train: np.ndarray, parallel: ParallelModel, predictor: type[Predictor], ) -> tuple: - X_pred = np.expand_dims(X_old[idx], axis=0) + X_pred = np.expand_dims(X_train[idx], axis=0) Y_pred = predictor.forest_predict( - X_old=X_old, - Y_old=Y_old, - X_new=X_pred, + X_train=X_train, + Y_train=Y_train, + X_pred=X_pred, trees=trees, parallel=parallel, __no_parallel=True, ).astype(np.float64) - Y_true = Y_old[idx] + Y_true = Y_train[idx] return (Y_pred, Y_true) -def predict_single_tree( - tree: DecisionTree, predict_values: np.ndarray, **kwargs -) -> np.ndarray: - return tree.predict(predict_values, **kwargs) - - class RandomForest(BaseModel): """ Attributes @@ -353,8 +342,7 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: if "size" not in sampling_args: sampling_args["size"] = self.X_n_rows elif isinstance(sampling_args["size"], float): - sampling_args["size"] = int( - sampling_args["size"] * self.X_n_rows) + sampling_args["size"] = int(sampling_args["size"] * self.X_n_rows) elif not isinstance(sampling_args["size"], int): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." @@ -424,7 +412,8 @@ def __build_trees(self) -> None: sampling=self.sampling, ) self.fitting_indices, self.prediction_indices, self.out_of_bag_indices = zip( - *indices) + *indices + ) self.trees = self.parallel.starmap( build_single_tree, map_input=zip(self.fitting_indices, self.prediction_indices), @@ -446,8 +435,9 @@ def __build_trees(self) -> None: sample_weight=self.sample_weight, ) - def fit(self, X: ArrayLike, Y: ArrayLike, - sample_weight: ArrayLike | None = None) -> None: + def fit( + self, X: ArrayLike, Y: ArrayLike, sample_weight: ArrayLike | None = None + ) -> None: """ Fit the random forest with training data (X, Y). @@ -479,8 +469,7 @@ def fit(self, X: ArrayLike, Y: ArrayLike, self.X = shared_numpy_array(X) self.Y = shared_numpy_array(Y) self.X_n_rows, self.n_features = self.X.shape - self.max_features = self._check_max_features( - self.max_features, X.shape[0]) + self.max_features = self._check_max_features(self.max_features, X.shape[0]) self.sample_weight = self._check_sample_weight(sample_weight) self.sampling_args = self.__get_sampling_parameter(self.sampling_args) @@ -508,8 +497,8 @@ def fit(self, X: ArrayLike, Y: ArrayLike, *self.parallel.async_starmap( oob_calculation, map_input=tree_dict.items(), - X_old=self.X, - Y_old=self.Y, + X_train=self.X, + Y_train=self.Y, parallel=self.parallel, predictor=self.predictor, ) @@ -570,9 +559,9 @@ def predict(self, X: ArrayLike, **kwargs) -> np.ndarray: predict_value = shared_numpy_array(X) prediction = self.predictor.forest_predict( - X_old=self.X, - Y_old=self.Y, - X_new=predict_value, + X_train=self.X, + Y_train=self.Y, + X_pred=predict_value, trees=self.trees, parallel=self.parallel, **kwargs, @@ -627,6 +616,8 @@ def predict_weights( X1=None, size_X0=size_0, size_X1=self.X_n_rows, + X_train=self.X, + Y_train=self.Y, scaling=scaling, ) @@ -668,6 +659,8 @@ def similarity(self, X0: ArrayLike, X1: ArrayLike): X1=X1, size_X0=size_0, size_X1=size_1, + X_train=self.X, + Y_train=self.Y, scaling="similarity", ) return np.mean(weight_list, axis=0) diff --git a/time_predict_tree.py b/time_predict_tree.py deleted file mode 100644 index 0232fadc..00000000 --- a/time_predict_tree.py +++ /dev/null @@ -1,55 +0,0 @@ -import numpy as np -import adaXT.criteria as crit -from adaXT.decision_tree import DecisionTree -from adaXT.random_forest import RandomForest -from sklearn.tree import DecisionTreeClassifier -import time - -from memory_profiler import profile - -low_X = 0 -high_X = 10_000 -N_TRAIN = 100_000 -M = 5 -N_PREDICT = N_TRAIN -NUM_TREES = 1 -NUM_PREDICT = 1 - - -def predict_n_times(tree): - """ - predicts NUM_PREDICT number of times using the tree on randomly generated X - data. - - Returns the mean predict time of NUM_PREDICT randomly generated values. - """ - times = [] - for _ in range(NUM_PREDICT): - X = np.random.uniform(low_X, high_X, (N_PREDICT, M)) - st = time.time() - x = tree.predict(X) - et = time.time() - times.append(et - st) - return np.mean(times) - - -def main(): - X = np.random.uniform(low_X, high_X, (N_TRAIN, M)) - Y = np.random.randint(0, M, N_TRAIN) - - times = [] - for _ in range(NUM_TREES): - tree = RandomForest(forest_type="Classification", criteria=crit.Gini_index) - # tree = DecisionTreeClassifier(criterion="gini") - tree.fit(X, Y) - times.append(predict_n_times(tree)) - - mean_predict_time = np.mean(times) - print( - f"Mean predict times for {NUM_TREES} predicting {NUM_PREDICT} times: ", - mean_predict_time, - ) - - -if __name__ == "__main__": - main() From 2eeae46c36b2d21df3a489222944caca88a07076 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Sun, 23 Feb 2025 20:36:52 +0100 Subject: [PATCH 07/33] Fixed linting --- src/adaXT/decision_tree/_decision_tree.pyx | 3 ++- src/adaXT/decision_tree/nodes.pxd | 2 +- src/adaXT/decision_tree/nodes.pyx | 6 +++--- src/adaXT/predictor/predictor.pyx | 15 ++++++--------- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 1b83a0cc..85084e90 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -40,6 +40,7 @@ class refit_object(): def add_idx(self, idx: int) -> None: self.indices.append(idx) + @cython.auto_pickle(True) cdef class _DecisionTree(): cdef public: @@ -592,5 +593,5 @@ class DepthTreeBuilder: tree.max_depth = max_depth_seen tree.root = root tree.leaf_nodes = leaf_node_list - if not self.ensemble: + if not self.ensemble: tree.predictor_instance = self.predictor(self.X, self.Y, root) diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index 2e67b9d5..cc420d62 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -9,7 +9,7 @@ cdef class Node: bint is_leaf cdef class DecisionNode(Node): - cdef public: + cdef public: double threshold int split_idx object left_child diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index da2e2e9b..1afe72aa 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -1,6 +1,5 @@ # cython: embedsignature=True import numpy as np -cimport numpy as cnp cdef class Node: def __init__( @@ -28,8 +27,9 @@ cdef class DecisionNode(Node): left_child: "DecisionNode|LeafNode|None" = None, right_child: "DecisionNode|LeafNode|None" = None, parent: "DecisionNode|None" = None, - is_leaf: int=0, - visited: int=0) -> None: + is_leaf: int = 0, + visited: int = 0) -> None: + super().__init__(indices, depth, impurity) self.threshold = threshold self.split_idx = split_idx diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 0db24c99..a391d742 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -1,4 +1,3 @@ -import sys import numpy as np from numpy import float64 as DOUBLE from ..decision_tree.nodes import DecisionNode @@ -31,6 +30,7 @@ def predict_default( res = predictor_instance.predict(X_pred) return res + def predict_proba( tree, double[:, ::1] X_pred, @@ -95,7 +95,6 @@ cdef class Predictor(): # Make sure that x fits the dimensions. ht = {} n_obs = X.shape[0] - prediction = np.empty(n_obs, dtype=DOUBLE) for i in range(n_obs): cur_node = self.root @@ -119,7 +118,7 @@ cdef class Predictor(): cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, cnp.ndarray[DOUBLE_t, ndim=2] X_pred, - trees:list[DecisionTree], + trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: predictions = parallel.async_map(predict_default, @@ -149,7 +148,7 @@ cdef class PredictorClassification(Predictor): cur_max = i return cur_max - cdef inline cnp.ndarray __predict(self, double[:,::1] X): + cdef inline cnp.ndarray __predict(self, double[:, ::1] X): cdef: int i, cur_split_idx, n_obs double cur_threshold @@ -182,7 +181,6 @@ cdef class PredictorClassification(Predictor): double cur_threshold Node cur_node DecisionNode dNode - double[:] prediction list ret_val # Make sure that x fits the dimensions. @@ -216,7 +214,7 @@ cdef class PredictorClassification(Predictor): cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, cnp.ndarray[DOUBLE_t, ndim=2] X_pred, - trees:list[DecisionTree], + trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: # Forest_predict_proba @@ -358,13 +356,12 @@ cdef class PredictorQuantile(Predictor): cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, cnp.ndarray[DOUBLE_t, ndim=2] X_pred, - trees:list[DecisionTree], + trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: cdef: int i, j, n_obs, n_trees - # cnp.ndarray - # list pred_indices_combined, indices_combined, prediction_indices + list pred_indices_combined, indices_combined, prediction_indices if "quantile" not in kwargs.keys(): raise ValueError( "quantile called without quantile passed as argument" From d3ef499e8f8e49ebf38d9c77b02a88e91c1adb78 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 07:50:54 +0100 Subject: [PATCH 08/33] Remove left over comment --- src/adaXT/decision_tree/_decision_tree.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 85084e90..8bfe5668 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -1,4 +1,3 @@ -# cython: auto_pickle=False import numpy as np import sys From 07c384a15eb5a6ecbe7beae60cc0a89df59ea20f Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 10:01:17 +0100 Subject: [PATCH 09/33] Changed to no parallel predict --- src/adaXT/criteria/__init__.pxd | 16 ++--- src/adaXT/criteria/__init__.py | 2 + src/adaXT/criteria/criteria.pyi | 26 +++++++-- src/adaXT/criteria/criteria.pyx | 2 +- src/adaXT/decision_tree/_decision_tree.pyx | 6 +- src/adaXT/decision_tree/decision_tree.py | 3 - src/adaXT/decision_tree/nodes.pyx | 3 +- src/adaXT/decision_tree/tree_utils.py | 32 +++------- src/adaXT/leaf_builder/leaf_builder.pxd | 2 +- src/adaXT/leaf_builder/leaf_builder.pyx | 19 +++--- src/adaXT/parallel.py | 8 ++- src/adaXT/predictor/predictor.pxd | 9 +-- src/adaXT/predictor/predictor.pyx | 68 ++++++++++++---------- src/adaXT/random_forest/random_forest.py | 1 - 14 files changed, 106 insertions(+), 91 deletions(-) diff --git a/src/adaXT/criteria/__init__.pxd b/src/adaXT/criteria/__init__.pxd index 3a26738b..3fd46c9a 100644 --- a/src/adaXT/criteria/__init__.pxd +++ b/src/adaXT/criteria/__init__.pxd @@ -1,8 +1,10 @@ from .criteria cimport ( - Criteria, - Gini_index, - Entropy, - Squared_error, - Partial_linear, - Partial_quadratic - ) + ClassificationCriteria, + RegressionCriteria, + Criteria, + Gini_index, + Entropy, + Squared_error, + Partial_linear, + Partial_quadratic +) diff --git a/src/adaXT/criteria/__init__.py b/src/adaXT/criteria/__init__.py index 1c066c94..027b0809 100644 --- a/src/adaXT/criteria/__init__.py +++ b/src/adaXT/criteria/__init__.py @@ -1,4 +1,6 @@ from .criteria import ( + ClassificationCriteria, + RegressionCriteria, Gini_index, Squared_error, Entropy, diff --git a/src/adaXT/criteria/criteria.pyi b/src/adaXT/criteria/criteria.pyi index 3e278bf2..0340d56a 100644 --- a/src/adaXT/criteria/criteria.pyi +++ b/src/adaXT/criteria/criteria.pyi @@ -5,7 +5,15 @@ class Criteria: pass -class Gini_index(Criteria): +class ClassificationCriteria(Criteria): + """ + Parent class for Criteria used in the Classification Tree Type. Can not be + used as a standalone Criteria. + """ + + pass + +class Gini_index(ClassificationCriteria): r""" Gini index based criteria, which can be used for classification. Formally, given class labels $\mathcal{L}$, the Gini index in a node @@ -19,7 +27,7 @@ class Gini_index(Criteria): pass -class Entropy(Criteria): +class Entropy(ClassificationCriteria): r""" Entropy based criteria, which can be used for classification. Formally, given class labels $\mathcal{L}$, the entropy in a node @@ -33,7 +41,15 @@ class Entropy(Criteria): pass -class Squared_error(Criteria): +class RegressionCriteria(Criteria): + """ + Parent class for criteria used in Regression Tree Type. Can not be used as a + standalone Criteria. + """ + + pass + +class Squared_error(RegressionCriteria): r""" Squared error based criteria, which can be used for regression and leads to standard CART splits. Formally, the squared error in a node @@ -53,7 +69,7 @@ class Squared_error(Criteria): pass -class Partial_linear(Criteria): +class Partial_linear(RegressionCriteria): r""" Criteria based on fitting a linear function in the first predictor variable in each leaf. Formally, in a node consisting of samples $I$, @@ -71,7 +87,7 @@ class Partial_linear(Criteria): pass -class Partial_quadratic(Criteria): +class Partial_quadratic(RegressionCriteria): r""" Criteria based on fitting a quadratic function in the first predictor variable in each leaf. Formally, in a node consisting of samples $I$, diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 323f1894..b1e4a01a 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -76,7 +76,7 @@ cdef class ClassificationCriteria(Criteria): super().__init__(X, Y, sample_weight) self.first_call = True - def __del__(self) -> None: + def __dealloc__(self) -> None: free(self.weight_in_class_left) free(self.weight_in_class_right) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 8bfe5668..6e5a77e0 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -398,7 +398,6 @@ class DepthTreeBuilder: splitter: Splitter, leaf_builder: LeafBuilder, predictor: Predictor, - ensemble: bool = False, ) -> None: """ Parameters @@ -433,8 +432,6 @@ class DepthTreeBuilder: self.predictor = predictor self.leaf_builder = leaf_builder - self.ensemble = ensemble - def __get_feature_indices(self) -> np.ndarray: if self.max_features == -1: return self.feature_indices @@ -592,5 +589,4 @@ class DepthTreeBuilder: tree.max_depth = max_depth_seen tree.root = root tree.leaf_nodes = leaf_node_list - if not self.ensemble: - tree.predictor_instance = self.predictor(self.X, self.Y, root) + tree.predictor_instance = self.predictor(self.X, self.Y, root) diff --git a/src/adaXT/decision_tree/decision_tree.py b/src/adaXT/decision_tree/decision_tree.py index 8f5963f7..76432d70 100644 --- a/src/adaXT/decision_tree/decision_tree.py +++ b/src/adaXT/decision_tree/decision_tree.py @@ -57,7 +57,6 @@ def __init__( predictor: Type[Predictor] | None = None, splitter: Type[Splitter] | None = None, skip_check_input: bool = False, - ensemble: bool = False, ) -> None: """ Parameters @@ -97,7 +96,6 @@ def __init__( """ self.skip_check_input = skip_check_input - self.ensemble = ensemble # Input only checked on fitting. self.criteria = criteria @@ -192,7 +190,6 @@ def fit( leaf_builder=self.leaf_builder, predictor=self.predictor, splitter=self.splitter, - ensemble=self.ensemble, ) builder.build_tree(self._tree) diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index 1afe72aa..6c95b6c4 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -30,6 +30,7 @@ cdef class DecisionNode(Node): is_leaf: int = 0, visited: int = 0) -> None: + super().__init__(indices, depth, impurity) self.threshold = threshold self.split_idx = split_idx @@ -51,7 +52,7 @@ cdef class LeafNode(Node): self.weighted_samples = weighted_samples self.parent = parent self.id = id - self.value = np.asarray(value, dtype=np.float64) + self.value = np.asarray(value) self.is_leaf = 1 diff --git a/src/adaXT/decision_tree/tree_utils.py b/src/adaXT/decision_tree/tree_utils.py index 07de6d3d..8eea8da9 100644 --- a/src/adaXT/decision_tree/tree_utils.py +++ b/src/adaXT/decision_tree/tree_utils.py @@ -121,31 +121,25 @@ def get_label(**kwargs): if isinstance(node, DecisionNode): node_string += "DecisionNode" + new_line node_string += f"X{node.split_idx} <= " - node_string += str(round(node.threshold, - impurity_precision)) + new_line + node_string += str(round(node.threshold, impurity_precision)) + new_line if kwargs["impurity"]: node_string += "Impurity: " - node_string += str(round(node.impurity, - impurity_precision)) + new_line + node_string += str(round(node.impurity, impurity_precision)) + new_line elif isinstance(node, LeafNode): node_string += "LeafNode" + new_line if kwargs["impurity"]: node_string += "Impurity: " - node_string += str(round(node.impurity, - impurity_precision)) + new_line + node_string += str(round(node.impurity, impurity_precision)) + new_line node_string += "Samples: " - node_string += str(round(node.weighted_samples, - impurity_precision)) + new_line + node_string += str(round(node.weighted_samples, impurity_precision)) + new_line node_string += "Value: " if len(node.value) == 1: node_string += str(round(node.value[0], node_precision)) else: node_value_string = "\n [" value_length = len(node.value) - n_vals_per_line = max( - value_length / 3, - 4) # Number of values per line + n_vals_per_line = max(value_length / 3, 4) # Number of values per line for i in range(value_length): node_value_string += str(round(node.value[i], node_precision)) if (i + 1) % n_vals_per_line == 0 and i != value_length - 1: @@ -168,20 +162,12 @@ def __init__(self, node, parent=None, depth=0, number=1, **kwargs): if node.left_child is not None: lst.append( - DrawTree( - node.left_child, - self, - depth + 1, - number=1, - **kwargs)) + DrawTree(node.left_child, self, depth + 1, number=1, **kwargs) + ) if node.right_child is not None: lst.append( - DrawTree( - node.right_child, - self, - depth + 1, - number=2, - **kwargs)) + DrawTree(node.right_child, self, depth + 1, number=2, **kwargs) + ) self.children = lst self.parent = parent self.thread = None diff --git a/src/adaXT/leaf_builder/leaf_builder.pxd b/src/adaXT/leaf_builder/leaf_builder.pxd index 76be2a41..2186e25b 100644 --- a/src/adaXT/leaf_builder/leaf_builder.pxd +++ b/src/adaXT/leaf_builder/leaf_builder.pxd @@ -21,7 +21,7 @@ cdef class LeafBuilderClassification(LeafBuilder): double[::1] classes int n_classes - cdef double[::1] __get_mean(self, int[::1] indices) + cdef inline cnp.ndarray __get_mean(self, int[::1] indices) cpdef object build_leaf(self, int leaf_id, diff --git a/src/adaXT/leaf_builder/leaf_builder.pyx b/src/adaXT/leaf_builder/leaf_builder.pyx index cd9104f2..6ae8869d 100644 --- a/src/adaXT/leaf_builder/leaf_builder.pyx +++ b/src/adaXT/leaf_builder/leaf_builder.pyx @@ -23,17 +23,17 @@ cdef class LeafBuilderClassification(LeafBuilder): self.classes = np.array(np.unique(Y.base[all_idx, 0]), dtype=np.double) self.n_classes = self.classes.shape[0] - cdef double[::1] __get_mean(self, int[::1] indices): + cdef inline cnp.ndarray __get_mean(self, int[::1] indices): cdef: - cnp.ndarray[double, ndim=1] ret + cnp.ndarray[float, ndim=1] ret int i, idx, n_samples n_samples = indices.shape[0] - ret = np.zeros(self.n_classes) + ret = np.zeros(self.n_classes, dtype=np.float32) for idx in range(n_samples): for i in range(self.n_classes): if self.Y[indices[idx], 0] == self.classes[i]: - ret[i] += 1 # add 1, if the value is the same as class value + ret[i] += 1.0 # add 1, if the value is the same as class value break ret = ret / n_samples @@ -46,9 +46,14 @@ cdef class LeafBuilderClassification(LeafBuilder): double impurity, double weighted_samples, object parent): - cdef double[::1] mean = self.__get_mean(indices) - return LeafNode(leaf_id, indices, depth, impurity, weighted_samples, - mean, parent) + cdef cnp.ndarray mean = self.__get_mean(indices) + return LeafNode(id=leaf_id, + indices=indices, + depth=depth, + impurity=impurity, + weighted_samples=weighted_samples, + value=mean, + parent=parent) cdef class LeafBuilderRegression(LeafBuilder): diff --git a/src/adaXT/parallel.py b/src/adaXT/parallel.py index da1704b2..0f992fdd 100644 --- a/src/adaXT/parallel.py +++ b/src/adaXT/parallel.py @@ -63,7 +63,13 @@ def async_map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterab Returns the result of running function on all elements of map_input """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or ("__no_parallel" in kwargs): + + parallel = True + if "__no_parallel" in kwargs: + if kwargs["__no_parallel"]: + parallel = False + + if self.n_jobs == 1 or not parallel: ret = list(map(partial_func, map_input)) else: with self.ctx.Pool(self.n_jobs) as p: diff --git a/src/adaXT/predictor/predictor.pxd b/src/adaXT/predictor/predictor.pxd index df818be4..bb405447 100644 --- a/src/adaXT/predictor/predictor.pxd +++ b/src/adaXT/predictor/predictor.pxd @@ -2,8 +2,9 @@ cimport numpy as cnp cdef class Predictor(): cdef: - const double[:, ::1] X - const double[:, ::1] Y + # Must be ndarray such that it and all children can be pickled + cnp.ndarray X + cnp.ndarray Y int n_features object root @@ -12,9 +13,9 @@ cdef class Predictor(): cdef class PredictorClassification(Predictor): cdef: - readonly double[::1] classes + readonly cnp.ndarray classes - cdef int __find_max_index(self, double[::1] lst) + cdef int __find_max_index(self, float[::1] lst) cdef cnp.ndarray __predict_proba(self, double[:, ::1] X) diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index a391d742..130317f3 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -8,6 +8,8 @@ cimport numpy as cnp from ..parallel import ParallelModel +import time + # Circular import. Since only used for typing, this fixes the issue. from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -21,26 +23,18 @@ ctypedef cnp.float64_t DOUBLE_t def predict_default( tree, double[:, ::1] X_pred, - double[:, ::1] X_train, - double[:, ::1] Y_train, - predictor, **kwargs) -> np.ndarray: - predictor_instance = predictor(X_train, Y_train, tree.root) - res = predictor_instance.predict(X_pred) + res = tree.predict(X_pred, **kwargs) return res def predict_proba( tree, double[:, ::1] X_pred, - double[:, ::1] X_train, - double[:, ::1] Y_train, - predictor, **kwargs) -> np.ndarray: - predictor_instance = predictor(X_train, Y_train, tree.root) - res = predictor_instance.predict(X_pred, predict_proba=True) + res = tree.predict(X_pred, predict_proba=True, **kwargs) return res @@ -76,8 +70,8 @@ def predict_quantile( cdef class Predictor(): def __init__(self, const double[:, ::1] X, const double[:, ::1] Y, object root, **kwargs): - self.X = X - self.Y = Y + self.X = np.asarray(X) + self.Y = np.asarray(Y) self.root = root self.n_features = X.shape[1] @@ -113,9 +107,20 @@ cdef class Predictor(): ht[cur_node.id] += [i] return ht - @classmethod - def forest_predict(cls, - cnp.ndarray[DOUBLE_t, ndim=2] X_train, + def __get_state__(self): + return { + "root": self.root, + "X": np.asarray(self.X), + "Y": np.asarray(self.Y), + } + + def __set_state__(self, d: dict): + self.X = d["X"] + self.Y = d["Y"] + self.root = d["root"] + + @staticmethod + def forest_predict(cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], @@ -123,10 +128,8 @@ cdef class Predictor(): **kwargs) -> np.ndarray: predictions = parallel.async_map(predict_default, trees, - X_train = X_train, - Y_train = Y_train, X_pred=X_pred, - predictor=cls, + __no_parallel=True, **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) @@ -139,7 +142,7 @@ cdef class PredictorClassification(Predictor): super().__init__(X, Y, root, **kwargs) self.classes = np.unique(Y) - cdef int __find_max_index(self, double[::1] lst): + cdef int __find_max_index(self, float[::1] lst): cdef: int cur_max, i cur_max = 0 @@ -209,33 +212,35 @@ cdef class PredictorClassification(Predictor): # if predict_proba = False this return is hit return self.__predict(X) - @classmethod - def forest_predict(cls, - cnp.ndarray[DOUBLE_t, ndim=2] X_train, + @staticmethod + def forest_predict(cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: + + # Remove no parallel if given + kwargs.pop("__no_parallel", None) + # Forest_predict_proba if "predict_proba" in kwargs: if kwargs["predict_proba"]: predictions = parallel.async_map(predict_proba, map_input=trees, - X_train = X_train, - Y_train = Y_train, X_pred=X_pred, - predictor=cls, + __no_parallel=True, **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) + st = time.time() predictions = parallel.async_map(predict_default, map_input=trees, - X_train = X_train, - Y_train = Y_train, X_pred=X_pred, - predictor=cls, + __no_parallel=True, **kwargs) + et = time.time() + print("Parallel time predict: ", et - st) return np.array(np.apply_along_axis(mode, 0, predictions), dtype=int) @@ -348,12 +353,11 @@ cdef class PredictorQuantile(Predictor): else: cur_node = dNode.right_child - prediction[i] = np.quantile(self.Y.base[cur_node.indices, 0], quantile) + prediction[i] = np.quantile(self.Y[cur_node.indices, 0], quantile) return prediction - @classmethod - def forest_predict(cls, - cnp.ndarray[DOUBLE_t, ndim=2] X_train, + @staticmethod + def forest_predict(cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 95587abc..c092381d 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -151,7 +151,6 @@ def build_single_tree( leaf_builder=leaf_builder, predictor=predictor, splitter=splitter, - ensemble=True, ) tree.fit(X=X, Y=Y, sample_indices=fitting_indices, sample_weight=sample_weight) if honest_tree: From e8bda8fda24c5f5e217551041b886a43b206bc43 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 12:24:30 +0100 Subject: [PATCH 10/33] Changed to sequential naming --- src/adaXT/decision_tree/__init__.pxd | 2 - src/adaXT/decision_tree/_decision_tree.pyx | 15 ++- src/adaXT/decision_tree/nodes.pxd | 8 +- src/adaXT/parallel.py | 57 +++++++---- src/adaXT/predictor/predictor.pxd | 9 +- src/adaXT/predictor/predictor.pyx | 106 ++++++++++++--------- src/adaXT/random_forest/random_forest.py | 3 +- 7 files changed, 119 insertions(+), 81 deletions(-) diff --git a/src/adaXT/decision_tree/__init__.pxd b/src/adaXT/decision_tree/__init__.pxd index 24b0312a..e5b084f8 100644 --- a/src/adaXT/decision_tree/__init__.pxd +++ b/src/adaXT/decision_tree/__init__.pxd @@ -1,3 +1 @@ - from .nodes cimport LeafNode, DecisionNode, Node -from .decision_tree cimport DecisionTree diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 6e5a77e0..f7b7cf2d 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -12,18 +12,23 @@ from .splitter import Splitter from ..predictor import Predictor from ..criteria import Criteria from ..leaf_builder import LeafBuilder +from .nodes import DecisionNode # for c level definitions cimport cython -from .nodes import DecisionNode +from .nodes cimport DecisionNode, Node from ..utils cimport dsum cdef double EPSILON = np.finfo('double').eps +# Pseudo Node class, which will get replaced after refitting leaf nodes. +cdef class refit_object(Node): + cdef public: + list list_idx + bint is_left -class refit_object(): def __init__( self, idx: int, @@ -31,13 +36,13 @@ class refit_object(): parent: DecisionNode, is_left: bool) -> None: - self.indices = [idx] + self.list_idx = [idx] self.depth = depth self.parent = parent self.is_left = is_left def add_idx(self, idx: int) -> None: - self.indices.append(idx) + self.list_idx.append(idx) @cython.auto_pickle(True) @@ -252,7 +257,7 @@ cdef class _DecisionTree(): nodes = [] for i in range(n_objs): obj = refit_objs[i] - leaf_indices = np.array(obj.indices, dtype=np.int32) + leaf_indices = np.array(obj.list_idx, dtype=np.int32) weighted_samples = dsum(sample_weight, leaf_indices) new_node = leaf_builder_instance.build_leaf( leaf_id=i, diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index cc420d62..0d63c498 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -1,19 +1,21 @@ cimport numpy as cnp +cimport cython cdef class Node: cdef public: + Node parent cnp.ndarray indices int depth double impurity - object parent bint visited bint is_leaf +@cython.final cdef class DecisionNode(Node): cdef public: + Node left_child + Node right_child double threshold int split_idx - object left_child - object right_child cdef class LeafNode(Node): cdef public: diff --git a/src/adaXT/parallel.py b/src/adaXT/parallel.py index 0f992fdd..7d876b88 100644 --- a/src/adaXT/parallel.py +++ b/src/adaXT/parallel.py @@ -45,7 +45,13 @@ def __init__( self.ctx = multiprocessing.get_context("fork") self.n_jobs = n_jobs if n_jobs != -1 else cpu_count() - def async_map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterable: + def async_map( + self, + function: Callable, + map_input: Iterable, + sequential: bool = False, + **kwargs, + ) -> Iterable: """ Asynchronously applies the function to the map_input passing along any kwargs given to the function. @@ -64,12 +70,7 @@ def async_map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterab """ partial_func = partial(function, **kwargs) - parallel = True - if "__no_parallel" in kwargs: - if kwargs["__no_parallel"]: - parallel = False - - if self.n_jobs == 1 or not parallel: + if self.n_jobs == 1 or sequential: ret = list(map(partial_func, map_input)) else: with self.ctx.Pool(self.n_jobs) as p: @@ -77,7 +78,13 @@ def async_map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterab ret = promise.get() return ret - def map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterable: + def map( + self, + function: Callable, + map_input: Iterable, + sequential: bool = False, + **kwargs, + ) -> Iterable: """ Maps the function with map_input. Similair to async_map, but instead guarantees that the first element returned is the result of the first @@ -99,7 +106,7 @@ def map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterable: """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or ("__no_parallel" in kwargs): + if self.n_jobs == 1 or sequential: ret = list(map(partial_func, map_input)) else: with self.ctx.Pool(self.n_jobs) as p: @@ -107,7 +114,11 @@ def map(self, function: Callable, map_input: Iterable, **kwargs) -> Iterable: return ret def async_starmap( - self, function: Callable, map_input: Iterable, **kwargs + self, + function: Callable, + map_input: Iterable, + sequential: bool = False, + **kwargs, ) -> Iterable: """ Asynchronously apply function to map_input, where map_input might be a @@ -128,7 +139,7 @@ def async_starmap( Returns the result of applying function to each element of map_input """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or ("__no_parallel" in kwargs): + if self.n_jobs == 1 or sequential: ret = list(starmap(partial_func, map_input)) else: with self.ctx.Pool(self.n_jobs) as p: @@ -136,7 +147,13 @@ def async_starmap( ret = promise.get() return ret - def starmap(self, function: Callable, map_input: Iterable, **kwargs) -> Any: + def starmap( + self, + function: Callable, + map_input: Iterable, + sequential: bool = False, + **kwargs, + ) -> Any: """ Applies function to each elemetn of map_input but guarantees that element i of return value is the result of function applied to element i @@ -158,16 +175,16 @@ def starmap(self, function: Callable, map_input: Iterable, **kwargs) -> Any: Returns the result of applying function to each element of map_input """ partial_func = partial(function, **kwargs) - if (self.n_jobs == 1) or ( - ("__no_parallel" in kwargs) and kwargs["__no_parallel"] - ): + if (self.n_jobs == 1) or sequential: ret = list(starmap(partial_func, map_input)) else: with self.ctx.Pool(self.n_jobs) as p: ret = p.starmap(partial_func, map_input) return ret - def async_apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterable: + def async_apply( + self, function: Callable, n_iterations: int, sequential: bool = False, **kwargs + ) -> Iterable: """ Applies the function n_iterations number of times and returns the result of the n_iterations in an unknown order. @@ -186,7 +203,7 @@ def async_apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterab Function applied n_iterations number of times """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or ("__no_parallel" in kwargs): + if self.n_jobs == 1 or sequential: ret = [partial_func() for _ in range(n_iterations)] else: with self.ctx.Pool(self.n_jobs) as p: @@ -194,7 +211,9 @@ def async_apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterab ret = [res.get() for res in promise] return ret - def apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterable: + def apply( + self, function: Callable, n_iterations: int, sequential: bool = False, **kwargs + ) -> Iterable: """ Applies the function n_iterations number of times and returns the result of the n_iterations where element i corresponds to the i'th return value @@ -213,7 +232,7 @@ def apply(self, function: Callable, n_iterations: int, **kwargs) -> Iterable: Function applied n_iterations number of times """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or ("__no_parallel" in kwargs): + if self.n_jobs == 1 or sequential: ret = [partial_func() for _ in range(n_iterations)] else: with self.ctx.Pool(self.n_jobs) as p: diff --git a/src/adaXT/predictor/predictor.pxd b/src/adaXT/predictor/predictor.pxd index bb405447..77c18433 100644 --- a/src/adaXT/predictor/predictor.pxd +++ b/src/adaXT/predictor/predictor.pxd @@ -1,4 +1,5 @@ cimport numpy as cnp +from ..decision_tree cimport Node cdef class Predictor(): cdef: @@ -6,7 +7,7 @@ cdef class Predictor(): cnp.ndarray X cnp.ndarray Y int n_features - object root + Node root cpdef dict predict_leaf(self, double[:, ::1] X) @@ -15,18 +16,18 @@ cdef class PredictorClassification(Predictor): cdef: readonly cnp.ndarray classes - cdef int __find_max_index(self, float[::1] lst) + cdef int __find_max_index(self, float[::1] lst) noexcept nogil cdef cnp.ndarray __predict_proba(self, double[:, ::1] X) - cdef cnp.ndarray __predict(self, double[:, ::1] X) + cdef inline double[::1] __predict(self, double[:, ::1] X) noexcept cdef class PredictorRegression(Predictor): pass -cdef class PredictorLocalPolynomial(PredictorRegression): +cdef class PredictorLocalPolynomial(Predictor): pass diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 130317f3..5ae8b2c6 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -1,10 +1,12 @@ import numpy as np from numpy import float64 as DOUBLE from ..decision_tree.nodes import DecisionNode -from ..decision_tree.nodes cimport Node, DecisionNode from collections.abc import Sequence from statistics import mode + cimport numpy as cnp +from ..decision_tree.nodes cimport Node, DecisionNode, LeafNode +cimport cython from ..parallel import ParallelModel @@ -52,15 +54,16 @@ def predict_quantile( indices = [] for i in range(n_obs): - cur_node = tree.root + cur_node = tree.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X_pred[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child + cur_node = dNode.right_child + indices.append(cur_node.indices) @@ -68,7 +71,6 @@ def predict_quantile( cdef class Predictor(): - def __init__(self, const double[:, ::1] X, const double[:, ::1] Y, object root, **kwargs): self.X = np.asarray(X) self.Y = np.asarray(Y) @@ -91,15 +93,15 @@ cdef class Predictor(): n_obs = X.shape[0] for i in range(n_obs): - cur_node = self.root + cur_node = self.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child + cur_node = dNode.right_child if cur_node.id not in ht.keys(): ht[cur_node.id] = [i] @@ -126,14 +128,18 @@ cdef class Predictor(): trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: + sequential = True + if "sequential" in kwargs: + sequential = kwargs["sequential"] + kwargs.pop("sequential") predictions = parallel.async_map(predict_default, trees, X_pred=X_pred, - __no_parallel=True, + sequential=sequential, **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) - +@cython.final cdef class PredictorClassification(Predictor): def __init__(self, const double[:, ::1] X, @@ -142,41 +148,40 @@ cdef class PredictorClassification(Predictor): super().__init__(X, Y, root, **kwargs) self.classes = np.unique(Y) - cdef int __find_max_index(self, float[::1] lst): + cdef int __find_max_index(self, float[::1] lst) noexcept nogil: cdef: int cur_max, i cur_max = 0 - for i in range(1, len(lst)): + for i in range(1, lst.shape[0]): if lst[cur_max] < lst[i]: cur_max = i return cur_max - cdef inline cnp.ndarray __predict(self, double[:, ::1] X): + cdef inline double[::1] __predict(self, double[:, ::1] X) noexcept: cdef: int i, cur_split_idx, n_obs double cur_threshold Node cur_node DecisionNode dNode - double[:] prediction + double[::1] prediction # Make sure that x fits the dimensions. n_obs = X.shape[0] prediction = np.empty(n_obs, dtype=DOUBLE) for i in range(n_obs): - cur_node = self.root + cur_node = self.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child - + cur_node = dNode.right_child idx = self.__find_max_index(cur_node.value) prediction[i] = self.classes[idx] - return np.array(prediction) + return prediction cdef inline cnp.ndarray __predict_proba(self, double[:, ::1] X): cdef: @@ -191,15 +196,15 @@ cdef class PredictorClassification(Predictor): ret_val = [] for i in range(n_obs): - cur_node = self.root + cur_node = self.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child + cur_node = dNode.right_child ret_val.append(cur_node.value) return np.array(ret_val) @@ -210,7 +215,7 @@ cdef class PredictorClassification(Predictor): return self.__predict_proba(X) # if predict_proba = False this return is hit - return self.__predict(X) + return np.asarray(self.__predict(X)) @staticmethod def forest_predict(cnp.ndarray[DOUBLE_t, ndim=2] X_train, @@ -219,9 +224,10 @@ cdef class PredictorClassification(Predictor): trees: list[DecisionTree], parallel: ParallelModel, **kwargs) -> np.ndarray: - - # Remove no parallel if given - kwargs.pop("__no_parallel", None) + sequential = True + if "sequential" in kwargs: + sequential = kwargs["sequential"] + kwargs.pop("sequential") # Forest_predict_proba if "predict_proba" in kwargs: @@ -229,21 +235,21 @@ cdef class PredictorClassification(Predictor): predictions = parallel.async_map(predict_proba, map_input=trees, X_pred=X_pred, - __no_parallel=True, + sequential=sequential, **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) st = time.time() predictions = parallel.async_map(predict_default, - map_input=trees, + trees, X_pred=X_pred, - __no_parallel=True, + sequential=sequential, **kwargs) et = time.time() print("Parallel time predict: ", et - st) return np.array(np.apply_along_axis(mode, 0, predictions), dtype=int) - +@cython.final cdef class PredictorRegression(Predictor): def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: cdef: @@ -262,15 +268,15 @@ cdef class PredictorRegression(Predictor): prediction = np.empty(n_obs, dtype=DOUBLE) for i in range(n_obs): - cur_node = self.root + cur_node = self.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child + cur_node = dNode.right_child if cur_node.value.ndim == 1: prediction[i] = cur_node.value[0] @@ -279,7 +285,7 @@ cdef class PredictorRegression(Predictor): return prediction -cdef class PredictorLocalPolynomial(PredictorRegression): +cdef class PredictorLocalPolynomial(Predictor): def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: cdef: int i, cur_split_idx, n_obs, ind, oo @@ -299,15 +305,15 @@ cdef class PredictorLocalPolynomial(PredictorRegression): deriv_mat = np.empty((n_obs, len(order)), dtype=DOUBLE) for i in range(n_obs): - cur_node = self.root + cur_node = self.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child + cur_node = dNode.right_child ind = 0 for oo in order: @@ -343,15 +349,15 @@ cdef class PredictorQuantile(Predictor): prediction = np.empty(n_obs, dtype=DOUBLE) for i in range(n_obs): - cur_node = self.root + cur_node = self.root while not cur_node.is_leaf: - dNode = cur_node + dNode = cur_node cur_split_idx = dNode.split_idx cur_threshold = dNode.threshold if X[i, cur_split_idx] <= cur_threshold: - cur_node = dNode.left_child + cur_node = dNode.left_child else: - cur_node = dNode.right_child + cur_node = dNode.right_child prediction[i] = np.quantile(self.Y[cur_node.indices, 0], quantile) return prediction @@ -370,10 +376,16 @@ cdef class PredictorQuantile(Predictor): raise ValueError( "quantile called without quantile passed as argument" ) + sequential = True + if "sequential" in kwargs: + sequential = kwargs["sequential"] + kwargs.pop("sequential") + quantile = kwargs['quantile'] n_obs = X_pred.shape[0] prediction_indices = parallel.async_map(predict_quantile, map_input=trees, + sequential=sequential, X_pred=X_pred) # In case the leaf nodes have multiple elements and not just one, we # have to combine them together diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index c092381d..445ea30d 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -4,6 +4,7 @@ import numpy as np from numpy.random import Generator, default_rng +from adaXT import parallel from adaXT.parallel import ParallelModel, shared_numpy_array from numpy.typing import ArrayLike @@ -176,7 +177,7 @@ def oob_calculation( X_pred=X_pred, trees=trees, parallel=parallel, - __no_parallel=True, + sequential=True, ).astype(np.float64) Y_true = Y_train[idx] return (Y_pred, Y_true) From 3b5d33678c26b80e1b0d6de92ca84e68a9b6111c Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 14:26:35 +0100 Subject: [PATCH 11/33] sequential true --- src/adaXT/random_forest/random_forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 445ea30d..86330d47 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -564,6 +564,7 @@ def predict(self, X: ArrayLike, **kwargs) -> np.ndarray: X_pred=predict_value, trees=self.trees, parallel=self.parallel, + sequential=True, **kwargs, ) return prediction From 2e53ea74b793d2537c9a94ab3aa812c3b8764d71 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 14:29:30 +0100 Subject: [PATCH 12/33] Euclidean norm criteria --- src/adaXT/criteria/criteria.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 323f1894..c31fab1d 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -439,6 +439,7 @@ cdef class Squared_error(RegressionCriteria): square_err = cur_sum/obs_weight - mu*mu return square_err +#TODO: Euclidean norm Criteria. # Partial linear criteria cdef class Partial_linear(RegressionCriteria): From 052ce5f0eb3f32f8fe900974ee96435cb1ab3a53 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 14:37:24 +0100 Subject: [PATCH 13/33] Fixed linting --- src/adaXT/decision_tree/nodes.pxd | 1 + src/adaXT/decision_tree/nodes.pyx | 1 - src/adaXT/predictor/predictor.pyx | 5 +++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index 0d63c498..2eb76497 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -9,6 +9,7 @@ cdef class Node: bint visited bint is_leaf + @cython.final cdef class DecisionNode(Node): cdef public: diff --git a/src/adaXT/decision_tree/nodes.pyx b/src/adaXT/decision_tree/nodes.pyx index 6c95b6c4..fa911570 100644 --- a/src/adaXT/decision_tree/nodes.pyx +++ b/src/adaXT/decision_tree/nodes.pyx @@ -30,7 +30,6 @@ cdef class DecisionNode(Node): is_leaf: int = 0, visited: int = 0) -> None: - super().__init__(indices, depth, impurity) self.threshold = threshold self.split_idx = split_idx diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 5ae8b2c6..81c8cc90 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -5,7 +5,7 @@ from collections.abc import Sequence from statistics import mode cimport numpy as cnp -from ..decision_tree.nodes cimport Node, DecisionNode, LeafNode +from ..decision_tree.nodes cimport Node, DecisionNode cimport cython from ..parallel import ParallelModel @@ -64,7 +64,6 @@ def predict_quantile( else: cur_node = dNode.right_child - indices.append(cur_node.indices) return indices @@ -139,6 +138,7 @@ cdef class Predictor(): **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) + @cython.final cdef class PredictorClassification(Predictor): def __init__(self, @@ -249,6 +249,7 @@ cdef class PredictorClassification(Predictor): print("Parallel time predict: ", et - st) return np.array(np.apply_along_axis(mode, 0, predictions), dtype=int) + @cython.final cdef class PredictorRegression(Predictor): def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: From 45eb650432ae1708dd9704b46b87ce7abb2f8559 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 25 Feb 2025 14:47:30 +0100 Subject: [PATCH 14/33] Fixed linting --- src/adaXT/criteria/criteria.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index c31fab1d..ae44753a 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -439,7 +439,7 @@ cdef class Squared_error(RegressionCriteria): square_err = cur_sum/obs_weight - mu*mu return square_err -#TODO: Euclidean norm Criteria. +# TODO: Euclidean norm Criteria. # Partial linear criteria cdef class Partial_linear(RegressionCriteria): From 7306c39a90de9b3531b4e903bc416b9184438f0f Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 4 Mar 2025 13:23:47 +0100 Subject: [PATCH 15/33] Changes n_jobs parameter --- src/adaXT/decision_tree/nodes.pxd | 1 - src/adaXT/parallel.py | 38 +++++++++++------------- src/adaXT/predictor/predictor.pyx | 31 +++++-------------- src/adaXT/random_forest/random_forest.py | 34 ++++++++++++++------- 4 files changed, 48 insertions(+), 56 deletions(-) diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index 2eb76497..0d63c498 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -9,7 +9,6 @@ cdef class Node: bint visited bint is_leaf - @cython.final cdef class DecisionNode(Node): cdef public: diff --git a/src/adaXT/parallel.py b/src/adaXT/parallel.py index 7d876b88..87372693 100644 --- a/src/adaXT/parallel.py +++ b/src/adaXT/parallel.py @@ -33,7 +33,6 @@ class ParallelModel: def __init__( self, - n_jobs: int = -1, ): """ Parameters @@ -43,13 +42,12 @@ def __init__( uses all available proccesors """ self.ctx = multiprocessing.get_context("fork") - self.n_jobs = n_jobs if n_jobs != -1 else cpu_count() def async_map( self, function: Callable, map_input: Iterable, - sequential: bool = False, + n_jobs: int = 1, **kwargs, ) -> Iterable: """ @@ -70,10 +68,10 @@ def async_map( """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or sequential: + if n_jobs == 1: ret = list(map(partial_func, map_input)) else: - with self.ctx.Pool(self.n_jobs) as p: + with self.ctx.Pool(n_jobs) as p: promise = p.map_async(partial_func, map_input) ret = promise.get() return ret @@ -82,7 +80,7 @@ def map( self, function: Callable, map_input: Iterable, - sequential: bool = False, + n_jobs: int = 1, **kwargs, ) -> Iterable: """ @@ -106,10 +104,10 @@ def map( """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or sequential: + if n_jobs == 1: ret = list(map(partial_func, map_input)) else: - with self.ctx.Pool(self.n_jobs) as p: + with self.ctx.Pool(n_jobs) as p: ret = p.map(partial_func, map_input) return ret @@ -117,7 +115,7 @@ def async_starmap( self, function: Callable, map_input: Iterable, - sequential: bool = False, + n_jobs: int = 1, **kwargs, ) -> Iterable: """ @@ -139,10 +137,10 @@ def async_starmap( Returns the result of applying function to each element of map_input """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or sequential: + if n_jobs == 1: ret = list(starmap(partial_func, map_input)) else: - with self.ctx.Pool(self.n_jobs) as p: + with self.ctx.Pool(n_jobs) as p: promise = p.starmap_async(partial_func, map_input) ret = promise.get() return ret @@ -151,7 +149,7 @@ def starmap( self, function: Callable, map_input: Iterable, - sequential: bool = False, + n_jobs: int = 1, **kwargs, ) -> Any: """ @@ -175,15 +173,15 @@ def starmap( Returns the result of applying function to each element of map_input """ partial_func = partial(function, **kwargs) - if (self.n_jobs == 1) or sequential: + if n_jobs == 1: ret = list(starmap(partial_func, map_input)) else: - with self.ctx.Pool(self.n_jobs) as p: + with self.ctx.Pool(n_jobs) as p: ret = p.starmap(partial_func, map_input) return ret def async_apply( - self, function: Callable, n_iterations: int, sequential: bool = False, **kwargs + self, function: Callable, n_iterations: int, n_jobs: int = 1, **kwargs ) -> Iterable: """ Applies the function n_iterations number of times and returns the result @@ -203,16 +201,16 @@ def async_apply( Function applied n_iterations number of times """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or sequential: + if n_jobs == 1: ret = [partial_func() for _ in range(n_iterations)] else: - with self.ctx.Pool(self.n_jobs) as p: + with self.ctx.Pool(n_jobs) as p: promise = [p.apply_async(partial_func) for _ in range(n_iterations)] ret = [res.get() for res in promise] return ret def apply( - self, function: Callable, n_iterations: int, sequential: bool = False, **kwargs + self, function: Callable, n_iterations: int, n_jobs: int = 1, **kwargs ) -> Iterable: """ Applies the function n_iterations number of times and returns the result @@ -232,9 +230,9 @@ def apply( Function applied n_iterations number of times """ partial_func = partial(function, **kwargs) - if self.n_jobs == 1 or sequential: + if n_jobs == 1: ret = [partial_func() for _ in range(n_iterations)] else: - with self.ctx.Pool(self.n_jobs) as p: + with self.ctx.Pool(n_jobs) as p: ret = [p.apply(partial_func) for _ in range(n_iterations)] return ret diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 81c8cc90..06bd5dbb 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -10,7 +10,6 @@ cimport cython from ..parallel import ParallelModel -import time # Circular import. Since only used for typing, this fixes the issue. from typing import TYPE_CHECKING @@ -126,15 +125,12 @@ cdef class Predictor(): cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, + n_jobs: int=1, **kwargs) -> np.ndarray: - sequential = True - if "sequential" in kwargs: - sequential = kwargs["sequential"] - kwargs.pop("sequential") predictions = parallel.async_map(predict_default, trees, X_pred=X_pred, - sequential=sequential, + n_jobs=n_jobs, **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) @@ -223,30 +219,21 @@ cdef class PredictorClassification(Predictor): cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, + n_jobs: int=1, **kwargs) -> np.ndarray: - sequential = True - if "sequential" in kwargs: - sequential = kwargs["sequential"] - kwargs.pop("sequential") - # Forest_predict_proba if "predict_proba" in kwargs: if kwargs["predict_proba"]: - predictions = parallel.async_map(predict_proba, - map_input=trees, - X_pred=X_pred, - sequential=sequential, + predictions = parallel.async_map(predict_proba, map_input=trees, + X_pred=X_pred, n_jobs=n_jobs **kwargs) return np.mean(predictions, axis=0, dtype=DOUBLE) - st = time.time() predictions = parallel.async_map(predict_default, trees, X_pred=X_pred, sequential=sequential, **kwargs) - et = time.time() - print("Parallel time predict: ", et - st) return np.array(np.apply_along_axis(mode, 0, predictions), dtype=int) @@ -369,6 +356,7 @@ cdef class PredictorQuantile(Predictor): cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, + n_jobs: int=1, **kwargs) -> np.ndarray: cdef: int i, j, n_obs, n_trees @@ -377,16 +365,11 @@ cdef class PredictorQuantile(Predictor): raise ValueError( "quantile called without quantile passed as argument" ) - sequential = True - if "sequential" in kwargs: - sequential = kwargs["sequential"] - kwargs.pop("sequential") - quantile = kwargs['quantile'] n_obs = X_pred.shape[0] prediction_indices = parallel.async_map(predict_quantile, map_input=trees, - sequential=sequential, + n_jobs=n_jobs, X_pred=X_pred) # In case the leaf nodes have multiple elements and not just one, we # have to combine them together diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 86330d47..562ba887 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -176,8 +176,8 @@ def oob_calculation( Y_train=Y_train, X_pred=X_pred, trees=trees, + n_jobs=1, parallel=parallel, - sequential=True, ).astype(np.float64) Y_true = Y_train[idx] return (Y_pred, Y_true) @@ -196,9 +196,9 @@ class RandomForest(BaseModel): (currently "Regression", "Classification", "Quantile" or "Gradient"). n_estimators : int The number of trees in the random forest. - n_jobs : int - The number of processes used to fit, and predict for the forest, -1 - uses all available proccesors. + n_jobs : int | tuple[int, int] + The number of jobs used to fit and predict. If tuple, then different + between the two sampling: str | None Either resampling, honest_tree, honest_forest or None. sampling_args: dict | None @@ -236,7 +236,7 @@ def __init__( self, forest_type: str | None, n_estimators: int = 100, - n_jobs: int = -1, + n_jobs: int | tuple[int, int] = 1, sampling: str | None = "resampling", sampling_args: dict | None = None, max_features: int | float | Literal["sqrt", "log2"] | None = None, @@ -326,6 +326,7 @@ def __init__( self.predictor = predictor self.n_jobs = n_jobs + self.seed = seed def __get_random_generator(self, seed) -> Generator: @@ -409,6 +410,7 @@ def __build_trees(self) -> None: map_input=self.parent_rng.spawn(self.n_estimators), sampling_args=self.sampling_args, X_n_rows=self.X_n_rows, + n_jobs=self.n_jobs_fit, sampling=self.sampling, ) self.fitting_indices, self.prediction_indices, self.out_of_bag_indices = zip( @@ -433,6 +435,7 @@ def __build_trees(self) -> None: max_features=self.max_features, skip_check_input=True, sample_weight=self.sample_weight, + n_jobs=self.n_jobs_fit, ) def fit( @@ -461,7 +464,7 @@ def fit( self.leaf_builder, self.predictor, ) - self.parallel = ParallelModel(n_jobs=self.n_jobs) + self.parallel = ParallelModel() self.parent_rng = self.__get_random_generator(self.seed) # Check input @@ -473,6 +476,16 @@ def fit( self.sample_weight = self._check_sample_weight(sample_weight) self.sampling_args = self.__get_sampling_parameter(self.sampling_args) + # Check n_jobs + if isinstance(self.n_jobs, tuple): + self.n_jobs_fit = self.n_jobs[0] + self.n_jobs_pred = self.n_jobs[1] + elif isinstance(self.n_jobs, int): + self.n_jobs_fit = self.n_jobs + self.n_jobs_pred = self.n_jobs + else: + raise ValueError("n_jobs is neither a tuple or int") + # Fit trees self.__build_trees() self.forest_fitted = True @@ -501,6 +514,7 @@ def fit( Y_train=self.Y, parallel=self.parallel, predictor=self.predictor, + n_jobs=self.n_jobs_pred, ) ) ) @@ -564,7 +578,7 @@ def predict(self, X: ArrayLike, **kwargs) -> np.ndarray: X_pred=predict_value, trees=self.trees, parallel=self.parallel, - sequential=True, + n_jobs=self.n_jobs_pred, **kwargs, ) return prediction @@ -617,9 +631,8 @@ def predict_weights( X1=None, size_X0=size_0, size_X1=self.X_n_rows, - X_train=self.X, - Y_train=self.Y, scaling=scaling, + n_jobs=self.n_jobs_pred, ) if scale: @@ -660,8 +673,7 @@ def similarity(self, X0: ArrayLike, X1: ArrayLike): X1=X1, size_X0=size_0, size_X1=size_1, - X_train=self.X, - Y_train=self.Y, scaling="similarity", + n_jobs=self.n_jobs_pred, ) return np.mean(weight_list, axis=0) From 8daaf291cf6362bef403086256d71a9ff4893afd Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 7 Mar 2025 08:43:14 +0100 Subject: [PATCH 16/33] Added n_jobs parameter on forest for fitting and predicting --- src/adaXT/decision_tree/_decision_tree.pyx | 9 +-------- src/adaXT/decision_tree/nodes.pxd | 1 + src/adaXT/predictor/predictor.pyx | 11 +++++------ src/adaXT/random_forest/random_forest.py | 6 ++---- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index f7b7cf2d..ccae0f58 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -122,14 +122,7 @@ cdef class _DecisionTree(): size_0, self.n_rows_predict, scaling=scaling) - def _forest_predict_leaf(self, double[:, ::1] X_train, double[:, ::1] - Y_train, double[:, ::1] X_pred, **kwargs): - if X_pred is None: - return self.__get_leaf() - predictor_instance = self.predictor(X_train, Y_train, self.root) - return predictor_instance.predict_leaf(X_pred, **kwargs) - - def predict_leaf(self, X: np.ndarray | None = None) -> dict: + def predict_leaf(self, X: double[:, ::1] | None = None) -> dict: if X is None: return self.__get_leaf() if self.predictor_instance is None: diff --git a/src/adaXT/decision_tree/nodes.pxd b/src/adaXT/decision_tree/nodes.pxd index 0d63c498..2eb76497 100644 --- a/src/adaXT/decision_tree/nodes.pxd +++ b/src/adaXT/decision_tree/nodes.pxd @@ -9,6 +9,7 @@ cdef class Node: bint visited bint is_leaf + @cython.final cdef class DecisionNode(Node): cdef public: diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index 06bd5dbb..ad2ab66a 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -10,7 +10,6 @@ cimport cython from ..parallel import ParallelModel - # Circular import. Since only used for typing, this fixes the issue. from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -125,8 +124,9 @@ cdef class Predictor(): cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, - n_jobs: int=1, + n_jobs: int = 1, **kwargs) -> np.ndarray: + predictions = parallel.async_map(predict_default, trees, X_pred=X_pred, @@ -219,7 +219,7 @@ cdef class PredictorClassification(Predictor): cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, - n_jobs: int=1, + n_jobs: int = 1, **kwargs) -> np.ndarray: # Forest_predict_proba if "predict_proba" in kwargs: @@ -232,12 +232,11 @@ cdef class PredictorClassification(Predictor): predictions = parallel.async_map(predict_default, trees, X_pred=X_pred, - sequential=sequential, + n_jobs=n_jobs, **kwargs) return np.array(np.apply_along_axis(mode, 0, predictions), dtype=int) -@cython.final cdef class PredictorRegression(Predictor): def predict(self, double[:, ::1] X, **kwargs) -> np.ndarray: cdef: @@ -356,7 +355,7 @@ cdef class PredictorQuantile(Predictor): cnp.ndarray[DOUBLE_t, ndim=2] X_pred, trees: list[DecisionTree], parallel: ParallelModel, - n_jobs: int=1, + n_jobs: int = 1, **kwargs) -> np.ndarray: cdef: int i, j, n_obs, n_trees diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 562ba887..1b474e47 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -25,12 +25,10 @@ def tree_based_weights( X1: np.ndarray | None, size_X0: int, size_X1: int, - X_train: np.ndarray, - Y_train: np.ndarray, scaling: str, ) -> np.ndarray: - hash0 = tree._forest_predict_leaf(X_pred=X0, X_train=X_train, Y_train=Y_train) - hash1 = tree._forest_predict_leaf(X_pred=X1, X_train=X_train, Y_train=Y_train) + hash0 = tree.predict_leaf(X=X0) + hash1 = tree.predict_leaf(X=X1) return tree._tree_based_weights( hash0=hash0, hash1=hash1, From c53fea36f286d59961f85f9de79d58af2832793e Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 7 Mar 2025 09:14:48 +0100 Subject: [PATCH 17/33] Naming change to camelcase for criteria --- .../creating_custom_criteria/example_usage.py | 2 +- .../creating_custom_criteria/testCrit.pyx | 2 +- docs/user_guide/creatingCriteria.md | 12 ++-- docs/user_guide/decision_tree.md | 12 ++-- docs/user_guide/random_forest.md | 8 +-- docs/user_guide/scikit_learn.md | 6 +- src/adaXT/base_model.pyx | 8 +-- src/adaXT/criteria/__init__.pxd | 8 +-- src/adaXT/criteria/__init__.py | 8 +-- src/adaXT/criteria/criteria.pxd | 8 +-- src/adaXT/criteria/criteria.pyi | 8 +-- src/adaXT/criteria/criteria.pyx | 17 +++-- src/adaXT/predictor/predictor.pyx | 12 ---- tests/test_decision_tree.py | 28 ++++---- tests/test_random_forest.py | 24 +++---- tests/test_tree_features.py | 72 +++++++++---------- time_decision_tree.py | 12 ++-- time_random_forest.py | 6 +- 18 files changed, 122 insertions(+), 131 deletions(-) diff --git a/docs/assets/examples/creating_custom_criteria/example_usage.py b/docs/assets/examples/creating_custom_criteria/example_usage.py index 597e4e94..69a980f3 100644 --- a/docs/assets/examples/creating_custom_criteria/example_usage.py +++ b/docs/assets/examples/creating_custom_criteria/example_usage.py @@ -15,7 +15,7 @@ # Initialize and fit tree tree = DecisionTree("Regression", - criteria=testCrit.Partial_linear, + criteria=testCrit.PartialLinear, max_depth=3) tree.fit(X, Y) diff --git a/docs/assets/examples/creating_custom_criteria/testCrit.pyx b/docs/assets/examples/creating_custom_criteria/testCrit.pyx index 7571579f..b0ba8340 100644 --- a/docs/assets/examples/creating_custom_criteria/testCrit.pyx +++ b/docs/assets/examples/creating_custom_criteria/testCrit.pyx @@ -1,6 +1,6 @@ from adaXT.criteria cimport Criteria -cdef class Partial_linear(Criteria): +cdef class PartialLinear(Criteria): # Custom mean function, such that we don't have to loop through twice. cdef (double, double) custom_mean(self, int[::1] indices): diff --git a/docs/user_guide/creatingCriteria.md b/docs/user_guide/creatingCriteria.md index 58bf69f4..a489c195 100644 --- a/docs/user_guide/creatingCriteria.md +++ b/docs/user_guide/creatingCriteria.md @@ -139,12 +139,12 @@ tree = DecisionTree("Regression", criteria=my_custom_critera.My_custom_criteria, tree.fit(X, Y) ``` -We now go over a detailed example in which we construct the `Partial_linear` +We now go over a detailed example in which we construct the `PartialLinear` criteria. -## A detailed example: `Partial_linear` +## A detailed example: `PartialLinear` -The general idea of the `Partial_linear` criteria is to fit a linear function on +The general idea of the `PartialLinear` criteria is to fit a linear function on the first feature with the $Y$ value as the response, that is, $$ @@ -170,7 +170,7 @@ and start with the following lines: ```python from adaXT.criteria cimport Criteria -cdef class Partial_linear(Criteria): +cdef class PartialLinear(Criteria): ``` ### Calculating the mean @@ -316,7 +316,7 @@ X = np.random.uniform(0, 100, (n, m)) Y = np.random.uniform(0, 10, n) # Initialize and fit tree -tree = DecisionTree("Regression", testCrit.Partial_linear, max_depth=3) +tree = DecisionTree("Regression", testCrit.PartialLinear, max_depth=3) tree.fit(X, Y) # Plot the tree @@ -324,7 +324,7 @@ plot_tree(tree) plt.show() ``` -This creates a regression tree with the newly created custom `Partial_linear` +This creates a regression tree with the newly created custom `PartialLinear` criteria class, specifies the `max_depth` to be 3 and then plots the tree using both the [plot_tree](../api_docs/tree_utils.md#adaXT.decision_tree.tree_utils.plot_tree) based diff --git a/docs/user_guide/decision_tree.md b/docs/user_guide/decision_tree.md index 0c2a7f19..16b8ae64 100644 --- a/docs/user_guide/decision_tree.md +++ b/docs/user_guide/decision_tree.md @@ -58,14 +58,14 @@ Below is a short example that illustrates how to use a classification tree. ```py import numpy as np from adaXT.decision_tree import DecisionTree -from adaXT.criteria import Gini_index +from adaXT.criteria import GiniIndex X = np.array([[1, 1], [1, -1], [-1, -1], [-1, 1], [1, 1], [1, -1], [-1, -1], [-1, 1]]) Xtest = np.array([[1, 1], [1, -1], [-1, -1], [-1, 1]]) Y = [0, 1, 0, 1, 0, 0, 1, 1] -tree = DecisionTree("Classification", criteria=Gini_index) +tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y) print(tree.predict(Xtest)) print(tree.predict(Xtest, predict_proba=True)) @@ -74,7 +74,7 @@ print(tree.predict(Xtest, predict_proba=True)) In this example we created and fit a classification tree using training data and then used the fitted tree to predict the response at the training data. When initializing the tree we changed the default criteria to the -[Gini Index](../api_docs/Criteria.md#adaXT.criteria.criteria.Gini_index); it is +[Gini Index](../api_docs/Criteria.md#adaXT.criteria.criteria.GiniIndex); it is always possible to overwrite any of the default components of a specific tree type. Classification trees use a majority vote in each of the leaf nodes to decide which class to predict and ties are broken by selecting the smaller @@ -96,7 +96,7 @@ the data. For the `Regression` tree type, the following default components are used: - Criteria class: - [Squared_error](../api_docs/Criteria.md#adaXT.criteria.criteria.Squared_error) + [SquaredError](../api_docs/Criteria.md#adaXT.criteria.criteria.SquaredError) - Predict class: [PredictRegression](../api_docs/Predictor.md#adaXT.predict.predict.PredictRegression) - LeafBuilder class: @@ -124,7 +124,7 @@ print(tree.predict(Xnew)) For the `Quantile` tree type, the following default components are used: - Criteria class: - [Squared_error](../api_docs/Criteria.md#adaXT.criteria.criteria.Squared_error) + [SquaredError](../api_docs/Criteria.md#adaXT.criteria.criteria.SquaredError) - Predict class: [PredictorQuantile](../api_docs/Predictor.md#adaXT.predictor.predictor.PredictQuantile) - LeafBuilder class: @@ -159,7 +159,7 @@ prediction and it is possible to predict several quantiles simultaneously. For the `Gradient` tree type, the following default components are used: - Criteria class: - [Partial_quadratic](../api_docs/Criteria.md#adaXT.criteria.criteria.Partial_quadratic) + [PartialQuadratic](../api_docs/Criteria.md#adaXT.criteria.criteria.PartialQuadratic) - Predict class: [PredictLocalPolynomial](../api_docs/Predictor.md#adaXT.predict.predict.PredictLocalPolynomial) - LeafBuilder class: diff --git a/docs/user_guide/random_forest.md b/docs/user_guide/random_forest.md index bcd83ebd..14f25f12 100644 --- a/docs/user_guide/random_forest.md +++ b/docs/user_guide/random_forest.md @@ -15,7 +15,7 @@ example below. import numpy as np import matplotlib.pyplot as plt from adaXT.random_forest import RandomForest -from adaXT.criteria import Partial_linear +from adaXT.criteria import PartialLinear from adaXT.leaf_builder import LeafBuilderPartialLinear from adaXT.predictor import PredictorLocalPolynomial @@ -28,7 +28,7 @@ Xtest = np.linspace(-1, 1, 50).reshape(-1, 1) # Fit a regular regression forest and a regression forest with linear splits rf = RandomForest("Regression", min_samples_leaf=30) rf_lin = RandomForest("Regression", - criteria=Partial_linear, + criteria=PartialLinear, leaf_builder=LeafBuilderPartialLinear, predictor=PredictorLocalPolynomial, min_samples_leaf=30) @@ -44,8 +44,8 @@ plt.show() ``` In this example, we fit a regular regression forest (which uses the -[Squared_error](../api_docs/Criteria.md)) and a regression forest that uses the -[Partial_linear](../api_docs/Criteria.md) splitting criteria and predicts a +[SquaredError](../api_docs/Criteria.md)) and a regression forest that uses the +[PartialLinear](../api_docs/Criteria.md) splitting criteria and predicts a linear function in each leaf. As can be seen when running this example, the forest with the linear splits is able to produce a better fit when both forests are grown similarly deep. diff --git a/docs/user_guide/scikit_learn.md b/docs/user_guide/scikit_learn.md index f21c524a..f8e68644 100644 --- a/docs/user_guide/scikit_learn.md +++ b/docs/user_guide/scikit_learn.md @@ -18,7 +18,7 @@ there is the initial setup: ```python from adaXT.decision_tree import DecisionTree -from adaXT.criteria import Gini_index, Entropy +from adaXT.criteria import GiniIndex, Entropy from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeClassifier @@ -36,14 +36,14 @@ param_grid = { "min_samples_split": [2, 5, 10], } -param_grid_ada = param_grid | {"criteria": [Gini_index, Entropy]} +param_grid_ada = param_grid | {"criteria": [GiniIndex, Entropy]} param_grid_sk = param_grid | {"criterion": ["gini", "entropy"]} ``` Here, we import the necessary components and setup the parameter grids of the two decision trees. One small difference to be aware of is that the parameter names and format are different in some cases, e.g., in sklearn it is called criterion and takes a string as input, while in adaXT it is called criteria and takes a criteria class -such as Gini_index, Entropy or perhaps your own [implementation](creatingCriteria.md). +such as GiniIndex, Entropy or perhaps your own [implementation](creatingCriteria.md). Next, we define and fit the GridSearchCV instance. ```python diff --git a/src/adaXT/base_model.pyx b/src/adaXT/base_model.pyx index adeb697c..eac86bf2 100644 --- a/src/adaXT/base_model.pyx +++ b/src/adaXT/base_model.pyx @@ -1,7 +1,7 @@ from numpy import float64 as DOUBLE from .predictor import Predictor from .criteria import Criteria -from .criteria.criteria import Entropy, Squared_error, Partial_quadratic +from .criteria.criteria import Entropy, SquaredError, PartialQuadratic from .decision_tree.splitter import Splitter from .leaf_builder import LeafBuilder @@ -131,9 +131,9 @@ class BaseModel(): tree_types = { "Classification": [Entropy, PredictorClassification, LeafBuilderClassification], - "Regression": [Squared_error, PredictorRegression, LeafBuilderRegression], - "Gradient": [Partial_quadratic, PredictorLocalPolynomial, LeafBuilderPartialQuadratic], - "Quantile": [Squared_error, PredictorQuantile, LeafBuilderRegression] + "Regression": [SquaredError, PredictorRegression, LeafBuilderRegression], + "Gradient": [PartialQuadratic, PredictorLocalPolynomial, LeafBuilderPartialQuadratic], + "Quantile": [SquaredError, PredictorQuantile, LeafBuilderRegression] } if tree_type in tree_types.keys(): # Set the defaults diff --git a/src/adaXT/criteria/__init__.pxd b/src/adaXT/criteria/__init__.pxd index 3fd46c9a..9f95b6ed 100644 --- a/src/adaXT/criteria/__init__.pxd +++ b/src/adaXT/criteria/__init__.pxd @@ -2,9 +2,9 @@ from .criteria cimport ( ClassificationCriteria, RegressionCriteria, Criteria, - Gini_index, + GiniIndex, Entropy, - Squared_error, - Partial_linear, - Partial_quadratic + SquaredError, + PartialLinear, + PartialQuadratic ) diff --git a/src/adaXT/criteria/__init__.py b/src/adaXT/criteria/__init__.py index 027b0809..eef69688 100644 --- a/src/adaXT/criteria/__init__.py +++ b/src/adaXT/criteria/__init__.py @@ -1,10 +1,10 @@ from .criteria import ( ClassificationCriteria, RegressionCriteria, - Gini_index, - Squared_error, + GiniIndex, + SquaredError, Entropy, - Partial_linear, - Partial_quadratic, + PartialLinear, + PartialQuadratic, Criteria, ) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index 0f9ad052..fa002a41 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -61,7 +61,7 @@ cdef class ClassificationCriteria(Criteria): cdef void reset_weight_list(self, double* class_occurences) -cdef class Gini_index(ClassificationCriteria): +cdef class GiniIndex(ClassificationCriteria): cdef void reset_weight_list(self, double* class_occurences) @@ -123,7 +123,7 @@ cdef class Entropy(ClassificationCriteria): cdef class RegressionCriteria(Criteria): pass -cdef class Squared_error(RegressionCriteria): +cdef class SquaredError(RegressionCriteria): cdef: double left_sum double right_sum @@ -152,7 +152,7 @@ cdef class Squared_error(RegressionCriteria): """ -cdef class Partial_linear(RegressionCriteria): +cdef class PartialLinear(RegressionCriteria): cdef (double, double) __custom_mean(self, int[:] indices) @@ -192,7 +192,7 @@ cdef class Partial_linear(RegressionCriteria): """ -cdef class Partial_quadratic(RegressionCriteria): +cdef class PartialQuadratic(RegressionCriteria): cdef (double, double, double) __custom_mean(self, int[:] indices) diff --git a/src/adaXT/criteria/criteria.pyi b/src/adaXT/criteria/criteria.pyi index 0340d56a..e8e6bac8 100644 --- a/src/adaXT/criteria/criteria.pyi +++ b/src/adaXT/criteria/criteria.pyi @@ -13,7 +13,7 @@ class ClassificationCriteria(Criteria): pass -class Gini_index(ClassificationCriteria): +class GiniIndex(ClassificationCriteria): r""" Gini index based criteria, which can be used for classification. Formally, given class labels $\mathcal{L}$, the Gini index in a node @@ -49,7 +49,7 @@ class RegressionCriteria(Criteria): pass -class Squared_error(RegressionCriteria): +class SquaredError(RegressionCriteria): r""" Squared error based criteria, which can be used for regression and leads to standard CART splits. Formally, the squared error in a node @@ -69,7 +69,7 @@ class Squared_error(RegressionCriteria): pass -class Partial_linear(RegressionCriteria): +class PartialLinear(RegressionCriteria): r""" Criteria based on fitting a linear function in the first predictor variable in each leaf. Formally, in a node consisting of samples $I$, @@ -87,7 +87,7 @@ class Partial_linear(RegressionCriteria): pass -class Partial_quadratic(RegressionCriteria): +class PartialQuadratic(RegressionCriteria): r""" Criteria based on fitting a quadratic function in the first predictor variable in each leaf. Formally, in a node consisting of samples $I$, diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 1b177d09..24a39de8 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -106,7 +106,7 @@ cdef class ClassificationCriteria(Criteria): return tot_sum / n_samples # Gini index criteria -cdef class Gini_index(ClassificationCriteria): +cdef class GiniIndex(ClassificationCriteria): cpdef double impurity(self, int[::1] indices): if self.first_call: @@ -372,7 +372,7 @@ cdef class RegressionCriteria(Criteria): # Squared error criteria -cdef class Squared_error(RegressionCriteria): +cdef class SquaredError(RegressionCriteria): cdef double update_proxy(self, int[::1] indices, int new_split): cdef: @@ -395,6 +395,7 @@ cdef class Squared_error(RegressionCriteria): int i, idx int n_obs = indices.shape[0] double y_val, weight + # More efficient data excess self.left_sum = 0.0 self.right_sum = 0.0 @@ -424,8 +425,7 @@ cdef class Squared_error(RegressionCriteria): cdef double __squared_error(self, int[::1] indices): cdef: double cur_sum = 0.0 - double[:, ::1] Y = self.Y - double mu = weighted_mean(Y[:, 0], indices, self.sample_weight) # set mu to be the mean of the dataset + double mu = weighted_mean(self.Y[:, 0], indices, self.sample_weight) # set mu to be the mean of the dataset double square_err, tmp double obs_weight = 0.0 int i, p @@ -433,16 +433,19 @@ cdef class Squared_error(RegressionCriteria): # Calculate the variance using: variance = sum((y_i - mu)^2)/y_len for i in range(n_indices): p = indices[i] - tmp = Y[p, 0] * self.sample_weight[p] + tmp = self.Y[p, 0] * self.sample_weight[p] cur_sum += tmp*tmp obs_weight += self.sample_weight[p] square_err = cur_sum/obs_weight - mu*mu return square_err +cdef class EuclideanNorm(RegressionCriteria): + pass + # TODO: Euclidean norm Criteria. # Partial linear criteria -cdef class Partial_linear(RegressionCriteria): +cdef class PartialLinear(RegressionCriteria): # Custom mean function, such that we don't have to loop through twice. cdef (double, double) __custom_mean(self, int[:] indices): @@ -493,7 +496,7 @@ cdef class Partial_linear(RegressionCriteria): cur_sum += step_calc * step_calc return cur_sum / length -cdef class Partial_quadratic(RegressionCriteria): +cdef class PartialQuadratic(RegressionCriteria): # Custom mean function, such that we don't have to loop through twice. cdef (double, double, double) __custom_mean(self, int[:] indices): diff --git a/src/adaXT/predictor/predictor.pyx b/src/adaXT/predictor/predictor.pyx index ad2ab66a..16fb3a84 100644 --- a/src/adaXT/predictor/predictor.pyx +++ b/src/adaXT/predictor/predictor.pyx @@ -106,18 +106,6 @@ cdef class Predictor(): ht[cur_node.id] += [i] return ht - def __get_state__(self): - return { - "root": self.root, - "X": np.asarray(self.X), - "Y": np.asarray(self.Y), - } - - def __set_state__(self, d: dict): - self.X = d["X"] - self.Y = d["Y"] - self.root = d["root"] - @staticmethod def forest_predict(cnp.ndarray[DOUBLE_t, ndim=2] X_train, cnp.ndarray[DOUBLE_t, ndim=2] Y_train, diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index f85b9e02..b59e6936 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -1,10 +1,10 @@ from adaXT.decision_tree import LeafNode, DecisionNode, DecisionTree from adaXT.criteria import ( - Gini_index, - Squared_error, + GiniIndex, + SquaredError, Entropy, - Partial_linear, - Partial_quadratic, + PartialLinear, + PartialQuadratic, ) import numpy as np @@ -42,7 +42,7 @@ def test_gini_single(): ] ) Y_cla = np.array([1, -1, 1, -1, 1, -1, 1, -1]) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y_cla) root = tree.root exp_val = [0.25, -0.75, 0] @@ -93,7 +93,7 @@ def test_gini_multi(): ) Y_multi = np.array([1, 2, 1, 0, 1, 0, 1, 0]) Y_unique = len(np.unique(Y_multi)) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y_multi) root = tree.root # DIFFERENT FROM SKLEARN THEIRS IS: [0.25, -0.75, -1.5], both give pure @@ -142,7 +142,7 @@ def test_regression(): ] ) Y_reg = np.array([2.2, -0.5, 0.5, -0.5, 2, -3, 2.2, -3]) - tree = DecisionTree("Regression", criteria=Squared_error) + tree = DecisionTree("Regression", criteria=SquaredError) tree.fit(X, Y_reg) root = tree.root exp_val2 = [0.25, -0.5, 0.5, 0.25, -0.75] @@ -277,8 +277,8 @@ def sanity_regression(n, m): Y1 = np.random.randint(0, 5, n) Y2 = np.random.uniform(0, 5, n) - tree1 = DecisionTree("Regression", criteria=Squared_error) - tree2 = DecisionTree("Regression", criteria=Squared_error) + tree1 = DecisionTree("Regression", criteria=SquaredError) + tree2 = DecisionTree("Regression", criteria=SquaredError) tree1.fit(X, Y1) tree2.fit(X, Y2) pred1 = tree1.predict(X) @@ -296,7 +296,7 @@ def sanity_gini(n, m): X = np.random.uniform(0, 100, (n, m)) Y = np.random.randint(0, 5, n) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y) pred = tree.predict(X) @@ -319,20 +319,20 @@ def sanity_entropy(n, m): def sanity_partial_linear(n, m): X = np.c_[np.linspace(-1, 1, n), np.random.uniform(-1, 1, (n, m))] Y = X[:, 0] * (X[:, 0] > 0) - tree = DecisionTree("Gradient", criteria=Partial_linear, max_depth=1) + tree = DecisionTree("Gradient", criteria=PartialLinear, max_depth=1) tree.fit(X, Y) # Since the response is a piece-wise linear function it can be fit - # exactly with the Partial_linear criteria, with a single split at 0 + # exactly with the PartialLinear criteria, with a single split at 0 assert (tree.leaf_nodes[0].impurity + tree.leaf_nodes[1].impurity) == 0 def sanity_partial_quadratic(n, m): X = np.c_[np.linspace(-1, 1, n), np.random.uniform(-1, 1, (n, m))] Y = X[:, 0] ** 2 * (X[:, 0] > 0) - tree = DecisionTree("Gradient", criteria=Partial_quadratic, max_depth=1) + tree = DecisionTree("Gradient", criteria=PartialQuadratic, max_depth=1) tree.fit(X, Y) # Since the response is a piece-wise quadratic function it can be fit - # exactly with the Partial_quadratic criteria, with a single split at 0 + # exactly with the PartialQuadratic criteria, with a single split at 0 assert (tree.leaf_nodes[0].impurity + tree.leaf_nodes[1].impurity) == 0 diff --git a/tests/test_random_forest.py b/tests/test_random_forest.py index d80169b1..00ab336c 100644 --- a/tests/test_random_forest.py +++ b/tests/test_random_forest.py @@ -1,9 +1,9 @@ from adaXT.decision_tree import DecisionTree from adaXT.criteria import ( - Gini_index, - Squared_error, + GiniIndex, + SquaredError, Entropy, - Partial_quadratic, + PartialQuadratic, ) from adaXT.predictor import PredictorLocalPolynomial from adaXT.leaf_builder import LeafBuilderPartialQuadratic @@ -57,7 +57,7 @@ def run_gini_index( ): forest = RandomForest( forest_type="Classification", - criteria=Gini_index, + criteria=GiniIndex, n_estimators=n_estimators, n_jobs=n_jobs, sampling=sampling, @@ -107,7 +107,7 @@ def run_squared_error( ): forest = RandomForest( forest_type="Regression", - criteria=Squared_error, + criteria=SquaredError, n_estimators=n_estimators, n_jobs=n_jobs, sampling=sampling, @@ -133,7 +133,7 @@ def test_dominant_feature(): forest = RandomForest( "Classification", n_estimators=100, - criteria=Gini_index, + criteria=GiniIndex, sampling="resampling", ) forest.fit(X, Y) @@ -162,7 +162,7 @@ def test_deterministic_seeding_regression(): forest1 = RandomForest( "Regression", n_estimators=100, - criteria=Squared_error, + criteria=SquaredError, seed=tree_state, sampling="resampling", ) @@ -171,7 +171,7 @@ def test_deterministic_seeding_regression(): forest2 = RandomForest( "Regression", n_estimators=100, - criteria=Squared_error, + criteria=SquaredError, seed=tree_state, sampling="resampling", ) @@ -196,7 +196,7 @@ def test_deterministic_seeding_classification(): forest1 = RandomForest( "Classification", n_estimators=100, - criteria=Gini_index, + criteria=GiniIndex, seed=tree_state, sampling="resampling", ) @@ -205,7 +205,7 @@ def test_deterministic_seeding_classification(): forest2 = RandomForest( "Classification", n_estimators=100, - criteria=Gini_index, + criteria=GiniIndex, seed=tree_state, sampling="resampling", ) @@ -291,13 +291,13 @@ def test_gradient_forest(): "Gradient", leaf_builder=LeafBuilderPartialQuadratic, predictor=PredictorLocalPolynomial, - criteria=Partial_quadratic, + criteria=PartialQuadratic, ) forest = RandomForest( "Gradient", leaf_builder=LeafBuilderPartialQuadratic, predictor=PredictorLocalPolynomial, - criteria=Partial_quadratic, + criteria=PartialQuadratic, sampling=None, ) tree.fit(X_reg, Y_reg) diff --git a/tests/test_tree_features.py b/tests/test_tree_features.py index 44a03f04..a918a02a 100644 --- a/tests/test_tree_features.py +++ b/tests/test_tree_features.py @@ -1,10 +1,10 @@ from adaXT.decision_tree import DecisionTree from adaXT.criteria import ( - Gini_index, - Squared_error, + GiniIndex, + SquaredError, Entropy, - Partial_linear, - Partial_quadratic, + PartialLinear, + PartialQuadratic, ) from adaXT.decision_tree.nodes import LeafNode, DecisionNode from adaXT.predictor import PredictorLocalPolynomial, PredictorQuantile @@ -42,7 +42,7 @@ def test_predict_leaf_matrix_classification(): ) Y_cla = np.array([1, -1, 1, -1, 1, -1, 1, -1]) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y_cla) res1 = tree.predict_weights(X, scale=False) res2 = tree.predict_weights(X, scale=False) @@ -67,7 +67,7 @@ def test_predict_leaf_matrix_regression(): ] ) Y_reg = np.array([2.2, -0.5, 0.5, -0.5, 2, -3, 2.2, -3]) - tree = DecisionTree("Regression", criteria=Squared_error) + tree = DecisionTree("Regression", criteria=SquaredError) tree.fit(X, Y_reg) res1 = tree.predict_weights(X=None, scale=False) @@ -93,7 +93,7 @@ def test_predict_leaf_matrix_regression_with_scaling(): ] ) Y_reg = np.array([2.2, -0.5, 0.5, -0.5, 2, -3, 2.2, -3]) - tree = DecisionTree("Regression", criteria=Squared_error) + tree = DecisionTree("Regression", criteria=SquaredError) tree.fit(X, Y_reg) res1 = tree.predict_weights(X=None, scale=False) @@ -121,7 +121,7 @@ def test_prediction(): ] ) Y_cla = np.array([1, -1, 1, -1, 1, -1, 1, -1]) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y_cla) prediction = tree.predict(X) for i in range(len(Y_cla)): @@ -137,7 +137,7 @@ def test_predict_proba_probability(): Y_cla = np.array([0, 1, 0, 1, 0, 0, 1, 1]) expected_probs = [[1, 0], [0.5, 0.5], [0.5, 0.5], [0, 1]] expected_class = [0, 0, 0, 1] - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y_cla) classes = np.unique(Y_cla) pred_probs = tree.predict(Xtest, predict_proba=True) @@ -158,7 +158,7 @@ def test_predict_proba_against_predict(): X = np.random.uniform(0, 100, (10000, 5)) Y = np.random.randint(0, 5, 10000) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y) predict = tree.predict(X) @@ -185,7 +185,7 @@ def test_NxN_matrix(): ] ) Y_cla = np.array([1, -1, 1, -1, 1, -1, 1, -1]) - tree = DecisionTree("Classification", criteria=Gini_index) + tree = DecisionTree("Classification", criteria=GiniIndex) tree.fit(X, Y_cla) leaf_matrix = tree.predict_weights(X=None, scale=False) true_weight = np.array( @@ -214,7 +214,7 @@ def test_max_depth_setting(): max_depth_desired = 20 tree = DecisionTree( - "Classification", criteria=Gini_index, max_depth=max_depth_desired + "Classification", criteria=GiniIndex, max_depth=max_depth_desired ) tree.fit(X, Y) @@ -232,7 +232,7 @@ def test_impurity_tol_setting(): tree = DecisionTree( "Classification", - criteria=Gini_index, + criteria=GiniIndex, impurity_tol=impurity_tol_desired) tree.fit(X, Y) @@ -250,7 +250,7 @@ def test_min_samples_split_setting(): tree = DecisionTree( "Classification", - criteria=Gini_index, + criteria=GiniIndex, min_samples_split=min_samples_split_desired, ) tree.fit(X, Y) @@ -268,7 +268,7 @@ def test_min_samples_leaf_setting(): tree = DecisionTree( "Classification", - criteria=Gini_index, + criteria=GiniIndex, min_samples_leaf=min_samples_leaf_desired) tree.fit(X, Y) @@ -286,7 +286,7 @@ def test_min_improvement_setting(): tree = DecisionTree( "Classification", - criteria=Gini_index, + criteria=GiniIndex, min_improvement=min_improvement_desired) tree.fit(X, Y) @@ -363,8 +363,8 @@ def test_sample_indices_classification(): Y2 = np.insert(Y2, i, i) # i is used as a bloat outcome value sample_indices.append(i - 1) - t1 = DecisionTree("Classification", criteria=Gini_index) - t2 = DecisionTree("Classification", criteria=Gini_index) + t1 = DecisionTree("Classification", criteria=GiniIndex) + t2 = DecisionTree("Classification", criteria=GiniIndex) t1.fit(X1, Y1) t2.fit(X2, Y2, sample_indices=sample_indices) @@ -391,15 +391,15 @@ def test_sample_indices_regression(): Y2 = np.insert(Y2, i, i) # i is used as a bloat outcome value sample_indices.append(i - 1) - t1 = DecisionTree("Regression", criteria=Squared_error) - t2 = DecisionTree("Regression", criteria=Squared_error) + t1 = DecisionTree("Regression", criteria=SquaredError) + t2 = DecisionTree("Regression", criteria=SquaredError) t1.fit(X1, Y1) t2.fit(X2, Y2, sample_indices=sample_indices) assert_tree_equality(t1, t2) - t1 = DecisionTree("Regression", criteria=Partial_linear) - t2 = DecisionTree("Regression", criteria=Partial_linear) + t1 = DecisionTree("Regression", criteria=PartialLinear) + t2 = DecisionTree("Regression", criteria=PartialLinear) t1.fit(X1, Y1) t2.fit(X2, Y2, sample_indices=sample_indices) @@ -418,8 +418,8 @@ def test_sample_weight_classification(): sample_weights.append(1) sample_weights.append(0) - t1 = DecisionTree("Classification", criteria=Gini_index) - t2 = DecisionTree("Classification", criteria=Gini_index) + t1 = DecisionTree("Classification", criteria=GiniIndex) + t2 = DecisionTree("Classification", criteria=GiniIndex) t1.fit(X1, Y1) t2.fit(X2, Y2, sample_weight=sample_weights) @@ -428,7 +428,7 @@ def test_sample_weight_classification(): t1 = DecisionTree("Classification", criteria=Entropy) t2 = DecisionTree("Classification", criteria=Entropy) - Partial_linear + PartialLinear t1.fit(X1, Y1) t2.fit(X2, Y2, sample_weight=sample_weights) @@ -447,16 +447,16 @@ def test_sample_weight_regression(): sample_weights.append(1) sample_weights.append(0) - t1 = DecisionTree("Regression", criteria=Squared_error) - t2 = DecisionTree("Regression", criteria=Squared_error) + t1 = DecisionTree("Regression", criteria=SquaredError) + t2 = DecisionTree("Regression", criteria=SquaredError) t1.fit(X1, Y1) t2.fit(X2, Y2, sample_weight=sample_weights) assert_tree_equality(t1, t2) - t1 = DecisionTree("Regression", criteria=Partial_linear) - t2 = DecisionTree("Regression", criteria=Partial_linear) + t1 = DecisionTree("Regression", criteria=PartialLinear) + t2 = DecisionTree("Regression", criteria=PartialLinear) t1.fit(X1, Y1) t2.fit(X2, Y2, sample_weight=sample_weights) @@ -468,7 +468,7 @@ def test_quantile_predict(): X, Y = uniform_x_y(10000, 5) tree = DecisionTree( "Quantile", - criteria=Squared_error, + criteria=SquaredError, predictor=PredictorQuantile, leaf_builder=LeafBuilderRegression, max_depth=0, @@ -487,7 +487,7 @@ def test_quantile_predict_array(): X, Y = uniform_x_y(10000, 5) tree = DecisionTree( "Quantile", - criteria=Squared_error, + criteria=SquaredError, predictor=PredictorQuantile, leaf_builder=LeafBuilderRegression, max_depth=0, @@ -504,7 +504,7 @@ def test_quantile_predict_array(): def test_local_polynomial_predict(): """ - We test both the Partial_linear and Partial_quadratic criteria by + We test both the PartialLinear and PartialQuadratic criteria by consideirng a (piece-wise) linear and a quadratic example. In both cases the prediction based on PreictLocalPolynomial should perfectly align. """ @@ -517,7 +517,7 @@ def test_local_polynomial_predict(): # trees with a single split tree1 = DecisionTree( None, - criteria=Partial_linear, + criteria=PartialLinear, predictor=PredictorLocalPolynomial, leaf_builder=LeafBuilderPartialLinear, max_depth=1, @@ -525,7 +525,7 @@ def test_local_polynomial_predict(): tree1.fit(X, Y1) tree2 = DecisionTree( None, - criteria=Partial_quadratic, + criteria=PartialQuadratic, predictor=PredictorLocalPolynomial, leaf_builder=LeafBuilderPartialQuadratic, max_depth=1, @@ -536,10 +536,10 @@ def test_local_polynomial_predict(): residuals2 = tree2.predict(X, order=0)[:, 0] - Y2 assert ( np.sum(residuals1**2) == 0.0 - ), "Partial_linear criteria and PredictLocalPolynomial does not behave as expected" + ), "PartialLinear criteria and PredictLocalPolynomial does not behave as expected" assert ( np.sum(residuals2**2) == 0.0 - ), "Partial_quadratic criteria and PredictLocalPolynomial does not behave as expected" + ), "PartialQuadratic criteria and PredictLocalPolynomial does not behave as expected" if __name__ == "__main__": diff --git a/time_decision_tree.py b/time_decision_tree.py index 3e56e4b7..72ae2fca 100644 --- a/time_decision_tree.py +++ b/time_decision_tree.py @@ -10,7 +10,7 @@ def run_classification_tree(X, Y, criteria): sk_time = 0 - if criteria.__name__ == "Gini_index": + if criteria.__name__ == "GiniIndex": tree = DecisionTreeClassifier(criteria="gini") st = time.time() tree.fit(X, Y) @@ -23,7 +23,7 @@ def run_classification_tree(X, Y, criteria): et = time.time() sk_time = et - st else: - raise Exception("Tree neither Entropy nor Gini_index") + raise Exception("Tree neither Entropy nor GiniIndex") tree = DecisionTree("Classification", criteria=criteria) st = time.time() @@ -36,14 +36,14 @@ def run_classification_tree(X, Y, criteria): def run_regression_tree(X, Y, criteria): sk_time = 0 - if criteria.__name__ == "Squared_error": + if criteria.__name__ == "SquaredError": tree = DecisionTreeRegressor(criteria="squared_error") st = time.time() tree.fit(X, Y) et = time.time() sk_time = et - st else: - raise Exception("Tree not a Squared_error") + raise Exception("Tree not a SquaredError") tree = DecisionTree("Regression", criteria=criteria) st = time.time() @@ -61,13 +61,13 @@ def run_num_iterations(n, m, x=[0, 100], y=[0, 5], num_trees=10): run_times = np.empty(shape=(num_trees, 6)) for i in range(num_trees): gini_diff, gini_time = run_classification_tree( - X, y_classification, crit.Gini_index + X, y_classification, crit.GiniIndex ) entropy_diff, entropy_time = run_classification_tree( X, y_classification, crit.Entropy ) squared_diff, square_time = run_regression_tree( - X, y_regression, crit.Squared_error + X, y_regression, crit.SquaredError ) run_times[i] = [ gini_diff, diff --git a/time_random_forest.py b/time_random_forest.py index 4d807984..b8aba492 100644 --- a/time_random_forest.py +++ b/time_random_forest.py @@ -1,6 +1,6 @@ from multiprocessing import cpu_count from adaXT.random_forest import RandomForest -from adaXT.criteria import Gini_index, Squared_error, Entropy +from adaXT.criteria import GiniIndex, SquaredError, Entropy import matplotlib.pyplot as plt import time import numpy as np @@ -39,7 +39,7 @@ def get_classification_data( def run_gini_index(X, Y, n_jobs, n_estimators): forest = RandomForest( forest_type="Classification", - criteria=Gini_index, + criteria=GiniIndex, n_estimators=n_estimators, n_jobs=n_jobs, ) @@ -65,7 +65,7 @@ def run_entropy(X, Y, n_jobs, n_estimators): def run_squared_error(X, Y, n_jobs, n_estimators): forest = RandomForest( forest_type="Regression", - criteria=Squared_error, + criteria=SquaredError, n_estimators=n_estimators, n_jobs=n_jobs, ) From 67f08b25b081bf026f77ea9ec15f0470389e95c8 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 7 Mar 2025 14:19:45 +0100 Subject: [PATCH 18/33] Initial draft --- src/adaXT/criteria/criteria.pxd | 7 ++ src/adaXT/criteria/criteria.pyx | 141 +++++++++++++++++++++++++++++++- 2 files changed, 144 insertions(+), 4 deletions(-) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index fa002a41..952f2294 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -151,6 +151,13 @@ cdef class SquaredError(RegressionCriteria): The variance of the response y """ +cdef class EuclideanNorm(RegressionCriteria): + cdef: + double left_dist_sum, right_dist_sum + double weight_left, weight_right + double[:, ::1] right_indiv_dist + double[:, ::1] left_indiv_dist + int Y_cols cdef class PartialLinear(RegressionCriteria): diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 24a39de8..cfc36607 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -4,6 +4,7 @@ from libc.string cimport memset import numpy as np from .crit_helpers cimport weighted_mean +from libc.math cimport sqrt # Abstract Criteria class cdef class Criteria: @@ -395,7 +396,6 @@ cdef class SquaredError(RegressionCriteria): int i, idx int n_obs = indices.shape[0] double y_val, weight - # More efficient data excess self.left_sum = 0.0 self.right_sum = 0.0 @@ -416,13 +416,15 @@ cdef class SquaredError(RegressionCriteria): self.right_sum += y_val self.weight_right += weight + # Instead of calculating the squared error fully, we calculate + # - (1/n_L sum_{i in left} y_i^2 + 1/n_R sum_{i in right} y_i^2) return -((self.left_sum*self.left_sum) / self.weight_left + (self.right_sum*self.right_sum) / self.weight_right) cpdef double impurity(self, int[::1] indices): return self.__squared_error(indices) - cdef double __squared_error(self, int[::1] indices): + cdef inline double __squared_error(self, int[::1] indices): cdef: double cur_sum = 0.0 double mu = weighted_mean(self.Y[:, 0], indices, self.sample_weight) # set mu to be the mean of the dataset @@ -440,9 +442,140 @@ cdef class SquaredError(RegressionCriteria): return square_err cdef class EuclideanNorm(RegressionCriteria): - pass + def __init__(self, double[:, ::1] X, double[:, ::1] Y, double[::1] sample_weight): + super().__init__(X, Y, sample_weight) + # Initialize two empty arrays for storing the sum of each Y[:, i] in a + # split + self.left_indiv_dist = np.zeros((Y.shape[0], Y.shape[0]-1), dtype=np.float64) + self.right_indiv_dist = np.zeros((Y.shape[0], Y.shape[0]-1), dtype=np.float64) + self.Y_cols = Y.shape[1] + + cdef inline double __sum_arr(self, double[:] arr, int n_obs): + cdef: + int i + double cur_sum = 0.0 + + for i in range(n_obs): + cur_sum += arr[i] + return cur_sum + + cdef double update_proxy(self, int[::1] indices, int new_split): + cdef: + int i, j, idx_i, idx_j + double tmp, weight, tmp_squared + double[:] tmp_arr + for i in range(self.old_split, new_split): + idx_i = indices[i] + weight = self.sample_weight[idx] + for j in range(self.old_split): + idx_j = indices[j] + tmp_arr = ( + self.Y[idx_i, :] * weight_i - + self.Y[idx_j, :] * weight_j + ) + tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + self.left_dist_sum += tmp + + for j in range(self.old_split+1, self.n_obs): + self.right_dist_sum -= self.right_indiv_dist[j, i] + + # Not needed, as we should not index like this again + # self.right_indiv_dist[i, j] = 0.0 + # self.right_indiv_dist[j, i] = 0.0 + + self.weight_left += weight + self.weight_right -= weight + + # No proxy for EuclideanNorm, so calculate fully + return (self.left_dist_sum * self.weight_left + + self.right_dist_sum * self.weight_right) + + cdef double proxy_improvement(self, int[::1] indices, int split_idx): + cdef: + int i, j, idx_i, idx_j + int n_obs = indices.shape[0] + double weight_i, weight_j, tmp + double[:] tmp_arr + + self.weight_left = 0.0 + self.weight_right = 0.0 + + # Create individual distances for right half, as we will be wanting to + # subtract a point by removing its distance to all other nodes + self.right_indiv_dist[i, j] = np.zeros((n_obs, n_obs)) + + # Calculate sum of each Y point + for i in range(split_idx): + idx_i = indices[i] + weight_i = self.sample_weight[i] + for j in range(split_idx, n_obs): + idx_j = indices[j] + if i == j: + continue + weight_j = self.sample_weight[idx_j] + tmp_arr = ( + self.Y[idx_i, :] * weight_i - + self.Y[idx_j, :] * weight_j + ) + tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + # i and j remain the same for current node checking + self.left_dist_sum += tmp + + self.weight_left += weight_i + + for i in range(split_idx, n_obs): + idx_i = indices[i] + weight_i = self.sample_weight[i] + for j in range(split_idx, n_obs): + idx_j = indices[j] + if i == j: + continue + weight_j = self.sample_weight[idx_j] + tmp_arr = ( + self.Y[idx_i, :] * weight_i - + self.Y[idx_j, :] * weight_j + ) + tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + self.right_indiv_dist[i, j] = tmp + self.right_dist_sum += tmp + + self.weight_right += weight_i + + # No proxy for EuclideanNorm, so calculate fully + return (self.left_dist_sum * self.weight_left + + self.right_dist_sum * self.weight_right) + + cpdef double impurity(self, int[::1] indices): + return self.__euclidean_norm(indices) + + cdef inline double __euclidean_norm(self, int[::1] indices): + cdef: + double[:] tmp_arr + double dist_sum + int i, j + int n_indices = indices.shape[0] + double weight_i, weight_j + + dist_sum = np.zeros(n_indices) + for i in range(n_indices): + weight_i = self.sample_weight[i] + for j in range(n_indices): + weight_j = self.sample_weight[j] + if i == j: + continue + tmp_arr = ( + self.Y[i, :] * weight_i - + self.Y[j, :] * weight_j + ) + tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + dist_sum += tmp + -# TODO: Euclidean norm Criteria. + for j in range(self.Y_cols): + tmp = Y_sum[j] + Y_sum[j] = tmp*tmp + euc_norm = sqrt(cur_sum) + return euc_norm # Partial linear criteria cdef class PartialLinear(RegressionCriteria): From 6bc395ddaefbf2fb7162e84009de883a2e51ff2b Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 7 Mar 2025 14:42:16 +0100 Subject: [PATCH 19/33] New draft --- src/adaXT/criteria/criteria.pxd | 4 ++-- src/adaXT/criteria/criteria.pyx | 42 +++++++++++++++------------------ 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index 952f2294..e4d9626c 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -155,8 +155,8 @@ cdef class EuclideanNorm(RegressionCriteria): cdef: double left_dist_sum, right_dist_sum double weight_left, weight_right - double[:, ::1] right_indiv_dist - double[:, ::1] left_indiv_dist + double[::1] right_indiv_dist + int right_start_idx int Y_cols cdef class PartialLinear(RegressionCriteria): diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index cfc36607..061c6894 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -462,6 +462,7 @@ cdef class EuclideanNorm(RegressionCriteria): cdef double update_proxy(self, int[::1] indices, int new_split): cdef: int i, j, idx_i, idx_j + int prev_n_right double tmp, weight, tmp_squared double[:] tmp_arr for i in range(self.old_split, new_split): @@ -476,12 +477,11 @@ cdef class EuclideanNorm(RegressionCriteria): tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) self.left_dist_sum += tmp - for j in range(self.old_split+1, self.n_obs): - self.right_dist_sum -= self.right_indiv_dist[j, i] - # Not needed, as we should not index like this again - # self.right_indiv_dist[i, j] = 0.0 - # self.right_indiv_dist[j, i] = 0.0 + prev_n_right = self.obs - i + for j in range(self.right_start_idx, self.right_start_idx+prev_n_right): + self.right_dist_sum -= self.right_indiv_dist[j] + self.right_start_idx += prev_n_right self.weight_left += weight self.weight_right -= weight @@ -496,6 +496,10 @@ cdef class EuclideanNorm(RegressionCriteria): int n_obs = indices.shape[0] double weight_i, weight_j, tmp double[:] tmp_arr + int indiv_idx = 0 + + # reset the start idx for self.right_indiv_dist + self.right_start_idx = indiv_idx self.weight_left = 0.0 self.weight_right = 0.0 @@ -508,10 +512,8 @@ cdef class EuclideanNorm(RegressionCriteria): for i in range(split_idx): idx_i = indices[i] weight_i = self.sample_weight[i] - for j in range(split_idx, n_obs): + for j in range(i+1, n_obs): idx_j = indices[j] - if i == j: - continue weight_j = self.sample_weight[idx_j] tmp_arr = ( self.Y[idx_i, :] * weight_i - @@ -526,18 +528,20 @@ cdef class EuclideanNorm(RegressionCriteria): for i in range(split_idx, n_obs): idx_i = indices[i] weight_i = self.sample_weight[i] - for j in range(split_idx, n_obs): + for j in range(i+1, n_obs): idx_j = indices[j] - if i == j: - continue weight_j = self.sample_weight[idx_j] tmp_arr = ( self.Y[idx_i, :] * weight_i - self.Y[idx_j, :] * weight_j ) tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) - self.right_indiv_dist[i, j] = tmp self.right_dist_sum += tmp + # Just a list of all distances, should start at index 0. + # Therefore, we remove split_idx, and 1 as the distance from idx + # 0 to 1 is the first distance. + self.right_indiv_dist[indiv_idx] = tmp + indiv_idx += 1 self.weight_right += weight_i @@ -551,18 +555,15 @@ cdef class EuclideanNorm(RegressionCriteria): cdef inline double __euclidean_norm(self, int[::1] indices): cdef: double[:] tmp_arr - double dist_sum + double dist_sum = 0.0 int i, j int n_indices = indices.shape[0] double weight_i, weight_j - dist_sum = np.zeros(n_indices) for i in range(n_indices): weight_i = self.sample_weight[i] - for j in range(n_indices): + for j in range(i+1, n_indices): weight_j = self.sample_weight[j] - if i == j: - continue tmp_arr = ( self.Y[i, :] * weight_i - self.Y[j, :] * weight_j @@ -570,12 +571,7 @@ cdef class EuclideanNorm(RegressionCriteria): tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) dist_sum += tmp - - for j in range(self.Y_cols): - tmp = Y_sum[j] - Y_sum[j] = tmp*tmp - euc_norm = sqrt(cur_sum) - return euc_norm + return dist_sum # Partial linear criteria cdef class PartialLinear(RegressionCriteria): From b033157061b035617a1aa3a6f2b1e8603343a457 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 7 Mar 2025 14:58:46 +0100 Subject: [PATCH 20/33] draft, which is building now --- src/adaXT/criteria/criteria.pxd | 6 +++ src/adaXT/criteria/criteria.pyx | 72 +++++++++++++++++---------------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index e4d9626c..7c024db5 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -159,6 +159,12 @@ cdef class EuclideanNorm(RegressionCriteria): int right_start_idx int Y_cols + cdef inline double __euclidean_norm(self, int[::1] indices) + + + cdef inline double __get_square_sum(self, double[::1] arr1, double val1, + double[::1] arr2, val2) + cdef class PartialLinear(RegressionCriteria): cdef (double, double) __custom_mean(self, int[:] indices) diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 061c6894..f67eeb0c 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -450,31 +450,34 @@ cdef class EuclideanNorm(RegressionCriteria): self.right_indiv_dist = np.zeros((Y.shape[0], Y.shape[0]-1), dtype=np.float64) self.Y_cols = Y.shape[1] - cdef inline double __sum_arr(self, double[:] arr, int n_obs): + cdef inline double __get_square_sum(self, double[::1] arr1, double val1, + double[::1] arr2, val2): cdef: int i - double cur_sum = 0.0 - - for i in range(n_obs): - cur_sum += arr[i] - return cur_sum + double tmp + double square_sum = 0.0 + for i in range(self.Y_cols): + tmp = arr1[i] * val1 - arr2[i] * val2 + square_sum += tmp*tmp + return square_sum cdef double update_proxy(self, int[::1] indices, int new_split): cdef: int i, j, idx_i, idx_j int prev_n_right - double tmp, weight, tmp_squared + double tmp, weight_i, weight_j, square_sum double[:] tmp_arr for i in range(self.old_split, new_split): idx_i = indices[i] - weight = self.sample_weight[idx] + weight_i = self.sample_weight[idx_i] for j in range(self.old_split): idx_j = indices[j] - tmp_arr = ( - self.Y[idx_i, :] * weight_i - - self.Y[idx_j, :] * weight_j + weight_j = self.sample_weight[idx_j] + square_sum = self.__get_square_sum( + self.Y[idx_i, :], weight_i, + self.Y[idx_j, :], weight_j ) - tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + tmp = sqrt(square_sum) self.left_dist_sum += tmp @@ -483,8 +486,8 @@ cdef class EuclideanNorm(RegressionCriteria): self.right_dist_sum -= self.right_indiv_dist[j] self.right_start_idx += prev_n_right - self.weight_left += weight - self.weight_right -= weight + self.weight_left += weight_i + self.weight_right -= weight_i # No proxy for EuclideanNorm, so calculate fully return (self.left_dist_sum * self.weight_left + @@ -494,8 +497,7 @@ cdef class EuclideanNorm(RegressionCriteria): cdef: int i, j, idx_i, idx_j int n_obs = indices.shape[0] - double weight_i, weight_j, tmp - double[:] tmp_arr + double weight_i, weight_j, tmp, square_sum int indiv_idx = 0 # reset the start idx for self.right_indiv_dist @@ -506,7 +508,7 @@ cdef class EuclideanNorm(RegressionCriteria): # Create individual distances for right half, as we will be wanting to # subtract a point by removing its distance to all other nodes - self.right_indiv_dist[i, j] = np.zeros((n_obs, n_obs)) + self.right_indiv_dist = np.zeros(n_obs) # Calculate sum of each Y point for i in range(split_idx): @@ -515,11 +517,11 @@ cdef class EuclideanNorm(RegressionCriteria): for j in range(i+1, n_obs): idx_j = indices[j] weight_j = self.sample_weight[idx_j] - tmp_arr = ( - self.Y[idx_i, :] * weight_i - - self.Y[idx_j, :] * weight_j + square_sum = self.__get_square_sum( + self.Y[idx_i, :], weight_i, + self.Y[idx_j, :], weight_j ) - tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + tmp = sqrt(square_sum) # i and j remain the same for current node checking self.left_dist_sum += tmp @@ -531,11 +533,11 @@ cdef class EuclideanNorm(RegressionCriteria): for j in range(i+1, n_obs): idx_j = indices[j] weight_j = self.sample_weight[idx_j] - tmp_arr = ( - self.Y[idx_i, :] * weight_i - - self.Y[idx_j, :] * weight_j + square_sum = self.__get_square_sum( + self.Y[idx_i, :], weight_i, + self.Y[idx_j, :], weight_j ) - tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) + tmp = sqrt(square_sum) self.right_dist_sum += tmp # Just a list of all distances, should start at index 0. # Therefore, we remove split_idx, and 1 as the distance from idx @@ -554,22 +556,22 @@ cdef class EuclideanNorm(RegressionCriteria): cdef inline double __euclidean_norm(self, int[::1] indices): cdef: - double[:] tmp_arr double dist_sum = 0.0 - int i, j + int i, j, idx_i, idx_j int n_indices = indices.shape[0] - double weight_i, weight_j + double weight_i, weight_j, square_sum for i in range(n_indices): - weight_i = self.sample_weight[i] + idx_i = indices[i] + weight_i = self.sample_weight[idx_i] for j in range(i+1, n_indices): - weight_j = self.sample_weight[j] - tmp_arr = ( - self.Y[i, :] * weight_i - - self.Y[j, :] * weight_j + idx_j = indices[idx_j] + weight_j = self.sample_weight[idx_j] + square_sum = self.__get_square_sum( + self.Y[idx_i, :], weight_i, + self.Y[idx_j, :], weight_j ) - tmp = sqrt(self.__sum_arr(tmp_arr * tmp_arr, self.Y_cols)) - dist_sum += tmp + dist_sum += sqrt(square_sum) return dist_sum From f453b3e489c49d76842476d3bb1ca7738c1b67f3 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Sat, 8 Mar 2025 11:37:35 +0100 Subject: [PATCH 21/33] Remove incorrect type hinting --- src/adaXT/decision_tree/_decision_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index ccae0f58..508fb81b 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -122,7 +122,7 @@ cdef class _DecisionTree(): size_0, self.n_rows_predict, scaling=scaling) - def predict_leaf(self, X: double[:, ::1] | None = None) -> dict: + def predict_leaf(self, X: np.ndarray | None = None) -> dict: if X is None: return self.__get_leaf() if self.predictor_instance is None: From 68f05d49c3e302639cc832ecc49fa6390573b8b5 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Sat, 8 Mar 2025 11:57:31 +0100 Subject: [PATCH 22/33] Remove left over forest predict --- src/adaXT/decision_tree/decision_tree.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/adaXT/decision_tree/decision_tree.py b/src/adaXT/decision_tree/decision_tree.py index 76432d70..9aa88126 100644 --- a/src/adaXT/decision_tree/decision_tree.py +++ b/src/adaXT/decision_tree/decision_tree.py @@ -275,15 +275,6 @@ def predict_weights( self._check_dimensions(X) return self._tree.predict_weights(X=X, scale=scale) - def _forest_predict_leaf( - self, X_pred: ArrayLike, X_train: ArrayLike, Y_train: ArrayLike - ): - if not self.skip_check_input: - raise ValueError("_forest_predict can only be called with skip_check_input") - return self._tree._forest_predict_leaf( - X_train=X_train, Y_train=Y_train, X_pred=X_pred - ) - def predict_leaf(self, X: ArrayLike | None) -> dict: """ Computes a hash table indexing in which LeafNodes the rows of the provided From 77928e11b8320a2a9021185543e315041388310d Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 10 Mar 2025 19:51:35 +0100 Subject: [PATCH 23/33] Work on SquaredDist criteria --- src/adaXT/criteria/criteria.pxd | 18 +++++- src/adaXT/criteria/criteria.pyx | 107 +++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 3 deletions(-) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index 7c024db5..4c79c8d4 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -135,7 +135,7 @@ cdef class SquaredError(RegressionCriteria): cpdef double impurity(self, int[::1] indices) - cdef double __squared_error(self, int[::1] indices) + cdef inline double __squared_error(self, int[::1] indices) """ Function used to calculate the squared error of y[indices] ---------- @@ -150,8 +150,22 @@ cdef class SquaredError(RegressionCriteria): double The variance of the response y """ +cdef class SquaredDistance(RegressionCriteria): + cdef: + double* left_sum + double* right_sum + double weight_left, weight_right + + cdef inline void __reset_sums(self) + + cdef double update_proxy(self, int[::1] indices, int new_split) + + cdef double proxy_improvement(self, int[::1] indices, int split_idx) + + cpdef double impurity(self, int[::1] indices) + -cdef class EuclideanNorm(RegressionCriteria): +cdef class PairwiseDistance(RegressionCriteria): cdef: double left_dist_sum, right_dist_sum double weight_left, weight_right diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index f67eeb0c..1adc16e7 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -441,7 +441,112 @@ cdef class SquaredError(RegressionCriteria): square_err = cur_sum/obs_weight - mu*mu return square_err -cdef class EuclideanNorm(RegressionCriteria): +cdef class SquaredDistance(RegressionCriteria): + def __init__(self, double[:, ::1] X, double[:, ::1] Y, double[::1] sample_weight): + super().__init__(X, Y, sample_weight) + self.Y_cols = Y.shape[1] + self.left_sum = malloc(sizeof(double) * self.Y_cols) + self.right_sum = malloc(sizeof(double) * self.Y_cols) + + def __dealloc__(self): + free(self.left_sum) + free(self.right_sum) + + cdef inline void __reset_sums(self): + memset(self.left_sum, 0, self.Y_cols*sizeof(double)) + memset(self.right_sum, 0, self.Y_cols*sizeof(double)) + + cdef double update_proxy(self, int[::1] indices, int new_split): + cdef: + int i, idx, j + double y_val, weight + double left_square_sum, right_square_sum + + for i in range(self.old_split, new_split): + self.weight_left += weight + self.weight_right -= weight + + left_square_sum = 0.0 + right_square_sum = 0.0 + + for j in range(self.Y_cols): + for i in range(self.old_split, new_split): + idx = indices[i] + weight = self.sample_weight[idx] + y_val = self.Y[idx, j]*weight + self.left_sum[j] += y_val + self.right_sum[j] -= y_val + left_square_sum += self.left_sum[j]*self.left_sum[j] + right_square_sum += self.right_sum[j]*self.right_sum[j] + + return -( left_square_sum / self.weight_left + + right_square_sum / self.weight_right) + + + cdef double proxy_improvement(self, int[::1] indices, int split_idx): + cdef: + int i, idx, j + int n_obs = indices.shape[0] + double y_val, weight, left_square_sum, right_square_sum + + self.__reset_sums() + self.weight_left = 0.0 + self.weight_right = 0.0 + left_square_sum = 0.0 + right_square_sum = 0.0 + + for i in range(split_idx): + idx = indices[i] + self.weight_left += self.sample_weight[idx] + + for j in range(self.Y_cols): + for i in range(split_idx): + idx = indices[i] + weight = self.sample_weight[idx] + y_val = self.Y[idx, j]*weight + self.left_sum[j] += y_val + left_square_sum += self.left_sum[j]*self.left_sum[j] + + for i in range(split_idx, n_obs): + idx = indices[i] + self.weight_right += self.sample_weight[idx] + + for j in range(self.Y_cols): + for i in range(split_idx, n_obs): + idx = indices[i] + weight = self.sample_weight[idx] + y_val = self.Y[idx, j]*weight + self.right_sum[j] += y_val + right_square_sum += self.left_sum[j]*self.left_sum[j] + + # Instead of calculating the squared error fully, we calculate + # - (1/n_L sum_{i in left} y_i^2 + 1/n_R sum_{i in right} y_i^2) + return -( left_square_sum / self.weight_left + + right_square_sum / self.weight_right) + + cpdef double impurity(self, int[::1] indices): + cdef: + double cur_sum = 0.0 + double mu = 0.0 + double square_dist, tmp + double obs_weight = 0.0 + int i, p, j + int n_indices = indices.shape[0] + for i in range(n_indices): + p = indices[i] + obs_weight += self.sample_weight[p] + + for j in range(self.Y_cols): + mu = weighted_mean(self.Y[:, j], indices, self.sample_weight) + for i in range(n_indices): + p = indices[i] + tmp = self.Y[p, j] * self.sample_weight[p] + cur_sum += tmp*tmp + square_dist += cur_sum / obs_weight - mu*mu + return square_dist + + +cdef class PairwiseDistance(RegressionCriteria): def __init__(self, double[:, ::1] X, double[:, ::1] Y, double[::1] sample_weight): super().__init__(X, Y, sample_weight) # Initialize two empty arrays for storing the sum of each Y[:, i] in a From 488e6d2d471aa2951bade868816804e37de5d530 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 10 Mar 2025 20:21:27 +0100 Subject: [PATCH 24/33] Created test for SquaredDistance --- src/adaXT/base_model.pyx | 6 +- src/adaXT/criteria/__init__.pxd | 3 +- src/adaXT/criteria/__init__.py | 1 + src/adaXT/criteria/criteria.pxd | 1 + src/adaXT/criteria/criteria.pyi | 6 ++ tests/test_decision_tree.py | 180 +++++++++++++++++++------------- 6 files changed, 121 insertions(+), 76 deletions(-) diff --git a/src/adaXT/base_model.pyx b/src/adaXT/base_model.pyx index eac86bf2..92293e7e 100644 --- a/src/adaXT/base_model.pyx +++ b/src/adaXT/base_model.pyx @@ -1,7 +1,7 @@ from numpy import float64 as DOUBLE from .predictor import Predictor from .criteria import Criteria -from .criteria.criteria import Entropy, SquaredError, PartialQuadratic +from .criteria.criteria import Entropy, SquaredError, PartialQuadratic, SquaredDistance from .decision_tree.splitter import Splitter from .leaf_builder import LeafBuilder @@ -133,7 +133,9 @@ class BaseModel(): LeafBuilderClassification], "Regression": [SquaredError, PredictorRegression, LeafBuilderRegression], "Gradient": [PartialQuadratic, PredictorLocalPolynomial, LeafBuilderPartialQuadratic], - "Quantile": [SquaredError, PredictorQuantile, LeafBuilderRegression] + "Quantile": [SquaredError, PredictorQuantile, LeafBuilderRegression], + "SquaredDistance": [SquaredDistance, PredictorRegression, + LeafBuilderRegression] } if tree_type in tree_types.keys(): # Set the defaults diff --git a/src/adaXT/criteria/__init__.pxd b/src/adaXT/criteria/__init__.pxd index 9f95b6ed..1ab937c0 100644 --- a/src/adaXT/criteria/__init__.pxd +++ b/src/adaXT/criteria/__init__.pxd @@ -6,5 +6,6 @@ from .criteria cimport ( Entropy, SquaredError, PartialLinear, - PartialQuadratic + PartialQuadratic, + SquaredDistance ) diff --git a/src/adaXT/criteria/__init__.py b/src/adaXT/criteria/__init__.py index eef69688..9c71abf6 100644 --- a/src/adaXT/criteria/__init__.py +++ b/src/adaXT/criteria/__init__.py @@ -7,4 +7,5 @@ PartialLinear, PartialQuadratic, Criteria, + SquaredDistance, ) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index 4c79c8d4..adedf30a 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -155,6 +155,7 @@ cdef class SquaredDistance(RegressionCriteria): double* left_sum double* right_sum double weight_left, weight_right + int Y_cols cdef inline void __reset_sums(self) diff --git a/src/adaXT/criteria/criteria.pyi b/src/adaXT/criteria/criteria.pyi index e8e6bac8..492cd4a8 100644 --- a/src/adaXT/criteria/criteria.pyi +++ b/src/adaXT/criteria/criteria.pyi @@ -69,6 +69,12 @@ class SquaredError(RegressionCriteria): pass +class SquaredDistance(RegressionCriteria): + pass + +class PairwiseDistance(RegressionCriteria): + pass + class PartialLinear(RegressionCriteria): r""" Criteria based on fitting a linear function in the first predictor diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index b59e6936..1a969e81 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -5,6 +5,7 @@ Entropy, PartialLinear, PartialQuadratic, + SquaredDistance, ) import numpy as np @@ -21,9 +22,9 @@ def rec_node(node: LeafNode | DecisionNode | None, depth: int) -> None: expected depth of the node """ if isinstance(node, LeafNode) or isinstance(node, DecisionNode): - assert ( - node.depth == depth - ), f"Incorrect depth, expected {depth} got {node.depth}" + assert node.depth == depth, ( + f"Incorrect depth, expected {depth} got {node.depth}" + ) if isinstance(node, DecisionNode): rec_node(node.left_child, depth + 1) @@ -47,9 +48,9 @@ def test_gini_single(): root = tree.root exp_val = [0.25, -0.75, 0] spl_idx = [0, 0, 1] - assert isinstance(root, LeafNode) or isinstance( - root, DecisionNode - ), f"root is not a node but {type(root)}" + assert isinstance(root, LeafNode) or isinstance(root, DecisionNode), ( + f"root is not a node but {type(root)}" + ) queue = [root] i = 0 @@ -59,21 +60,21 @@ def test_gini_single(): if isinstance( cur_node, DecisionNode ): # Check threshold and idx of decision node - assert ( - cur_node.threshold == exp_val[i] - ), f"Expected threshold {exp_val[i]} on node={i}, got {cur_node.threshold} on split_idx {cur_node.split_idx} exp: {spl_idx[i]}" - assert ( - cur_node.split_idx == spl_idx[i] - ), f"Expected split idx {spl_idx[i]} on i={i}, got {cur_node.split_idx}" + assert cur_node.threshold == exp_val[i], ( + f"Expected threshold {exp_val[i]} on node={i}, got {cur_node.threshold} on split_idx {cur_node.split_idx} exp: {spl_idx[i]}" + ) + assert cur_node.split_idx == spl_idx[i], ( + f"Expected split idx {spl_idx[i]} on i={i}, got {cur_node.split_idx}" + ) if cur_node.left_child: queue.append(cur_node.left_child) if cur_node.right_child: queue.append(cur_node.right_child) i += 1 elif isinstance(cur_node, LeafNode): # Check that the value is of length 2 - assert ( - len(cur_node.value) == 2 - ), f"Expected 2 mean values, one for each class, but got: {len(cur_node.value)}" + assert len(cur_node.value) == 2, ( + f"Expected 2 mean values, one for each class, but got: {len(cur_node.value)}" + ) rec_node(root, 0) @@ -101,29 +102,29 @@ def test_gini_multi(): exp_val = [0.25, -0.75, -0.75] # DIFFERENT FROM SKLEARN THEIRS IS: [0, 1, 1], both give pure leaf node spl_idx = [0, 1, 0] - assert isinstance(root, LeafNode) or isinstance( - root, DecisionNode - ), f"root is not a node but {type(root)}" + assert isinstance(root, LeafNode) or isinstance(root, DecisionNode), ( + f"root is not a node but {type(root)}" + ) queue = [root] i = 0 while len(queue) > 0: cur_node = queue.pop() if isinstance(cur_node, DecisionNode): - assert ( - cur_node.threshold == exp_val[i] - ), f"Expected threshold {exp_val[i]}, got {cur_node.threshold}" - assert ( - cur_node.split_idx == spl_idx[i] - ), f"Expected split idx {spl_idx[i]}, got {cur_node.split_idx}" + assert cur_node.threshold == exp_val[i], ( + f"Expected threshold {exp_val[i]}, got {cur_node.threshold}" + ) + assert cur_node.split_idx == spl_idx[i], ( + f"Expected split idx {spl_idx[i]}, got {cur_node.split_idx}" + ) if cur_node.left_child: queue.append(cur_node.left_child) if cur_node.right_child: queue.append(cur_node.right_child) i += 1 elif isinstance(cur_node, LeafNode): - assert ( - len(cur_node.value) == Y_unique - ), f"Expected {Y_unique} mean values, one for each class, but got: {len(cur_node.value)}" + assert len(cur_node.value) == Y_unique, ( + f"Expected {Y_unique} mean values, one for each class, but got: {len(cur_node.value)}" + ) rec_node(root, 0) @@ -147,29 +148,29 @@ def test_regression(): root = tree.root exp_val2 = [0.25, -0.5, 0.5, 0.25, -0.75] spl_idx2 = [0, 1, 1, 1, 0] - assert isinstance(root, LeafNode) or isinstance( - root, DecisionNode - ), f"root is not a node but {type(root)}" + assert isinstance(root, LeafNode) or isinstance(root, DecisionNode), ( + f"root is not a node but {type(root)}" + ) queue = [root] i = 0 while len(queue) > 0: cur_node = queue.pop() if isinstance(cur_node, DecisionNode): - assert ( - cur_node.threshold == exp_val2[i] - ), f"Expected threshold {exp_val2[i]}, got {cur_node.threshold}" - assert ( - cur_node.split_idx == spl_idx2[i] - ), f"Expected split idx {spl_idx2[i]}, got {cur_node.split_idx}" + assert cur_node.threshold == exp_val2[i], ( + f"Expected threshold {exp_val2[i]}, got {cur_node.threshold}" + ) + assert cur_node.split_idx == spl_idx2[i], ( + f"Expected split idx {spl_idx2[i]}, got {cur_node.split_idx}" + ) if cur_node.left_child: queue.append(cur_node.left_child) if cur_node.right_child: queue.append(cur_node.right_child) i += 1 elif isinstance(cur_node, LeafNode): - assert ( - len(cur_node.value) == 1 - ), f"Expected {1} mean values, but got: {len(cur_node.value)}" + assert len(cur_node.value) == 1, ( + f"Expected {1} mean values, but got: {len(cur_node.value)}" + ) rec_node(root, 0) @@ -192,9 +193,9 @@ def test_entropy_single(): root = tree.root exp_val = [0.25, -0.75, 0] spl_idx = [0, 0, 1] - assert isinstance(root, LeafNode) or isinstance( - root, DecisionNode - ), f"root is not a node but {type(root)}" + assert isinstance(root, LeafNode) or isinstance(root, DecisionNode), ( + f"root is not a node but {type(root)}" + ) queue = [root] i = 0 # Loop over all the nodes @@ -203,21 +204,21 @@ def test_entropy_single(): if isinstance( cur_node, DecisionNode ): # Check threshold and idx of decision node - assert ( - cur_node.threshold == exp_val[i] - ), f"Expected threshold {exp_val[i]} on node={i}, got {cur_node.threshold} on split_idx {cur_node.split_idx} exp: {spl_idx[i]}" - assert ( - cur_node.split_idx == spl_idx[i] - ), f"Expected split idx {spl_idx[i]} on i={i}, got {cur_node.split_idx}" + assert cur_node.threshold == exp_val[i], ( + f"Expected threshold {exp_val[i]} on node={i}, got {cur_node.threshold} on split_idx {cur_node.split_idx} exp: {spl_idx[i]}" + ) + assert cur_node.split_idx == spl_idx[i], ( + f"Expected split idx {spl_idx[i]} on i={i}, got {cur_node.split_idx}" + ) if cur_node.left_child: queue.append(cur_node.left_child) if cur_node.right_child: queue.append(cur_node.right_child) i += 1 elif isinstance(cur_node, LeafNode): # Check that the value is of length 2 - assert ( - len(cur_node.value) == 2 - ), f"Expected 2 mean values, one for each class, but got: {len(cur_node.value)}" + assert len(cur_node.value) == 2, ( + f"Expected 2 mean values, one for each class, but got: {len(cur_node.value)}" + ) rec_node(root, 0) @@ -245,33 +246,65 @@ def test_entropy_multi(): exp_val = [0.25, -0.75, -0.75] # DIFFERENT FROM SKLEARN THEIRS IS: [0, 1, 1], both give pure leaf node spl_idx = [0, 1, 0] - assert isinstance(root, LeafNode) or isinstance( - root, DecisionNode - ), f"root is not a node but {type(root)}" + assert isinstance(root, LeafNode) or isinstance(root, DecisionNode), ( + f"root is not a node but {type(root)}" + ) queue = [root] i = 0 while len(queue) > 0: cur_node = queue.pop() if isinstance(cur_node, DecisionNode): - assert ( - cur_node.threshold == exp_val[i] - ), f"Expected threshold {exp_val[i]}, got {cur_node.threshold}" - assert ( - cur_node.split_idx == spl_idx[i] - ), f"Expected split idx {spl_idx[i]}, got {cur_node.split_idx}" + assert cur_node.threshold == exp_val[i], ( + f"Expected threshold {exp_val[i]}, got {cur_node.threshold}" + ) + assert cur_node.split_idx == spl_idx[i], ( + f"Expected split idx {spl_idx[i]}, got {cur_node.split_idx}" + ) if cur_node.left_child: queue.append(cur_node.left_child) if cur_node.right_child: queue.append(cur_node.right_child) i += 1 elif isinstance(cur_node, LeafNode): - assert ( - len(cur_node.value) == Y_unique - ), f"Expected {Y_unique} mean values, one for each class, but got: {len(cur_node.value)}" + assert len(cur_node.value) == Y_unique, ( + f"Expected {Y_unique} mean values, one for each class, but got: {len(cur_node.value)}" + ) rec_node(root, 0) +def test_squared_distance(): + N = 500 + Y_M = 4 + + # Create clear split on the uneven numbers + X = np.array([[0, 1] if x % 2 == 0 else [0, 2] for x in range(N)]) + # All the Y values for the even numbers gather around 0.0, and all the + # values for the uneven numbers gather around 10 + Y = np.array( + [ + np.random.normal(0.0, 1.0, Y_M) + if x % 2 == 0 + else np.random.normal(10.0, 1.0, Y_M) + for x in range(N) + ] + ) + + tree = DecisionTree(max_depth=1, tree_type="SquaredDistance") + tree.fit(X, Y) + + # Should only predict the values of the random normal + X_pred_1 = np.array([[0, 1] for _ in range(N // 2)]) + X_pred_2 = np.array([[0, 2] for _ in range(N // 2)]) + pred_1 = tree.predict(X_pred_1) + pred_2 = tree.predict(X_pred_2) + assert np.all(pred_1 == pred_1[0]), "All elements of pred_1 are not equal" + assert np.all(pred_2 == pred_2[0]), "All elements of pred_1 are not equal" + + assert abs(np.mean(pred_1) - 0.0) < 1, "Mean of pred_1 is not approximately 0.0" + assert abs(np.mean(pred_2) - 10.0) < 1, "Mean of pred_2 is not approximately 10.0" + + def sanity_regression(n, m): X = np.random.uniform(0, 100, (n, m)) Y1 = np.random.randint(0, 5, n) @@ -284,12 +317,12 @@ def sanity_regression(n, m): pred1 = tree1.predict(X) pred2 = tree2.predict(X) for i in range(n): - assert ( - abs(Y1[i] - pred1[i]) < 0.00001 - ), f"Square: Expected {Y1[i]} Got {pred1[i]}" - assert ( - abs(Y2[i] - pred2[i]) < 0.00001 - ), f"Square: Expected {Y2[i]} Got {pred2[i]}" + assert abs(Y1[i] - pred1[i]) < 0.00001, ( + f"Square: Expected {Y1[i]} Got {pred1[i]}" + ) + assert abs(Y2[i] - pred2[i]) < 0.00001, ( + f"Square: Expected {Y2[i]} Got {pred2[i]}" + ) def sanity_gini(n, m): @@ -347,8 +380,9 @@ def test_sanity(): if __name__ == "__main__": - test_gini_single() - test_gini_multi() - test_entropy_single() - test_entropy_multi() + # test_gini_single() + # test_gini_multi() + # test_entropy_single() + # test_entropy_multi() + test_squared_distance() # print("Done.") From 9086545ba9d1538f3d71eaa502a9877416adc111 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Mon, 10 Mar 2025 20:25:11 +0100 Subject: [PATCH 25/33] Fixed linting --- src/adaXT/criteria/criteria.pxd | 1 - src/adaXT/criteria/criteria.pyx | 23 ++++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index adedf30a..1f19e5b1 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -176,7 +176,6 @@ cdef class PairwiseDistance(RegressionCriteria): cdef inline double __euclidean_norm(self, int[::1] indices) - cdef inline double __get_square_sum(self, double[::1] arr1, double val1, double[::1] arr2, val2) diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 1adc16e7..db641be4 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -416,7 +416,7 @@ cdef class SquaredError(RegressionCriteria): self.right_sum += y_val self.weight_right += weight - # Instead of calculating the squared error fully, we calculate + # Instead of calculating the squared error fully, we calculate # - (1/n_L sum_{i in left} y_i^2 + 1/n_R sum_{i in right} y_i^2) return -((self.left_sum*self.left_sum) / self.weight_left + (self.right_sum*self.right_sum) / self.weight_right) @@ -451,7 +451,7 @@ cdef class SquaredDistance(RegressionCriteria): def __dealloc__(self): free(self.left_sum) free(self.right_sum) - + cdef inline void __reset_sums(self): memset(self.left_sum, 0, self.Y_cols*sizeof(double)) memset(self.right_sum, 0, self.Y_cols*sizeof(double)) @@ -468,7 +468,7 @@ cdef class SquaredDistance(RegressionCriteria): left_square_sum = 0.0 right_square_sum = 0.0 - + for j in range(self.Y_cols): for i in range(self.old_split, new_split): idx = indices[i] @@ -479,10 +479,9 @@ cdef class SquaredDistance(RegressionCriteria): left_square_sum += self.left_sum[j]*self.left_sum[j] right_square_sum += self.right_sum[j]*self.right_sum[j] - return -( left_square_sum / self.weight_left + + return -(left_square_sum / self.weight_left + right_square_sum / self.weight_right) - cdef double proxy_improvement(self, int[::1] indices, int split_idx): cdef: int i, idx, j @@ -519,9 +518,9 @@ cdef class SquaredDistance(RegressionCriteria): self.right_sum[j] += y_val right_square_sum += self.left_sum[j]*self.left_sum[j] - # Instead of calculating the squared error fully, we calculate + # Instead of calculating the squared error fully, we calculate # - (1/n_L sum_{i in left} y_i^2 + 1/n_R sum_{i in right} y_i^2) - return -( left_square_sum / self.weight_left + + return -(left_square_sum / self.weight_left + right_square_sum / self.weight_right) cpdef double impurity(self, int[::1] indices): @@ -542,7 +541,7 @@ cdef class SquaredDistance(RegressionCriteria): p = indices[i] tmp = self.Y[p, j] * self.sample_weight[p] cur_sum += tmp*tmp - square_dist += cur_sum / obs_weight - mu*mu + square_dist += cur_sum / obs_weight - mu*mu return square_dist @@ -571,7 +570,6 @@ cdef class PairwiseDistance(RegressionCriteria): int i, j, idx_i, idx_j int prev_n_right double tmp, weight_i, weight_j, square_sum - double[:] tmp_arr for i in range(self.old_split, new_split): idx_i = indices[i] weight_i = self.sample_weight[idx_i] @@ -585,7 +583,6 @@ cdef class PairwiseDistance(RegressionCriteria): tmp = sqrt(square_sum) self.left_dist_sum += tmp - prev_n_right = self.obs - i for j in range(self.right_start_idx, self.right_start_idx+prev_n_right): self.right_dist_sum -= self.right_indiv_dist[j] @@ -629,7 +626,7 @@ cdef class PairwiseDistance(RegressionCriteria): tmp = sqrt(square_sum) # i and j remain the same for current node checking self.left_dist_sum += tmp - + self.weight_left += weight_i for i in range(split_idx, n_obs): @@ -649,13 +646,13 @@ cdef class PairwiseDistance(RegressionCriteria): # 0 to 1 is the first distance. self.right_indiv_dist[indiv_idx] = tmp indiv_idx += 1 - + self.weight_right += weight_i # No proxy for EuclideanNorm, so calculate fully return (self.left_dist_sum * self.weight_left + self.right_dist_sum * self.weight_right) - + cpdef double impurity(self, int[::1] indices): return self.__euclidean_norm(indices) From 6ae4e3ed0788976c2cce342599dcced4b1b3bb3e Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Tue, 11 Mar 2025 13:51:18 +0100 Subject: [PATCH 26/33] Change SquaredDistance to MultiSquaredError --- src/adaXT/base_model.pyx | 4 ++-- src/adaXT/criteria/__init__.pxd | 2 +- src/adaXT/criteria/__init__.py | 2 +- src/adaXT/criteria/criteria.pxd | 2 +- src/adaXT/criteria/criteria.pyi | 2 +- src/adaXT/criteria/criteria.pyx | 5 +++-- tests/test_decision_tree.py | 10 +++++----- 7 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/adaXT/base_model.pyx b/src/adaXT/base_model.pyx index 92293e7e..2112910d 100644 --- a/src/adaXT/base_model.pyx +++ b/src/adaXT/base_model.pyx @@ -1,7 +1,7 @@ from numpy import float64 as DOUBLE from .predictor import Predictor from .criteria import Criteria -from .criteria.criteria import Entropy, SquaredError, PartialQuadratic, SquaredDistance +from .criteria.criteria import Entropy, SquaredError, PartialQuadratic, MultiSquaredError from .decision_tree.splitter import Splitter from .leaf_builder import LeafBuilder @@ -134,7 +134,7 @@ class BaseModel(): "Regression": [SquaredError, PredictorRegression, LeafBuilderRegression], "Gradient": [PartialQuadratic, PredictorLocalPolynomial, LeafBuilderPartialQuadratic], "Quantile": [SquaredError, PredictorQuantile, LeafBuilderRegression], - "SquaredDistance": [SquaredDistance, PredictorRegression, + "MultiRegression": [MultiSquaredError, PredictorRegression, LeafBuilderRegression] } if tree_type in tree_types.keys(): diff --git a/src/adaXT/criteria/__init__.pxd b/src/adaXT/criteria/__init__.pxd index 1ab937c0..f448c25e 100644 --- a/src/adaXT/criteria/__init__.pxd +++ b/src/adaXT/criteria/__init__.pxd @@ -7,5 +7,5 @@ from .criteria cimport ( SquaredError, PartialLinear, PartialQuadratic, - SquaredDistance + MultiSquaredError ) diff --git a/src/adaXT/criteria/__init__.py b/src/adaXT/criteria/__init__.py index 9c71abf6..43e92394 100644 --- a/src/adaXT/criteria/__init__.py +++ b/src/adaXT/criteria/__init__.py @@ -7,5 +7,5 @@ PartialLinear, PartialQuadratic, Criteria, - SquaredDistance, + MultiSquaredError, ) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index 1f19e5b1..ed845c23 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -150,7 +150,7 @@ cdef class SquaredError(RegressionCriteria): double The variance of the response y """ -cdef class SquaredDistance(RegressionCriteria): +cdef class MultiSquaredError(RegressionCriteria): cdef: double* left_sum double* right_sum diff --git a/src/adaXT/criteria/criteria.pyi b/src/adaXT/criteria/criteria.pyi index 492cd4a8..a633a831 100644 --- a/src/adaXT/criteria/criteria.pyi +++ b/src/adaXT/criteria/criteria.pyi @@ -69,7 +69,7 @@ class SquaredError(RegressionCriteria): pass -class SquaredDistance(RegressionCriteria): +class MultiSquaredError(RegressionCriteria): pass class PairwiseDistance(RegressionCriteria): diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index db641be4..9749873f 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -441,7 +441,7 @@ cdef class SquaredError(RegressionCriteria): square_err = cur_sum/obs_weight - mu*mu return square_err -cdef class SquaredDistance(RegressionCriteria): +cdef class MultiSquaredError(RegressionCriteria): def __init__(self, double[:, ::1] X, double[:, ::1] Y, double[::1] sample_weight): super().__init__(X, Y, sample_weight) self.Y_cols = Y.shape[1] @@ -545,6 +545,7 @@ cdef class SquaredDistance(RegressionCriteria): return square_dist +# TODO: Rename PairwiseEuclideanDistance cdef class PairwiseDistance(RegressionCriteria): def __init__(self, double[:, ::1] X, double[:, ::1] Y, double[::1] sample_weight): super().__init__(X, Y, sample_weight) @@ -663,7 +664,7 @@ cdef class PairwiseDistance(RegressionCriteria): int n_indices = indices.shape[0] double weight_i, weight_j, square_sum - for i in range(n_indices): + for i in range(n_indices-1): idx_i = indices[i] weight_i = self.sample_weight[idx_i] for j in range(i+1, n_indices): diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index 1a969e81..d442af4e 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -5,7 +5,7 @@ Entropy, PartialLinear, PartialQuadratic, - SquaredDistance, + MultiSquaredError, ) import numpy as np @@ -273,7 +273,7 @@ def test_entropy_multi(): rec_node(root, 0) -def test_squared_distance(): +def test_multi_squared(): N = 500 Y_M = 4 @@ -290,7 +290,7 @@ def test_squared_distance(): ] ) - tree = DecisionTree(max_depth=1, tree_type="SquaredDistance") + tree = DecisionTree(tree_type="MultiRegression") tree.fit(X, Y) # Should only predict the values of the random normal @@ -384,5 +384,5 @@ def test_sanity(): # test_gini_multi() # test_entropy_single() # test_entropy_multi() - test_squared_distance() - # print("Done.") + test_multi_squared() + print("Done.") From 6f3433299b4a4e986df12afd4338e2b7b5eb23e1 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 14 Mar 2025 08:44:26 +0100 Subject: [PATCH 27/33] Add checks back, when debugging --- setup.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/setup.py b/setup.py index b536292b..59c1d09c 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ DEBUG = False PROFILE = False -ANNOTATE = True +ANNOTATE = False # Make all pyx files for the decision_tree ext = ".pyx" if USE_CYTHON else ".cpp" @@ -100,15 +100,6 @@ def run_build(): from Cython.Compiler.Options import get_directive_defaults compiler_directives = get_directive_defaults() - compiler_directives.update( - { - "boundscheck": False, - "wraparound": False, - "cdivision": True, - "initializedcheck": False, - "nonecheck": False, - } - ) if PROFILE: compiler_directives["profile"] = True @@ -125,6 +116,27 @@ def run_build(): if ANNOTATE: arg_dir["annotate"] = True + if DEBUG: + compiler_directives.update( + { + "boundscheck": True, + "wraparound": True, + "cdivision": False, + "initializedcheck": True, + "nonecheck": True, + } + ) + else: + compiler_directives.update( + { + "boundscheck": False, + "wraparound": False, + "cdivision": True, + "initializedcheck": False, + "nonecheck": False, + } + ) + extensions = cythonize(extensions, **arg_dir) setup( name=NAME, From 0535a656c2feb03282ba642d8d91c7560a7021e6 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 14 Mar 2025 08:45:18 +0100 Subject: [PATCH 28/33] Added equivalent test between MultiSquared and Pairwise --- tests/test_decision_tree.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index d442af4e..60191262 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -6,6 +6,7 @@ PartialLinear, PartialQuadratic, MultiSquaredError, + PairwiseEuclideanDistance, ) import numpy as np @@ -305,6 +306,38 @@ def test_multi_squared(): assert abs(np.mean(pred_2) - 10.0) < 1, "Mean of pred_2 is not approximately 10.0" +def test_Pairwise(): + N = 500 + Y_M = 4 + + # Create clear split on the uneven numbers + X = np.array([[0, 1] if x % 2 == 0 else [0, 2] for x in range(N)]) + # All the Y values for the even numbers gather around 0.0, and all the + # values for the uneven numbers gather around 10 + Y = np.array( + [ + np.random.normal(0.0, 1.0, Y_M) + if x % 2 == 0 + else np.random.normal(10.0, 1.0, Y_M) + for x in range(N) + ] + ) + + tree = DecisionTree(tree_type="MultiRegression", criteria=PairwiseEuclideanDistance) + tree.fit(X, Y) + + # Should only predict the values of the random normal + X_pred_1 = np.array([[0, 1] for _ in range(N // 2)]) + X_pred_2 = np.array([[0, 2] for _ in range(N // 2)]) + pred_1 = tree.predict(X_pred_1) + pred_2 = tree.predict(X_pred_2) + assert np.all(pred_1 == pred_1[0]), "All elements of pred_1 are not equal" + assert np.all(pred_2 == pred_2[0]), "All elements of pred_1 are not equal" + + assert abs(np.mean(pred_1) - 0.0) < 1, "Mean of pred_1 is not approximately 0.0" + assert abs(np.mean(pred_2) - 10.0) < 1, "Mean of pred_2 is not approximately 10.0" + + def sanity_regression(n, m): X = np.random.uniform(0, 100, (n, m)) Y1 = np.random.randint(0, 5, n) From 8a7035971174803247e11112f920d69c110f39e8 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 14 Mar 2025 08:46:07 +0100 Subject: [PATCH 29/33] Finished PairwiseEuclideanDistance --- src/adaXT/criteria/__init__.pxd | 1 + src/adaXT/criteria/__init__.py | 1 + src/adaXT/criteria/criteria.pxd | 2 +- src/adaXT/criteria/criteria.pyi | 40 ++++++++++++++++++++++++++++----- src/adaXT/criteria/criteria.pyx | 40 ++++++++++++++++++--------------- 5 files changed, 60 insertions(+), 24 deletions(-) diff --git a/src/adaXT/criteria/__init__.pxd b/src/adaXT/criteria/__init__.pxd index f448c25e..b707d060 100644 --- a/src/adaXT/criteria/__init__.pxd +++ b/src/adaXT/criteria/__init__.pxd @@ -8,4 +8,5 @@ from .criteria cimport ( PartialLinear, PartialQuadratic, MultiSquaredError + PairwiseEuclideanDistance ) diff --git a/src/adaXT/criteria/__init__.py b/src/adaXT/criteria/__init__.py index 43e92394..826011d6 100644 --- a/src/adaXT/criteria/__init__.py +++ b/src/adaXT/criteria/__init__.py @@ -8,4 +8,5 @@ PartialQuadratic, Criteria, MultiSquaredError, + PairwiseEuclideanDistance, ) diff --git a/src/adaXT/criteria/criteria.pxd b/src/adaXT/criteria/criteria.pxd index ed845c23..5f722dce 100644 --- a/src/adaXT/criteria/criteria.pxd +++ b/src/adaXT/criteria/criteria.pxd @@ -166,7 +166,7 @@ cdef class MultiSquaredError(RegressionCriteria): cpdef double impurity(self, int[::1] indices) -cdef class PairwiseDistance(RegressionCriteria): +cdef class PairwiseEuclideanDistance(RegressionCriteria): cdef: double left_dist_sum, right_dist_sum double weight_left, weight_right diff --git a/src/adaXT/criteria/criteria.pyi b/src/adaXT/criteria/criteria.pyi index a633a831..829f5562 100644 --- a/src/adaXT/criteria/criteria.pyi +++ b/src/adaXT/criteria/criteria.pyi @@ -19,7 +19,7 @@ class GiniIndex(ClassificationCriteria): Formally, given class labels $\mathcal{L}$, the Gini index in a node consisting of samples $I$, is given by $$ - \text{Gini\_index} = 1 - \sum_{k\in \mathcal{L}} P[k]^2, + \text{GiniIndex} = 1 - \sum_{k\in \mathcal{L}} P[k]^2, $$ where $P[k]$ denotes the fraction of samples in $I$ with class label $k$. @@ -55,7 +55,7 @@ class SquaredError(RegressionCriteria): leads to standard CART splits. Formally, the squared error in a node consisting of samples $I$, is given by $$ - \text{Squared\_error} = \tfrac{1}{|I|}\sum_{i\in I} + \text{SquaredError} = \tfrac{1}{|I|}\sum_{i\in I} \Big(Y[i] - \tfrac{1}{|I|}\sum_{i\in I} Y[i]\Big)^2, $$ where $Y[i]$ denotes the response value at sample $i$. @@ -70,9 +70,39 @@ class SquaredError(RegressionCriteria): pass class MultiSquaredError(RegressionCriteria): + r""" + Multi dimensional squared error criteria. With Y-values in one-dimension, it + is equivalent to the SquaredError criteria. However, this criteria is able + to function with Y-values in multiple dimensions. Formally, the + MultiSquaredError in a node consisting of samples $I$ and Y-values of $D$ + dimensions, is given by: + $$ + \text{MultiSquaredError} = \tfrac{1}{|I|} \sum^D_{j = 1} \sum_{i \in I} + \Left(Y[i, j] - \tfrac{1}{|I|\sum_{i \in I} Y[I] \Right)^2 + $$ + + For a faster, but equivalent calculation, it is computed as: + $$ + \text{MultiSquaredError} = \tfrac{1}{|I|} \sum^D_{j = 1} \left(\sum_{i\in I} Y[i]^2 + - \Big(\tfrac{1}{|I|}\sum_{i\in I} Y[i]\Big)^2 \right) + $$ + """ + pass -class PairwiseDistance(RegressionCriteria): +class PairwiseEuclideanDistance(RegressionCriteria): + r""" + Pairwise Euclidean Distance criteria. Generally performs in a similair + fashion to the MultiSquaredError. However, instead of the squared error + compared with the mean, it instead minimizes the individual distance between + points in a node. Formally, the PairwiseEuclideanDistance in a node + consisting of samples $I$ and Y-values of $D$ dimensions is given by: + $$ + \text{PairwiseEuclideanDistance} = \tfrac{1}{|I|} \sum_{i = 1}^{|I| -1} + \sum_{j = i}^{|I|} \sqrt{\sum_{k = 1}^{D} (Y[I[i], k] - Y[I[j], k])^2} + $$ + """ + pass class PartialLinear(RegressionCriteria): @@ -81,7 +111,7 @@ class PartialLinear(RegressionCriteria): variable in each leaf. Formally, in a node consisting of samples $I$, it is given by $$ - \text{Partial\_linear} = \tfrac{1}{|I|}\sum_{i \in I} + \text{PartialLinear} = \tfrac{1}{|I|}\sum_{i \in I} (Y[i] - \widehat{\theta}_0 - \widehat{\theta}_1 X[i, 0])^2, $$ where $Y[i]$ and $X[i, 0]$ denote the response value and @@ -99,7 +129,7 @@ class PartialQuadratic(RegressionCriteria): variable in each leaf. Formally, in a node consisting of samples $I$, it is given by $$ - \text{Partial\_quadratic} = \tfrac{1}{|I|}\sum_{i \in I} + \text{PartialQuadratic} = \tfrac{1}{|I|}\sum_{i \in I} (Y[i] - \widehat{\theta}_0 - \widehat{\theta}_1 X[i, 0] - \widehat{\theta}_2 X[i, 0]^2)^2, $$ where $Y[i]$ and $X[i, 0]$ denote the response value and diff --git a/src/adaXT/criteria/criteria.pyx b/src/adaXT/criteria/criteria.pyx index 9749873f..3f0c3f9f 100644 --- a/src/adaXT/criteria/criteria.pyx +++ b/src/adaXT/criteria/criteria.pyx @@ -463,6 +463,8 @@ cdef class MultiSquaredError(RegressionCriteria): double left_square_sum, right_square_sum for i in range(self.old_split, new_split): + idx = indices[i] + weight = self.sample_weight[idx] self.weight_left += weight self.weight_right -= weight @@ -545,14 +547,11 @@ cdef class MultiSquaredError(RegressionCriteria): return square_dist -# TODO: Rename PairwiseEuclideanDistance -cdef class PairwiseDistance(RegressionCriteria): +cdef class PairwiseEuclideanDistance(RegressionCriteria): def __init__(self, double[:, ::1] X, double[:, ::1] Y, double[::1] sample_weight): super().__init__(X, Y, sample_weight) - # Initialize two empty arrays for storing the sum of each Y[:, i] in a - # split - self.left_indiv_dist = np.zeros((Y.shape[0], Y.shape[0]-1), dtype=np.float64) - self.right_indiv_dist = np.zeros((Y.shape[0], Y.shape[0]-1), dtype=np.float64) + # Right individual distances + self.right_indiv_dist = np.zeros((Y.shape[0]*Y.shape[0],), dtype=np.float64) self.Y_cols = Y.shape[1] cdef inline double __get_square_sum(self, double[::1] arr1, double val1, @@ -568,13 +567,14 @@ cdef class PairwiseDistance(RegressionCriteria): cdef double update_proxy(self, int[::1] indices, int new_split): cdef: - int i, j, idx_i, idx_j + int i, j, idx_i, idx_j, n_left int prev_n_right double tmp, weight_i, weight_j, square_sum + n_left = self.old_split for i in range(self.old_split, new_split): idx_i = indices[i] weight_i = self.sample_weight[idx_i] - for j in range(self.old_split): + for j in range(n_left): idx_j = indices[j] weight_j = self.sample_weight[idx_j] square_sum = self.__get_square_sum( @@ -592,18 +592,20 @@ cdef class PairwiseDistance(RegressionCriteria): self.weight_left += weight_i self.weight_right -= weight_i + n_left = i + # No proxy for EuclideanNorm, so calculate fully return (self.left_dist_sum * self.weight_left + self.right_dist_sum * self.weight_right) cdef double proxy_improvement(self, int[::1] indices, int split_idx): cdef: - int i, j, idx_i, idx_j + int i, j, idx_i, idx_j, indiv_idx int n_obs = indices.shape[0] double weight_i, weight_j, tmp, square_sum - int indiv_idx = 0 # reset the start idx for self.right_indiv_dist + indiv_idx = 0 self.right_start_idx = indiv_idx self.weight_left = 0.0 @@ -611,12 +613,12 @@ cdef class PairwiseDistance(RegressionCriteria): # Create individual distances for right half, as we will be wanting to # subtract a point by removing its distance to all other nodes - self.right_indiv_dist = np.zeros(n_obs) + self.right_indiv_dist = np.zeros((n_obs*n_obs,), dtype=np.float64) # Calculate sum of each Y point - for i in range(split_idx): + for i in range(split_idx-1): idx_i = indices[i] - weight_i = self.sample_weight[i] + weight_i = self.sample_weight[idx_i] for j in range(i+1, n_obs): idx_j = indices[j] weight_j = self.sample_weight[idx_j] @@ -630,9 +632,9 @@ cdef class PairwiseDistance(RegressionCriteria): self.weight_left += weight_i - for i in range(split_idx, n_obs): + for i in range(split_idx, n_obs-1): idx_i = indices[i] - weight_i = self.sample_weight[i] + weight_i = self.sample_weight[idx_i] for j in range(i+1, n_obs): idx_j = indices[j] weight_j = self.sample_weight[idx_j] @@ -662,21 +664,23 @@ cdef class PairwiseDistance(RegressionCriteria): double dist_sum = 0.0 int i, j, idx_i, idx_j int n_indices = indices.shape[0] - double weight_i, weight_j, square_sum + double weight_i, weight_j, square_sum, tot_weight + tot_weight = 0.0 for i in range(n_indices-1): idx_i = indices[i] weight_i = self.sample_weight[idx_i] for j in range(i+1, n_indices): - idx_j = indices[idx_j] + idx_j = indices[j] weight_j = self.sample_weight[idx_j] square_sum = self.__get_square_sum( self.Y[idx_i, :], weight_i, self.Y[idx_j, :], weight_j ) dist_sum += sqrt(square_sum) + tot_weight += weight_i - return dist_sum + return dist_sum / tot_weight # Partial linear criteria cdef class PartialLinear(RegressionCriteria): From 101d0a108d1af6dbe663d06ea95d81a2e2ab6f39 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 14 Mar 2025 08:49:48 +0100 Subject: [PATCH 30/33] Added mkdocs serve to makefile, for easier documentation --- Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile b/Makefile index e61ab036..f706895c 100644 --- a/Makefile +++ b/Makefile @@ -14,5 +14,12 @@ clean: lint: cython-lint src/* --max-line-length=127 +mkdocs_install: + pip install mkdocs mkdocs-material mkdocstrings 'mkdocstrings[python, cython]' mkdocs-autorefs pymdown-extensions + +mkdocs: mkdocs_install + mkdocs serve + + test_pypi: pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple adaXT From 6ad4c917fc751e450b1786b79c72e35d290eb6d9 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 14 Mar 2025 08:55:18 +0100 Subject: [PATCH 31/33] Added random seed to tests --- tests/test_decision_tree.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index 60191262..45913224 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -275,6 +275,7 @@ def test_entropy_multi(): def test_multi_squared(): + np.random.seed(2025) N = 500 Y_M = 4 @@ -307,6 +308,7 @@ def test_multi_squared(): def test_Pairwise(): + np.random.seed(2025) N = 500 Y_M = 4 From 58f47a13783b698dc6bdcea709c2b047addc27d0 Mon Sep 17 00:00:00 2001 From: Simon Vinding Brodersen Date: Fri, 14 Mar 2025 14:01:59 +0100 Subject: [PATCH 32/33] Version number update 1.5.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 59c1d09c..3358c711 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import os NAME = "adaXT" -VERSION = "1.4.0" +VERSION = "1.5.0" DESCRIPTION = "A Python package for tree-based regression and classification" PROJECT_URLS = { "Documentation": "https://NiklasPfister.github.io/adaXT/", From eb6fbfe06f0e2d58c181fc6761dd2c979ed727c8 Mon Sep 17 00:00:00 2001 From: Niklas Andreas Pfister Date: Fri, 14 Mar 2025 13:04:29 +0000 Subject: [PATCH 33/33] Automated autopep8 fixes --- src/adaXT/decision_tree/decision_tree.py | 24 +++++++++++----- src/adaXT/decision_tree/tree_utils.py | 32 +++++++++++++++------ src/adaXT/parallel.py | 6 ++-- src/adaXT/random_forest/random_forest.py | 36 +++++++++++++++--------- tests/test_decision_tree.py | 16 +++++++---- 5 files changed, 77 insertions(+), 37 deletions(-) diff --git a/src/adaXT/decision_tree/decision_tree.py b/src/adaXT/decision_tree/decision_tree.py index 9aa88126..77a59d56 100644 --- a/src/adaXT/decision_tree/decision_tree.py +++ b/src/adaXT/decision_tree/decision_tree.py @@ -156,7 +156,8 @@ def fit( self.leaf_builder, self.predictor, ) - self.max_features = self._check_max_features(self.max_features, X.shape[0]) + self.max_features = self._check_max_features( + self.max_features, X.shape[0]) self._tree = _DecisionTree( max_depth=self.max_depth, @@ -177,8 +178,10 @@ def fit( self._tree.n_features = X.shape[1] if not self.skip_check_input: - sample_weight = self._check_sample_weight(sample_weight=sample_weight) - sample_indices = self._check_sample_indices(sample_indices=sample_indices) + sample_weight = self._check_sample_weight( + sample_weight=sample_weight) + sample_indices = self._check_sample_indices( + sample_indices=sample_indices) builder = DepthTreeBuilder( X=X, @@ -298,11 +301,18 @@ def predict_leaf(self, X: ArrayLike | None) -> dict: return self._tree.predict_leaf(X=X) def _tree_based_weights( - self, hash0: dict, hash1: dict, size_X0: int, size_X1: int, scaling: str - ) -> np.ndarray: + self, + hash0: dict, + hash1: dict, + size_X0: int, + size_X1: int, + scaling: str) -> np.ndarray: return self._tree._tree_based_weights( - hash0=hash0, hash1=hash1, size_X0=size_X0, size_X1=size_X1, scaling=scaling - ) + hash0=hash0, + hash1=hash1, + size_X0=size_X0, + size_X1=size_X1, + scaling=scaling) def similarity(self, X0: ArrayLike, X1: ArrayLike) -> np.ndarray: """ diff --git a/src/adaXT/decision_tree/tree_utils.py b/src/adaXT/decision_tree/tree_utils.py index 8eea8da9..07de6d3d 100644 --- a/src/adaXT/decision_tree/tree_utils.py +++ b/src/adaXT/decision_tree/tree_utils.py @@ -121,25 +121,31 @@ def get_label(**kwargs): if isinstance(node, DecisionNode): node_string += "DecisionNode" + new_line node_string += f"X{node.split_idx} <= " - node_string += str(round(node.threshold, impurity_precision)) + new_line + node_string += str(round(node.threshold, + impurity_precision)) + new_line if kwargs["impurity"]: node_string += "Impurity: " - node_string += str(round(node.impurity, impurity_precision)) + new_line + node_string += str(round(node.impurity, + impurity_precision)) + new_line elif isinstance(node, LeafNode): node_string += "LeafNode" + new_line if kwargs["impurity"]: node_string += "Impurity: " - node_string += str(round(node.impurity, impurity_precision)) + new_line + node_string += str(round(node.impurity, + impurity_precision)) + new_line node_string += "Samples: " - node_string += str(round(node.weighted_samples, impurity_precision)) + new_line + node_string += str(round(node.weighted_samples, + impurity_precision)) + new_line node_string += "Value: " if len(node.value) == 1: node_string += str(round(node.value[0], node_precision)) else: node_value_string = "\n [" value_length = len(node.value) - n_vals_per_line = max(value_length / 3, 4) # Number of values per line + n_vals_per_line = max( + value_length / 3, + 4) # Number of values per line for i in range(value_length): node_value_string += str(round(node.value[i], node_precision)) if (i + 1) % n_vals_per_line == 0 and i != value_length - 1: @@ -162,12 +168,20 @@ def __init__(self, node, parent=None, depth=0, number=1, **kwargs): if node.left_child is not None: lst.append( - DrawTree(node.left_child, self, depth + 1, number=1, **kwargs) - ) + DrawTree( + node.left_child, + self, + depth + 1, + number=1, + **kwargs)) if node.right_child is not None: lst.append( - DrawTree(node.right_child, self, depth + 1, number=2, **kwargs) - ) + DrawTree( + node.right_child, + self, + depth + 1, + number=2, + **kwargs)) self.children = lst self.parent = parent self.thread = None diff --git a/src/adaXT/parallel.py b/src/adaXT/parallel.py index 87372693..fcb16918 100644 --- a/src/adaXT/parallel.py +++ b/src/adaXT/parallel.py @@ -19,7 +19,8 @@ def shared_numpy_array(array) -> np.ndarray: elif array.ndim == 1: row = array.shape[0] shared_array = RawArray(ctypes.c_double, row) - shared_array_np = np.ndarray(shape=row, dtype=np.double, buffer=shared_array) + shared_array_np = np.ndarray( + shape=row, dtype=np.double, buffer=shared_array) else: raise ValueError("Array is neither 1 dimensional nor 2 dimensional") np.copyto(shared_array_np, array) @@ -205,7 +206,8 @@ def async_apply( ret = [partial_func() for _ in range(n_iterations)] else: with self.ctx.Pool(n_jobs) as p: - promise = [p.apply_async(partial_func) for _ in range(n_iterations)] + promise = [p.apply_async(partial_func) + for _ in range(n_iterations)] ret = [res.get() for res in promise] return ret diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index 1b474e47..d0483e3b 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -64,7 +64,8 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min( + [sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -74,7 +75,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"] :], + indices[sampling_args["split"]:], size=resample_size1, replace=sampling_args["replace"], ) @@ -85,7 +86,8 @@ def get_sample_indices( resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min( + [sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -95,7 +97,7 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"] :], + indices[sampling_args["split"]:], size=resample_size1, replace=sampling_args["replace"], ) @@ -151,11 +153,17 @@ def build_single_tree( predictor=predictor, splitter=splitter, ) - tree.fit(X=X, Y=Y, sample_indices=fitting_indices, sample_weight=sample_weight) + tree.fit( + X=X, + Y=Y, + sample_indices=fitting_indices, + sample_weight=sample_weight) if honest_tree: tree.refit_leaf_nodes( - X=X, Y=Y, sample_weight=sample_weight, sample_indices=prediction_indices - ) + X=X, + Y=Y, + sample_weight=sample_weight, + sample_indices=prediction_indices) return tree @@ -341,7 +349,8 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: if "size" not in sampling_args: sampling_args["size"] = self.X_n_rows elif isinstance(sampling_args["size"], float): - sampling_args["size"] = int(sampling_args["size"] * self.X_n_rows) + sampling_args["size"] = int( + sampling_args["size"] * self.X_n_rows) elif not isinstance(sampling_args["size"], int): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." @@ -412,8 +421,7 @@ def __build_trees(self) -> None: sampling=self.sampling, ) self.fitting_indices, self.prediction_indices, self.out_of_bag_indices = zip( - *indices - ) + *indices) self.trees = self.parallel.starmap( build_single_tree, map_input=zip(self.fitting_indices, self.prediction_indices), @@ -436,9 +444,8 @@ def __build_trees(self) -> None: n_jobs=self.n_jobs_fit, ) - def fit( - self, X: ArrayLike, Y: ArrayLike, sample_weight: ArrayLike | None = None - ) -> None: + def fit(self, X: ArrayLike, Y: ArrayLike, + sample_weight: ArrayLike | None = None) -> None: """ Fit the random forest with training data (X, Y). @@ -470,7 +477,8 @@ def fit( self.X = shared_numpy_array(X) self.Y = shared_numpy_array(Y) self.X_n_rows, self.n_features = self.X.shape - self.max_features = self._check_max_features(self.max_features, X.shape[0]) + self.max_features = self._check_max_features( + self.max_features, X.shape[0]) self.sample_weight = self._check_sample_weight(sample_weight) self.sampling_args = self.__get_sampling_parameter(self.sampling_args) diff --git a/tests/test_decision_tree.py b/tests/test_decision_tree.py index 45913224..d842c6f9 100644 --- a/tests/test_decision_tree.py +++ b/tests/test_decision_tree.py @@ -303,8 +303,10 @@ def test_multi_squared(): assert np.all(pred_1 == pred_1[0]), "All elements of pred_1 are not equal" assert np.all(pred_2 == pred_2[0]), "All elements of pred_1 are not equal" - assert abs(np.mean(pred_1) - 0.0) < 1, "Mean of pred_1 is not approximately 0.0" - assert abs(np.mean(pred_2) - 10.0) < 1, "Mean of pred_2 is not approximately 10.0" + assert abs(np.mean(pred_1) - + 0.0) < 1, "Mean of pred_1 is not approximately 0.0" + assert abs(np.mean(pred_2) - + 10.0) < 1, "Mean of pred_2 is not approximately 10.0" def test_Pairwise(): @@ -325,7 +327,9 @@ def test_Pairwise(): ] ) - tree = DecisionTree(tree_type="MultiRegression", criteria=PairwiseEuclideanDistance) + tree = DecisionTree( + tree_type="MultiRegression", + criteria=PairwiseEuclideanDistance) tree.fit(X, Y) # Should only predict the values of the random normal @@ -336,8 +340,10 @@ def test_Pairwise(): assert np.all(pred_1 == pred_1[0]), "All elements of pred_1 are not equal" assert np.all(pred_2 == pred_2[0]), "All elements of pred_1 are not equal" - assert abs(np.mean(pred_1) - 0.0) < 1, "Mean of pred_1 is not approximately 0.0" - assert abs(np.mean(pred_2) - 10.0) < 1, "Mean of pred_2 is not approximately 10.0" + assert abs(np.mean(pred_1) - + 0.0) < 1, "Mean of pred_1 is not approximately 0.0" + assert abs(np.mean(pred_2) - + 10.0) < 1, "Mean of pred_2 is not approximately 10.0" def sanity_regression(n, m):