diff --git a/pyproject.toml b/pyproject.toml index b1701892..51c7fe31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "Cython", "numpy", "wheel"] +requires = ["setuptools", "Cython==3.1.0", "numpy", "wheel"] [tool.cython-lint] max-line-length = 127 diff --git a/requirements.txt b/requirements.txt index 1d98842e..efc2e9e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy>=1.25.0 matplotlib>=3.7.1 -cython>=3.0.0 \ No newline at end of file +cython==3.1.0 diff --git a/setup.py b/setup.py index 3358c711..f6a0931d 100644 --- a/setup.py +++ b/setup.py @@ -140,6 +140,7 @@ def run_build(): extensions = cythonize(extensions, **arg_dir) setup( name=NAME, + license="BSD-3-clause", version=VERSION, description=DESCRIPTION, long_description=LONG_DESCRIPTION, @@ -160,7 +161,6 @@ def run_build(): classifiers=[ "Programming Language :: Python :: 3", "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", ], extras_require=extras, diff --git a/src/adaXT/decision_tree/_decision_tree.pyx b/src/adaXT/decision_tree/_decision_tree.pyx index 508fb81b..cf86e3e4 100644 --- a/src/adaXT/decision_tree/_decision_tree.pyx +++ b/src/adaXT/decision_tree/_decision_tree.pyx @@ -3,7 +3,7 @@ import sys cimport numpy as cnp ctypedef cnp.float64_t DOUBLE_t -ctypedef cnp.int64_t LONG_t +ctypedef cnp.int32_t INT32_T from libcpp cimport bool @@ -16,7 +16,6 @@ from .nodes import DecisionNode # for c level definitions -cimport cython from .nodes cimport DecisionNode, Node from ..utils cimport dsum @@ -27,7 +26,7 @@ cdef double EPSILON = np.finfo('double').eps cdef class refit_object(Node): cdef public: list list_idx - bint is_left + bool is_left def __init__( self, @@ -44,9 +43,7 @@ cdef class refit_object(Node): def add_idx(self, idx: int) -> None: self.list_idx.append(idx) - -@cython.auto_pickle(True) -cdef class _DecisionTree(): +cdef class _DecisionTree: cdef public: object criteria object splitter @@ -180,7 +177,7 @@ cdef class _DecisionTree(): cdef void __fit_new_leaf_nodes(self, cnp.ndarray[DOUBLE_t, ndim=2] X, cnp.ndarray[DOUBLE_t, ndim=2] Y, cnp.ndarray[DOUBLE_t, ndim=1] sample_weight, - cnp.ndarray[LONG_t, ndim=1] sample_indices): + cnp.ndarray[INT32_T, ndim=1] sample_indices): cdef: int idx, n_objs, depth, cur_split_idx double cur_threshold @@ -328,7 +325,7 @@ cdef class _DecisionTree(): cnp.ndarray[DOUBLE_t, ndim=2] X, cnp.ndarray[DOUBLE_t, ndim=2] Y, cnp.ndarray[DOUBLE_t, ndim=1] sample_weight, - cnp.ndarray[LONG_t, ndim=1] sample_indices) -> None: + cnp.ndarray[INT32_T, ndim=1] sample_indices) -> None: if self.root is None: raise ValueError("The tree has not been trained before trying to\ @@ -343,6 +340,10 @@ cdef class _DecisionTree(): # Now squash all the DecisionNodes not visited self.__squash_tree() + # Make sure that predictor_instance points to the same root, if we have + # changed it + self.predictor_instance.root = self.root + # From below here, it is the DepthTreeBuilder class queue_obj: diff --git a/src/adaXT/predictor/predictor.pxd b/src/adaXT/predictor/predictor.pxd index 77c18433..82300fec 100644 --- a/src/adaXT/predictor/predictor.pxd +++ b/src/adaXT/predictor/predictor.pxd @@ -7,7 +7,7 @@ cdef class Predictor(): cnp.ndarray X cnp.ndarray Y int n_features - Node root + cdef public Node root cpdef dict predict_leaf(self, double[:, ::1] X) diff --git a/src/adaXT/random_forest/random_forest.py b/src/adaXT/random_forest/random_forest.py index d0483e3b..185ad4f7 100644 --- a/src/adaXT/random_forest/random_forest.py +++ b/src/adaXT/random_forest/random_forest.py @@ -48,24 +48,23 @@ def get_sample_indices( Assumes there has been a previous call to self.__get_sample_indices on the RandomForest. """ + indices = np.arange(0, X_n_rows, dtype=np.int32) if sampling == "resampling": ret = ( gen.choice( - np.arange(0, X_n_rows), + indices, size=sampling_args["size"], replace=sampling_args["replace"], ), None, ) elif sampling == "honest_tree": - indices = np.arange(0, X_n_rows) gen.shuffle(indices) if sampling_args["replace"]: resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min( - [sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -75,19 +74,17 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"]:], + indices[sampling_args["split"] :], size=resample_size1, replace=sampling_args["replace"], ) ret = (fit_indices, pred_indices) elif sampling == "honest_forest": - indices = np.arange(0, X_n_rows) if sampling_args["replace"]: resample_size0 = sampling_args["size"] resample_size1 = sampling_args["size"] else: - resample_size0 = np.min( - [sampling_args["split"], sampling_args["size"]]) + resample_size0 = np.min([sampling_args["split"], sampling_args["size"]]) resample_size1 = np.min( [X_n_rows - sampling_args["split"], sampling_args["size"]] ) @@ -97,13 +94,13 @@ def get_sample_indices( replace=sampling_args["replace"], ) pred_indices = gen.choice( - indices[sampling_args["split"]:], + indices[sampling_args["split"] :], size=resample_size1, replace=sampling_args["replace"], ) ret = (fit_indices, pred_indices) else: - ret = (np.arange(0, X_n_rows), None) + ret = (indices, None) if sampling_args["OOB"]: # Only fitting indices @@ -153,18 +150,11 @@ def build_single_tree( predictor=predictor, splitter=splitter, ) - tree.fit( - X=X, - Y=Y, - sample_indices=fitting_indices, - sample_weight=sample_weight) + tree.fit(X=X, Y=Y, sample_indices=fitting_indices, sample_weight=sample_weight) if honest_tree: tree.refit_leaf_nodes( - X=X, - Y=Y, - sample_weight=sample_weight, - sample_indices=prediction_indices) - + X=X, Y=Y, sample_weight=sample_weight, sample_indices=prediction_indices + ) return tree @@ -349,8 +339,7 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: if "size" not in sampling_args: sampling_args["size"] = self.X_n_rows elif isinstance(sampling_args["size"], float): - sampling_args["size"] = int( - sampling_args["size"] * self.X_n_rows) + sampling_args["size"] = int(sampling_args["size"] * self.X_n_rows) elif not isinstance(sampling_args["size"], int): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." @@ -366,11 +355,11 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: sampling_args["split"] = np.min( [int(0.5 * self.X_n_rows), self.X_n_rows - 1] ) - elif isinstance(sampling_args["size"], float): + elif isinstance(sampling_args["split"], float): sampling_args["split"] = np.min( [int(sampling_args["split"] * self.X_n_rows), self.X_n_rows - 1] ) - elif not isinstance(sampling_args["size"], int): + elif not isinstance(sampling_args["split"], (int, np.integer)): raise ValueError( "The provided sampling_args['split'] is not an integer or float as required." ) @@ -380,7 +369,7 @@ def __get_sampling_parameter(self, sampling_args: dict | None) -> dict: sampling_args["size"] = int( sampling_args["size"] * sampling_args["split"] ) - elif not isinstance(sampling_args["size"], int): + elif not isinstance(sampling_args["size"], (np.integer, int)): raise ValueError( "The provided sampling_args['size'] is not an integer or float as required." ) @@ -421,7 +410,8 @@ def __build_trees(self) -> None: sampling=self.sampling, ) self.fitting_indices, self.prediction_indices, self.out_of_bag_indices = zip( - *indices) + *indices + ) self.trees = self.parallel.starmap( build_single_tree, map_input=zip(self.fitting_indices, self.prediction_indices), @@ -444,8 +434,9 @@ def __build_trees(self) -> None: n_jobs=self.n_jobs_fit, ) - def fit(self, X: ArrayLike, Y: ArrayLike, - sample_weight: ArrayLike | None = None) -> None: + def fit( + self, X: ArrayLike, Y: ArrayLike, sample_weight: ArrayLike | None = None + ) -> None: """ Fit the random forest with training data (X, Y). @@ -477,8 +468,7 @@ def fit(self, X: ArrayLike, Y: ArrayLike, self.X = shared_numpy_array(X) self.Y = shared_numpy_array(Y) self.X_n_rows, self.n_features = self.X.shape - self.max_features = self._check_max_features( - self.max_features, X.shape[0]) + self.max_features = self._check_max_features(self.max_features, X.shape[0]) self.sample_weight = self._check_sample_weight(sample_weight) self.sampling_args = self.__get_sampling_parameter(self.sampling_args)