Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1e9f6b4
Testpypi install in makefile
svbrodersen Jan 17, 2025
086d5cc
Small documentation updates
svbrodersen Feb 8, 2025
eef958e
Move from def nodes to cdef nodes
svbrodersen Feb 9, 2025
331bbe8
Work on Node optimisations
svbrodersen Feb 11, 2025
7008760
Merge branch 'main' into predict_optimisation
svbrodersen Feb 11, 2025
52384a7
changes to pickle
svbrodersen Feb 11, 2025
76e319c
Work on prediction
svbrodersen Feb 23, 2025
2eeae46
Fixed linting
svbrodersen Feb 23, 2025
d3ef499
Remove left over comment
svbrodersen Feb 25, 2025
07c384a
Changed to no parallel predict
svbrodersen Feb 25, 2025
e8bda8f
Changed to sequential naming
svbrodersen Feb 25, 2025
3b5d336
sequential true
svbrodersen Feb 25, 2025
2e53ea7
Euclidean norm criteria
svbrodersen Feb 25, 2025
052ce5f
Fixed linting
svbrodersen Feb 25, 2025
45eb650
Fixed linting
svbrodersen Feb 25, 2025
08f8a0f
Merge branch 'Development' into predict_optimisation
svbrodersen Feb 25, 2025
7306c39
Changes n_jobs parameter
svbrodersen Mar 4, 2025
8daaf29
Added n_jobs parameter on forest for fitting and predicting
svbrodersen Mar 7, 2025
c53fea3
Naming change to camelcase for criteria
svbrodersen Mar 7, 2025
67f08b2
Initial draft
svbrodersen Mar 7, 2025
6bc395d
New draft
svbrodersen Mar 7, 2025
b033157
draft, which is building now
svbrodersen Mar 7, 2025
85f8eb9
Merge pull request #123 from svbrodersen/predict_optimisation
svbrodersen Mar 8, 2025
f453b3e
Remove incorrect type hinting
svbrodersen Mar 8, 2025
68f05d4
Remove left over forest predict
svbrodersen Mar 8, 2025
7deb06c
Merge pull request #124 from svbrodersen/criteria_naming_update
svbrodersen Mar 8, 2025
77928e1
Work on SquaredDist criteria
svbrodersen Mar 10, 2025
488e6d2
Created test for SquaredDistance
svbrodersen Mar 10, 2025
67e7ad0
Merge branch 'Development' into EuclideanNorm
svbrodersen Mar 10, 2025
9086545
Fixed linting
svbrodersen Mar 10, 2025
6ae4e3e
Change SquaredDistance to MultiSquaredError
svbrodersen Mar 11, 2025
6f34332
Add checks back, when debugging
svbrodersen Mar 14, 2025
0535a65
Added equivalent test between MultiSquared and Pairwise
svbrodersen Mar 14, 2025
8a70359
Finished PairwiseEuclideanDistance
svbrodersen Mar 14, 2025
101d0a1
Added mkdocs serve to makefile, for easier documentation
svbrodersen Mar 14, 2025
6ad4c91
Added random seed to tests
svbrodersen Mar 14, 2025
372e96f
Merge pull request #125 from svbrodersen/EuclideanNorm
svbrodersen Mar 14, 2025
58f47a1
Version number update 1.5.0
svbrodersen Mar 14, 2025
eb6fbfe
Automated autopep8 fixes
NiklasPfister Mar 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,13 @@ clean:

lint:
cython-lint src/* --max-line-length=127

mkdocs_install:
pip install mkdocs mkdocs-material mkdocstrings 'mkdocstrings[python, cython]' mkdocs-autorefs pymdown-extensions

mkdocs: mkdocs_install
mkdocs serve


test_pypi:
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple adaXT
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

# Initialize and fit tree
tree = DecisionTree("Regression",
criteria=testCrit.Partial_linear,
criteria=testCrit.PartialLinear,
max_depth=3)
tree.fit(X, Y)

Expand Down
2 changes: 1 addition & 1 deletion docs/assets/examples/creating_custom_criteria/testCrit.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from adaXT.criteria cimport Criteria

cdef class Partial_linear(Criteria):
cdef class PartialLinear(Criteria):

# Custom mean function, such that we don't have to loop through twice.
cdef (double, double) custom_mean(self, int[::1] indices):
Expand Down
20 changes: 12 additions & 8 deletions docs/user_guide/creatingCriteria.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,15 @@ should be computed. To access the feature and response you can make use of
`self.y[indices]` are the feature and response samples for which the impurity
needs to be computed. With this in place you should be able to implement almost
any criteria function you can imagine. Keep in mind that the `impurity` method
is extremely often (approximately $n\log(n)$ times). Therefore you should invest
a bit of time in optimizing the function in order to avoid long fitting times.
is used often (approximately $n\log(n)$ times). Therefore you should invest a
bit of time in optimizing the function in order to avoid long fitting times.
Further computational speed-ups can be achieved by implementing
`proxy_improvement` and `update_proxy` methods in the criteria class. If these
are not explicitly defined the code defaults to using the `impurity` method.
Although we do not provide in depth examples of those functionalities here, feel
free to look at
[criteria.pyx](https://github.com/NiklasPfister/adaXT/blob/main/src/adaXT/criteria/criteria.pyx)
where the default criteria make use of both.

Once you have finished defining your critera class and saved the .pyx file, you
can compile the Cython code and use it as part of adaXT.
Expand Down Expand Up @@ -135,12 +139,12 @@ tree = DecisionTree("Regression", criteria=my_custom_critera.My_custom_criteria,
tree.fit(X, Y)
```

We now go over a detailed example in which we construct the `Partial_linear`
We now go over a detailed example in which we construct the `PartialLinear`
criteria.

## A detailed example: `Partial_linear`
## A detailed example: `PartialLinear`

The general idea of the `Partial_linear` criteria is to fit a linear function on
The general idea of the `PartialLinear` criteria is to fit a linear function on
the first feature with the $Y$ value as the response, that is,

$$
Expand All @@ -166,7 +170,7 @@ and start with the following lines:
```python
from adaXT.criteria cimport Criteria

cdef class Partial_linear(Criteria):
cdef class PartialLinear(Criteria):
```

### Calculating the mean
Expand Down Expand Up @@ -312,15 +316,15 @@ X = np.random.uniform(0, 100, (n, m))
Y = np.random.uniform(0, 10, n)

# Initialize and fit tree
tree = DecisionTree("Regression", testCrit.Partial_linear, max_depth=3)
tree = DecisionTree("Regression", testCrit.PartialLinear, max_depth=3)
tree.fit(X, Y)

# Plot the tree
plot_tree(tree)
plt.show()
```

This creates a regression tree with the newly created custom `Partial_linear`
This creates a regression tree with the newly created custom `PartialLinear`
criteria class, specifies the `max_depth` to be 3 and then plots the tree using
both the
[plot_tree](../api_docs/tree_utils.md#adaXT.decision_tree.tree_utils.plot_tree) based
Expand Down
10 changes: 5 additions & 5 deletions docs/user_guide/creatingPredictor.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ cdef class MyPredictorClass(Predictor):
# Define your own custom predict function

@staticmethod
def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new,
def forest_predict(cnp.ndarray X_train, cnp.ndarray Y_train, cnp.ndarray X_pred,
trees: list[DecisionTree], parallel: ParallelModel,
**kwargs) -> np.ndarray:
# Define special handling for the RandomForest predict.
Expand Down Expand Up @@ -151,7 +151,7 @@ def predict_quantile(

cdef class PredictorQuantile(Predictor):
@staticmethod
def forest_predict(cnp.ndarray X_old, cnp.ndarray Y_old, cnp.ndarray X_new,
def forest_predict(cnp.ndarray X_train, cnp.ndarray Y_train, cnp.ndarray X_pred,
trees: list[DecisionTree], parallel: ParallelModel,
**kwargs) -> np.ndarray:
cdef:
Expand All @@ -162,9 +162,9 @@ cdef class PredictorQuantile(Predictor):
"quantile called without quantile passed as argument"
)
quantile = kwargs['quantile']
n_obs = X_new.shape[0]
n_obs = X_pred.shape[0]
prediction_indices = parallel.async_map(predict_quantile,
map_input=trees, X=X_new,
map_input=trees, X=X_pred,
n_obs=n_obs)
# In case the leaf nodes have multiple elements and not just one, we
# have to combine them together
Expand All @@ -175,7 +175,7 @@ cdef class PredictorQuantile(Predictor):
for j in range(n_trees):
indices_combined.extend(prediction_indices[j][i])
pred_indices_combined.append(indices_combined)
ret = np.quantile(Y_old[pred_indices_combined], quantile)
ret = np.quantile(Y_train[pred_indices_combined], quantile)
return np.array(ret, dtype=DOUBLE)
```

Expand Down
12 changes: 6 additions & 6 deletions docs/user_guide/decision_tree.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@ Below is a short example that illustrates how to use a classification tree.
```py
import numpy as np
from adaXT.decision_tree import DecisionTree
from adaXT.criteria import Gini_index
from adaXT.criteria import GiniIndex

X = np.array([[1, 1], [1, -1], [-1, -1], [-1, 1],
[1, 1], [1, -1], [-1, -1], [-1, 1]])
Xtest = np.array([[1, 1], [1, -1], [-1, -1], [-1, 1]])
Y = [0, 1, 0, 1, 0, 0, 1, 1]

tree = DecisionTree("Classification", criteria=Gini_index)
tree = DecisionTree("Classification", criteria=GiniIndex)
tree.fit(X, Y)
print(tree.predict(Xtest))
print(tree.predict(Xtest, predict_proba=True))
Expand All @@ -74,7 +74,7 @@ print(tree.predict(Xtest, predict_proba=True))
In this example we created and fit a classification tree using training data and
then used the fitted tree to predict the response at the training data. When
initializing the tree we changed the default criteria to the
[Gini Index](../api_docs/Criteria.md#adaXT.criteria.criteria.Gini_index); it is
[Gini Index](../api_docs/Criteria.md#adaXT.criteria.criteria.GiniIndex); it is
always possible to overwrite any of the default components of a specific tree
type. Classification trees use a majority vote in each of the leaf nodes to
decide which class to predict and ties are broken by selecting the smaller
Expand All @@ -96,7 +96,7 @@ the data.
For the `Regression` tree type, the following default components are used:

- Criteria class:
[Squared_error](../api_docs/Criteria.md#adaXT.criteria.criteria.Squared_error)
[SquaredError](../api_docs/Criteria.md#adaXT.criteria.criteria.SquaredError)
- Predict class:
[PredictRegression](../api_docs/Predictor.md#adaXT.predict.predict.PredictRegression)
- LeafBuilder class:
Expand Down Expand Up @@ -124,7 +124,7 @@ print(tree.predict(Xnew))
For the `Quantile` tree type, the following default components are used:

- Criteria class:
[Squared_error](../api_docs/Criteria.md#adaXT.criteria.criteria.Squared_error)
[SquaredError](../api_docs/Criteria.md#adaXT.criteria.criteria.SquaredError)
- Predict class:
[PredictorQuantile](../api_docs/Predictor.md#adaXT.predictor.predictor.PredictQuantile)
- LeafBuilder class:
Expand Down Expand Up @@ -159,7 +159,7 @@ prediction and it is possible to predict several quantiles simultaneously.
For the `Gradient` tree type, the following default components are used:

- Criteria class:
[Partial_quadratic](../api_docs/Criteria.md#adaXT.criteria.criteria.Partial_quadratic)
[PartialQuadratic](../api_docs/Criteria.md#adaXT.criteria.criteria.PartialQuadratic)
- Predict class:
[PredictLocalPolynomial](../api_docs/Predictor.md#adaXT.predict.predict.PredictLocalPolynomial)
- LeafBuilder class:
Expand Down
8 changes: 4 additions & 4 deletions docs/user_guide/random_forest.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ example below.
import numpy as np
import matplotlib.pyplot as plt
from adaXT.random_forest import RandomForest
from adaXT.criteria import Partial_linear
from adaXT.criteria import PartialLinear
from adaXT.leaf_builder import LeafBuilderPartialLinear
from adaXT.predictor import PredictorLocalPolynomial

Expand All @@ -28,7 +28,7 @@ Xtest = np.linspace(-1, 1, 50).reshape(-1, 1)
# Fit a regular regression forest and a regression forest with linear splits
rf = RandomForest("Regression", min_samples_leaf=30)
rf_lin = RandomForest("Regression",
criteria=Partial_linear,
criteria=PartialLinear,
leaf_builder=LeafBuilderPartialLinear,
predictor=PredictorLocalPolynomial,
min_samples_leaf=30)
Expand All @@ -44,8 +44,8 @@ plt.show()
```

In this example, we fit a regular regression forest (which uses the
[Squared_error](../api_docs/Criteria.md)) and a regression forest that uses the
[Partial_linear](../api_docs/Criteria.md) splitting criteria and predicts a
[SquaredError](../api_docs/Criteria.md)) and a regression forest that uses the
[PartialLinear](../api_docs/Criteria.md) splitting criteria and predicts a
linear function in each leaf. As can be seen when running this example, the
forest with the linear splits is able to produce a better fit when both forests
are grown similarly deep.
Expand Down
6 changes: 3 additions & 3 deletions docs/user_guide/scikit_learn.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ there is the initial setup:

```python
from adaXT.decision_tree import DecisionTree
from adaXT.criteria import Gini_index, Entropy
from adaXT.criteria import GiniIndex, Entropy
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
Expand All @@ -36,14 +36,14 @@ param_grid = {
"min_samples_split": [2, 5, 10],
}

param_grid_ada = param_grid | {"criteria": [Gini_index, Entropy]}
param_grid_ada = param_grid | {"criteria": [GiniIndex, Entropy]}
param_grid_sk = param_grid | {"criterion": ["gini", "entropy"]}
```
Here, we import the necessary components and setup the parameter grids of the
two decision trees. One small difference to be aware of is that the parameter names
and format are different in some cases, e.g., in sklearn it is called criterion and
takes a string as input, while in adaXT it is called criteria and takes a criteria class
such as Gini_index, Entropy or perhaps your own [implementation](creatingCriteria.md).
such as GiniIndex, Entropy or perhaps your own [implementation](creatingCriteria.md).
Next, we define and fit the GridSearchCV instance.

```python
Expand Down
52 changes: 34 additions & 18 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os

NAME = "adaXT"
VERSION = "1.4.0"
VERSION = "1.5.0"
DESCRIPTION = "A Python package for tree-based regression and classification"
PROJECT_URLS = {
"Documentation": "https://NiklasPfister.github.io/adaXT/",
Expand All @@ -29,6 +29,7 @@
DEBUG = False

PROFILE = False
ANNOTATE = False

# Make all pyx files for the decision_tree
ext = ".pyx" if USE_CYTHON else ".cpp"
Expand Down Expand Up @@ -99,29 +100,44 @@ def run_build():
from Cython.Compiler.Options import get_directive_defaults

compiler_directives = get_directive_defaults()
compiler_directives.update(
{
"boundscheck": False,
"wraparound": False,
"cdivision": True,
"initializedcheck": False,
"nonecheck": False,
}
)

if PROFILE:
compiler_directives["profile"] = True
compiler_directives["linetrace"] = True
compiler_directives["binding"] = True

extensions = cythonize(
extensions,
gdb_debug=False,
annotate=True,
language_level="3",
compiler_directives=compiler_directives,
verbose=True,
)
arg_dir = {
"gdb_debug": False,
"language_level": "3",
"compiler_directives": compiler_directives,
"verbose": True,
}

if ANNOTATE:
arg_dir["annotate"] = True

if DEBUG:
compiler_directives.update(
{
"boundscheck": True,
"wraparound": True,
"cdivision": False,
"initializedcheck": True,
"nonecheck": True,
}
)
else:
compiler_directives.update(
{
"boundscheck": False,
"wraparound": False,
"cdivision": True,
"initializedcheck": False,
"nonecheck": False,
}
)

extensions = cythonize(extensions, **arg_dir)
setup(
name=NAME,
version=VERSION,
Expand Down
10 changes: 6 additions & 4 deletions src/adaXT/base_model.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from numpy import float64 as DOUBLE
from .predictor import Predictor
from .criteria import Criteria
from .criteria.criteria import Entropy, Squared_error, Partial_quadratic
from .criteria.criteria import Entropy, SquaredError, PartialQuadratic, MultiSquaredError
from .decision_tree.splitter import Splitter
from .leaf_builder import LeafBuilder

Expand Down Expand Up @@ -131,9 +131,11 @@ class BaseModel():
tree_types = {
"Classification": [Entropy, PredictorClassification,
LeafBuilderClassification],
"Regression": [Squared_error, PredictorRegression, LeafBuilderRegression],
"Gradient": [Partial_quadratic, PredictorLocalPolynomial, LeafBuilderPartialQuadratic],
"Quantile": [Squared_error, PredictorQuantile, LeafBuilderRegression]
"Regression": [SquaredError, PredictorRegression, LeafBuilderRegression],
"Gradient": [PartialQuadratic, PredictorLocalPolynomial, LeafBuilderPartialQuadratic],
"Quantile": [SquaredError, PredictorQuantile, LeafBuilderRegression],
"MultiRegression": [MultiSquaredError, PredictorRegression,
LeafBuilderRegression]
}
if tree_type in tree_types.keys():
# Set the defaults
Expand Down
18 changes: 11 additions & 7 deletions src/adaXT/criteria/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from .criteria cimport (
Criteria,
Gini_index,
Entropy,
Squared_error,
Partial_linear,
Partial_quadratic
)
ClassificationCriteria,
RegressionCriteria,
Criteria,
GiniIndex,
Entropy,
SquaredError,
PartialLinear,
PartialQuadratic,
MultiSquaredError
PairwiseEuclideanDistance
)
12 changes: 8 additions & 4 deletions src/adaXT/criteria/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from .criteria import (
Gini_index,
Squared_error,
ClassificationCriteria,
RegressionCriteria,
GiniIndex,
SquaredError,
Entropy,
Partial_linear,
Partial_quadratic,
PartialLinear,
PartialQuadratic,
Criteria,
MultiSquaredError,
PairwiseEuclideanDistance,
)
Loading