From eee2ea61a0246f631da725ac987fda0a918827a2 Mon Sep 17 00:00:00 2001 From: Boris Tseytlin Date: Sun, 30 Jan 2022 16:29:51 +0300 Subject: [PATCH 1/3] Remove old version of algorithm, add new recursive version --- benchmark.py | 58 ++-- .../ClassRegressor-new_mode_tests.ipynb | 0 .../ClassRegressor.ipynb | 0 .../housing_dataset_EDA_bench.ipynb | 2 +- regression_classifier/__init__.py | 3 +- regression_classifier/class_regressor.py | 162 ---------- regression_classifier/ensemble.py | 186 ------------ .../recursive_class_regressor.py | 185 ++++++++++++ tests/test_class_regressor.py | 131 -------- tests/test_ensemble.py | 70 ----- tests/test_ensemble_onelevel.py | 117 -------- tests/test_recursive_class_regressor.py | 284 ++++++++++++++++++ 12 files changed, 502 insertions(+), 696 deletions(-) rename ClassRegressor-new_mode_tests.ipynb => notebooks/ClassRegressor-new_mode_tests.ipynb (100%) rename ClassRegressor.ipynb => notebooks/ClassRegressor.ipynb (100%) rename housing_dataset_EDA_bench.ipynb => notebooks/housing_dataset_EDA_bench.ipynb (99%) delete mode 100644 regression_classifier/class_regressor.py delete mode 100644 regression_classifier/ensemble.py create mode 100644 regression_classifier/recursive_class_regressor.py delete mode 100644 tests/test_class_regressor.py delete mode 100644 tests/test_ensemble.py delete mode 100644 tests/test_ensemble_onelevel.py create mode 100644 tests/test_recursive_class_regressor.py diff --git a/benchmark.py b/benchmark.py index bacf979..51a89ed 100644 --- a/benchmark.py +++ b/benchmark.py @@ -1,3 +1,5 @@ +import json + import numpy as np import pandas as pd import scipy.stats @@ -10,9 +12,10 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, ElasticNet +from sklearn.ensemble import GradientBoostingRegressor from lightgbm import LGBMRegressor -from regression_classifier import ClassRegressorEnsemble, ClassRegressorOnelevelEnsemble +from regression_classifier import RecursiveClassRegressor def load_dataframe(): @@ -22,7 +25,7 @@ def load_dataframe(): return df -def run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space, search_n_iter=30): +def run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space, search_n_iter=50): search = RandomizedSearchCV(model, cv=KFold(n_splits=4), param_distributions=hparam_space, @@ -40,7 +43,7 @@ def run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space, search_ 'score': mae, } print(benchmark_result) - return benchmark_result + return search, benchmark_result def run_benchmarks(): @@ -53,17 +56,17 @@ def run_benchmarks(): Pipeline([ ('inputer', SimpleImputer()), ('scaler', StandardScaler()), - ('model', ClassRegressorEnsemble()), + ('model', RecursiveClassRegressor()), ]), Pipeline([ ('inputer', SimpleImputer()), ('scaler', StandardScaler()), - ('model', ClassRegressorOnelevelEnsemble()), + ('model', ElasticNet()), ]), Pipeline([ ('inputer', SimpleImputer()), ('scaler', StandardScaler()), - ('model', ElasticNet()), + ('model', GradientBoostingRegressor()), ]), Pipeline([ ('inputer', SimpleImputer()), @@ -73,37 +76,38 @@ def run_benchmarks(): ] hparam_spaces = [ - { - 'model__n_bins': [2, 5], - 'model__n_levels': [2, 5, 10, 30], - 'model__bin_calc_method': ['equal', 'percentile'], + { # RecursiveClassRegressor + 'model__n_bins': [2, 3, 5], + 'model__n_splits': [2, 3, 5, 10], + 'model__bins_calc_method': ['equal', 'percentile'], 'model__leaf_size': [10, 50, 100], - 'model__leaf_model_cls': [DummyRegressor, LinearRegression], + 'model__leaf_model_cls_name': ['DummyRegressor', 'LinearRegression'], }, - { - 'model__n_bins': [10, 20, 30], - 'model__bin_calc_method': ['equal', 'percentile'], - 'model__leaf_model_cls': [DummyRegressor, LinearRegression, None], - }, - { + { # ElasticNet 'model__alpha': scipy.stats.norm(0.5, 1), 'model__l1_ratio': scipy.stats.norm(0.5, 0.15), }, - { - 'model__max_depth': np.arange(-1, 20, 2), - 'model__subsample': np.arange(0.2, 1.2, 0.2), - 'model__n_estimators': np.arange(10, 310, 40), - }, - + # { # GradientBoostingRegressor + # 'model__max_depth': np.arange(-1, 20, 2), + # 'model__subsample': np.arange(0.2, 1.2, 0.2), + # 'model__n_estimators': np.arange(10, 310, 40), + # }, + # { # LGBMRegressor + # 'model__max_depth': np.arange(-1, 20, 2), + # 'model__subsample': np.arange(0.2, 1.2, 0.2), + # 'model__n_estimators': np.arange(10, 310, 40), + # }, ] + searches = {} results = {} for model, hparam_space in tqdm(zip(pipelines, hparam_spaces), total=len(pipelines)): - results[model.named_steps.model.__class__.__name__] = run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space) + model_name = model.named_steps.model.__class__.__name__ + searches[model_name], results[model_name] = run_benchmark(train_X, test_X, train_Y, test_Y, model, hparam_space) - return results + return searches, results if __name__ == '__main__': - results = run_benchmarks() - print(results) + searches, results = run_benchmarks() + print(json.dumps(results, sort_keys=True, indent=4, default=str)) diff --git a/ClassRegressor-new_mode_tests.ipynb b/notebooks/ClassRegressor-new_mode_tests.ipynb similarity index 100% rename from ClassRegressor-new_mode_tests.ipynb rename to notebooks/ClassRegressor-new_mode_tests.ipynb diff --git a/ClassRegressor.ipynb b/notebooks/ClassRegressor.ipynb similarity index 100% rename from ClassRegressor.ipynb rename to notebooks/ClassRegressor.ipynb diff --git a/housing_dataset_EDA_bench.ipynb b/notebooks/housing_dataset_EDA_bench.ipynb similarity index 99% rename from housing_dataset_EDA_bench.ipynb rename to notebooks/housing_dataset_EDA_bench.ipynb index 109b6b9..f98bdf1 100644 --- a/housing_dataset_EDA_bench.ipynb +++ b/notebooks/housing_dataset_EDA_bench.ipynb @@ -504,4 +504,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/regression_classifier/__init__.py b/regression_classifier/__init__.py index dd13b71..ddb88c8 100644 --- a/regression_classifier/__init__.py +++ b/regression_classifier/__init__.py @@ -1,3 +1,2 @@ -from .class_regressor import ClassRegressor, ClassRegressorOnelevel -from .ensemble import ClassRegressorEnsemble, ClassRegressorOnelevelEnsemble +from .recursive_class_regressor import ClassRegressorSplit, ClassRegressorTree, RecursiveClassRegressor from .utils import * diff --git a/regression_classifier/class_regressor.py b/regression_classifier/class_regressor.py deleted file mode 100644 index c96ea9b..0000000 --- a/regression_classifier/class_regressor.py +++ /dev/null @@ -1,162 +0,0 @@ -import numpy as np -import pandas as pd -from sklearn.dummy import DummyClassifier, DummyRegressor - -from sklearn.linear_model import LogisticRegression - -from .utils import bins_calc - - -class ClassRegressor(): - """Модель, обучающая классификатор по заданным границам таргета""" - def __init__(self, n_bins=2, bins_calc_method='equal', leaf_model_cls=DummyRegressor): - """ - Инициализация - n_bins - количество бинов, на которые делятся данные на каждом уровне - bins_calc_method - метод разделения таргет-переменной на бины ('equal', 'percentile') - leaf_model_cls - модель регрессии на листовых бинах - """ - self.n_bins = n_bins - self.bins_calc_method = bins_calc_method - self.leaf_model_cls = leaf_model_cls - - self.bin_borders = None - self.leaf_model_ex = {} - - def set_params(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - - def fit(self, X, y): - """ - Обучение модели - X - таблица с входными данными - y - столбец с таргет-переменной - """ - - if isinstance(X, pd.DataFrame): - X = X.values - if isinstance(y, pd.Series): - y = y.values - - X = np.array(X) - y = np.array(y) - - bin_edges = bins_calc(y, n_bins=self.n_bins, method=self.bins_calc_method) - self.bin_borders = np.zeros((len(bin_edges) - 1, 2)) - - for i in range(len(bin_edges) - 1): - self.bin_borders[i] = np.array([bin_edges[i], bin_edges[i+1]]) - - self.y_classes = pd.cut(y, bins=bin_edges, labels=False, include_lowest=True) - for label, _ in enumerate(self.bin_borders): - bin_y = y[self.y_classes == label] - if len(bin_y) == 0: - continue - bin_X = X[self.y_classes == label] - self.leaf_model_ex[label] = self.leaf_model_cls() - self.leaf_model_ex[label].fit(bin_X, bin_y) - - if X.shape[1] > X.shape[0]: - self.model = DummyClassifier(strategy='most_frequent') - else: - self.model = LogisticRegression(n_jobs=1) - self.model.fit(X, self.y_classes) - - return self - - def predict(self, X, regression=False): - """ - Предиктор - X - таблица с входными данными - """ - - if isinstance(X, pd.DataFrame): - X = X.values - X = np.array(X) - - pred = self.model.predict(X) - if regression: - preds = np.zeros((len(X),)) - for pred_class in np.unique(pred): - idx = np.array(range(len(X)))[pred==pred_class] - class_X = X[idx] - preds[idx] = self.leaf_model_ex[pred_class].predict(class_X) - return preds - return pred - - -class ClassRegressorOnelevel(): - """Модель, обучающая бинарный классификатор по заданной границе таргета""" - - def __init__(self, bin_edges, leaf_model_cls=None): - """ - Инициализация - bin_edges - граница для деления данных на 2 бина - leaf_model_cls - модель регрессии на листовых бинах - """ - self.bin_edges = bin_edges - self.leaf_model_cls = leaf_model_cls - - self.bin_borders = {} - self.bin_predictions = np.zeros((2, )) - self.leaf_model_ex = {} - - def fit(self, X, y): - """ - Обучение модели - X - таблица с входными данными - y - столбец с таргет-переменной - """ - - if isinstance(X, pd.DataFrame): - X = X.values - if isinstance(y, pd.Series): - y = y.values - - X = np.array(X) - y = np.array(y) - - for i in range(len(self.bin_edges) - 1): - self.bin_borders[i] = np.array([self.bin_edges[i], self.bin_edges[i+1]]) - - self.y_classes = np.digitize(y, self.bin_edges, right=True) - 1 - - if not self.leaf_model_cls: - self.bin_predictions[0] = self.bin_edges[1] - self.bin_predictions[1] = self.bin_edges[1] - else: - for label in [0, 1]: - bin_y = y[self.y_classes == label] - bin_X = X[self.y_classes == label] - self.leaf_model_ex[label] = self.leaf_model_cls() - self.leaf_model_ex[label].fit(bin_X, bin_y) - - if X.shape[1] > X.shape[0]: - self.model = DummyClassifier(strategy='most_frequent') - else: - self.model = LogisticRegression(n_jobs=1) - - self.model.fit(X, self.y_classes) - - return self - - def predict(self, X, regression=False): - """ - Предиктор - X - таблица с входными данными - """ - - if isinstance(X, pd.DataFrame): - X = X.values - X = np.array(X) - - pred = self.model.predict(X) - - if regression: - if not self.leaf_model_cls: - pred = self.bin_predictions[pred] - else: - pred = [self.leaf_model_ex[p].predict(X[i].reshape(1, -1)) for i, p in enumerate(pred)] - - return pred diff --git a/regression_classifier/ensemble.py b/regression_classifier/ensemble.py deleted file mode 100644 index 349271c..0000000 --- a/regression_classifier/ensemble.py +++ /dev/null @@ -1,186 +0,0 @@ -import numpy as np -import pandas as pd -from sklearn import metrics -from sklearn.dummy import DummyRegressor - -from .class_regressor import ClassRegressor, ClassRegressorOnelevel - -from .utils import bins_calc - - -class ClassRegressorEnsemble(): - """Комплексная модель с ансамблем одноуровневых моделей классификации""" - - def __init__(self, n_bins=2, n_levels=2, bins_calc_method='equal', leaf_size=1, leaf_model_cls=DummyRegressor): - """ - Инициализация - n_bins - количество бинов, на которые делятся данные на каждом уровне - n_levels - количество уровней деления - bins_calc_method - метод разделения таргет-переменной на бины ('equal', 'percentile') - leaf_size - минимальный размер листового (неделимого) бина - leaf_model_cls - модель регрессора для предсказаний на листовых бинах - """ - self.n_bins = n_bins - self.n_levels = n_levels - self.bins_calc_method = bins_calc_method - self.leaf_size = leaf_size - self.leaf_model_cls = leaf_model_cls - - self.models = {} - - def set_params(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - - def _fit_recur(self, X, y, level, bin_index): - - bin_index_tuple = tuple(bin_index) - - y_uniq = len(np.unique(y)) - - if (level >= self.n_levels) or (len(y) < self.leaf_size) or (y_uniq < self.n_bins) or (y_uniq < 2): - return - - model = ClassRegressor(n_bins=self.n_bins, bins_calc_method=self.bins_calc_method, leaf_model_cls=self.leaf_model_cls) - model.fit(X, y) - self.models[(level, bin_index_tuple)] = model - - for i, bin_border in enumerate(model.bin_borders): - if i > 0: - bin_idx = (y > bin_border[0]) & (y <= bin_border[1]) - else: - bin_idx = (y >= bin_border[0]) & (y <= bin_border[1]) - - X_subset, y_subset = X[bin_idx], y[bin_idx] - if len(y_subset) == 0: - continue - - self._fit_recur( - X_subset, - y_subset, - level=level+1, - bin_index=bin_index_tuple + (i,), - ) - - def fit(self, X, y): - """ - Обучение модели - X - таблица с входными данными - y - столбец с таргет-переменной - """ - - if isinstance(X, pd.DataFrame): - X = X.values - if isinstance(y, pd.Series): - y = y.values - - X = np.array(X) - y = np.array(y) - - self._fit_recur(X, y, 0, [0]) - - def predict(self, X): - if isinstance(X, pd.DataFrame): - X = X.values - X = np.array(X) - - pred = np.empty((X.shape[0], )) - for i, x in enumerate(X): - cur_level = 0 - cur_bin = tuple([0]) - clf = None - - while cur_level <= self.n_levels: - if (cur_level, cur_bin) in self.models: - clf = self.models[(cur_level, cur_bin)] - predicted_class = clf.predict([x])[0] - cur_level += 1 - cur_bin += (predicted_class,) - else: - pred[i] = clf.predict([x], regression=True)[0] - break - - return pred - - -class ClassRegressorOnelevelEnsemble(): - """Комплексная модель, состоящая из ансамбля бинарных моделей классификации с переменной границей между классами""" - - def __init__(self, n_bins=100, bins_calc_method='equal', leaf_model_cls=None): - """ - Инициализация - n_bins - количество вариантов деления даанных на два бина - bins_calc_method - метод разделения таргет-переменной на бины ('equal', 'percentile') - leaf_model_cls - модель регрессора для предсказаний на листовых бинах - """ - self.n_bins = n_bins - self.bins_calc_method = bins_calc_method - self.leaf_model_cls = leaf_model_cls - - self.bin_edges = {} - self.models = {} - - def set_params(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - - def fit(self, X, y): - """ - Обучение модели - X - таблица с входными данными - y - столбец с таргет-переменной - """ - - if isinstance(X, pd.DataFrame): - X = X.values - if isinstance(y, pd.Series): - y = y.values - - self.bin_edges = bins_calc(y, n_bins=self.n_bins, method=self.bins_calc_method) - self.bin_edges[0] = self.bin_edges[0] - 1e-10 - - for bin_i, bin_border in enumerate(self.bin_edges[1:-1]): - bin_edges = np.array([self.bin_edges[0], bin_border, self.bin_edges[-1]]) - - model = ClassRegressorOnelevel(bin_edges=bin_edges, leaf_model_cls=self.leaf_model_cls) - model.fit(X, y) - self.models[bin_i+1] = model - - def predict(self, X): - if isinstance(X, pd.DataFrame): - X = X.values - X = np.array(X) - - pred = np.empty((X.shape[0], )) - - for i, x in enumerate(X): - start_bin = int(self.n_bins / 2) - - clf = self.models[start_bin] - start_class = clf.predict([x])[0] - - if start_class == 0: - bins_range = list(range(start_bin, 0, -1)) - elif start_class == 1: - bins_range = list(range(start_bin, len(self.bin_edges)-1, 1)) - else: - raise Exception('Bin error') - - prev_class = start_class - cur_class = prev_class - prev_clf = clf - for cur_bin in bins_range[1:]: - clf = self.models[cur_bin] - cur_class = clf.predict([x])[0] - - if cur_class != prev_class: - break - prev_class = cur_class - prev_clf = clf - - if cur_class != prev_class: - pred[i] = np.mean([clf.predict([x], regression=True)[0], prev_clf.predict([x], regression=True)[0]]) - else: - pred[i] = clf.predict([x], regression=True)[0] - - return pred diff --git a/regression_classifier/recursive_class_regressor.py b/regression_classifier/recursive_class_regressor.py new file mode 100644 index 0000000..bf93bc0 --- /dev/null +++ b/regression_classifier/recursive_class_regressor.py @@ -0,0 +1,185 @@ +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin +from sklearn.dummy import DummyClassifier, DummyRegressor + +from sklearn.linear_model import LogisticRegression, LinearRegression +from .utils import bins_calc + + +class ClassRegressorSplit(BaseEstimator, ClassifierMixin): + def __init__(self, n_bins=2, bins_calc_method='equal'): + self.n_bins = n_bins + self.bins_calc_method = bins_calc_method + + self.bin_borders = None + self.bin_idx = None + + def fit(self, X, y): + if isinstance(X, pd.DataFrame): + X = X.values + if isinstance(y, pd.Series): + y = y.values + + X = np.array(X) + y = np.array(y) + + bin_edges = bins_calc(y, n_bins=self.n_bins, method=self.bins_calc_method) + self.y_classes = pd.cut(y, bins=bin_edges, labels=False, include_lowest=True) + + self.bin_borders = np.zeros((len(bin_edges) - 1, 2)) + for i in range(len(bin_edges) - 1): + self.bin_borders[i] = np.array([bin_edges[i], bin_edges[i+1]]) + + + if X.shape[1] > X.shape[0]: + self.model = DummyClassifier(strategy='most_frequent') + else: + self.model = LogisticRegression(n_jobs=1) + self.model.fit(X, self.y_classes) + return self + + def predict_proba(self, X): + if isinstance(X, pd.DataFrame): + X = X.values + X = np.array(X) + + return self.model.predict_proba(X) + + def predict(self, X): + if isinstance(X, pd.DataFrame): + X = X.values + X = np.array(X) + + return self.model.predict(X) + + +class ClassRegressorTree(BaseEstimator, RegressorMixin): + def __init__(self, + n_bins=2, + n_splits=2, + bins_calc_method='equal', + leaf_model_cls_name='DummyRegressor', + leaf_model_kwargs={}, + leaf_size=1, + level=0): + self.n_bins = n_bins + self.n_splits = n_splits + self.bins_calc_method = bins_calc_method + self.leaf_model_cls_name = leaf_model_cls_name + self.leaf_model_kwargs = leaf_model_kwargs + self.leaf_size = leaf_size + + self.level = level + self.split = None + self.child_models = {} + self.bin_idx = None + + @property + def leaf_model_cls(self): + str_to_cls = { + 'DummyRegressor': DummyRegressor, + 'LinearRegression': LinearRegression, + } + return str_to_cls[self.leaf_model_cls_name] + + def get_child_model(self, X, y): + y_uniq = len(np.unique(y)) + if (self.level >= self.n_splits) or (len(y) < self.leaf_size) or (y_uniq < self.n_bins) or (y_uniq < 2): + return self.leaf_model_cls(**(self.leaf_model_kwargs)) + else: + return ClassRegressorTree( + level=self.level+1, + n_bins=self.n_bins, + n_splits=self.n_splits, + bins_calc_method=self.bins_calc_method, + leaf_model_cls_name=self.leaf_model_cls_name, + leaf_size=self.leaf_size, + ) + + def fit(self, X, y): + if isinstance(X, pd.DataFrame): + X = X.values + if isinstance(y, pd.Series): + y = y.values + + X = np.array(X) + y = np.array(y) + + split_model = ClassRegressorSplit( + n_bins=self.n_bins, + bins_calc_method=self.bins_calc_method, + ) + split_model.fit(X, y) + self.split = split_model + + for i, bin_border in enumerate(self.split.bin_borders): + if i > 0: + bin_idx = (y > bin_border[0]) & (y <= bin_border[1]) + else: + bin_idx = (y >= bin_border[0]) & (y <= bin_border[1]) + + X_subset, y_subset = X[bin_idx], y[bin_idx] + if len(y_subset) == 0: + continue + + child_model = self.get_child_model(X_subset, y_subset) + child_model.fit(X_subset, y_subset) + self.child_models[i] = child_model + + def predict(self, X, classification=False): + + proba = self.split.predict_proba(X) + pred = np.argmax(proba, axis=1) + if classification: + return pred, proba + + preds = np.zeros((len(X),)) + for bin_i, child_model in self.child_models.items(): + child_prediction = child_model.predict(X) + preds += proba[:, bin_i] * child_prediction + return preds + + +class RecursiveClassRegressor(BaseEstimator, RegressorMixin): + def __init__(self, + n_bins=2, + n_splits=2, + bins_calc_method='equal', + leaf_model_cls_name='DummyRegressor', # Have to make it a string or sklearn hparam opt doesnt work + leaf_model_kwargs={}, + leaf_size=1, + ): + self.n_bins = n_bins + self.n_splits = n_splits + self.bins_calc_method = bins_calc_method + self.leaf_model_cls_name = leaf_model_cls_name + self.leaf_model_kwargs = leaf_model_kwargs + self.leaf_size = leaf_size + + self.tree = ClassRegressorTree( + n_bins=self.n_bins, + n_splits=self.n_splits, + bins_calc_method=self.bins_calc_method, + leaf_model_cls_name=self.leaf_model_cls_name, + leaf_model_kwargs=self.leaf_model_kwargs, + leaf_size=self.leaf_size, + ) + + def fit(self, X, y): + if isinstance(X, pd.DataFrame): + X = X.values + if isinstance(y, pd.Series): + y = y.values + + X = np.array(X) + y = np.array(y) + + self.tree.fit(X, y) + + def predict(self, X): + if isinstance(X, pd.DataFrame): + X = X.values + X = np.array(X) + + return self.tree.predict(X) diff --git a/tests/test_class_regressor.py b/tests/test_class_regressor.py deleted file mode 100644 index dbb395e..0000000 --- a/tests/test_class_regressor.py +++ /dev/null @@ -1,131 +0,0 @@ -import numpy as np -import pytest -from sklearn.dummy import DummyRegressor -from sklearn.linear_model import LinearRegression -from sklearn.metrics import mean_absolute_error, mean_squared_error - -from regression_classifier import ClassRegressor - - -class TestClassRegressor: - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_fit_two_bins(self, bins_calc_method): - clf = ClassRegressor(n_bins=2, bins_calc_method=bins_calc_method) - - X = [[1], [2]] - y = [1, 2] - - clf.fit(X, y) - - assert clf.bin_borders.shape == (2, 2) - assert clf.y_classes.tolist() == [0, 1] - - assert clf.predict(X).tolist() == [0, 1] - assert clf.predict(X, regression=True).tolist() == [1, 2] - - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_fit_three_bins(self, bins_calc_method): - clf = ClassRegressor(n_bins=3, bins_calc_method=bins_calc_method) - - X = [[1], [2], [3]] - y = [1, 2, 3] - - clf.fit(X, y) - - assert clf.bin_borders.shape == (3, 2) - assert clf.y_classes.tolist() == [0, 1, 2] - - assert clf.predict(X).tolist() == [0, 1, 2] - assert clf.predict(X, regression=True).tolist() == [1, 2, 3] - - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_better_than_dummy(self, airbnb_split, bins_calc_method): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - clf = ClassRegressor(n_bins=2, bins_calc_method=bins_calc_method) - clf.fit(X_train_scaled, y_train) - - pred_train = clf.predict(X_train_scaled, regression=True) - pred_test = clf.predict(X_test_scaled, regression=True) - train_mae = mean_absolute_error(y_train, pred_train) - test_mae = mean_absolute_error(y_test, pred_test) - - dummy_regr = DummyRegressor(strategy="mean") - dummy_regr.fit(X_train_scaled, y_train) - - dummy_pred_train = dummy_regr.predict(X_train_scaled) - dummy_pred_test = dummy_regr.predict(X_test_scaled) - dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) - dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) - - assert train_mae <= dummy_train_mae - assert test_mae <= dummy_test_mae - - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_classes_are_classes(self, airbnb_split, bins_calc_method): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - N_BINS = 10 - clf = ClassRegressor(n_bins=N_BINS, bins_calc_method=bins_calc_method) - clf.fit(X_train_scaled, y_train) - - classes_list = clf.y_classes.tolist() - assert min(classes_list) == 0 - assert max(classes_list) == N_BINS-1 - assert classes_list == [int(classes_list) for classes_list in classes_list] - - pred_test_classes = clf.predict(X_test_scaled) - pred_classes_list = np.unique(pred_test_classes).tolist() - assert min(pred_classes_list) >= 0 - assert max(pred_classes_list) <= N_BINS-1 - assert pred_classes_list == [int(pred_classes_list) for pred_classes_list in pred_classes_list] - - @pytest.mark.parametrize("bins_calc_method, output", [('equal', [[1.0, 5.0], [5.0, 9.0]]), - ('percentile', [[1.0, 2.5], [2.5, 9.0]])]) - def test_bins_equal(self, bins_calc_method, output): - clf = ClassRegressor(n_bins=2, bins_calc_method=bins_calc_method) - - X = [[1], [2], [3], [9]] - y = [1, 2, 3, 9] - - clf.fit(X, y) - - assert clf.bin_borders.tolist() == output - - def test_linreg_is_better_than_none(self): - X = np.array(list(range(100))).reshape(-1, 1).tolist() - y = list(range(100)) - - clf = ClassRegressor(n_bins=5) - clf.fit(X, y) - - pred_train = clf.predict(X, regression=True) - train_mae = mean_absolute_error(y, pred_train) - - clf_linreg = ClassRegressor(n_bins=5, leaf_model_cls=LinearRegression) - clf_linreg.fit(X, y) - - pred_train_linreg = clf_linreg.predict(X, regression=True) - train_mae_linreg = mean_absolute_error(y, pred_train_linreg) - - assert train_mae_linreg < train_mae - - def test_perc_better_than_equal(self, airbnb_split): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - - clf_eq = ClassRegressor(n_bins=2, bins_calc_method='equal') - clf_eq.fit(X_train_scaled, y_train) - - pred_train_eq = clf_eq.predict(X_train_scaled, regression=True) - pred_test_eq = clf_eq.predict(X_test_scaled, regression=True) - train_mse_eq = mean_squared_error(y_train, pred_train_eq) - test_mse_eq = mean_squared_error(y_test, pred_test_eq) - - clf_perc = ClassRegressor(n_bins=2, bins_calc_method='percentile') - clf_perc.fit(X_train_scaled, y_train) - - pred_train_perc = clf_perc.predict(X_train_scaled, regression=True) - pred_test_perc = clf_perc.predict(X_test_scaled, regression=True) - train_mse_perc = mean_squared_error(y_train, pred_train_perc) - test_mse_perc = mean_squared_error(y_test, pred_test_perc) - - assert train_mse_perc < train_mse_eq - assert test_mse_perc < test_mse_eq diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py deleted file mode 100644 index 3c377ea..0000000 --- a/tests/test_ensemble.py +++ /dev/null @@ -1,70 +0,0 @@ -import pytest -from sklearn.dummy import DummyRegressor -from sklearn.metrics import mean_absolute_error - -from regression_classifier import ClassRegressorEnsemble - - -class TestEnsemble: - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_fit_two_bins_two_levels(self, bins_calc_method): - X = [[1], [2], [3], [4]] - y = [1, 2, 3, 4] - - model = ClassRegressorEnsemble(n_bins=2, n_levels=2, bins_calc_method=bins_calc_method) - - model.fit(X, y) - - print(model.models) - assert len(model.models) == 3 - assert model.models[(0, (0,))].predict(X).tolist() == [0, 0, 1, 1] - assert model.models[(1, (0, 0))].predict(X).tolist() == [0, 1, 1, 1] - assert model.models[(1, (0, 1))].predict(X).tolist() == [0, 0, 0, 1] - - assert model.predict(X).tolist() == y - - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_better_than_dummy(self, airbnb_split, bins_calc_method): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - model = ClassRegressorEnsemble(n_bins=2, n_levels=2, bins_calc_method=bins_calc_method) - model.fit(X_train_scaled, y_train) - - pred_train = model.predict(X_train_scaled) - pred_test = model.predict(X_test_scaled) - train_mae = mean_absolute_error(y_train, pred_train) - test_mae = mean_absolute_error(y_test, pred_test) - - dummy_regr = DummyRegressor(strategy="mean") - dummy_regr.fit(X_train_scaled, y_train) - - dummy_pred_train = dummy_regr.predict(X_train_scaled) - dummy_pred_test = dummy_regr.predict(X_test_scaled) - dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) - dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) - - assert train_mae <= dummy_train_mae - assert test_mae <= dummy_test_mae - - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_fit_many_levels_better_than_dummy(self, airbnb_split, bins_calc_method): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - model = ClassRegressorEnsemble(n_bins=2, n_levels=4, bins_calc_method=bins_calc_method) - model.fit(X_train_scaled, y_train) - - assert len(model.models) == 1 + 2 + 2*2 + 4*2 - - pred_train = model.predict(X_train_scaled) - pred_test = model.predict(X_test_scaled) - train_mae = mean_absolute_error(y_train, pred_train) - test_mae = mean_absolute_error(y_test, pred_test) - - dummy_regr = DummyRegressor(strategy="mean") - dummy_regr.fit(X_train_scaled, y_train) - - dummy_pred_train = dummy_regr.predict(X_train_scaled) - dummy_pred_test = dummy_regr.predict(X_test_scaled) - dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) - dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) - - assert train_mae <= dummy_train_mae - assert test_mae <= dummy_test_mae diff --git a/tests/test_ensemble_onelevel.py b/tests/test_ensemble_onelevel.py deleted file mode 100644 index f3677a2..0000000 --- a/tests/test_ensemble_onelevel.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import pytest -from sklearn.dummy import DummyRegressor -from sklearn.linear_model import LinearRegression -from sklearn.metrics import mean_absolute_error, mean_squared_error - -from regression_classifier import ClassRegressorOnelevel, ClassRegressorOnelevelEnsemble, ClassRegressorEnsemble - - -class TestOnelevelEnsemble: - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_fit_two_bins_two_levels(self, bins_calc_method): - X = [[1], [2]] - y = [1, 2] - - model = ClassRegressorOnelevelEnsemble(n_bins=4, bins_calc_method=bins_calc_method) - - model.fit(X, y) - - assert len(model.models) == 3 - assert model.predict(X).tolist() == [1.25, 1.75] - - @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) - def test_better_than_dummy(self, airbnb_split, bins_calc_method): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - model = ClassRegressorOnelevelEnsemble(n_bins=10, bins_calc_method=bins_calc_method) - model.fit(X_train_scaled, y_train) - - pred_train = model.predict(X_train_scaled) - pred_test = model.predict(X_test_scaled) - train_mae = mean_absolute_error(y_train, pred_train) - test_mae = mean_absolute_error(y_test, pred_test) - - dummy_regr = DummyRegressor(strategy="mean") - dummy_regr.fit(X_train_scaled, y_train) - - dummy_pred_train = dummy_regr.predict(X_train_scaled) - dummy_pred_test = dummy_regr.predict(X_test_scaled) - dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) - dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) - - assert train_mae <= dummy_train_mae - assert test_mae <= dummy_test_mae - - @pytest.mark.parametrize("bins_calc_method, output", [('equal', [3, 5, 7, 9]), - ('percentile', [1.75, 2.5, 4.5, 9.0])]) - def test_bins_equal(self, bins_calc_method, output): - clf = ClassRegressorOnelevelEnsemble(n_bins=4, bins_calc_method=bins_calc_method) - - X = [[1], [2], [3], [9]] - y = [1, 2, 3, 9] - - clf.fit(X, y) - - assert clf.bin_edges[1:].tolist() == output - - def test_linreg_is_better_than_none(self): - X = np.array(list(range(100))).reshape(-1, 1).tolist() - y = list(range(100)) - - clf = ClassRegressorOnelevelEnsemble(n_bins=10) - clf.fit(X, y) - - pred_train = clf.predict(X) - train_mae = mean_absolute_error(y, pred_train) - - clf_linreg = ClassRegressorOnelevelEnsemble(n_bins=10, leaf_model_cls=LinearRegression) - clf_linreg.fit(X, y) - - pred_train_linreg = clf_linreg.predict(X) - train_mae_linreg = mean_absolute_error(y, pred_train_linreg) - - assert train_mae_linreg < train_mae - - def test_perc_better_than_equal(self, airbnb_split): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - - clf_eq = ClassRegressorOnelevelEnsemble(n_bins=10, bins_calc_method='equal') - clf_eq.fit(X_train_scaled, y_train) - - pred_train_eq = clf_eq.predict(X_train_scaled) - pred_test_eq = clf_eq.predict(X_test_scaled) - train_mse_eq = mean_squared_error(y_train, pred_train_eq) - test_mse_eq = mean_squared_error(y_test, pred_test_eq) - - clf_perc = ClassRegressorOnelevelEnsemble(n_bins=10, bins_calc_method='percentile') - clf_perc.fit(X_train_scaled, y_train) - - pred_train_perc = clf_perc.predict(X_train_scaled) - pred_test_perc = clf_perc.predict(X_test_scaled) - train_mse_perc = mean_squared_error(y_train, pred_train_perc) - test_mse_perc = mean_squared_error(y_test, pred_test_perc) - - assert train_mse_perc < train_mse_eq - assert test_mse_perc < test_mse_eq - - def test_onelevel_is_better_than_normal(self, airbnb_split): - X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split - - clf_onelevel = ClassRegressorOnelevelEnsemble(n_bins=20, bins_calc_method='equal') - clf_onelevel.fit(X_train_scaled, y_train) - - pred_train_onelevel = clf_onelevel.predict(X_train_scaled) - pred_test_onelevel = clf_onelevel.predict(X_test_scaled) - train_mse_onelevel = mean_squared_error(y_train, pred_train_onelevel) - test_mse_onelevel = mean_squared_error(y_test, pred_test_onelevel) - - clf_norm = ClassRegressorEnsemble(n_bins=2, n_levels=5, bins_calc_method='equal') - clf_norm.fit(X_train_scaled, y_train) - - pred_train_norm = clf_norm.predict(X_train_scaled) - pred_test_norm = clf_norm.predict(X_test_scaled) - train_mse_norm = mean_squared_error(y_train, pred_train_norm) - test_mse_norm = mean_squared_error(y_test, pred_test_norm) - - assert train_mse_norm > train_mse_onelevel - assert test_mse_norm > test_mse_onelevel diff --git a/tests/test_recursive_class_regressor.py b/tests/test_recursive_class_regressor.py new file mode 100644 index 0000000..ff2e8f8 --- /dev/null +++ b/tests/test_recursive_class_regressor.py @@ -0,0 +1,284 @@ +import numpy as np +import pytest +from sklearn.dummy import DummyRegressor +from sklearn.metrics import mean_absolute_error, mean_squared_error + +from regression_classifier import ClassRegressorSplit, ClassRegressorTree, RecursiveClassRegressor + + +class TestClassRegressorTree: + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_fit_two_bins(self, bins_calc_method): + clf = ClassRegressorTree(n_bins=2, + n_splits=1, + bins_calc_method=bins_calc_method) + + X = [[1], [2]] + y = [1, 2] + + clf.fit(X, y) + + assert clf.split.bin_borders.shape == (2, 2) + assert clf.split.y_classes.tolist() == [0, 1] + + assert isinstance(clf.child_models[0], clf.leaf_model_cls) + assert isinstance(clf.child_models[1], clf.leaf_model_cls) + + pred, proba = clf.predict(X, classification=True) + assert pred.tolist() == [0, 1] + reg_pred = clf.predict(X) + assert 1.5 > reg_pred[0] >= 1 + assert 2 >= reg_pred[1] > 1.5 + + def test_fit_two_splits(self): + clf = ClassRegressorTree(n_bins=2, + n_splits=2) + + X = [[1], [2], [3], [4]] + y = [1, 2, 3, 4] + + clf.fit(X, y) + + assert clf.split.bin_borders.shape == (2, 2) + assert clf.split.y_classes.tolist() == [0, 0, 1, 1] + + assert isinstance(clf.child_models[0], ClassRegressorTree) + assert isinstance(clf.child_models[0].child_models[0], clf.leaf_model_cls) + assert isinstance(clf.child_models[0].child_models[1], clf.leaf_model_cls) + assert isinstance(clf.child_models[1], ClassRegressorTree) + assert isinstance(clf.child_models[1].child_models[0], clf.leaf_model_cls) + assert isinstance(clf.child_models[1].child_models[1], clf.leaf_model_cls) + + pred, proba = clf.predict(X, classification=True) + assert pred.tolist() == [0, 0, 1, 1] + reg_pred = clf.predict(X) + assert 2 > reg_pred[0] + assert 3 >= reg_pred[1] + assert 3 >= reg_pred[2] + assert 4 >= reg_pred[3] + + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_fit_three_bins(self, bins_calc_method): + clf = ClassRegressorTree(n_bins=3, + n_splits=1, + bins_calc_method=bins_calc_method) + + X = [[1], [2], [3]] + y = [1, 2, 3] + + clf.fit(X, y) + + assert clf.split.bin_borders.shape == (3, 2) + assert clf.split.y_classes.tolist() == [0, 1, 2] + + assert isinstance(clf.child_models[0], clf.leaf_model_cls) + assert isinstance(clf.child_models[1], clf.leaf_model_cls) + + pred, proba = clf.predict(X, classification=True) + assert pred.tolist() == [0, 1, 2] + reg_pred = clf.predict(X) + assert 2 > reg_pred[0] >= 1 + assert 2.2 >= reg_pred[1] > 1.5 + assert 3 >= reg_pred[2] > 2.2 + + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_better_than_dummy(self, airbnb_split, bins_calc_method): + X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split + clf = ClassRegressorTree(n_bins=2, + n_splits=1, + bins_calc_method=bins_calc_method) + clf.fit(X_train_scaled, y_train) + + pred_train = clf.predict(X_train_scaled) + pred_test = clf.predict(X_test_scaled) + train_mae = mean_absolute_error(y_train, pred_train) + test_mae = mean_absolute_error(y_test, pred_test) + + dummy_regr = DummyRegressor(strategy="mean") + dummy_regr.fit(X_train_scaled, y_train) + + dummy_pred_train = dummy_regr.predict(X_train_scaled) + dummy_pred_test = dummy_regr.predict(X_test_scaled) + dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) + dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) + + assert train_mae <= dummy_train_mae + assert test_mae <= dummy_test_mae + + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_classes_are_classes(self, airbnb_split, bins_calc_method): + X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split + n_bins = 5 + clf = ClassRegressorTree(n_bins=n_bins, + n_splits=1, + bins_calc_method=bins_calc_method) + clf.fit(X_train_scaled, y_train) + + classes_list = clf.split.y_classes.tolist() + assert min(classes_list) == 0 + assert max(classes_list) == n_bins - 1 + assert classes_list == [int(classes_list) for classes_list in classes_list] + + pred_test_classes, probas = clf.predict(X_test_scaled, classification=True) + pred_classes_list = np.unique(pred_test_classes).tolist() + assert min(pred_classes_list) >= 0 + assert max(pred_classes_list) <= n_bins - 1 + assert pred_classes_list == [int(pred_classes_list) for pred_classes_list in pred_classes_list] + + def test_linreg_is_better_than_none(self): + X = np.array(list(range(100))).reshape(-1, 1).tolist() + y = list(range(100)) + + clf = ClassRegressorTree(n_bins=5) + clf.fit(X, y) + + pred_train = clf.predict(X) + train_mae = mean_absolute_error(y, pred_train) + + clf_linreg = ClassRegressorTree(n_bins=5, leaf_model_cls_name='LinearRegression') + clf_linreg.fit(X, y) + + pred_train_linreg = clf_linreg.predict(X) + train_mae_linreg = mean_absolute_error(y, pred_train_linreg) + + assert train_mae_linreg < train_mae + + def test_perc_better_than_equal(self, airbnb_split): + X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split + + clf_eq = ClassRegressorTree(n_bins=2, bins_calc_method='equal') + clf_eq.fit(X_train_scaled, y_train) + + pred_train_eq = clf_eq.predict(X_train_scaled) + pred_test_eq = clf_eq.predict(X_test_scaled) + train_mse_eq = mean_squared_error(y_train, pred_train_eq) + test_mse_eq = mean_squared_error(y_test, pred_test_eq) + + clf_perc = ClassRegressorTree(n_bins=2, bins_calc_method='percentile') + clf_perc.fit(X_train_scaled, y_train) + + pred_train_perc = clf_perc.predict(X_train_scaled) + pred_test_perc = clf_perc.predict(X_test_scaled) + train_mse_perc = mean_squared_error(y_train, pred_train_perc) + test_mse_perc = mean_squared_error(y_test, pred_test_perc) + + assert train_mse_perc < train_mse_eq + assert test_mse_perc < test_mse_eq + + +class TestClassRegressorSplit: + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_fit_two_bins(self, bins_calc_method): + clf = ClassRegressorSplit(n_bins=2, bins_calc_method=bins_calc_method) + + X = [[1], [2]] + y = [1, 2] + + clf.fit(X, y) + + assert clf.bin_borders.shape == (2, 2) + assert clf.y_classes.tolist() == [0, 1] + + pred = clf.predict(X) + proba = clf.predict_proba(X) + assert pred.tolist() == [0, 1] + assert proba.shape == (2, 2) + + @pytest.mark.parametrize("bins_calc_method, output", [('equal', [[1.0, 5.0], [5.0, 9.0]]), + ('percentile', [[1.0, 2.5], [2.5, 9.0]])]) + def test_bins_equal(self, bins_calc_method, output): + clf = ClassRegressorSplit(n_bins=2, bins_calc_method=bins_calc_method) + + X = [[1], [2], [3], [9]] + y = [1, 2, 3, 9] + + clf.fit(X, y) + + assert clf.bin_borders.tolist() == output + + +class TestRecursiveClassRegressor: + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_fit_two_bins_two_levels(self, bins_calc_method): + X = [[1], [2], [3], [4]] + y = [1, 2, 3, 4] + + model = RecursiveClassRegressor( + n_bins=2, + n_splits=2, + bins_calc_method=bins_calc_method, + ) + model.fit(X, y) + + + assert isinstance(model.tree, ClassRegressorTree) + assert isinstance(model.tree.split, ClassRegressorSplit) + assert len(model.tree.child_models) == 2 + assert isinstance(model.tree.child_models[0], ClassRegressorTree) + assert isinstance(model.tree.child_models[1], ClassRegressorTree) + + pred, proba = model.tree.predict(X, classification=True) + assert pred.tolist() == [0, 0, 1, 1] + + child = model.tree.child_models[0] + assert isinstance(child, ClassRegressorTree) + assert len(child.child_models) == 2 + assert isinstance(child.child_models[0], child.leaf_model_cls) + assert isinstance(child.child_models[1], child.leaf_model_cls) + pred, proba = child.predict(X, classification=True) + assert pred.tolist() == [0, 1, 1, 1] + + child = model.tree.child_models[1] + assert isinstance(child, ClassRegressorTree) + assert len(child.child_models) == 2 + assert isinstance(child.child_models[0], child.leaf_model_cls) + assert isinstance(child.child_models[1], child.leaf_model_cls) + pred, proba = child.predict(X, classification=True) + assert pred.tolist() == [0, 0, 0, 1] + + pred = model.predict(X) + + assert len(pred) == len(y) + + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_better_than_dummy(self, airbnb_split, bins_calc_method): + X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split + model = RecursiveClassRegressor(n_bins=2, n_splits=2, bins_calc_method=bins_calc_method) + model.fit(X_train_scaled, y_train) + + pred_train = model.predict(X_train_scaled) + pred_test = model.predict(X_test_scaled) + train_mae = mean_absolute_error(y_train, pred_train) + test_mae = mean_absolute_error(y_test, pred_test) + + dummy_regr = DummyRegressor(strategy="mean") + dummy_regr.fit(X_train_scaled, y_train) + + dummy_pred_train = dummy_regr.predict(X_train_scaled) + dummy_pred_test = dummy_regr.predict(X_test_scaled) + dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) + dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) + + assert train_mae <= dummy_train_mae + assert test_mae <= dummy_test_mae + + @pytest.mark.parametrize("bins_calc_method", ['equal', 'percentile']) + def test_fit_many_levels_better_than_dummy(self, airbnb_split, bins_calc_method): + X_train_scaled, X_test_scaled, y_train, y_test = airbnb_split + model = RecursiveClassRegressor(n_bins=2, n_splits=4, bins_calc_method=bins_calc_method) + model.fit(X_train_scaled, y_train) + + pred_train = model.predict(X_train_scaled) + pred_test = model.predict(X_test_scaled) + train_mae = mean_absolute_error(y_train, pred_train) + test_mae = mean_absolute_error(y_test, pred_test) + + dummy_regr = DummyRegressor(strategy="mean") + dummy_regr.fit(X_train_scaled, y_train) + + dummy_pred_train = dummy_regr.predict(X_train_scaled) + dummy_pred_test = dummy_regr.predict(X_test_scaled) + dummy_train_mae = mean_absolute_error(y_train, dummy_pred_train) + dummy_test_mae = mean_absolute_error(y_test, dummy_pred_test) + assert train_mae <= dummy_train_mae + assert test_mae <= dummy_test_mae From 44d06c5cc06869b9ac88f12ddb993a260c9f16a0 Mon Sep 17 00:00:00 2001 From: Boris Tseytlin Date: Sun, 30 Jan 2022 16:50:27 +0300 Subject: [PATCH 2/3] uncomment baseline models --- benchmark.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark.py b/benchmark.py index 51a89ed..925de07 100644 --- a/benchmark.py +++ b/benchmark.py @@ -87,16 +87,16 @@ def run_benchmarks(): 'model__alpha': scipy.stats.norm(0.5, 1), 'model__l1_ratio': scipy.stats.norm(0.5, 0.15), }, - # { # GradientBoostingRegressor - # 'model__max_depth': np.arange(-1, 20, 2), - # 'model__subsample': np.arange(0.2, 1.2, 0.2), - # 'model__n_estimators': np.arange(10, 310, 40), - # }, - # { # LGBMRegressor - # 'model__max_depth': np.arange(-1, 20, 2), - # 'model__subsample': np.arange(0.2, 1.2, 0.2), - # 'model__n_estimators': np.arange(10, 310, 40), - # }, + { # GradientBoostingRegressor + 'model__max_depth': np.arange(-1, 20, 2), + 'model__subsample': np.arange(0.2, 1.2, 0.2), + 'model__n_estimators': np.arange(10, 310, 40), + }, + { # LGBMRegressor + 'model__max_depth': np.arange(-1, 20, 2), + 'model__subsample': np.arange(0.2, 1.2, 0.2), + 'model__n_estimators': np.arange(10, 310, 40), + }, ] searches = {} From 92b56fe9aa375a2064a854b798298a02d4a2a12b Mon Sep 17 00:00:00 2001 From: Boris Tseytlin Date: Sun, 30 Jan 2022 17:01:20 +0300 Subject: [PATCH 3/3] Add numba parallelization --- .../recursive_class_regressor.py | 47 ++++++++++++------- requirements.txt | 3 +- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/regression_classifier/recursive_class_regressor.py b/regression_classifier/recursive_class_regressor.py index bf93bc0..072f669 100644 --- a/regression_classifier/recursive_class_regressor.py +++ b/regression_classifier/recursive_class_regressor.py @@ -4,6 +4,8 @@ from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.linear_model import LogisticRegression, LinearRegression +from numba import njit, prange, jit + from .utils import bins_calc @@ -97,6 +99,23 @@ def get_child_model(self, X, y): leaf_size=self.leaf_size, ) + @jit(parallel=True, nopython=False, forceobj=True) + def fit_child_models(self, X, y): + for i in prange(len(self.split.bin_borders)): + bin_border = self.split.bin_borders[i] + if i > 0: + bin_idx = (y > bin_border[0]) & (y <= bin_border[1]) + else: + bin_idx = (y >= bin_border[0]) & (y <= bin_border[1]) + + X_subset, y_subset = X[bin_idx], y[bin_idx] + if len(y_subset) == 0: + continue + + child_model = self.get_child_model(X_subset, y_subset) + child_model.fit(X_subset, y_subset) + self.child_models[i] = child_model + def fit(self, X, y): if isinstance(X, pd.DataFrame): X = X.values @@ -113,32 +132,24 @@ def fit(self, X, y): split_model.fit(X, y) self.split = split_model - for i, bin_border in enumerate(self.split.bin_borders): - if i > 0: - bin_idx = (y > bin_border[0]) & (y <= bin_border[1]) - else: - bin_idx = (y >= bin_border[0]) & (y <= bin_border[1]) - - X_subset, y_subset = X[bin_idx], y[bin_idx] - if len(y_subset) == 0: - continue + self.fit_child_models(X, y) - child_model = self.get_child_model(X_subset, y_subset) - child_model.fit(X_subset, y_subset) - self.child_models[i] = child_model + @jit(nopython=False, parallel=True, forceobj=True) + def get_child_model_preds(self, X, proba): + preds = np.zeros((len(X),)) + for bin_i in prange(len(self.child_models)): + child_model = self.child_models[bin_i] + child_prediction = child_model.predict(X) + preds += proba[:, bin_i] * child_prediction + return preds def predict(self, X, classification=False): - proba = self.split.predict_proba(X) pred = np.argmax(proba, axis=1) if classification: return pred, proba - preds = np.zeros((len(X),)) - for bin_i, child_model in self.child_models.items(): - child_prediction = child_model.predict(X) - preds += proba[:, bin_i] * child_prediction - return preds + return self.get_child_model_preds(X, proba) class RecursiveClassRegressor(BaseEstimator, RegressorMixin): diff --git a/requirements.txt b/requirements.txt index b442b6c..111cd6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ scikit-learn numpy pandas -pytest \ No newline at end of file +pytest +numba