diff --git a/corrai/surrogate.py b/corrai/surrogate.py index 08255e7..1c15405 100644 --- a/corrai/surrogate.py +++ b/corrai/surrogate.py @@ -1,13 +1,21 @@ +import time + import numpy as np import pandas as pd +from scipy.stats import loguniform, randint + from sklearn.base import BaseEstimator, RegressorMixin, clone from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression -from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV +from sklearn.model_selection import ( + train_test_split, + cross_val_score, + RandomizedSearchCV, +) from sklearn.neural_network import MLPRegressor from sklearn.pipeline import Pipeline -from sklearn.preprocessing import PolynomialFeatures +from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.svm import SVR from sklearn.utils.validation import check_is_fitted @@ -15,43 +23,24 @@ from corrai.base.utils import as_1_column_dataframe from corrai.base.model import Model -MODEL_MAP = { - "TREE_REGRESSOR": RandomForestRegressor(), - "RANDOM_FOREST": RandomForestRegressor(), - "LINEAR_REGRESSION": LinearRegression(), - "LINEAR_SECOND_ORDER": Pipeline( - [ - ("poly", PolynomialFeatures(2)), - # Intercept is already added by PolynomialFeatures - ("Line_reg", LinearRegression(fit_intercept=False)), - ] - ), - "LINEAR_THIRD_ORDER": Pipeline( - [ - ("poly", PolynomialFeatures(3)), - ("Line_reg", LinearRegression(fit_intercept=False)), - ] - ), - "SUPPORT_VECTOR": SVR(), - "MULTI_LAYER_PERCEPTRON": MLPRegressor(max_iter=5000), -} - GRID_DICT = { "TREE_REGRESSOR": [ { - "n_estimators": [100, 300, 500], - "max_depth": [None, 10, 20, 40], + "n_estimators": [100, 200, 300], + "max_depth": [10, 20, 30, None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4], - "max_features": ["sqrt", "log2", 0.5], # allow fraction - "bootstrap": [True, False], + "max_features": ["sqrt", "log2"], + "bootstrap": [True], } ], "RANDOM_FOREST": [ { - "n_estimators": [100, 200, 400, 800], + "n_estimators": [100, 200, 300], + "max_depth": [10, 20, 30, None], "max_features": ["sqrt", "log2"], "min_samples_leaf": [1, 2, 5], + "min_samples_split": [2, 5], } ], "LINEAR_REGRESSION": [{"fit_intercept": [True, False]}], @@ -59,27 +48,122 @@ "LINEAR_THIRD_ORDER": [{"poly__interaction_only": [True, False]}], "SUPPORT_VECTOR": [ { - "kernel": ["linear", "poly", "rbf", "sigmoid"], + "kernel": ["rbf", "linear"], "C": [0.1, 1, 10, 100], - "gamma": ["scale", "auto"], - "epsilon": [0.001, 0.01, 0.1], - "degree": [2, 3, 4], # only relevant if kernel="poly" + "gamma": ["scale", "auto", 0.001, 0.01], + "epsilon": [0.01, 0.1], } ], "MULTI_LAYER_PERCEPTRON": [ { - "hidden_layer_sizes": [(50,), (100,), (150,), (50, 50), (100, 50)], - "activation": [ - "relu", - "tanh", - ], # identity/logistic rarely perform well in regression - "solver": ["adam", "lbfgs"], # "sgd" often unstable unless tuned further - "alpha": [0.0001, 0.001, 0.01], - "learning_rate": ["constant", "adaptive"], + "MLP__hidden_layer_sizes": [ + (100,), + (150,), + (100, 50), + (150, 100), + ], + "MLP__activation": ["relu", "tanh"], + "MLP__solver": ["adam"], + "MLP__alpha": [0.0001, 0.001, 0.01], + "MLP__learning_rate": ["adaptive"], + "MLP__max_iter": [3000, 5000], + "MLP__early_stopping": [True], + "MLP__validation_fraction": [0.15], + "MLP__n_iter_no_change": [15], + "MLP__tol": [1e-4], } ], } +# For use with RandomizedSearchCV - continuous distributions +# This allows exploring MORE distinct values +GRID_DICT_CONTINUOUS = { + "TREE_REGRESSOR": { + "n_estimators": randint(100, 400), + "max_depth": [10, 20, 30, 40, None], + "min_samples_split": randint(2, 15), + "min_samples_leaf": randint(1, 8), + "max_features": ["sqrt", "log2"], + "bootstrap": [True], + }, + "RANDOM_FOREST": { + "n_estimators": randint(100, 400), + "max_depth": [10, 20, 30, 40, None], + "max_features": ["sqrt", "log2"], + "min_samples_leaf": randint(1, 10), + "min_samples_split": randint(2, 15), + }, + "LINEAR_REGRESSION": {"fit_intercept": [True, False]}, + "LINEAR_SECOND_ORDER": {"poly__interaction_only": [True, False]}, + "LINEAR_THIRD_ORDER": {"poly__interaction_only": [True, False]}, + "SUPPORT_VECTOR": { + "kernel": ["rbf", "linear"], + "C": loguniform(0.1, 100), # Continuous log scale + "gamma": ["scale", "auto"] + + list(loguniform(1e-4, 1e-1).rvs(5, random_state=42)), + "epsilon": loguniform(0.001, 0.5), + }, + "MULTI_LAYER_PERCEPTRON": { + "MLP__hidden_layer_sizes": [ + (50,), + (100,), + (150,), + (200,), + (50, 50), + (100, 50), + (150, 100), + (200, 100), + (100, 50, 25), + # (200, 50, 25), + ], + "MLP__activation": ["relu", "tanh"], + "MLP__solver": ["adam", "lbfgs"], + "MLP__alpha": loguniform(1e-5, 1e-2), + "MLP__learning_rate": ["constant", "adaptive"], + "MLP__max_iter": [3000, 5000], + "MLP__early_stopping": [True], + "MLP__validation_fraction": [0.05, 0.1], + "MLP__n_iter_no_change": [15, 30, 50], + "MLP__tol": [1e-4, 1e-5], + }, +} + + +# RECOMMENDED: Different n_iter for different model complexities +TUNING_N_ITER_BY_MODEL = { + "TREE_REGRESSOR": 30, + "RANDOM_FOREST": 30, + "LINEAR_REGRESSION": 2, + "LINEAR_SECOND_ORDER": 2, + "LINEAR_THIRD_ORDER": 2, + "SUPPORT_VECTOR": 25, + "MULTI_LAYER_PERCEPTRON": 20, +} + + +MODEL_MAP = { + "TREE_REGRESSOR": RandomForestRegressor(), + "RANDOM_FOREST": RandomForestRegressor(), + "LINEAR_REGRESSION": LinearRegression(), + "LINEAR_SECOND_ORDER": Pipeline( + [ + ("poly", PolynomialFeatures(2)), + # Intercept is already added by PolynomialFeatures + ("Line_reg", LinearRegression(fit_intercept=False)), + ] + ), + "LINEAR_THIRD_ORDER": Pipeline( + [ + ("poly", PolynomialFeatures(3)), + ("Line_reg", LinearRegression(fit_intercept=False)), + ] + ), + "SUPPORT_VECTOR": SVR(), + "MULTI_LAYER_PERCEPTRON": Pipeline( + [("scaler", StandardScaler()), ("MLP", MLPRegressor(max_iter=5000))] + ), +} + class ModelTrainer: def __init__(self, model, test_size: float = 0.2, random_state: float = 42): @@ -158,7 +242,7 @@ class MultiModelSO(BaseEstimator, RegressorMixin): "LINEAR_SECOND_ORDER", "LINEAR_THIRD_ORDER", "SUPPORT_VECTOR", "MULTI_LAYER_PERCEPTRON" If ``None`` (default), all models in ``MODEL_MAP`` are evaluated. - cv : int, default=10 + cv : int, default=3 Number of cross-validation folds to use for model comparison. fine_tuning : bool, default=True If True, perform a grid search on the best model to fine-tune its @@ -223,19 +307,44 @@ class MultiModelSO(BaseEstimator, RegressorMixin): 72 134.038756 321 291.417029 73 123.789659 + + >>> # Fast configuration and training (development) + >>> model = MultiModelSO( + ... models=["LINEAR_REGRESSION", "RANDOM_FOREST", "MULTI_LAYER_PERCEPTRON"], + ... cv=3, + ... fine_tuning=True, + ... tuning_n_iter=TUNING_N_ITER_BY_MODEL, + ... use_continuous_distributions=False, + ... n_jobs=-1, + ... ) + + >>> # Optimal configuration (production) + >>> model = MultiModelSO( + ... models=None, + ... cv=5, + ... fine_tuning=True, + ... tuning_n_iter=TUNING_N_ITER_BY_MODEL, + ... use_continuous_distributions=True, + ... n_jobs=-1, + ... random_state=42, + ... ) """ def __init__( self, models: list[str] = None, - cv: int = 10, - fine_tuning: bool = True, + cv: int = 3, scoring: str = "neg_mean_squared_error", + fine_tuning: bool = True, + tuning_n_iter: int | dict = None, + use_continuous_distributions: bool = False, n_jobs: int = -1, random_state: int = None, ): self.cv = cv self.fine_tuning = fine_tuning + self.tuning_n_iter = tuning_n_iter + self.use_continuous_distributions = use_continuous_distributions self.scoring = scoring self.n_jobs = n_jobs self.random_state = random_state @@ -243,31 +352,62 @@ def __init__( self.models = models if models is not None else list(MODEL_MAP.keys()) self.model_map = {mod: clone(MODEL_MAP[mod]) for mod in self.models} + self.training_times_ = {} + self.cv_scores_ = {} + @property def feature_names_in_(self): check_is_fitted(self, ["_is_fitted"]) return self.get_model().feature_names_in_ def __sklearn_is_fitted__(self): - """ - Check fitted status and return a Boolean value. - """ return hasattr(self, "_is_fitted") and self._is_fitted def fit(self, X: pd.DataFrame, y: pd.DataFrame | pd.Series, verbose=True): y = y.squeeze() if isinstance(y, pd.DataFrame) else y - for mod in self.model_map.values(): - mod.fit(X, y) - + # Train all models with timing + for mod_name, mod in self.model_map.items(): + start_time = time.time() + try: + mod.fit(X, y) + self.training_times_[mod_name] = time.time() - start_time + except Exception as e: + if verbose: + print(f"Warning: {mod_name} failed to train: {e}") + self.training_times_[mod_name] = None + + # Cross-validation scores score_frame = pd.DataFrame( - columns=[f"mean({self.scoring})", f"std({self.scoring})"], index=self.models + columns=[ + f"mean({self.scoring})", + f"std({self.scoring})", + "train_time_sec", + ], + index=self.models, ) + for mod_name, mod in self.model_map.items(): - cv_scores = cross_val_score(mod, X, y, scoring=self.scoring, cv=self.cv) + if self.training_times_[mod_name] is None: + continue + + cv_scores = cross_val_score( + mod, + X, + y, + scoring=self.scoring, + cv=self.cv, + n_jobs=self.n_jobs, # NEW: parallel CV + ) + score_frame.loc[mod_name, f"mean({self.scoring})"] = np.mean(cv_scores) score_frame.loc[mod_name, f"std({self.scoring})"] = np.std(cv_scores) + score_frame.loc[mod_name, "train_time_sec"] = self.training_times_[mod_name] + self.cv_scores_[mod_name] = cv_scores + + # Remove failed models + score_frame = score_frame.dropna() score_frame.sort_values(f"mean({self.scoring})", ascending=False, inplace=True) if verbose: @@ -277,17 +417,22 @@ def fit(self, X: pd.DataFrame, y: pd.DataFrame | pd.Series, verbose=True): f"{score_frame}" ) + if len(score_frame) == 0: + raise ValueError("All models failed to train!") + self.best_model_key = score_frame.index[0] if self.fine_tuning: if verbose: print(f"\n === Fine tuning === \n" f"Fine tuning {self.best_model_key}") - self.fine_tune(X, y, self.best_model_key) + self.fine_tune(X, y, self.best_model_key, verbose=verbose) self.target_name_ = y.name self._is_fitted = True + return self + def predict( self, X: pd.DataFrame | np.ndarray | pd.Series, model: str = None ) -> pd.DataFrame: @@ -321,29 +466,83 @@ def fine_tune( X: pd.DataFrame, y: pd.DataFrame | pd.Series, model: str = None, - verbose=3, + verbose=True, ): - if GRID_DICT[model] is None: - raise ValueError(f"Fine tuning for {model} not yet implemented") + if model is None: + model = self.best_model_key + + if self.use_continuous_distributions: + param_dist = GRID_DICT_CONTINUOUS.get(model) + else: + param_dist = GRID_DICT.get(model) + + if param_dist is None: + if verbose: + print(f"Fine tuning for {model} not available - skipping") + return model_to_tune = self.get_model(model) - grid_search = GridSearchCV( + if isinstance(self.tuning_n_iter, dict): + n_iter = self.tuning_n_iter.get(model, 20) + elif self.tuning_n_iter is not None: + n_iter = self.tuning_n_iter + else: + n_iter = TUNING_N_ITER_BY_MODEL.get(model, 20) + + verbose_level = 2 if verbose else 0 + + grid_search = RandomizedSearchCV( model_to_tune, - GRID_DICT[model], + param_distributions=param_dist, cv=self.cv, + n_iter=n_iter, scoring=self.scoring, return_train_score=True, - verbose=verbose, + verbose=verbose_level, n_jobs=self.n_jobs, + random_state=self.random_state, ) + + start_time = time.time() grid_search.fit(X, y) + tuning_time = time.time() - start_time self.model_map[model] = grid_search.best_estimator_ - if verbose > 0: + + if verbose: + print(f"\nFine tuning completed in {tuning_time:.2f}s") + print(f"Best params: {grid_search.best_params_}") + print(f"Best CV score: {grid_search.best_score_:.4f}") + + # Show top 5 configurations cvres = grid_search.cv_results_ - for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): - print(np.sqrt(-mean_score), params) + results = sorted( + zip(cvres["mean_test_score"], cvres["params"]), reverse=True + )[:5] + + print("\nTop 5 configurations:") + for i, (score, params) in enumerate(results, 1): + print(f"{i}. Score: {score:.4f} | Params: {params}") + + def get_feature_importance(self, model: str = None, top_n: int = 10): + check_is_fitted(self) + mod = self.get_model(model) + + # Handle Pipeline + if isinstance(mod, Pipeline): + mod = mod.steps[-1][1] + + if not hasattr(mod, "feature_importances_"): + raise ValueError( + f"Model {model or self.best_model_key} doesn't support feature_importances_" + ) + + importance_df = pd.DataFrame( + {"feature": self.feature_names_in_, "importance": mod.feature_importances_} + ).sort_values("importance", ascending=False) + + return importance_df.head(top_n) if top_n else importance_df class StaticScikitModel(Model): diff --git a/tests/test_surrogate.py b/tests/test_surrogate.py index f7a7300..03edc7b 100644 --- a/tests/test_surrogate.py +++ b/tests/test_surrogate.py @@ -39,7 +39,9 @@ def test_mumoso_and_trainer(self): x_df = pd.DataFrame(list(set(itertools.product(*list(x_dict.values()))))) y = pd.Series(np.random.randn(x_df.shape[0])) - model_pipe = make_pipeline(OneHotEncoder(), MultiModelSO(fine_tuning=False)) + model_pipe = make_pipeline( + OneHotEncoder(sparse_output=False), MultiModelSO(fine_tuning=False) + ) trainer = ModelTrainer(model_pipe) trainer.train(X=x_df, y=y)