From c04f2186857380f4ec4f66181274f176f0696b48 Mon Sep 17 00:00:00 2001 From: Engelsgeduld Date: Wed, 30 Apr 2025 03:05:27 +0300 Subject: [PATCH 1/2] refactoring: separate preprocessing transformers for each dataset, unite pipeline builder functions into transformers. Create different pipelines for each dataset. Refactor TSM model, now all predictions below zero are equal to zero. Change configs to more proper parameters --- configs/models_configs.py | 14 +-- src/models/time_series_model.py | 15 +-- .../features_extraction.py | 46 +++++++ .../series_comp.py | 5 +- .../series_decomposition.py | 0 .../pipeline.py} | 42 +------ .../preprocessing.py | 0 .../second_special_pipeline/pipeline.py | 24 ++++ .../second_special_pipeline/preprocessing.py | 117 ++++++++++++++++++ 9 files changed, 203 insertions(+), 60 deletions(-) rename src/special_preprocessing/{transformers => date_transformers}/features_extraction.py (75%) rename src/special_preprocessing/{transformers => date_transformers}/series_comp.py (96%) rename src/special_preprocessing/{transformers => date_transformers}/series_decomposition.py (100%) rename src/special_preprocessing/{preprocessing_pipeline.py => first_special_pipeline/pipeline.py} (52%) rename src/special_preprocessing/{transformers => first_special_pipeline}/preprocessing.py (100%) create mode 100644 src/special_preprocessing/second_special_pipeline/pipeline.py create mode 100644 src/special_preprocessing/second_special_pipeline/preprocessing.py diff --git a/configs/models_configs.py b/configs/models_configs.py index 605142a..eca813f 100644 --- a/configs/models_configs.py +++ b/configs/models_configs.py @@ -8,25 +8,20 @@ XGBRegressorConfig = ( xgb.XGBRegressor, { - "n_estimators": [100, 200, 500], "learning_rate": np.logspace(-2, -1, 3), "max_depth": [3, 5, 7], - "subsample": np.linspace(0.7, 1.0, 3), - "colsample_bytree": np.linspace(0.7, 1.0, 3), }, ) -LassoConfig = (Lasso, {"alpha": np.logspace(-4, 1, 6)}) +LassoConfig = (Lasso, {"alpha": np.logspace(-3, 2, 6)}) -RidgeConfig = (Ridge, {"alpha": np.logspace(-1, 3, 5)}) +RidgeConfig = (Ridge, {"alpha": np.logspace(-3, 2, 5)}) GradientBoostingRegressorConfig = ( GradientBoostingRegressor, { - "n_estimators": [100, 200, 500], "learning_rate": np.logspace(-2, -1, 3), "max_depth": [3, 5, 7], - "subsample": np.linspace(0.7, 1.0, 3), }, ) @@ -34,17 +29,14 @@ KNeighborsRegressor, { "n_neighbors": [3, 5, 10, 15], - "weights": ["uniform", "distance"], - "metric": ["euclidean", "manhattan", "minkowski"], }, ) SVRConfig = ( SVR, - {"C": np.logspace(-1, 2, 4), "epsilon": np.linspace(0.01, 0.5, 4), "kernel": ["linear", "rbf", "poly"]}, + {"C": np.logspace(-1, 2, 4), "epsilon": np.linspace(0.01, 0.5, 4)}, ) - ModelsConfigs = { "XGBRegressor": XGBRegressorConfig, "Lasso": LassoConfig, diff --git a/src/models/time_series_model.py b/src/models/time_series_model.py index 17f7b55..e1f16cc 100644 --- a/src/models/time_series_model.py +++ b/src/models/time_series_model.py @@ -143,6 +143,7 @@ def predict(self, X: pd.DataFrame) -> pd.DataFrame: forecast_list.extend(zip(X.loc[mask, self.keys_index], X.loc[mask, self.date_index], forecast_values)) forecast_df = pd.DataFrame(forecast_list, columns=[self.keys_index, self.date_index, "Forecast"]) X = X.merge(forecast_df, on=[self.keys_index, self.date_index], how="left") + X["Forecast"] = X["Forecast"].apply(lambda x: x if x >= 0 else 0) return X def score( @@ -265,20 +266,10 @@ def _setup_searchers(self) -> tuple[GridSearchCV, GridSearchCV]: s_models = self._create_pipelines(self.seasonal_models) grid_search_trend = GridSearchCV( - trend_pipe, - t_models, - cv=self.cv, - scoring=self.scoring, - verbose=0, - n_jobs=-1, + trend_pipe, t_models, cv=self.cv, scoring=self.scoring, verbose=0, n_jobs=-1, refit=True ) grid_search_seasonal = GridSearchCV( - seasonal_pipe, - s_models, - cv=self.cv, - scoring=self.scoring, - verbose=0, - n_jobs=-1, + seasonal_pipe, s_models, cv=self.cv, scoring=self.scoring, verbose=0, n_jobs=-1, refit=True ) return grid_search_trend, grid_search_seasonal diff --git a/src/special_preprocessing/transformers/features_extraction.py b/src/special_preprocessing/date_transformers/features_extraction.py similarity index 75% rename from src/special_preprocessing/transformers/features_extraction.py rename to src/special_preprocessing/date_transformers/features_extraction.py index 1446343..cf0d371 100644 --- a/src/special_preprocessing/transformers/features_extraction.py +++ b/src/special_preprocessing/date_transformers/features_extraction.py @@ -3,7 +3,10 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess from workalendar.europe import Russia @@ -113,3 +116,46 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X.drop(columns=columns_to_drop, inplace=True, errors="ignore") return X + + +class FeatureExtractionTransformer(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self.pipe: Optional[Pipeline] = None + + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "FeatureExtractionTransformer": + self.pipe = self.features_pipeline() + self.pipe.fit(X, y) + return self + + @staticmethod + def features_pipeline() -> Pipeline: + ohe = ColumnTransformer( + transformers=[ + ( + "ohe", + OneHotEncoder(handle_unknown="ignore", sparse_output=False), + ["holiday"], + ) + ], + remainder="passthrough", + verbose_feature_names_out=False, + force_int_remainder_cols=False, + ) + ohe.set_output(transform="pandas") + pipline = Pipeline( + steps=[ + ("date_feature_transform", HolidayTransformer()), + ("ohe", ohe), + ("mean_ship_feature", MeanWeekMonthTransformer()), + ( + "fourier_features", + FourierFeaturesTransformer(), + ), + ] + ) + return pipline + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + if self.pipe is None: + raise NotFittedError() + return self.pipe.transform(X) diff --git a/src/special_preprocessing/transformers/series_comp.py b/src/special_preprocessing/date_transformers/series_comp.py similarity index 96% rename from src/special_preprocessing/transformers/series_comp.py rename to src/special_preprocessing/date_transformers/series_comp.py index c8f80a4..f30aa21 100644 --- a/src/special_preprocessing/transformers/series_comp.py +++ b/src/special_preprocessing/date_transformers/series_comp.py @@ -30,8 +30,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class DateRangeFilledTransformer(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self._is_fitted_: bool = False + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "DateRangeFilledTransformer": - self._is_fitted_: bool = True + self._is_fitted_ = True return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/src/special_preprocessing/transformers/series_decomposition.py b/src/special_preprocessing/date_transformers/series_decomposition.py similarity index 100% rename from src/special_preprocessing/transformers/series_decomposition.py rename to src/special_preprocessing/date_transformers/series_decomposition.py diff --git a/src/special_preprocessing/preprocessing_pipeline.py b/src/special_preprocessing/first_special_pipeline/pipeline.py similarity index 52% rename from src/special_preprocessing/preprocessing_pipeline.py rename to src/special_preprocessing/first_special_pipeline/pipeline.py index 03a85ec..309ddf4 100644 --- a/src/special_preprocessing/preprocessing_pipeline.py +++ b/src/special_preprocessing/first_special_pipeline/pipeline.py @@ -2,46 +2,15 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder -from src.special_preprocessing.transformers.features_extraction import ( - FourierFeaturesTransformer, - HolidayTransformer, - MeanWeekMonthTransformer, -) -from src.special_preprocessing.transformers.preprocessing import ( +from src.special_preprocessing.date_transformers.features_extraction import FeatureExtractionTransformer +from src.special_preprocessing.date_transformers.series_comp import DateRangeFilledTransformer, GroupByDateTransformer +from src.special_preprocessing.date_transformers.series_decomposition import Separation, SeriesDecompositionTransformer +from src.special_preprocessing.first_special_pipeline.preprocessing import ( ChangeTypesTransformer, DropDuplicatesTransformer, KeyIndexTransformer, NaNHandlerTransformer, ) -from src.special_preprocessing.transformers.series_comp import DateRangeFilledTransformer, GroupByDateTransformer -from src.special_preprocessing.transformers.series_decomposition import Separation, SeriesDecompositionTransformer - - -def features() -> Pipeline: - ohe = ColumnTransformer( - transformers=[ - ( - "ohe", - OneHotEncoder(handle_unknown="ignore", sparse_output=False), - ["holiday"], - ) - ], - remainder="passthrough", - verbose_feature_names_out=False, - ) - ohe.set_output(transform="pandas") - pipline = Pipeline( - steps=[ - ("date_feature_transform", HolidayTransformer()), - ("ohe", ohe), - ("mean_ship_feature", MeanWeekMonthTransformer()), - ( - "fourier_features", - FourierFeaturesTransformer(), - ), - ] - ) - return pipline def preprocessing() -> Pipeline: @@ -56,6 +25,7 @@ def preprocessing() -> Pipeline: ], remainder="passthrough", verbose_feature_names_out=False, + force_int_remainder_cols=False, ) ohe.set_output(transform="pandas") pipline = Pipeline( @@ -75,7 +45,7 @@ def preprocessing() -> Pipeline: ("base preprocessing", preprocessing()), ("fill_data_range", DateRangeFilledTransformer()), ("grouping", GroupByDateTransformer()), - ("features extraction", features()), + ("features extraction", FeatureExtractionTransformer()), ("decomposition", SeriesDecompositionTransformer()), ("separation", Separation()), ] diff --git a/src/special_preprocessing/transformers/preprocessing.py b/src/special_preprocessing/first_special_pipeline/preprocessing.py similarity index 100% rename from src/special_preprocessing/transformers/preprocessing.py rename to src/special_preprocessing/first_special_pipeline/preprocessing.py diff --git a/src/special_preprocessing/second_special_pipeline/pipeline.py b/src/special_preprocessing/second_special_pipeline/pipeline.py new file mode 100644 index 0000000..c51d8dc --- /dev/null +++ b/src/special_preprocessing/second_special_pipeline/pipeline.py @@ -0,0 +1,24 @@ +from sklearn.pipeline import Pipeline + +from src.special_preprocessing.date_transformers.features_extraction import FeatureExtractionTransformer +from src.special_preprocessing.date_transformers.series_decomposition import Separation, SeriesDecompositionTransformer +from src.special_preprocessing.second_special_pipeline.preprocessing import ( + CategoricalFeaturesTransform, + DateRangeFilledTransformerSec, + DiscountTransformer, + KeyTransformer, + RenameColumns, +) + +preprocessing_pipeline = Pipeline( + steps=[ + ("rename", RenameColumns()), + ("key", KeyTransformer()), + ("discount", DiscountTransformer()), + ("fill_data_range", DateRangeFilledTransformerSec()), + ("categorical_features_prep", CategoricalFeaturesTransform()), + ("features extraction", FeatureExtractionTransformer()), + ("decomposition", SeriesDecompositionTransformer()), + ("separation", Separation()), + ] +) diff --git a/src/special_preprocessing/second_special_pipeline/preprocessing.py b/src/special_preprocessing/second_special_pipeline/preprocessing.py new file mode 100644 index 0000000..2478b55 --- /dev/null +++ b/src/special_preprocessing/second_special_pipeline/preprocessing.py @@ -0,0 +1,117 @@ +from typing import Optional + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.compose import ColumnTransformer +from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder + + +class RenameColumns(BaseEstimator, TransformerMixin): + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "RenameColumns": + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + X = X.rename( + columns={ + "CUSTOMER_CODE": "channel", + "PROMO_FAMILY_CD": "product", + "MASTER_FAMILY_CD": "group", + "PLAN_INVEST_V": "discount.1", + "PHY_CS_V": "ship", + } + ) + return X + + +class KeyTransformer(BaseEstimator, TransformerMixin): + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "KeyTransformer": + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + X["key"] = X["channel"] + "/" + X["group"] + "/" + X["product"] + return X + + +class DiscountTransformer(BaseEstimator, TransformerMixin): + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "DiscountTransformer": + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + X["discount"] = X["discount.1"].apply(lambda x: True if x > 0 else False) + return X + + +class CategoricalFeaturesTransform(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self._transformer: Optional[ColumnTransformer] = None + + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "CategoricalFeaturesTransform": + ohe_features = ["channel", "group"] + label_features = ["product"] + self._transformer = ColumnTransformer( + transformers=[ + ( + "OHE", + OneHotEncoder(handle_unknown="ignore", sparse_output=False), + ohe_features, + ), + ("OrdinalTransform", OrdinalEncoder(handle_unknown="error"), label_features), + ], + remainder="passthrough", + verbose_feature_names_out=False, + force_int_remainder_cols=False, + ) + self._transformer.set_output(transform="pandas") + self._transformer.fit(X, y) + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + if self._transformer is None: + raise NotFittedError() + return self._transformer.transform(X) + + +class DateRangeFilledTransformerSec(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self._is_fitted_: bool = False + + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "DateRangeFilledTransformerSec": + self._is_fitted_ = True + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + test_separator = X[X["mark"] == "train"]["date"].max() + special_columns = ["discount", "price", "discount.1", "key", "date", "ship", "mark"] + other = [col for col in X.columns if col not in special_columns] + + date_range = pd.date_range(X["date"].min(), X["date"].max(), freq="D") + missing_data = [] + unique_keys = X["key"].unique() + + for key in unique_keys: + product_data = X[X["key"] == key] + existing_dates = list(set(product_data["date"])) + missing_dates = date_range.difference(existing_dates) + + if missing_dates.empty: + continue + new_dt = pd.DataFrame( + { + "date": missing_dates, + "ship": 0, + "discount": 0, + "discount.1": 0, + "key": key, + "mark": np.where(missing_dates <= test_separator, "train", "test"), + } + ) + + for col in other: + new_dt[col] = product_data[col].iloc[0] + missing_data.append(new_dt) + + if missing_data: + X = pd.concat([X] + missing_data, ignore_index=True) + return X From 13249958c125235085dd504b3de16d80dfa9bfcc Mon Sep 17 00:00:00 2001 From: Engelsgeduld Date: Wed, 30 Apr 2025 03:08:27 +0300 Subject: [PATCH 2/2] ci: change python setups in actions, change requirements to actual --- .github/workflows/main.yaml | 4 ++-- .gitignore | 2 +- requirements.dev.txt | 14 ++++++++------ requirements.txt | 11 ++++++----- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 67c073a..7befc5e 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -6,8 +6,8 @@ jobs: main: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: 3.12.4 cache: "pip" diff --git a/.gitignore b/.gitignore index b276f05..7bf07bd 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ docs/Makefile docs/make.bat datasets - +.DS_Store/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/requirements.dev.txt b/requirements.dev.txt index 6ddbe26..8d44385 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,6 +1,8 @@ -mypy~=1.10.0 -black~=24.4.2 -isort~=5.13.2 -pytest~=7.4.4 -pandas-stubs~=2.2.3.250308 -hypothesis~=6.115.6 +mypy>=1.10.0 +black>=24.4.2 +isort>=5.13.2 +pytest>=7.4.4 +pandas-stubs>=2.2.3.250308 +hypothesis>=6.115.6 +seaborn >= 0.13.2 +matplotlib>=3.8.4 diff --git a/requirements.txt b/requirements.txt index c2b07e5..783c8b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -numpy~=1.26.4 -scipy~=1.13.1 -matplotlib~=3.8.4 -scikit-learn~=1.6.1 -pandas~=2.2.3 +numpy>=2.1.3 +scipy>=1.15.2 +scikit-learn>=1.6.1 +pandas>=2.2.3 +xgboost >= 3.0.0 +workalendar >= 17.0.0