diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 67c073a..7befc5e 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -6,8 +6,8 @@ jobs: main: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: 3.12.4 cache: "pip" diff --git a/.gitignore b/.gitignore index b276f05..7bf07bd 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ docs/Makefile docs/make.bat datasets - +.DS_Store/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/configs/models_configs.py b/configs/models_configs.py index 605142a..eca813f 100644 --- a/configs/models_configs.py +++ b/configs/models_configs.py @@ -8,25 +8,20 @@ XGBRegressorConfig = ( xgb.XGBRegressor, { - "n_estimators": [100, 200, 500], "learning_rate": np.logspace(-2, -1, 3), "max_depth": [3, 5, 7], - "subsample": np.linspace(0.7, 1.0, 3), - "colsample_bytree": np.linspace(0.7, 1.0, 3), }, ) -LassoConfig = (Lasso, {"alpha": np.logspace(-4, 1, 6)}) +LassoConfig = (Lasso, {"alpha": np.logspace(-3, 2, 6)}) -RidgeConfig = (Ridge, {"alpha": np.logspace(-1, 3, 5)}) +RidgeConfig = (Ridge, {"alpha": np.logspace(-3, 2, 5)}) GradientBoostingRegressorConfig = ( GradientBoostingRegressor, { - "n_estimators": [100, 200, 500], "learning_rate": np.logspace(-2, -1, 3), "max_depth": [3, 5, 7], - "subsample": np.linspace(0.7, 1.0, 3), }, ) @@ -34,17 +29,14 @@ KNeighborsRegressor, { "n_neighbors": [3, 5, 10, 15], - "weights": ["uniform", "distance"], - "metric": ["euclidean", "manhattan", "minkowski"], }, ) SVRConfig = ( SVR, - {"C": np.logspace(-1, 2, 4), "epsilon": np.linspace(0.01, 0.5, 4), "kernel": ["linear", "rbf", "poly"]}, + {"C": np.logspace(-1, 2, 4), "epsilon": np.linspace(0.01, 0.5, 4)}, ) - ModelsConfigs = { "XGBRegressor": XGBRegressorConfig, "Lasso": LassoConfig, diff --git a/requirements.dev.txt b/requirements.dev.txt index 6ddbe26..8d44385 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,6 +1,8 @@ -mypy~=1.10.0 -black~=24.4.2 -isort~=5.13.2 -pytest~=7.4.4 -pandas-stubs~=2.2.3.250308 -hypothesis~=6.115.6 +mypy>=1.10.0 +black>=24.4.2 +isort>=5.13.2 +pytest>=7.4.4 +pandas-stubs>=2.2.3.250308 +hypothesis>=6.115.6 +seaborn >= 0.13.2 +matplotlib>=3.8.4 diff --git a/requirements.txt b/requirements.txt index c2b07e5..783c8b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ -numpy~=1.26.4 -scipy~=1.13.1 -matplotlib~=3.8.4 -scikit-learn~=1.6.1 -pandas~=2.2.3 +numpy>=2.1.3 +scipy>=1.15.2 +scikit-learn>=1.6.1 +pandas>=2.2.3 +xgboost >= 3.0.0 +workalendar >= 17.0.0 diff --git a/src/models/time_series_model.py b/src/models/time_series_model.py index 17f7b55..e1f16cc 100644 --- a/src/models/time_series_model.py +++ b/src/models/time_series_model.py @@ -143,6 +143,7 @@ def predict(self, X: pd.DataFrame) -> pd.DataFrame: forecast_list.extend(zip(X.loc[mask, self.keys_index], X.loc[mask, self.date_index], forecast_values)) forecast_df = pd.DataFrame(forecast_list, columns=[self.keys_index, self.date_index, "Forecast"]) X = X.merge(forecast_df, on=[self.keys_index, self.date_index], how="left") + X["Forecast"] = X["Forecast"].apply(lambda x: x if x >= 0 else 0) return X def score( @@ -265,20 +266,10 @@ def _setup_searchers(self) -> tuple[GridSearchCV, GridSearchCV]: s_models = self._create_pipelines(self.seasonal_models) grid_search_trend = GridSearchCV( - trend_pipe, - t_models, - cv=self.cv, - scoring=self.scoring, - verbose=0, - n_jobs=-1, + trend_pipe, t_models, cv=self.cv, scoring=self.scoring, verbose=0, n_jobs=-1, refit=True ) grid_search_seasonal = GridSearchCV( - seasonal_pipe, - s_models, - cv=self.cv, - scoring=self.scoring, - verbose=0, - n_jobs=-1, + seasonal_pipe, s_models, cv=self.cv, scoring=self.scoring, verbose=0, n_jobs=-1, refit=True ) return grid_search_trend, grid_search_seasonal diff --git a/src/special_preprocessing/transformers/features_extraction.py b/src/special_preprocessing/date_transformers/features_extraction.py similarity index 75% rename from src/special_preprocessing/transformers/features_extraction.py rename to src/special_preprocessing/date_transformers/features_extraction.py index 1446343..cf0d371 100644 --- a/src/special_preprocessing/transformers/features_extraction.py +++ b/src/special_preprocessing/date_transformers/features_extraction.py @@ -3,7 +3,10 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.compose import ColumnTransformer from sklearn.exceptions import NotFittedError +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess from workalendar.europe import Russia @@ -113,3 +116,46 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X.drop(columns=columns_to_drop, inplace=True, errors="ignore") return X + + +class FeatureExtractionTransformer(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self.pipe: Optional[Pipeline] = None + + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "FeatureExtractionTransformer": + self.pipe = self.features_pipeline() + self.pipe.fit(X, y) + return self + + @staticmethod + def features_pipeline() -> Pipeline: + ohe = ColumnTransformer( + transformers=[ + ( + "ohe", + OneHotEncoder(handle_unknown="ignore", sparse_output=False), + ["holiday"], + ) + ], + remainder="passthrough", + verbose_feature_names_out=False, + force_int_remainder_cols=False, + ) + ohe.set_output(transform="pandas") + pipline = Pipeline( + steps=[ + ("date_feature_transform", HolidayTransformer()), + ("ohe", ohe), + ("mean_ship_feature", MeanWeekMonthTransformer()), + ( + "fourier_features", + FourierFeaturesTransformer(), + ), + ] + ) + return pipline + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + if self.pipe is None: + raise NotFittedError() + return self.pipe.transform(X) diff --git a/src/special_preprocessing/transformers/series_comp.py b/src/special_preprocessing/date_transformers/series_comp.py similarity index 96% rename from src/special_preprocessing/transformers/series_comp.py rename to src/special_preprocessing/date_transformers/series_comp.py index c8f80a4..f30aa21 100644 --- a/src/special_preprocessing/transformers/series_comp.py +++ b/src/special_preprocessing/date_transformers/series_comp.py @@ -30,8 +30,11 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class DateRangeFilledTransformer(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self._is_fitted_: bool = False + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "DateRangeFilledTransformer": - self._is_fitted_: bool = True + self._is_fitted_ = True return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: diff --git a/src/special_preprocessing/transformers/series_decomposition.py b/src/special_preprocessing/date_transformers/series_decomposition.py similarity index 100% rename from src/special_preprocessing/transformers/series_decomposition.py rename to src/special_preprocessing/date_transformers/series_decomposition.py diff --git a/src/special_preprocessing/preprocessing_pipeline.py b/src/special_preprocessing/first_special_pipeline/pipeline.py similarity index 52% rename from src/special_preprocessing/preprocessing_pipeline.py rename to src/special_preprocessing/first_special_pipeline/pipeline.py index 03a85ec..309ddf4 100644 --- a/src/special_preprocessing/preprocessing_pipeline.py +++ b/src/special_preprocessing/first_special_pipeline/pipeline.py @@ -2,46 +2,15 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder -from src.special_preprocessing.transformers.features_extraction import ( - FourierFeaturesTransformer, - HolidayTransformer, - MeanWeekMonthTransformer, -) -from src.special_preprocessing.transformers.preprocessing import ( +from src.special_preprocessing.date_transformers.features_extraction import FeatureExtractionTransformer +from src.special_preprocessing.date_transformers.series_comp import DateRangeFilledTransformer, GroupByDateTransformer +from src.special_preprocessing.date_transformers.series_decomposition import Separation, SeriesDecompositionTransformer +from src.special_preprocessing.first_special_pipeline.preprocessing import ( ChangeTypesTransformer, DropDuplicatesTransformer, KeyIndexTransformer, NaNHandlerTransformer, ) -from src.special_preprocessing.transformers.series_comp import DateRangeFilledTransformer, GroupByDateTransformer -from src.special_preprocessing.transformers.series_decomposition import Separation, SeriesDecompositionTransformer - - -def features() -> Pipeline: - ohe = ColumnTransformer( - transformers=[ - ( - "ohe", - OneHotEncoder(handle_unknown="ignore", sparse_output=False), - ["holiday"], - ) - ], - remainder="passthrough", - verbose_feature_names_out=False, - ) - ohe.set_output(transform="pandas") - pipline = Pipeline( - steps=[ - ("date_feature_transform", HolidayTransformer()), - ("ohe", ohe), - ("mean_ship_feature", MeanWeekMonthTransformer()), - ( - "fourier_features", - FourierFeaturesTransformer(), - ), - ] - ) - return pipline def preprocessing() -> Pipeline: @@ -56,6 +25,7 @@ def preprocessing() -> Pipeline: ], remainder="passthrough", verbose_feature_names_out=False, + force_int_remainder_cols=False, ) ohe.set_output(transform="pandas") pipline = Pipeline( @@ -75,7 +45,7 @@ def preprocessing() -> Pipeline: ("base preprocessing", preprocessing()), ("fill_data_range", DateRangeFilledTransformer()), ("grouping", GroupByDateTransformer()), - ("features extraction", features()), + ("features extraction", FeatureExtractionTransformer()), ("decomposition", SeriesDecompositionTransformer()), ("separation", Separation()), ] diff --git a/src/special_preprocessing/transformers/preprocessing.py b/src/special_preprocessing/first_special_pipeline/preprocessing.py similarity index 100% rename from src/special_preprocessing/transformers/preprocessing.py rename to src/special_preprocessing/first_special_pipeline/preprocessing.py diff --git a/src/special_preprocessing/second_special_pipeline/pipeline.py b/src/special_preprocessing/second_special_pipeline/pipeline.py new file mode 100644 index 0000000..c51d8dc --- /dev/null +++ b/src/special_preprocessing/second_special_pipeline/pipeline.py @@ -0,0 +1,24 @@ +from sklearn.pipeline import Pipeline + +from src.special_preprocessing.date_transformers.features_extraction import FeatureExtractionTransformer +from src.special_preprocessing.date_transformers.series_decomposition import Separation, SeriesDecompositionTransformer +from src.special_preprocessing.second_special_pipeline.preprocessing import ( + CategoricalFeaturesTransform, + DateRangeFilledTransformerSec, + DiscountTransformer, + KeyTransformer, + RenameColumns, +) + +preprocessing_pipeline = Pipeline( + steps=[ + ("rename", RenameColumns()), + ("key", KeyTransformer()), + ("discount", DiscountTransformer()), + ("fill_data_range", DateRangeFilledTransformerSec()), + ("categorical_features_prep", CategoricalFeaturesTransform()), + ("features extraction", FeatureExtractionTransformer()), + ("decomposition", SeriesDecompositionTransformer()), + ("separation", Separation()), + ] +) diff --git a/src/special_preprocessing/second_special_pipeline/preprocessing.py b/src/special_preprocessing/second_special_pipeline/preprocessing.py new file mode 100644 index 0000000..2478b55 --- /dev/null +++ b/src/special_preprocessing/second_special_pipeline/preprocessing.py @@ -0,0 +1,117 @@ +from typing import Optional + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.compose import ColumnTransformer +from sklearn.exceptions import NotFittedError +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder + + +class RenameColumns(BaseEstimator, TransformerMixin): + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "RenameColumns": + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + X = X.rename( + columns={ + "CUSTOMER_CODE": "channel", + "PROMO_FAMILY_CD": "product", + "MASTER_FAMILY_CD": "group", + "PLAN_INVEST_V": "discount.1", + "PHY_CS_V": "ship", + } + ) + return X + + +class KeyTransformer(BaseEstimator, TransformerMixin): + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "KeyTransformer": + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + X["key"] = X["channel"] + "/" + X["group"] + "/" + X["product"] + return X + + +class DiscountTransformer(BaseEstimator, TransformerMixin): + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "DiscountTransformer": + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + X["discount"] = X["discount.1"].apply(lambda x: True if x > 0 else False) + return X + + +class CategoricalFeaturesTransform(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self._transformer: Optional[ColumnTransformer] = None + + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "CategoricalFeaturesTransform": + ohe_features = ["channel", "group"] + label_features = ["product"] + self._transformer = ColumnTransformer( + transformers=[ + ( + "OHE", + OneHotEncoder(handle_unknown="ignore", sparse_output=False), + ohe_features, + ), + ("OrdinalTransform", OrdinalEncoder(handle_unknown="error"), label_features), + ], + remainder="passthrough", + verbose_feature_names_out=False, + force_int_remainder_cols=False, + ) + self._transformer.set_output(transform="pandas") + self._transformer.fit(X, y) + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + if self._transformer is None: + raise NotFittedError() + return self._transformer.transform(X) + + +class DateRangeFilledTransformerSec(BaseEstimator, TransformerMixin): + def __init__(self) -> None: + self._is_fitted_: bool = False + + def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> "DateRangeFilledTransformerSec": + self._is_fitted_ = True + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + test_separator = X[X["mark"] == "train"]["date"].max() + special_columns = ["discount", "price", "discount.1", "key", "date", "ship", "mark"] + other = [col for col in X.columns if col not in special_columns] + + date_range = pd.date_range(X["date"].min(), X["date"].max(), freq="D") + missing_data = [] + unique_keys = X["key"].unique() + + for key in unique_keys: + product_data = X[X["key"] == key] + existing_dates = list(set(product_data["date"])) + missing_dates = date_range.difference(existing_dates) + + if missing_dates.empty: + continue + new_dt = pd.DataFrame( + { + "date": missing_dates, + "ship": 0, + "discount": 0, + "discount.1": 0, + "key": key, + "mark": np.where(missing_dates <= test_separator, "train", "test"), + } + ) + + for col in other: + new_dt[col] = product_data[col].iloc[0] + missing_data.append(new_dt) + + if missing_data: + X = pd.concat([X] + missing_data, ignore_index=True) + return X