From aa18d122b49bedff4c3666754892ac64bc1b3c2c Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Wed, 8 Jan 2025 09:05:11 +0100 Subject: [PATCH 1/2] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20trash=20useless=20chec?= =?UTF-8?q?k=5Fand=5Freturn=5Fdt=5Findex=5Fdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tide/base.py b/tide/base.py index 5a20a4b..2fe687d 100644 --- a/tide/base.py +++ b/tide/base.py @@ -157,8 +157,6 @@ def __init__( def _pre_fit(self, X: pd.Series | pd.DataFrame): self.stl_kwargs = {} if self.stl_kwargs is None else self.stl_kwargs - - X = check_and_return_dt_index_df(X) check_array(X) self.stl_kwargs["period"] = timedelta_to_int(self.period, X) From e9ac7f154af610c351b3a0458c1d5f6c731b3356 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Wed, 8 Jan 2025 16:26:06 +0100 Subject: [PATCH 2/2] =?UTF-8?q?=E2=9C=A8=20FillGapsAR=20can=20now=20resamp?= =?UTF-8?q?le=20training=20data=20before=20fitting/predicting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 27 +++++++++++++++++++++++++- tide/processing.py | 41 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index 12a6d0d..99583a9 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -644,12 +644,37 @@ def test_pd_fill_gap(self): toy_df_gaps.loc[gap[0], gap[1]] = np.nan filler = FillGapsAR() - res = filler.fit_transform(toy_df_gaps) + res = filler.fit_transform(toy_df_gaps.copy()) for gap in holes_pairs[1:]: # Skip the first one. r2_score doesn't work for only value assert r2_score(toy_df.loc[gap[0], gap[1]], res.loc[gap[0], gap[1]]) > 0.99 + toy_df_15min = toy_df.resample("15min").mean().interpolate() + hole_backast = pd.date_range("2009-06-05", "2009-06-06 01:15:00", freq="15min") + hole_forecast = pd.date_range("2009-08-05", "2009-08-06 01:45:00", freq="15min") + toy_df_15min_hole = toy_df_15min.copy() + toy_df_15min_hole.loc[hole_backast, "Temp_1"] = np.nan + toy_df_15min_hole.loc[hole_forecast, "Temp_1"] = np.nan + + filler = FillGapsAR(resample_at_td="1h") + res = filler.fit_transform(toy_df_15min_hole.copy()) + + assert ( + r2_score( + res.loc[hole_backast, "Temp_1"], + toy_df_15min.loc[hole_backast, "Temp_1"], + ) + > 0.95 + ) + assert ( + r2_score( + res.loc[hole_forecast, "Temp_1"], + toy_df_15min.loc[hole_forecast, "Temp_1"], + ) + > 0.95 + ) + def test_combiner(self): test_df = pd.DataFrame( { diff --git a/tide/processing.py b/tide/processing.py index b4e8f15..50456ab 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1087,6 +1087,12 @@ class FillGapsAR(BaseFiller, BaseProcessing): thresholds. 2- The biggest group of valid data is identified and is used to fit the model. 3- The neighboring gaps are filled using backcasting or forecasting. + 4- OPTIONAL When the data's timestep is too short compared to the periodic behavior + (e.g., 5-min data for a 24h pattern): + - Resample data to a larger timestep + - Perform predictions at the resampled timestep + - Use linear interpolation to restore original data resolution + The process is repeated at step 2 until there are no more gaps to fill @@ -1101,6 +1107,8 @@ class FillGapsAR(BaseFiller, BaseProcessing): The lower threshold for the size of gaps to be considered, by default None. upper_gap_threshold : str or datetime.datetime, optional The upper threshold for the size of gaps to be considered, by default None. + resample_at_td: str or time delta, optinal + The time delta to resample fitting data before prediction Attributes ---------- @@ -1118,19 +1126,50 @@ def __init__( model_kwargs: dict = {}, gaps_lte: str | dt.datetime | pd.Timestamp = None, gaps_gte: str | dt.datetime | pd.Timestamp = None, + resample_at_td: str | dt.timedelta | pd.Timedelta = None, ): BaseFiller.__init__(self, gaps_lte, gaps_gte) BaseProcessing.__init__(self) self.model_name = model_name self.model_kwargs = model_kwargs + self.resample_at_td = resample_at_td + gaps_lte = pd.Timedelta(gaps_lte) if isinstance(gaps_lte, str) else gaps_lte + resample_at_td = ( + pd.Timedelta(resample_at_td) + if isinstance(resample_at_td, str) + else resample_at_td + ) + if ( + resample_at_td is not None + and gaps_lte is not None + and gaps_lte < resample_at_td + ): + raise ValueError( + f"Cannot predict data for gaps LTE to {gaps_lte} with data" + f"at a {resample_at_td} timestep" + ) def _fit_and_fill_x(self, X, biggest_group, col, idx, backcast): check_is_fitted(self, attributes=["model_"]) bc_model = self.model_(backcast=backcast, **self.model_kwargs) - bc_model.fit(X.loc[biggest_group, col]) + if self.resample_at_td is not None: + x_fit = X.loc[biggest_group, col].resample(self.resample_at_td).mean() + idx_origin = idx + idx = pd.date_range(idx[0], idx[-1], freq=self.resample_at_td) + if not backcast and x_fit.index[-1] == idx[0]: + x_fit = x_fit[:-1] + elif x_fit.index[0] == idx[-1]: + x_fit = x_fit[1:] + else: + x_fit = X.loc[biggest_group, col] + idx_origin = None + + bc_model.fit(x_fit) to_predict = idx.to_series() to_predict.name = col X.loc[idx, col] = bc_model.predict(to_predict).to_numpy().flatten() + if self.resample_at_td is not None: + X.loc[idx_origin, col] = X.loc[idx_origin, col].interpolate() def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): self.model_ = MODEL_MAP[self.model_name]