From 0c0ba616cc789a8e14a7520b546282a1a9b5e69e Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Fri, 7 Feb 2025 15:46:43 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Allow=20FillGapsAR=20to=20?= =?UTF-8?q?perform=20prediction=20on=20data=20used=20for=20fitting.=20Main?= =?UTF-8?q?ly=20for=20Prophet.=20Added=20tests=20for=20STL.=20Takes=20fore?= =?UTF-8?q?ver?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 35 +++++++++-- tide/processing.py | 133 +++++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 66 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index 5374c3b..dcce6a4 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -669,12 +669,17 @@ def test_pd_fill_gap(self): for gap in holes_pairs: toy_df_gaps.loc[gap[0], gap[1]] = np.nan - filler = FillGapsAR() + filler = FillGapsAR(recursive_fill=False) res = filler.fit_transform(toy_df_gaps.copy()) for gap in holes_pairs[1:]: - # Skip the first one. r2_score doesn't work for only value - assert r2_score(toy_df.loc[gap[0], gap[1]], res.loc[gap[0], gap[1]]) > 0.99 + assert r2_score(toy_df.loc[gap[0], gap[1]], res.loc[gap[0], gap[1]]) > 0.80 + + filler = FillGapsAR(model_name="STL", recursive_fill=True) + res = filler.fit_transform(toy_df_gaps.copy()) + + for gap in holes_pairs[1:]: + assert r2_score(toy_df.loc[gap[0], gap[1]], res.loc[gap[0], gap[1]]) > 0.80 toy_df_15min = toy_df.resample("15min").mean().interpolate() hole_backast = pd.date_range( @@ -689,7 +694,25 @@ def test_pd_fill_gap(self): toy_df_15min_hole.iloc[:12, 0] = np.nan toy_df_15min_hole.iloc[-12:, 0] = np.nan - filler = FillGapsAR(resample_at_td="1h") + filler = FillGapsAR(resample_at_td="1h", recursive_fill=False) + res = filler.fit_transform(toy_df_15min_hole.copy()) + + assert ( + r2_score( + res.loc[hole_backast, "Temp_1"], + toy_df_15min.loc[hole_backast, "Temp_1"], + ) + > 0.80 + ) + assert ( + r2_score( + res.loc[hole_forecast, "Temp_1"], + toy_df_15min.loc[hole_forecast, "Temp_1"], + ) + > 0.80 + ) + + filler = FillGapsAR(model_name="STL", resample_at_td="1h", recursive_fill=True) res = filler.fit_transform(toy_df_15min_hole.copy()) assert ( @@ -697,14 +720,14 @@ def test_pd_fill_gap(self): res.loc[hole_backast, "Temp_1"], toy_df_15min.loc[hole_backast, "Temp_1"], ) - > 0.95 + > 0.80 ) assert ( r2_score( res.loc[hole_forecast, "Temp_1"], toy_df_15min.loc[hole_forecast, "Temp_1"], ) - > 0.95 + > 0.80 ) def test_combiner(self): diff --git a/tide/processing.py b/tide/processing.py index 1a84f57..8501237 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1147,11 +1147,12 @@ class FillGapsAR(BaseFiller, BaseProcessing): def __init__( self, - model_name: str = "STL", + model_name: str = "Prophet", model_kwargs: dict = {}, gaps_lte: str | dt.datetime | pd.Timestamp = None, gaps_gte: str | dt.datetime | pd.Timestamp = None, resample_at_td: str | dt.timedelta | pd.Timedelta = None, + recursive_fill: bool = False, ): BaseFiller.__init__(self, gaps_lte, gaps_gte) BaseProcessing.__init__(self) @@ -1173,79 +1174,91 @@ def __init__( f"Cannot predict data for gaps LTE to {gaps_lte} with data" f"at a {resample_at_td} timestep" ) + self.recursive_fill = recursive_fill - def _fit_and_fill_x(self, X, biggest_group, col, idx, backcast): - check_is_fitted(self, attributes=["model_"]) - bc_model = self.model_(backcast=backcast, **self.model_kwargs) + def _check_forecast_horizon(self, idx): + idx_dt = idx[-1] - idx[0] + if idx_dt == dt.timedelta(0): + idx_dt = idx.freq + if idx_dt < pd.to_timedelta(self.resample_at_td): + raise ValueError( + f"Forecaster is asked to predict at {idx_dt} in the future " + f"or in the past." + f" But data used for fitting have a {self.resample_at_td} frequency" + ) + + def _get_x_and_idx_at_freq(self, x, idx, backcast): if self.resample_at_td is not None: - idx_dt = idx[-1] - idx[0] - if idx_dt == dt.timedelta(0): - idx_dt = idx.freq - if idx_dt < pd.to_timedelta(self.resample_at_td): - raise ValueError( - f"Forecaster is asked to predict at {idx_dt} in the future " - f"or in the past." - f" But data used for fitting have a {self.resample_at_td} frequency" - ) - x_fit = X.loc[biggest_group, col].resample(self.resample_at_td).mean() - idx_origin = idx - if backcast: - idx = pd.date_range( - idx[0], - x_fit.index[0] - pd.Timedelta(self.resample_at_td), - freq=self.resample_at_td, - ) - else: - idx = pd.date_range( - x_fit.index[-1] + pd.Timedelta(self.resample_at_td), - idx[-1], - freq=self.resample_at_td, - ) + self._check_forecast_horizon(idx) + x_out = x.resample(self.resample_at_td).mean() + idx_out = pd.date_range(idx[0], idx[-1], freq=self.resample_at_td).floor( + self.resample_at_td + ) + idx_out.freq = idx_out.inferred_freq else: - x_fit = X.loc[biggest_group, col] - idx_origin = None - + x_out = x + idx_out = idx + + return x_out, idx_out + + def _fill_up_sampling(self, X, idx, col): + beg = idx[0] - idx.freq + end = idx[-1] + idx.freq + # Interpolate linearly between inferred values and using neighbor data + X.loc[idx, col] = X.loc[beg:end, col].interpolate() + # If gap is at boundaries + if beg < X.index[0]: + X.loc[idx, col] = X.loc[idx, col].bfill() + if end > X.index[-1]: + X.loc[idx, col] = X.loc[idx, col].ffill() + + def fill_x(self, X, group, col, idx, backcast): + check_is_fitted(self, attributes=["model_"]) + bc_model = self.model_(backcast=backcast, **self.model_kwargs) + if self.resample_at_td: + self._check_forecast_horizon(idx) + x_fit, idx_pred = self._get_x_and_idx_at_freq(X.loc[group, col], idx, backcast) bc_model.fit(x_fit) - to_predict = idx.to_series() + to_predict = idx_pred.to_series() to_predict.name = col - X.loc[idx, col] = bc_model.predict(to_predict).to_numpy().flatten() + # Here a bit dirty. STL doesn't allow forecast on its fitting set + if self.model_name == "STL": + to_predict = to_predict[~to_predict.isin(x_fit.index)] + + X.loc[to_predict, col] = bc_model.predict(to_predict).to_numpy().flatten() + if self.resample_at_td is not None: - beg = idx_origin[0] - idx_origin.freq - end = idx_origin[-1] + idx_origin.freq - # Interpolate linearly between inferred values and using neighbor data - X.loc[idx_origin, col] = X.loc[beg:end, col].interpolate() - # If gap is at boundaries - if beg < X.index[0]: - X.loc[idx_origin, col] = X.loc[idx_origin, col].bfill() - if end > X.index[-1]: - X.loc[idx_origin, col] = X.loc[idx_origin, col].ffill() + self._fill_up_sampling(X, idx, col) def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): self.model_ = MODEL_MAP[self.model_name] - return self def _transform_implementation(self, X: pd.Series | pd.DataFrame): check_is_fitted(self, attributes=["model_"]) gaps = self.get_gaps_dict_to_fill(X) for col in X: - while gaps[col]: - data_blocks = get_data_blocks(X[col], return_combination=False)[col] - data_timedelta = [block[-1] - block[0] for block in data_blocks] - biggest_group = data_blocks[data_timedelta.index(max(data_timedelta))] - start, end = get_outer_timestamps(biggest_group, X.index) - - indices_to_delete = [] - for i, idx in enumerate(gaps[col]): - if start in idx: - self._fit_and_fill_x(X, biggest_group, col, idx, backcast=True) - indices_to_delete.append(i) - elif end in idx: - self._fit_and_fill_x(X, biggest_group, col, idx, backcast=False) - indices_to_delete.append(i) - - for i in sorted(indices_to_delete, reverse=True): - del gaps[col][i] - + if not self.recursive_fill: + for idx in gaps[col]: + self.fill_x(X, X.index, col, idx, backcast=None) + else: + while gaps[col]: + data_blocks = get_data_blocks(X[col], return_combination=False)[col] + data_timedelta = [block[-1] - block[0] for block in data_blocks] + biggest_group = data_blocks[ + data_timedelta.index(max(data_timedelta)) + ] + start, end = get_outer_timestamps(biggest_group, X.index) + indices_to_delete = [] + for i, idx in enumerate(gaps[col]): + if start in idx: + self.fill_x(X, biggest_group, col, idx, backcast=True) + indices_to_delete.append(i) + elif end in idx: + self.fill_x(X, biggest_group, col, idx, backcast=False) + indices_to_delete.append(i) + + for i in sorted(indices_to_delete, reverse=True): + del gaps[col][i] return X