From fd53e4d4fa3f9f57a5a4b9221e92bd72521d44cf Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Thu, 9 Jan 2025 10:36:23 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20FillGapsAR=20fix=20interpolation?= =?UTF-8?q?=20and=20boundaries=20effect=20when=20resampling=20data?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 2 ++ tide/processing.py | 27 +++++++++++++++++++++------ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index 99583a9..29c4453 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -656,6 +656,8 @@ def test_pd_fill_gap(self): toy_df_15min_hole = toy_df_15min.copy() toy_df_15min_hole.loc[hole_backast, "Temp_1"] = np.nan toy_df_15min_hole.loc[hole_forecast, "Temp_1"] = np.nan + toy_df_15min_hole.iloc[:12, 0] = np.nan + toy_df_15min_hole.iloc[-12:, 0] = np.nan filler = FillGapsAR(resample_at_td="1h") res = filler.fit_transform(toy_df_15min_hole.copy()) diff --git a/tide/processing.py b/tide/processing.py index 50456ab..69c5b04 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1155,11 +1155,18 @@ def _fit_and_fill_x(self, X, biggest_group, col, idx, backcast): if self.resample_at_td is not None: x_fit = X.loc[biggest_group, col].resample(self.resample_at_td).mean() idx_origin = idx - idx = pd.date_range(idx[0], idx[-1], freq=self.resample_at_td) - if not backcast and x_fit.index[-1] == idx[0]: - x_fit = x_fit[:-1] - elif x_fit.index[0] == idx[-1]: - x_fit = x_fit[1:] + if backcast: + idx = pd.date_range( + idx[0], + x_fit.index[0] - pd.Timedelta(self.resample_at_td), + freq=self.resample_at_td, + ) + else: + idx = pd.date_range( + x_fit.index[-1] + pd.Timedelta(self.resample_at_td), + idx[-1], + freq=self.resample_at_td, + ) else: x_fit = X.loc[biggest_group, col] idx_origin = None @@ -1169,7 +1176,15 @@ def _fit_and_fill_x(self, X, biggest_group, col, idx, backcast): to_predict.name = col X.loc[idx, col] = bc_model.predict(to_predict).to_numpy().flatten() if self.resample_at_td is not None: - X.loc[idx_origin, col] = X.loc[idx_origin, col].interpolate() + beg = idx_origin[0] - idx_origin.freq + end = idx_origin[-1] + idx_origin.freq + # Interpolate linearly between inferred values and using neighbor data + X.loc[idx_origin, col] = X.loc[beg:end, col].interpolate() + # If gap is at boundaries + if beg < X.index[0]: + X.loc[idx_origin, col] = X.loc[idx_origin, col].bfill() + if end > X.index[-1]: + X.loc[idx_origin, col] = X.loc[idx_origin, col].ffill() def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None): self.model_ = MODEL_MAP[self.model_name]