Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion tests/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,12 +644,37 @@ def test_pd_fill_gap(self):
toy_df_gaps.loc[gap[0], gap[1]] = np.nan

filler = FillGapsAR()
res = filler.fit_transform(toy_df_gaps)
res = filler.fit_transform(toy_df_gaps.copy())

for gap in holes_pairs[1:]:
# Skip the first one. r2_score doesn't work for only value
assert r2_score(toy_df.loc[gap[0], gap[1]], res.loc[gap[0], gap[1]]) > 0.99

toy_df_15min = toy_df.resample("15min").mean().interpolate()
hole_backast = pd.date_range("2009-06-05", "2009-06-06 01:15:00", freq="15min")
hole_forecast = pd.date_range("2009-08-05", "2009-08-06 01:45:00", freq="15min")
toy_df_15min_hole = toy_df_15min.copy()
toy_df_15min_hole.loc[hole_backast, "Temp_1"] = np.nan
toy_df_15min_hole.loc[hole_forecast, "Temp_1"] = np.nan

filler = FillGapsAR(resample_at_td="1h")
res = filler.fit_transform(toy_df_15min_hole.copy())

assert (
r2_score(
res.loc[hole_backast, "Temp_1"],
toy_df_15min.loc[hole_backast, "Temp_1"],
)
> 0.95
)
assert (
r2_score(
res.loc[hole_forecast, "Temp_1"],
toy_df_15min.loc[hole_forecast, "Temp_1"],
)
> 0.95
)

def test_combiner(self):
test_df = pd.DataFrame(
{
Expand Down
2 changes: 0 additions & 2 deletions tide/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ def __init__(

def _pre_fit(self, X: pd.Series | pd.DataFrame):
self.stl_kwargs = {} if self.stl_kwargs is None else self.stl_kwargs

X = check_and_return_dt_index_df(X)
check_array(X)

self.stl_kwargs["period"] = timedelta_to_int(self.period, X)
Expand Down
41 changes: 40 additions & 1 deletion tide/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,12 @@ class FillGapsAR(BaseFiller, BaseProcessing):
thresholds.
2- The biggest group of valid data is identified and is used to fit the model.
3- The neighboring gaps are filled using backcasting or forecasting.
4- OPTIONAL When the data's timestep is too short compared to the periodic behavior
(e.g., 5-min data for a 24h pattern):
- Resample data to a larger timestep
- Perform predictions at the resampled timestep
- Use linear interpolation to restore original data resolution


The process is repeated at step 2 until there are no more gaps to fill

Expand All @@ -1101,6 +1107,8 @@ class FillGapsAR(BaseFiller, BaseProcessing):
The lower threshold for the size of gaps to be considered, by default None.
upper_gap_threshold : str or datetime.datetime, optional
The upper threshold for the size of gaps to be considered, by default None.
resample_at_td: str or time delta, optinal
The time delta to resample fitting data before prediction

Attributes
----------
Expand All @@ -1118,19 +1126,50 @@ def __init__(
model_kwargs: dict = {},
gaps_lte: str | dt.datetime | pd.Timestamp = None,
gaps_gte: str | dt.datetime | pd.Timestamp = None,
resample_at_td: str | dt.timedelta | pd.Timedelta = None,
):
BaseFiller.__init__(self, gaps_lte, gaps_gte)
BaseProcessing.__init__(self)
self.model_name = model_name
self.model_kwargs = model_kwargs
self.resample_at_td = resample_at_td
gaps_lte = pd.Timedelta(gaps_lte) if isinstance(gaps_lte, str) else gaps_lte
resample_at_td = (
pd.Timedelta(resample_at_td)
if isinstance(resample_at_td, str)
else resample_at_td
)
if (
resample_at_td is not None
and gaps_lte is not None
and gaps_lte < resample_at_td
):
raise ValueError(
f"Cannot predict data for gaps LTE to {gaps_lte} with data"
f"at a {resample_at_td} timestep"
)

def _fit_and_fill_x(self, X, biggest_group, col, idx, backcast):
check_is_fitted(self, attributes=["model_"])
bc_model = self.model_(backcast=backcast, **self.model_kwargs)
bc_model.fit(X.loc[biggest_group, col])
if self.resample_at_td is not None:
x_fit = X.loc[biggest_group, col].resample(self.resample_at_td).mean()
idx_origin = idx
idx = pd.date_range(idx[0], idx[-1], freq=self.resample_at_td)
if not backcast and x_fit.index[-1] == idx[0]:
x_fit = x_fit[:-1]
elif x_fit.index[0] == idx[-1]:
x_fit = x_fit[1:]
else:
x_fit = X.loc[biggest_group, col]
idx_origin = None

bc_model.fit(x_fit)
to_predict = idx.to_series()
to_predict.name = col
X.loc[idx, col] = bc_model.predict(to_predict).to_numpy().flatten()
if self.resample_at_td is not None:
X.loc[idx_origin, col] = X.loc[idx_origin, col].interpolate()

def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None):
self.model_ = MODEL_MAP[self.model_name]
Expand Down
Loading