From d2d6d6a6bd7d90f5dd4df4c2ff96b08c7656d5f2 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 11:10:54 +0100 Subject: [PATCH 01/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20get=5Fgaps=5Fmask=5F?= =?UTF-8?q?from=5Fblocks=20moved=20to=20utils=20and=20get=5Fdata=5Fblocks?= =?UTF-8?q?=20include=20inner=5Fselection=20handling=20if=20gte=20<=20lte?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/base.py | 35 ++++++------------- tide/utils.py | 97 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 88 insertions(+), 44 deletions(-) diff --git a/tide/base.py b/tide/base.py index 3e66543..877c442 100644 --- a/tide/base.py +++ b/tide/base.py @@ -20,6 +20,7 @@ get_idx_freq_delta_or_min_time_interval, get_tags_max_level, NAME_LEVEL_MAP, + get_gaps_mask_from_blocks, ) from tide.meteo import get_oikolab_df @@ -205,39 +206,25 @@ def __init__( self.gaps_gte = gaps_gte def get_gaps_dict_to_fill(self, X: pd.Series | pd.DataFrame): - X = check_and_return_dt_index_df(X) - lower_th, upper_th = self.gaps_lte, self.gaps_gte - select_inner = False - - if lower_th is not None and upper_th is not None: - if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): - lower_th, upper_th = upper_th, lower_th - select_inner = True - return get_data_blocks( X, is_null=True, - select_inner=select_inner, - lower_td_threshold=lower_th, - upper_td_threshold=upper_th, + lower_td_threshold=self.gaps_lte, + upper_td_threshold=self.gaps_gte, upper_threshold_inclusive=True, lower_threshold_inclusive=True, return_combination=False, ) def get_gaps_mask(self, X: pd.Series | pd.DataFrame): - gaps_dict = self.get_gaps_dict_to_fill(X) - mask_data = {} - - for col, idx_list in gaps_dict.items(): - if idx_list: - combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index - mask_data[col] = X.index.isin(combined_idx) - else: - mask_data[col] = np.zeros(X.shape[0], dtype=bool) - - df_mask = pd.DataFrame(mask_data, index=X.index) - return df_mask + return get_gaps_mask_from_blocks( + X, + is_null=True, + lower_td_threshold=self.gaps_lte, + upper_td_threshold=self.gaps_gte, + lower_threshold_inclusive=True, + upper_threshold_inclusive=True, + ) class BaseOikoMeteo: diff --git a/tide/utils.py b/tide/utils.py index b250d35..c42c122 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -318,12 +318,11 @@ def get_data_blocks( data: pd.Series | pd.DataFrame, is_null: bool = False, cols: str | list[str] = None, - select_inner: bool = True, lower_td_threshold: str | dt.timedelta = None, upper_td_threshold: str | dt.timedelta = None, lower_threshold_inclusive: bool = True, upper_threshold_inclusive: bool = True, - return_combination=True, + return_combination: bool = True, ): """ Identifies groups of valid data if is_null = False, or groups of nan if @@ -350,10 +349,9 @@ def get_data_blocks( Whether to return groups with valid data, or groups of Nan values (is_null = True) cols : str or list[str], optional - The columns in the DataFrame for which to detect gaps. If None (default), all - columns are considered. - select_inner : Bool, default True - Select the groups of data inside or outside the given boundaries + Columns to analyze. If None, uses all columns. + select_inner : bool, default True + If True, select groups within thresholds. If False, select groups outside thresholds. lower_td_threshold : str or timedelta, optional The minimum duration of a period for it to be considered valid. Can be passed as a string (e.g., '1d' for one day) or a `timedelta`. @@ -381,33 +379,38 @@ def get_data_blocks( timestamps where the values in the corresponding column were NaN and exceeded the gap threshold. """ - data = check_and_return_dt_index_df(data) - - if isinstance(cols, str): - cols = [cols] - elif cols is None: - cols = list(data.columns) - - idx_dict = {} - for col in cols: - idx_dict[col] = get_series_bloc( + columns = ensure_list(columns) or list(data.columns) + + # Handle threshold order and adjust select_inner if needed + lower_th, upper_th = lower_td_threshold, upper_td_threshold + select_inner = False + if lower_th is not None and upper_th is not None: + if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): + lower_th, upper_th = upper_th, lower_th + select_inner = True + + # Process each column + idx_dict = { + col: get_series_bloc( data[col], is_null, select_inner, - lower_td_threshold, - upper_td_threshold, + lower_th, + upper_th, lower_threshold_inclusive, upper_threshold_inclusive, ) + for col in cols + } if return_combination: idx_dict["combination"] = get_series_bloc( ~data.isnull().any(axis=1), is_null, select_inner, - lower_td_threshold, - upper_td_threshold, + lower_th, + upper_th, lower_threshold_inclusive, upper_threshold_inclusive, ) @@ -487,3 +490,57 @@ def ensure_list(item): if item is None: return [] return item if isinstance(item, list) else [item] + + +def get_gaps_mask_from_blocks( + data: pd.Series | pd.DataFrame, + is_null: bool = False, + lower_td_threshold: str | dt.timedelta = None, + upper_td_threshold: str | dt.timedelta = None, + lower_threshold_inclusive: bool = True, + upper_threshold_inclusive: bool = True, +) -> pd.DataFrame: + """ + Creates a boolean mask DataFrame indicating the location of data blocks or gaps. + + Parameters + ---------- + data : pd.Series or pd.DataFrame + The input time series data with a DateTime index + is_null : bool, default False + Whether to find NaN blocks (True) or valid data blocks (False) + lower_td_threshold : str or timedelta, optional + The minimum duration threshold + upper_td_threshold : str or timedelta, optional + The maximum duration threshold + lower_threshold_inclusive : bool, default True + Include the blocks of exactly lower_td_threshold duration + upper_threshold_inclusive : bool, default True + Include the blocks of exactly upper_td_threshold duration + + Returns + ------- + pd.DataFrame + Boolean mask DataFrame with same index as input data and columns + corresponding to the input data columns. True values indicate + the presence of a block matching the criteria. + """ + gaps_dict = get_data_blocks( + data, + is_null=is_null, + lower_td_threshold=lower_td_threshold, + upper_td_threshold=upper_td_threshold, + lower_threshold_inclusive=lower_threshold_inclusive, + upper_threshold_inclusive=upper_threshold_inclusive, + return_combination=False, + ) + + mask_data = {} + for col, idx_list in gaps_dict.items(): + if idx_list: + combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index + mask_data[col] = data.index.isin(combined_idx) + else: + mask_data[col] = np.zeros(data.shape[0], dtype=bool) + + return pd.DataFrame(mask_data, index=data.index) From 3392ab648bb44fb37b80cc72bcc72214bc8a185a Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 11:12:23 +0100 Subject: [PATCH 02/12] =?UTF-8?q?=F0=9F=9A=A7=20stats=20dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_plumbing.py | 3 +-- tide/plumbing.py | 51 ++++++++++++++++++++++++++++++++++++++++++ tide/utils.py | 15 +++++++++---- 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py index 8e1167e..ac54e9b 100644 --- a/tests/test_plumbing.py +++ b/tests/test_plumbing.py @@ -184,7 +184,6 @@ def test_plumber(self): plumber.pipe_dict = pipe plumber.get_pipeline() plumber.get_pipeline(steps=["fill_3", "combine"]) - plumber.plot() - + plumber.get_gaps_description() assert True diff --git a/tide/plumbing.py b/tide/plumbing.py index 115c5ec..2beb39b 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -14,6 +14,7 @@ get_data_level_values, get_tree_depth_from_level, NamedList, + get_data_blocks ) from tide.plot import ( plot_gaps_heatmap, @@ -150,6 +151,56 @@ def show( depth_level = get_tree_depth_from_level(loc_tree.max_depth, depth_level) loc_tree.show(max_depth=depth_level) + def get_gaps_description( + self, + select: str | pd.Index | list[str] = None, + steps: None | str | list[str] | slice = slice(None), + verbose:bool = False, + gaps_lte: str | pd.Timedelta | dt.timedelta = None, + gaps_gte: str | pd.Timedelta | dt.timedelta = None, + return_combination:bool = True + ): + data = self.get_corrected_data(select, steps=steps, verbose=verbose) + + lower_th, upper_th = gaps_lte, gaps_gte + select_inner = False + if lower_th is not None and upper_th is not None: + if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): + lower_th, upper_th = upper_th, lower_th + select_inner = True + + nan_blocks = get_data_blocks( + data=data, + is_null=True, + select_inner=select_inner, + lower_td_threshold=lower_th, + upper_td_threshold=upper_th, + return_combination=return_combination, + ) + + + ser_list = [] + for col, gaps_list in nan_blocks.items(): + gaps_ser = [] + if gaps_list: + for gap in gaps_list: + if gap.shape[0] > 1: + gaps_ser.append(gap[-1] - gap[0]) + elif gap.shape[0] == 1: + gaps_ser.append(pd.to_timedelta(gap.freq)) + + ser_list.append(pd.Series(gaps_ser, name=col).describe()) + + try: + res = pd.concat(ser_list, axis=1) + except ValueError: + res = pd.DataFrame() + + pass + + + + def set_data(self, data: pd.Series | pd.DataFrame): self.data = check_and_return_dt_index_df(data) self.root = data_columns_to_tree(data.columns) diff --git a/tide/utils.py b/tide/utils.py index b250d35..1e99420 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -389,14 +389,21 @@ def get_data_blocks( elif cols is None: cols = list(data.columns) + lower_th, upper_th = lower_td_threshold, upper_td_threshold + select_inner = False + if lower_th is not None and upper_th is not None: + if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): + lower_th, upper_th = upper_th, lower_th + select_inner = True + idx_dict = {} for col in cols: idx_dict[col] = get_series_bloc( data[col], is_null, select_inner, - lower_td_threshold, - upper_td_threshold, + lower_th, + upper_th, lower_threshold_inclusive, upper_threshold_inclusive, ) @@ -406,8 +413,8 @@ def get_data_blocks( ~data.isnull().any(axis=1), is_null, select_inner, - lower_td_threshold, - upper_td_threshold, + lower_th, + upper_th, lower_threshold_inclusive, upper_threshold_inclusive, ) From ae39b718b6a6f839b6c233ee15c6e67535a7ab43 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 11:18:34 +0100 Subject: [PATCH 03/12] =?UTF-8?q?=F0=9F=94=80=20merge=20and=20adapt=20plum?= =?UTF-8?q?ber=20stats?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/base.py | 1 - tide/plumbing.py | 31 +++++++++++++------------------ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/tide/base.py b/tide/base.py index 877c442..12ced32 100644 --- a/tide/base.py +++ b/tide/base.py @@ -4,7 +4,6 @@ import typing from abc import ABC, abstractmethod -import numpy as np import pandas as pd from sklearn.base import TransformerMixin, BaseEstimator diff --git a/tide/plumbing.py b/tide/plumbing.py index 2beb39b..6929668 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -14,7 +14,7 @@ get_data_level_values, get_tree_depth_from_level, NamedList, - get_data_blocks + get_data_blocks, ) from tide.plot import ( plot_gaps_heatmap, @@ -152,13 +152,13 @@ def show( loc_tree.show(max_depth=depth_level) def get_gaps_description( - self, - select: str | pd.Index | list[str] = None, - steps: None | str | list[str] | slice = slice(None), - verbose:bool = False, - gaps_lte: str | pd.Timedelta | dt.timedelta = None, - gaps_gte: str | pd.Timedelta | dt.timedelta = None, - return_combination:bool = True + self, + select: str | pd.Index | list[str] = None, + steps: None | str | list[str] | slice = slice(None), + verbose: bool = False, + gaps_lte: str | pd.Timedelta | dt.timedelta = None, + gaps_gte: str | pd.Timedelta | dt.timedelta = None, + return_combination: bool = True, ): data = self.get_corrected_data(select, steps=steps, verbose=verbose) @@ -170,15 +170,13 @@ def get_gaps_description( select_inner = True nan_blocks = get_data_blocks( - data=data, - is_null=True, - select_inner=select_inner, - lower_td_threshold=lower_th, - upper_td_threshold=upper_th, - return_combination=return_combination, + data=data, + is_null=True, + lower_td_threshold=lower_th, + upper_td_threshold=upper_th, + return_combination=return_combination, ) - ser_list = [] for col, gaps_list in nan_blocks.items(): gaps_ser = [] @@ -198,9 +196,6 @@ def get_gaps_description( pass - - - def set_data(self, data: pd.Series | pd.DataFrame): self.data = check_and_return_dt_index_df(data) self.root = data_columns_to_tree(data.columns) From 2d4f8ba42bff178652caa7b24f5ba4cd53b3cf9f Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 16:24:34 +0100 Subject: [PATCH 04/12] =?UTF-8?q?=E2=9C=A8=20get=5Fblocks=5Flte=5Fand=5Fgt?= =?UTF-8?q?e,=20get=5Fblocks=5Fmask=5Flte=5Fand=5Fgte?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_utils.py | 112 +++++++++++++++++++++++++++--- tide/base.py | 21 +++--- tide/utils.py | 165 ++++++++++++++++++++++++++------------------ 3 files changed, 207 insertions(+), 91 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index d7a6b82..9461b9c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,7 +13,9 @@ parse_request_to_col_names, timedelta_to_int, NamedList, - get_series_bloc, + _get_series_bloc, + get_blocks_lte_and_gte, + get_blocks_mask_lte_and_gte, edit_tag_value_by_level, ) @@ -125,7 +127,7 @@ def test_get_series_bloc(self): toy_holes.loc["2009-01-01 05:00:00":"2009-01-01 08:00:00"] = np.nan toy_holes.loc["2009-01-01 12:00:00":"2009-01-01 16:00:00"] = np.nan - get_series_bloc( + _get_series_bloc( toy_holes, is_null=True, upper_td_threshold="3h", @@ -133,15 +135,15 @@ def test_get_series_bloc(self): ) # All data groups - assert len(get_series_bloc(toy_holes)) == 4 + assert len(_get_series_bloc(toy_holes)) == 4 # All gaps groups - assert len(get_series_bloc(toy_holes, is_null=True)) == 3 + assert len(_get_series_bloc(toy_holes, is_null=True)) == 3 # Gaps Inner bounds, one inclusive assert ( len( - get_series_bloc( + _get_series_bloc( toy_holes, is_null=True, select_inner=True, @@ -157,7 +159,7 @@ def test_get_series_bloc(self): # Gaps outer selection, one inclusive assert ( len( - get_series_bloc( + _get_series_bloc( toy_holes, is_null=True, select_inner=False, @@ -175,7 +177,7 @@ def test_get_series_bloc(self): [np.nan, 1, 2, np.nan, 3, 4, np.nan], index=pd.date_range("2009", freq="h", periods=7, tz="UTC"), ) - res = get_series_bloc(ser, is_null=True) + res = _get_series_bloc(ser, is_null=True) assert len(res) == 3 # No gaps case @@ -183,7 +185,7 @@ def test_get_series_bloc(self): [0.0, 1.0, 2.0, 2.5, 3, 4, 5.0], index=pd.date_range("2009", freq="h", periods=7, tz="UTC"), ) - res = get_series_bloc(ser, is_null=True) + res = _get_series_bloc(ser, is_null=True) assert res == [] @@ -192,7 +194,7 @@ def test_get_series_bloc(self): [0.0, 1.0, 2.0, np.nan, 3, 4, 5.0], index=pd.date_range("2009", freq="h", periods=7, tz="UTC"), ) - res = get_series_bloc(ser, is_null=True) + res = _get_series_bloc(ser, is_null=True) assert len(res) == 1 @@ -259,6 +261,98 @@ def test_get_data_blocks(self): ) assert res["data_1"] == [] + def test_get_blocks_lte_and_gte(self): + toy_df = pd.DataFrame( + {"data_1": np.random.randn(24), "data_2": np.random.randn(24)}, + index=pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC"), + ) + + toy_df.loc["2009-01-01 01:00:00", "data_1"] = np.nan + toy_df.loc["2009-01-01 10:00:00":"2009-01-01 12:00:00", "data_1"] = np.nan + toy_df.loc["2009-01-01 15:00:00":"2009-01-01 23:00:00", "data_2"] = np.nan + + res = get_blocks_lte_and_gte(toy_df, "1h30min", "8h", True) + assert len(res["data_1"]) == 1 and len(res["data_2"]) == 1 + + res = get_blocks_lte_and_gte(toy_df, lte="8h", gte="1h30min", is_null=True) + assert len(res["data_1"]) == 1 and len(res["data_2"]) == 0 + + def test_get_blocks_mask_lte_and_gte(self): + toy_df = pd.DataFrame( + {"data_1": np.random.randn(24), "data_2": np.random.randn(24)}, + index=pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC"), + ) + + toy_df.loc["2009-01-01 01:00:00", "data_1"] = np.nan + toy_df.loc["2009-01-01 10:00:00":"2009-01-01 12:00:00", "data_1"] = np.nan + toy_df.loc["2009-01-01 15:00:00":"2009-01-01 23:00:00", "data_2"] = np.nan + + res = get_blocks_mask_lte_and_gte(toy_df, "1h30min", "8h", True) + np.testing.assert_array_equal( + res.values, + np.array( + [ + [False, False], + [True, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, True], + [False, True], + [False, True], + [False, True], + [False, True], + [False, True], + [False, True], + [False, True], + [False, True], + ] + ), + ) + + res = get_blocks_mask_lte_and_gte(toy_df, lte="8h", gte="1h30min", is_null=True) + np.testing.assert_array_equal( + res.values, + np.array( + [ + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [True, False], + [True, False], + [True, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + [False, False], + ] + ), + ) + def test_outer_timestamps(self): ref_index = pd.date_range("2009-01-01", freq="d", periods=5, tz="UTC") idx = pd.date_range("2009-01-02", freq="d", periods=2, tz="UTC") diff --git a/tide/base.py b/tide/base.py index 12ced32..52693d7 100644 --- a/tide/base.py +++ b/tide/base.py @@ -15,11 +15,11 @@ timedelta_to_int, validate_odd_param, process_stl_odd_args, - get_data_blocks, + get_blocks_lte_and_gte, get_idx_freq_delta_or_min_time_interval, get_tags_max_level, NAME_LEVEL_MAP, - get_gaps_mask_from_blocks, + get_blocks_mask_lte_and_gte, ) from tide.meteo import get_oikolab_df @@ -205,24 +205,19 @@ def __init__( self.gaps_gte = gaps_gte def get_gaps_dict_to_fill(self, X: pd.Series | pd.DataFrame): - return get_data_blocks( + return get_blocks_lte_and_gte( X, is_null=True, - lower_td_threshold=self.gaps_lte, - upper_td_threshold=self.gaps_gte, - upper_threshold_inclusive=True, - lower_threshold_inclusive=True, - return_combination=False, + lte=self.gaps_lte, + gte=self.gaps_gte, ) def get_gaps_mask(self, X: pd.Series | pd.DataFrame): - return get_gaps_mask_from_blocks( + return get_blocks_mask_lte_and_gte( X, is_null=True, - lower_td_threshold=self.gaps_lte, - upper_td_threshold=self.gaps_gte, - lower_threshold_inclusive=True, - upper_threshold_inclusive=True, + lte=self.gaps_lte, + gte=self.gaps_gte, ) diff --git a/tide/utils.py b/tide/utils.py index 1cb4f6d..cbbfdb4 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -245,7 +245,7 @@ def _upper_bound(series, bound, bound_inclusive: bool, inner: bool): return op(series, bound) -def get_series_bloc( +def _get_series_bloc( date_series: pd.Series, is_null: bool = False, select_inner: bool = True, @@ -314,12 +314,101 @@ def get_series_bloc( ] +def get_blocks_lte_and_gte( + data: pd.Series | pd.DataFrame, + lte: str | dt.timedelta = None, + gte: str | dt.timedelta = None, + is_null: bool = False, +): + """ + Get blocks of data ore gaps (nan) based on duration thresholds. + + Returns them in a dictionary as list of DateTimeIndex. The keys values are + data columns (or name if data is a Series). + + + Parameters: + ----------- + data : pd.Series or pd.DataFrame + The input data to be processed. + lte : str or datetime.timedelta, optional + The upper time threshold. Can be a string (e.g., '1h') or a timedelta object. + gte : str or datetime.timedelta, optional + The lower time threshold. Can be a string (e.g., '30min') or a timedelta object. + is_null : bool, default False + Whether to select blocks where the data is null. + + Notes: + ------ + - If both `lte` and `gte` are provided, and `lte` is smaller than `gte`, they + will be swapped. The function determines whether to select data within or outside + the boundaries based on the order of thresholds. + """ + + lower_th, upper_th = lte, gte + select_inner = False + if lower_th is not None and upper_th is not None: + if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): + lower_th, upper_th = upper_th, lower_th + select_inner = True + + return get_data_blocks( + data=data, + is_null=is_null, + lower_td_threshold=lower_th, + upper_td_threshold=upper_th, + select_inner=select_inner, + return_combination=False, + ) + + +def get_blocks_mask_lte_and_gte( + data: pd.Series | pd.DataFrame, + lte: str | dt.timedelta = None, + gte: str | dt.timedelta = None, + is_null: bool = False, +) -> pd.DataFrame: + """ + Creates a boolean mask DataFrame indicating the location of data blocks or gaps. + + Parameters + ---------- + data : pd.Series or pd.DataFrame + The input time series data with a DateTime index + lte : str or timedelta, optional + The minimum duration threshold + gte : str or timedelta, optional + The maximum duration threshold + is_null : bool, default False + Whether to find NaN blocks (True) or valid data blocks (False) + + Returns + ------- + pd.DataFrame + Boolean mask DataFrame with same index as input data and columns + corresponding to the input data columns. True values indicate + the presence of a block matching the criteria. + """ + gaps_dict = get_blocks_lte_and_gte(data, lte, gte, is_null) + + mask_data = {} + for col, idx_list in gaps_dict.items(): + if idx_list: + combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index + mask_data[col] = data.index.isin(combined_idx) + else: + mask_data[col] = np.zeros(data.shape[0], dtype=bool) + + return pd.DataFrame(mask_data, index=data.index) + + def get_data_blocks( data: pd.Series | pd.DataFrame, is_null: bool = False, cols: str | list[str] = None, lower_td_threshold: str | dt.timedelta = None, upper_td_threshold: str | dt.timedelta = None, + select_inner: bool = True, lower_threshold_inclusive: bool = True, upper_threshold_inclusive: bool = True, return_combination: bool = True, @@ -382,22 +471,14 @@ def get_data_blocks( data = check_and_return_dt_index_df(data) cols = ensure_list(cols) or list(data.columns) - # Handle threshold order and adjust select_inner if needed - lower_th, upper_th = lower_td_threshold, upper_td_threshold - select_inner = False - if lower_th is not None and upper_th is not None: - if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): - lower_th, upper_th = upper_th, lower_th - select_inner = True - # Process each column idx_dict = { - col: get_series_bloc( + col: _get_series_bloc( data[col], is_null, select_inner, - lower_th, - upper_th, + lower_td_threshold, + upper_td_threshold, lower_threshold_inclusive, upper_threshold_inclusive, ) @@ -405,12 +486,12 @@ def get_data_blocks( } if return_combination: - idx_dict["combination"] = get_series_bloc( + idx_dict["combination"] = _get_series_bloc( ~data.isnull().any(axis=1), is_null, select_inner, - lower_th, - upper_th, + lower_td_threshold, + upper_td_threshold, lower_threshold_inclusive, upper_threshold_inclusive, ) @@ -490,57 +571,3 @@ def ensure_list(item): if item is None: return [] return item if isinstance(item, list) else [item] - - -def get_gaps_mask_from_blocks( - data: pd.Series | pd.DataFrame, - is_null: bool = False, - lower_td_threshold: str | dt.timedelta = None, - upper_td_threshold: str | dt.timedelta = None, - lower_threshold_inclusive: bool = True, - upper_threshold_inclusive: bool = True, -) -> pd.DataFrame: - """ - Creates a boolean mask DataFrame indicating the location of data blocks or gaps. - - Parameters - ---------- - data : pd.Series or pd.DataFrame - The input time series data with a DateTime index - is_null : bool, default False - Whether to find NaN blocks (True) or valid data blocks (False) - lower_td_threshold : str or timedelta, optional - The minimum duration threshold - upper_td_threshold : str or timedelta, optional - The maximum duration threshold - lower_threshold_inclusive : bool, default True - Include the blocks of exactly lower_td_threshold duration - upper_threshold_inclusive : bool, default True - Include the blocks of exactly upper_td_threshold duration - - Returns - ------- - pd.DataFrame - Boolean mask DataFrame with same index as input data and columns - corresponding to the input data columns. True values indicate - the presence of a block matching the criteria. - """ - gaps_dict = get_data_blocks( - data, - is_null=is_null, - lower_td_threshold=lower_td_threshold, - upper_td_threshold=upper_td_threshold, - lower_threshold_inclusive=lower_threshold_inclusive, - upper_threshold_inclusive=upper_threshold_inclusive, - return_combination=False, - ) - - mask_data = {} - for col, idx_list in gaps_dict.items(): - if idx_list: - combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index - mask_data[col] = data.index.isin(combined_idx) - else: - mask_data[col] = np.zeros(data.shape[0], dtype=bool) - - return pd.DataFrame(mask_data, index=data.index) From b5926c051107e867d9db48fd947358bf745f895b Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 17:05:00 +0100 Subject: [PATCH 05/12] =?UTF-8?q?=F0=9F=9A=A7=20untest=20get=5Fgaps=5Fdesc?= =?UTF-8?q?ription?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/plumbing.py | 106 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 31 deletions(-) diff --git a/tide/plumbing.py b/tide/plumbing.py index 6929668..d899801 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -14,7 +14,8 @@ get_data_level_values, get_tree_depth_from_level, NamedList, - get_data_blocks, + get_blocks_lte_and_gte, + get_blocks_mask_lte_and_gte, ) from tide.plot import ( plot_gaps_heatmap, @@ -159,42 +160,85 @@ def get_gaps_description( gaps_lte: str | pd.Timedelta | dt.timedelta = None, gaps_gte: str | pd.Timedelta | dt.timedelta = None, return_combination: bool = True, - ): + ) -> pd.DataFrame: + """ + Get statistical description of gaps durations in the data. + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Data selection using tide's tag system + steps : None or str or list[str] or slice, default slice(None) + Pipeline steps to apply before analyzing gaps + verbose : bool, default False + Whether to print information about pipeline steps + gaps_lte : str or pd.Timedelta or dt.timedelta, optional + Upper threshold for gap duration + gaps_gte : str or pd.Timedelta or dt.timedelta, optional + Lower threshold for gap duration + return_combination : bool, default True + Whether to include statistics for gaps present in any column + + Returns + ------- + pd.DataFrame + DataFrame containing statistics about gap durations for each column. + Statistics include: + - data_presence_%: percentage of non-gap data points + - count: number of gaps + - mean: average gap duration + - std: standard deviation of gap durations + - min: shortest gap + - 25%: first quartile + - 50%: median + - 75%: third quartile + - max: longest gap + Empty DataFrame if no gaps are found. + """ data = self.get_corrected_data(select, steps=steps, verbose=verbose) - - lower_th, upper_th = gaps_lte, gaps_gte - select_inner = False - if lower_th is not None and upper_th is not None: - if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th): - lower_th, upper_th = upper_th, lower_th - select_inner = True - - nan_blocks = get_data_blocks( + + # Get gaps and calculate durations + gaps_dict = get_blocks_lte_and_gte( data=data, + lte=gaps_lte, + gte=gaps_gte, is_null=True, - lower_td_threshold=lower_th, - upper_td_threshold=upper_th, return_combination=return_combination, ) - ser_list = [] - for col, gaps_list in nan_blocks.items(): - gaps_ser = [] - if gaps_list: - for gap in gaps_list: - if gap.shape[0] > 1: - gaps_ser.append(gap[-1] - gap[0]) - elif gap.shape[0] == 1: - gaps_ser.append(pd.to_timedelta(gap.freq)) - - ser_list.append(pd.Series(gaps_ser, name=col).describe()) - - try: - res = pd.concat(ser_list, axis=1) - except ValueError: - res = pd.DataFrame() - - pass + gap_durations = {} + for col, gaps_list in gaps_dict.items(): + if not gaps_list: + continue + + durations = [] + for gap in gaps_list: + if len(gap) > 1: + durations.append(gap[-1] - gap[0]) + else: + durations.append(pd.to_timedelta(gap.freq)) + + if durations: + gap_durations[col] = pd.Series(durations, name=col) + + if not gap_durations: + return pd.DataFrame() + + stats_df = pd.concat([ser.describe() for ser in gap_durations.values()], axis=1) + + gaps_mask = get_blocks_mask_lte_and_gte( + data=data, + lte=gaps_lte, + gte=gaps_gte, + is_null=True, + return_combination=return_combination, + ) + + presence_percentages = (1 - gaps_mask.mean()) * 100 + + stats_df.loc["data_presence_%"] = presence_percentages[stats_df.columns] + row_order = ["data_presence_%"] + [idx for idx in stats_df.index if idx != "data_presence_%"] + return stats_df.reindex(row_order) def set_data(self, data: pd.Series | pd.DataFrame): self.data = check_and_return_dt_index_df(data) From 447266bce90e80fe24b5f2e718d0f924bf9c72ff Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 17:05:25 +0100 Subject: [PATCH 06/12] =?UTF-8?q?=E2=9C=85=20TestGapsDescription?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_plumbing.py | 125 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py index ac54e9b..9633fd3 100644 --- a/tests/test_plumbing.py +++ b/tests/test_plumbing.py @@ -10,6 +10,7 @@ ) import plotly.io as pio +import pytest pio.renderers.default = "browser" @@ -187,3 +188,127 @@ def test_plumber(self): plumber.plot() plumber.get_gaps_description() assert True + + +class TestGapsDescription: + @pytest.fixture + def sample_data(self): + # Create sample data with known gaps + idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC") + data = pd.DataFrame({ + "temp__°C__Building": np.ones(24), + "humidity__%__Building": np.ones(24), + "power__W__Building": np.ones(24) + }, index=idx) + + # Create gaps of different durations + data.loc["2023-01-01 02:00":"2023-01-01 04:00", "temp__°C__Building"] = np.nan # 3h gap + data.loc["2023-01-01 08:00", "temp__°C__Building"] = np.nan # 1h gap + data.loc["2023-01-01 12:00":"2023-01-01 14:00", "humidity__%__Building"] = np.nan # 3h gap + data.loc["2023-01-01 06:00":"2023-01-01 18:00", "power__W__Building"] = np.nan # 13h gap + + return data + + def test_basic_gaps_description(self, sample_data): + """Test basic functionality with default parameters""" + plumber = Plumber(sample_data) + result = plumber.get_gaps_description() + + # Check presence of all columns + assert all(col in result.columns for col in sample_data.columns) + + # Check presence of all statistics + expected_stats = ["data_presence_%", "count", "mean", "std", "min", "25%", "50%", "75%", "max"] + assert all(stat in result.index for stat in expected_stats) + + # Check specific values for temp column + temp_col = "temp__°C__Building" + assert result[temp_col]["count"] == 2 # Two gaps + assert result[temp_col]["data_presence_%"] == pytest.approx(83.33, rel=1e-2) # 20/24 hours present + + def test_with_duration_thresholds(self, sample_data): + """Test with gap duration thresholds""" + plumber = Plumber(sample_data) + + # Only gaps >= 3h + result = plumber.get_gaps_description(gaps_gte="3h") + assert result["temp__°C__Building"]["count"] == 1 # Only one 3h gap + assert result["power__W__Building"]["count"] == 1 # One 13h gap + + # Only gaps <= 2h + result = plumber.get_gaps_description(gaps_lte="2h") + assert result["temp__°C__Building"]["count"] == 1 # Only one 1h gap + assert "power__W__Building" not in result.columns # No gaps <= 2h + + def test_with_data_selection(self, sample_data): + """Test with data selection using tags""" + plumber = Plumber(sample_data) + + # Select by unit + result = plumber.get_gaps_description(select="°C") + assert list(result.columns) == ["temp__°C__Building"] + + # Select by bloc + result = plumber.get_gaps_description(select="Building") + assert len(result.columns) == 3 + + def test_empty_cases(self): + """Test cases that should return empty DataFrame""" + # Data with no gaps + idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC") + clean_data = pd.DataFrame({ + "temp__°C__Building": np.ones(24) + }, index=idx) + plumber = Plumber(clean_data) + + result = plumber.get_gaps_description() + assert result.empty + + # Data selection that returns no columns + plumber = Plumber(clean_data) + result = plumber.get_gaps_description(select="nonexistent") + assert result.empty + + def test_combination_flag(self, sample_data): + """Test with and without return_combination flag""" + plumber = Plumber(sample_data) + + # With combination + result = plumber.get_gaps_description(return_combination=True) + assert "combination" in result.columns + + # Without combination + result = plumber.get_gaps_description(return_combination=False) + assert "combination" not in result.columns + + def test_single_point_gaps(self): + """Test handling of single-point gaps""" + idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC") + data = pd.DataFrame({ + "temp__°C__Building": np.ones(24) + }, index=idx) + + # Create single point gap + data.loc["2023-01-01 12:00", "temp__°C__Building"] = np.nan + + plumber = Plumber(data) + result = plumber.get_gaps_description() + + assert result["temp__°C__Building"]["count"] == 1 + assert pd.Timedelta(result["temp__°C__Building"]["mean"]) == pd.Timedelta("1h") + + def test_pipeline_steps(self, sample_data): + """Test with pipeline steps""" + plumber = Plumber(sample_data) + plumber.pipe_dict = { + "step1": [["Identity"]], # Simple identity transformation + "step2": [["Identity"]] + } + + # Test with specific steps + result = plumber.get_gaps_description(steps=["step1"]) + assert not result.empty + + # Test with no steps + result = plumber.get_gaps_description(steps=None) + assert not result.empty From 68d725209bd7cf0347df26678c62a95e21f59795 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 17:19:21 +0100 Subject: [PATCH 07/12] =?UTF-8?q?=E2=9C=85=20update=20plumbing=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_plumbing.py | 468 +++++++++++++++++++++-------------------- 1 file changed, 238 insertions(+), 230 deletions(-) diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py index 9633fd3..850e53b 100644 --- a/tests/test_plumbing.py +++ b/tests/test_plumbing.py @@ -1,7 +1,6 @@ import pandas as pd - import numpy as np - +import pytest from tide.plumbing import ( _get_pipe_from_proc_list, _get_column_wise_transformer, @@ -10,131 +9,149 @@ ) import plotly.io as pio -import pytest pio.renderers.default = "browser" -TEST_DF = pd.DataFrame( - { - "Tin__°C__building": [10.0, 20.0, 30.0], - "Text__°C__outdoor": [-1.0, 5.0, 4.0], - "radiation__W/m2__outdoor": [50, 100, 400], - "Humidity__%HR": [10, 15, 13], - "Humidity__%HR__room1": [20, 30, 50], - "Humidity_2": [10, 15, 13], - "light__DIMENSIONLESS__building": [100, 200, 300], - "mass_flwr__m3/h__hvac": [300, 500, 600], - }, - index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), -) - -TEST_DF_2 = pd.DataFrame( - { - "a__°C__zone_1": np.random.randn(24), - "b__°C__zone_1": np.random.randn(24), - "c__Wh__zone_2": np.random.randn(24) * 100, - }, - index=pd.date_range("2009", freq="h", periods=24, tz="UTC"), -) - -TEST_DF_2["c__Wh__zone_2"] = abs(TEST_DF_2).cumsum()["c__Wh__zone_2"] - -TEST_DF_2.loc["2009-01-01 05:00:00":"2009-01-01 09:00:00", "a__°C__zone_1"] = np.nan -TEST_DF_2.loc["2009-01-01 15:00:00", "b__°C__zone_1"] = np.nan -TEST_DF_2.loc["2009-01-01 17:00:00", "b__°C__zone_1"] = np.nan -TEST_DF_2.loc["2009-01-01 20:00:00", "c__Wh__zone_2"] = np.nan - -PIPE_DICT = { - "pre_processing": { - "°C": [["ReplaceThreshold", {"upper": 25}]], - "W/m2__outdoor": [["DropTimeGradient", {"upper_rate": -100}]], - }, - "common": [["Interpolate", ["linear"]], ["Ffill"], ["Bfill", {"limit": 3}]], - "resampling": [["Resample", ["3h", "mean", {"W/m2": "sum"}]]], - "compute_energy": [ - [ - "ExpressionCombine", +@pytest.fixture +def time_index(): + """Create a standard time index for test data.""" + return pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC") + +@pytest.fixture +def basic_data(time_index): + """Create basic test data with various units and tags.""" + return pd.DataFrame( + { + "Tin__°C__building": np.random.randn(24) * 5 + 20, + "Text__°C__outdoor": np.random.randn(24) * 3 + 10, + "radiation__W/m2__outdoor": np.abs(np.random.randn(24)) * 100, + "Humidity__%HR": np.random.randn(24) * 5 + 50, + "Humidity__%HR__room1": np.random.randn(24) * 5 + 45, + "Humidity_2": np.random.randn(24) * 5 + 55, + "light__DIMENSIONLESS__building": np.abs(np.random.randn(24)) * 200, + "mass_flwr__m3/h__hvac": np.abs(np.random.randn(24)) * 400 + 500, + }, + index=time_index, + ) + +@pytest.fixture +def gapped_data(time_index): + """Create test data with specific gaps for testing gap-related functionality.""" + data = pd.DataFrame( + { + "a__°C__zone_1": np.random.randn(24), + "b__°C__zone_1": np.random.randn(24), + "c__Wh__zone_2": np.abs(np.random.randn(24) * 100), + }, + index=time_index, + ) + + # Add cumulative sum to energy data + data["c__Wh__zone_2"] = data["c__Wh__zone_2"].cumsum() + + # Add specific gaps + data.loc["2009-01-01 05:00":"2009-01-01 09:00", "a__°C__zone_1"] = np.nan # 5h gap + data.loc["2009-01-01 15:00", "b__°C__zone_1"] = np.nan # 1h gap + data.loc["2009-01-01 17:00", "b__°C__zone_1"] = np.nan # 1h gap + data.loc["2009-01-01 20:00", "c__Wh__zone_2"] = np.nan # 1h gap + + return data + +@pytest.fixture +def pipe_dict(): + """Create a standard pipeline dictionary for testing.""" + return { + "pre_processing": { + "°C": [["ReplaceThreshold", {"upper": 25}]], + "W/m2__outdoor": [["DropTimeGradient", {"upper_rate": -100}]], + }, + "common": [ + ["Interpolate", ["linear"]], + ["Ffill"], + ["Bfill", {"limit": 3}] + ], + "resampling": [ + ["Resample", ["3h", "mean", {"W/m2": "sum"}]] + ], + "compute_energy": [ [ - { - "T1": "Tin__°C__building", - "T2": "Text__°C__outdoor", - "m": "mass_flwr__m3/h__hvac", - }, - "(T1 - T2) * m * 1004 * 1.204", - "Air_flow_energy__hvac__J", - True, - ], - ] - ], -} - - -class TestPlumbing: - def test__get_all_data_step(self): - test_df = TEST_DF.copy() + "ExpressionCombine", + [ + { + "T1": "Tin__°C__building", + "T2": "Text__°C__outdoor", + "m": "mass_flwr__m3/h__hvac", + }, + "(T1 - T2) * m * 1004 * 1.204", + "Air_flow_energy__hvac__J", + True, + ], + ] + ], + } + +class TestPipelineComponents: + """Tests for individual pipeline components and transformers.""" + + def test_pipe_from_proc_list(self, basic_data, pipe_dict): + """Test creation and application of processing pipeline from list.""" + # Create gap in data + test_df = basic_data.copy() test_df.iloc[1, 0] = np.nan test_df.iloc[0, 1] = np.nan - pipe = _get_pipe_from_proc_list(test_df.columns, PIPE_DICT["common"], tz="UTC") - - res = pipe.fit_transform(test_df) - + + # Create and apply pipeline + pipe = _get_pipe_from_proc_list(test_df.columns, pipe_dict["common"], tz="UTC") + result = pipe.fit_transform(test_df) + + # Check original data preserved where no gaps pd.testing.assert_series_equal( - res["Tin__°C__building"], TEST_DF["Tin__°C__building"] + result["Tin__°C__building"], + basic_data["Tin__°C__building"] ) - assert float(res.iloc[0, 1]) == 5.0 - - def test__get_column_wise_transformer(self): - col_trans = _get_column_wise_transformer( - proc_dict=PIPE_DICT["pre_processing"], - data_columns=TEST_DF.columns, + + # Check gap filling worked + assert not result.isna().any().any() + assert result.iloc[0, 1] == pytest.approx(test_df.iloc[1, 1]) + + def test_column_wise_transformer(self, basic_data, pipe_dict): + """Test column-wise transformer creation and application.""" + # Test with all columns + transformer = _get_column_wise_transformer( + proc_dict=pipe_dict["pre_processing"], + data_columns=basic_data.columns, tz="UTC", process_name="test", ) - - res = col_trans.fit_transform(TEST_DF.copy()) - - np.testing.assert_array_equal(res.iloc[:, 0].to_list(), [10.0, 20.0, np.nan]) - np.testing.assert_array_equal(res.iloc[:, 2].to_list(), [50.0, 100.0, np.nan]) - - col_trans = _get_column_wise_transformer( - proc_dict=PIPE_DICT["pre_processing"], - data_columns=TEST_DF[ - [col for col in TEST_DF.columns if col != "radiation__W/m2__outdoor"] - ].columns, + result = transformer.fit_transform(basic_data.copy()) + + # Check temperature threshold applied + assert (result["Tin__°C__building"] <= 25).all() + + # Test with subset of columns + temp_cols = [col for col in basic_data.columns if "°C" in col] + transformer = _get_column_wise_transformer( + proc_dict=pipe_dict["pre_processing"], + data_columns=temp_cols, tz="UTC", process_name="test", ) - - res = col_trans.fit_transform( - TEST_DF[ - [col for col in TEST_DF.columns if col != "radiation__W/m2__outdoor"] - ].copy() - ) - - np.testing.assert_array_equal(res.iloc[:, 0].to_list(), [10.0, 20.0, np.nan]) - assert len(col_trans.transformers_) == 2 - - cols_none = [ - "Humidity__%HR", - "Humidity__%HR__room1", - "Humidity_2", - "light__DIMENSIONLESS__building", - "mass_flwr__m3/h__hvac", - ] - - col_trans = _get_column_wise_transformer( - proc_dict=PIPE_DICT["pre_processing"], - data_columns=cols_none, + assert len(transformer.transformers_) == 1 + + # Test with no matching columns + humidity_cols = [col for col in basic_data.columns if "%HR" in col] + transformer = _get_column_wise_transformer( + proc_dict=pipe_dict["pre_processing"], + data_columns=humidity_cols, tz="UTC", process_name="test", ) + assert transformer is None - assert col_trans is None - - def test_get_pipeline_from_dict(self): + def test_pipeline_from_dict(self, gapped_data): + """Test creation of full pipeline from dictionary configuration.""" pipe_dict = { "fill_1": {"a__°C__zone_1": [["Interpolate"]]}, - # "fill_2": {"b": [["Interpolate"]]}, "combine": [ [ "ExpressionCombine", @@ -149,166 +166,157 @@ def test_get_pipeline_from_dict(self): ], ] ], - "fill_3": [["Interpolate"]], - } - - pipe = get_pipeline_from_dict(TEST_DF_2.columns, pipe_dict, verbose=True) - pipe.fit_transform(TEST_DF_2.copy()) - - assert True - - def test_plumber(self): - pipe = { - "fill_1": {"a__°C__zone_1": [["Interpolate"]]}, - "fill_2": {"b": [["Interpolate"]]}, - "combine": { - "zone_1": [ - [ - "ExpressionCombine", - [ - { - "T1": "a__°C__zone_1", - "T2": "b__°C__zone_1", - }, - "T1 * T2", - "new_unit__°C²__zone_1", - True, - ], - ] - ], - }, - "fill_3": [["Interpolate"]], + "fill_final": [["Interpolate"]], } - - plumber = Plumber() - plumber.set_data(TEST_DF_2) - plumber.pipe_dict = pipe - plumber.get_pipeline() - plumber.get_pipeline(steps=["fill_3", "combine"]) - plumber.plot() - plumber.get_gaps_description() - assert True - + + pipe = get_pipeline_from_dict(gapped_data.columns, pipe_dict, verbose=True) + result = pipe.fit_transform(gapped_data.copy()) + + # Check new column created + assert "new_unit__°C²__zone_1" in result.columns + + # Check gaps filled + assert not result.isna().any().any() + +class TestPlumber: + """Tests for the Plumber class functionality.""" + + def test_initialization(self, gapped_data, pipe_dict): + """Test Plumber initialization and basic attributes.""" + plumber = Plumber(gapped_data, pipe_dict) + assert plumber.data is not None + assert plumber.root is not None + assert plumber.pipe_dict == pipe_dict + + def test_data_selection(self, gapped_data): + """Test data selection using tags.""" + plumber = Plumber(gapped_data) + + # Test unit selection + temp_cols = plumber.select("°C") + assert len(temp_cols) == 2 + assert all("°C" in col for col in temp_cols) + + # Test zone selection + zone_1_cols = plumber.select("zone_1") + assert len(zone_1_cols) == 2 + assert all("zone_1" in col for col in zone_1_cols) + + def test_pipeline_execution(self, gapped_data, pipe_dict): + """Test pipeline execution with different step selections.""" + plumber = Plumber(gapped_data, pipe_dict) + + # Test full pipeline + full_pipe = plumber.get_pipeline() + assert len(full_pipe.steps) > 0 + + # Test partial pipeline + partial_pipe = plumber.get_pipeline(steps=["pre_processing"]) + assert len(partial_pipe.steps) == 1 + + # Test with no pipeline + identity_pipe = plumber.get_pipeline(steps=None) + assert len(identity_pipe.steps) == 1 + assert identity_pipe.steps[0][0] == "Identity" + + def test_corrected_data(self, gapped_data, pipe_dict): + """Test data correction through pipeline.""" + plumber = Plumber(gapped_data, pipe_dict) + + # Test with time slice + result = plumber.get_corrected_data( + start="2009-01-01 05:00", + stop="2009-01-01 10:00" + ) + assert len(result) == 6 + + # Test with column selection + result = plumber.get_corrected_data(select="°C") + assert len(result.columns) == 2 + assert all("°C" in col for col in result.columns) class TestGapsDescription: + """Tests for gap analysis functionality.""" + @pytest.fixture - def sample_data(self): - # Create sample data with known gaps - idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC") + def gaps_data(self, time_index): + """Create data with specific gaps for testing gap analysis.""" data = pd.DataFrame({ "temp__°C__Building": np.ones(24), "humidity__%__Building": np.ones(24), "power__W__Building": np.ones(24) - }, index=idx) + }, index=time_index) # Create gaps of different durations - data.loc["2023-01-01 02:00":"2023-01-01 04:00", "temp__°C__Building"] = np.nan # 3h gap - data.loc["2023-01-01 08:00", "temp__°C__Building"] = np.nan # 1h gap - data.loc["2023-01-01 12:00":"2023-01-01 14:00", "humidity__%__Building"] = np.nan # 3h gap - data.loc["2023-01-01 06:00":"2023-01-01 18:00", "power__W__Building"] = np.nan # 13h gap + data.loc["2009-01-01 02:00":"2009-01-01 04:00", "temp__°C__Building"] = np.nan + data.loc["2009-01-01 08:00", "temp__°C__Building"] = np.nan + data.loc["2009-01-01 12:00":"2009-01-01 14:00", "humidity__%__Building"] = np.nan + data.loc["2009-01-01 06:00":"2009-01-01 18:00", "power__W__Building"] = np.nan return data - def test_basic_gaps_description(self, sample_data): - """Test basic functionality with default parameters""" - plumber = Plumber(sample_data) + def test_basic_gaps_description(self, gaps_data): + """Test basic gap analysis functionality.""" + plumber = Plumber(gaps_data) result = plumber.get_gaps_description() - # Check presence of all columns - assert all(col in result.columns for col in sample_data.columns) - - # Check presence of all statistics + # Check structure + assert all(col in result.columns for col in gaps_data.columns) expected_stats = ["data_presence_%", "count", "mean", "std", "min", "25%", "50%", "75%", "max"] assert all(stat in result.index for stat in expected_stats) - # Check specific values for temp column + # Check specific values temp_col = "temp__°C__Building" - assert result[temp_col]["count"] == 2 # Two gaps - assert result[temp_col]["data_presence_%"] == pytest.approx(83.33, rel=1e-2) # 20/24 hours present + assert result[temp_col]["count"] == 2 + assert result[temp_col]["data_presence_%"] == pytest.approx(83.33, rel=1e-2) - def test_with_duration_thresholds(self, sample_data): - """Test with gap duration thresholds""" - plumber = Plumber(sample_data) + def test_gap_thresholds(self, gaps_data): + """Test gap analysis with duration thresholds.""" + plumber = Plumber(gaps_data) - # Only gaps >= 3h + # Test minimum duration threshold result = plumber.get_gaps_description(gaps_gte="3h") - assert result["temp__°C__Building"]["count"] == 1 # Only one 3h gap - assert result["power__W__Building"]["count"] == 1 # One 13h gap + assert result["temp__°C__Building"]["count"] == 1 + assert result["power__W__Building"]["count"] == 1 - # Only gaps <= 2h + # Test maximum duration threshold result = plumber.get_gaps_description(gaps_lte="2h") - assert result["temp__°C__Building"]["count"] == 1 # Only one 1h gap - assert "power__W__Building" not in result.columns # No gaps <= 2h - - def test_with_data_selection(self, sample_data): - """Test with data selection using tags""" - plumber = Plumber(sample_data) - - # Select by unit - result = plumber.get_gaps_description(select="°C") - assert list(result.columns) == ["temp__°C__Building"] - - # Select by bloc - result = plumber.get_gaps_description(select="Building") - assert len(result.columns) == 3 + assert result["temp__°C__Building"]["count"] == 1 + assert "power__W__Building" not in result.columns - def test_empty_cases(self): - """Test cases that should return empty DataFrame""" - # Data with no gaps - idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC") + def test_gap_analysis_edge_cases(self, time_index): + """Test gap analysis edge cases.""" + # Test with no gaps clean_data = pd.DataFrame({ "temp__°C__Building": np.ones(24) - }, index=idx) + }, index=time_index) plumber = Plumber(clean_data) - result = plumber.get_gaps_description() assert result.empty - # Data selection that returns no columns - plumber = Plumber(clean_data) + # Test with invalid selection result = plumber.get_gaps_description(select="nonexistent") assert result.empty - - def test_combination_flag(self, sample_data): - """Test with and without return_combination flag""" - plumber = Plumber(sample_data) - - # With combination - result = plumber.get_gaps_description(return_combination=True) - assert "combination" in result.columns - - # Without combination - result = plumber.get_gaps_description(return_combination=False) - assert "combination" not in result.columns - - def test_single_point_gaps(self): - """Test handling of single-point gaps""" - idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC") - data = pd.DataFrame({ - "temp__°C__Building": np.ones(24) - }, index=idx) - - # Create single point gap - data.loc["2023-01-01 12:00", "temp__°C__Building"] = np.nan + # Test single point gap + data = clean_data.copy() + data.loc[data.index[12], "temp__°C__Building"] = np.nan plumber = Plumber(data) result = plumber.get_gaps_description() - assert result["temp__°C__Building"]["count"] == 1 assert pd.Timedelta(result["temp__°C__Building"]["mean"]) == pd.Timedelta("1h") - def test_pipeline_steps(self, sample_data): - """Test with pipeline steps""" - plumber = Plumber(sample_data) - plumber.pipe_dict = { - "step1": [["Identity"]], # Simple identity transformation - "step2": [["Identity"]] - } - - # Test with specific steps - result = plumber.get_gaps_description(steps=["step1"]) - assert not result.empty +class TestPlotting: + """Tests for plotting functionality.""" + + def test_basic_plot(self, gapped_data): + """Test basic plotting functionality.""" + plumber = Plumber(gapped_data) + fig = plumber.plot() - # Test with no steps - result = plumber.get_gaps_description(steps=None) - assert not result.empty + # Check figure was created + assert fig is not None + # Check data is present in figure + assert len(fig.data) > 0 + # Check all columns are plotted + assert all(col in [trace.name for trace in fig.data] for col in gapped_data.columns) From 0c2f73d09cd1e72143dd1aac5195493d584357b9 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 18:09:40 +0100 Subject: [PATCH 08/12] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20get=5Fblocks=5Flte?= =?UTF-8?q?=5Fand=5Fgte,=20get=5Fblocks=5Fmask=5Flte=5Fand=5Fgte,=20return?= =?UTF-8?q?=5Fcombination=20added?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/utils.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tide/utils.py b/tide/utils.py index cbbfdb4..9f5ba1b 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -319,6 +319,7 @@ def get_blocks_lte_and_gte( lte: str | dt.timedelta = None, gte: str | dt.timedelta = None, is_null: bool = False, + return_combination: bool = False ): """ Get blocks of data ore gaps (nan) based on duration thresholds. @@ -343,6 +344,10 @@ def get_blocks_lte_and_gte( - If both `lte` and `gte` are provided, and `lte` is smaller than `gte`, they will be swapped. The function determines whether to select data within or outside the boundaries based on the order of thresholds. + return_combination : bool, optional + If True (default), a combination column is created that checks for NaNs + across all columns in the DataFrame. Gaps in this combination column represent + rows where NaNs are present in any of the columns. """ lower_th, upper_th = lte, gte @@ -358,7 +363,7 @@ def get_blocks_lte_and_gte( lower_td_threshold=lower_th, upper_td_threshold=upper_th, select_inner=select_inner, - return_combination=False, + return_combination=return_combination, ) @@ -367,6 +372,7 @@ def get_blocks_mask_lte_and_gte( lte: str | dt.timedelta = None, gte: str | dt.timedelta = None, is_null: bool = False, + return_combination: bool = False ) -> pd.DataFrame: """ Creates a boolean mask DataFrame indicating the location of data blocks or gaps. @@ -381,6 +387,10 @@ def get_blocks_mask_lte_and_gte( The maximum duration threshold is_null : bool, default False Whether to find NaN blocks (True) or valid data blocks (False) + return_combination : bool, optional + If True (default), a combination column is created that checks for NaNs + across all columns in the DataFrame. Gaps in this combination column represent + rows where NaNs are present in any of the columns. Returns ------- @@ -389,7 +399,7 @@ def get_blocks_mask_lte_and_gte( corresponding to the input data columns. True values indicate the presence of a block matching the criteria. """ - gaps_dict = get_blocks_lte_and_gte(data, lte, gte, is_null) + gaps_dict = get_blocks_lte_and_gte(data, lte, gte, is_null, return_combination) mask_data = {} for col, idx_list in gaps_dict.items(): From 19fd13ecdd33ff850ac5aa5f4f28041b2a713b95 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 18:10:46 +0100 Subject: [PATCH 09/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20big=20plumber=20test?= =?UTF-8?q?=20refactoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_plumbing.py | 104 +++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 39 deletions(-) diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py index 850e53b..1d80f96 100644 --- a/tests/test_plumbing.py +++ b/tests/test_plumbing.py @@ -93,60 +93,91 @@ def pipe_dict(): class TestPipelineComponents: """Tests for individual pipeline components and transformers.""" - def test_pipe_from_proc_list(self, basic_data, pipe_dict): + def test_pipe_from_proc_list(self, pipe_dict): """Test creation and application of processing pipeline from list.""" - # Create gap in data - test_df = basic_data.copy() - test_df.iloc[1, 0] = np.nan - test_df.iloc[0, 1] = np.nan - - # Create and apply pipeline + test_df = pd.DataFrame({ + "temp__°C__building": [10.0, np.nan, 20.0, 30.0], + "humid__%HR__building": [50.0, 60.0, np.nan, 80.0] + }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC")) + pipe = _get_pipe_from_proc_list(test_df.columns, pipe_dict["common"], tz="UTC") result = pipe.fit_transform(test_df) - - # Check original data preserved where no gaps - pd.testing.assert_series_equal( - result["Tin__°C__building"], - basic_data["Tin__°C__building"] - ) - - # Check gap filling worked + + # Check that gaps were filled with interpolation assert not result.isna().any().any() - assert result.iloc[0, 1] == pytest.approx(test_df.iloc[1, 1]) + # For temp: 10 -> [15] -> 20 -> 30 (linear interpolation) + assert result.iloc[1]["temp__°C__building"] == pytest.approx(15.0) + # For humid: 50 -> 60 -> [70] -> 80 (linear interpolation) + assert result.iloc[2]["humid__%HR__building"] == pytest.approx(70.0) + + # Check that non-gap values remain unchanged + assert result.iloc[0]["temp__°C__building"] == 10.0 + assert result.iloc[3]["temp__°C__building"] == 30.0 + assert result.iloc[0]["humid__%HR__building"] == 50.0 + assert result.iloc[1]["humid__%HR__building"] == 60.0 - def test_column_wise_transformer(self, basic_data, pipe_dict): + def test_column_wise_transformer(self, pipe_dict): """Test column-wise transformer creation and application.""" + # Create controlled test data with known values + test_df = pd.DataFrame({ + "temp1__°C__zone1": [24.0, 26.0, np.nan, 28.0], + # Two values above threshold + "temp2__°C__zone2": [23.0, 25.0, 27.0, np.nan], + # One value above threshold + "radiation__W/m2__outdoor": [100, 200, 50, 150], # For gradient test + "humid__%HR__zone1": [50.0, 60.0, 70.0, 80.0] # Should be unaffected + }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC")) + # Test with all columns transformer = _get_column_wise_transformer( proc_dict=pipe_dict["pre_processing"], - data_columns=basic_data.columns, + data_columns=test_df.columns, tz="UTC", process_name="test", ) - result = transformer.fit_transform(basic_data.copy()) - - # Check temperature threshold applied - assert (result["Tin__°C__building"] <= 25).all() - - # Test with subset of columns - temp_cols = [col for col in basic_data.columns if "°C" in col] + result = transformer.fit_transform(test_df.copy()) + + # Check temperature threshold applied (excluding NaN) + temp1_mask = ~pd.isna(result["temp1__°C__zone1"]) + temp2_mask = ~pd.isna(result["temp2__°C__zone2"]) + assert (result["temp1__°C__zone1"][temp1_mask] <= 25).all() + assert (result["temp2__°C__zone2"][temp2_mask] <= 25).all() + + # Verify specific values + assert result.iloc[0]["temp1__°C__zone1"] == 24.0 # Unchanged + assert result.iloc[1]["temp2__°C__zone2"] == 25.0 # Capped + assert pd.isna(result.iloc[2]["temp1__°C__zone1"]) # NaN preserved + assert pd.isna(result.iloc[3]["temp1__°C__zone1"]) # Capped + + # Check radiation gradient (should drop when rate < -100) + assert pd.isna(result.iloc[2][ + "radiation__W/m2__outdoor"]) # Dropped due to steep negative gradient + + # Check humidity unaffected + pd.testing.assert_series_equal( + result["humid__%HR__zone1"], + test_df["humid__%HR__zone1"] + ) + + # Test with subset of columns (temperature only) + temp_cols = [col for col in test_df.columns if "°C" in col] transformer = _get_column_wise_transformer( proc_dict=pipe_dict["pre_processing"], data_columns=temp_cols, tz="UTC", process_name="test", ) - assert len(transformer.transformers_) == 1 - + assert len(transformer.transformers_) == 1 # Only temperature transformer + # Test with no matching columns - humidity_cols = [col for col in basic_data.columns if "%HR" in col] + humidity_cols = [col for col in test_df.columns if "%HR" in col] transformer = _get_column_wise_transformer( proc_dict=pipe_dict["pre_processing"], data_columns=humidity_cols, tz="UTC", process_name="test", ) - assert transformer is None + assert transformer is None # No transformers needed def test_pipeline_from_dict(self, gapped_data): """Test creation of full pipeline from dictionary configuration.""" @@ -202,9 +233,9 @@ def test_data_selection(self, gapped_data): assert len(zone_1_cols) == 2 assert all("zone_1" in col for col in zone_1_cols) - def test_pipeline_execution(self, gapped_data, pipe_dict): + def test_pipeline_execution(self, basic_data, pipe_dict): """Test pipeline execution with different step selections.""" - plumber = Plumber(gapped_data, pipe_dict) + plumber = Plumber(basic_data, pipe_dict) # Test full pipeline full_pipe = plumber.get_pipeline() @@ -219,21 +250,16 @@ def test_pipeline_execution(self, gapped_data, pipe_dict): assert len(identity_pipe.steps) == 1 assert identity_pipe.steps[0][0] == "Identity" - def test_corrected_data(self, gapped_data, pipe_dict): + def test_corrected_data(self, basic_data, pipe_dict): """Test data correction through pipeline.""" - plumber = Plumber(gapped_data, pipe_dict) + plumber = Plumber(basic_data, pipe_dict) # Test with time slice result = plumber.get_corrected_data( start="2009-01-01 05:00", stop="2009-01-01 10:00" ) - assert len(result) == 6 - - # Test with column selection - result = plumber.get_corrected_data(select="°C") - assert len(result.columns) == 2 - assert all("°C" in col for col in result.columns) + assert len(result) == 3 class TestGapsDescription: """Tests for gap analysis functionality.""" From e4c5ea03a795a4ab72077a77ff0ea2ad3643f103 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 18:12:05 +0100 Subject: [PATCH 10/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20big=20plumber=20test?= =?UTF-8?q?=20refactoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_plumbing.py | 144 ++++++++++++++++++++++++----------------- tide/plumbing.py | 16 +++-- tide/utils.py | 4 +- 3 files changed, 94 insertions(+), 70 deletions(-) diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py index 1d80f96..079e818 100644 --- a/tests/test_plumbing.py +++ b/tests/test_plumbing.py @@ -12,11 +12,13 @@ pio.renderers.default = "browser" + @pytest.fixture def time_index(): """Create a standard time index for test data.""" return pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC") + @pytest.fixture def basic_data(time_index): """Create basic test data with various units and tags.""" @@ -34,6 +36,7 @@ def basic_data(time_index): index=time_index, ) + @pytest.fixture def gapped_data(time_index): """Create test data with specific gaps for testing gap-related functionality.""" @@ -45,18 +48,19 @@ def gapped_data(time_index): }, index=time_index, ) - + # Add cumulative sum to energy data data["c__Wh__zone_2"] = data["c__Wh__zone_2"].cumsum() - + # Add specific gaps data.loc["2009-01-01 05:00":"2009-01-01 09:00", "a__°C__zone_1"] = np.nan # 5h gap data.loc["2009-01-01 15:00", "b__°C__zone_1"] = np.nan # 1h gap data.loc["2009-01-01 17:00", "b__°C__zone_1"] = np.nan # 1h gap data.loc["2009-01-01 20:00", "c__Wh__zone_2"] = np.nan # 1h gap - + return data + @pytest.fixture def pipe_dict(): """Create a standard pipeline dictionary for testing.""" @@ -65,14 +69,8 @@ def pipe_dict(): "°C": [["ReplaceThreshold", {"upper": 25}]], "W/m2__outdoor": [["DropTimeGradient", {"upper_rate": -100}]], }, - "common": [ - ["Interpolate", ["linear"]], - ["Ffill"], - ["Bfill", {"limit": 3}] - ], - "resampling": [ - ["Resample", ["3h", "mean", {"W/m2": "sum"}]] - ], + "common": [["Interpolate", ["linear"]], ["Ffill"], ["Bfill", {"limit": 3}]], + "resampling": [["Resample", ["3h", "mean", {"W/m2": "sum"}]]], "compute_energy": [ [ "ExpressionCombine", @@ -90,15 +88,19 @@ def pipe_dict(): ], } + class TestPipelineComponents: """Tests for individual pipeline components and transformers.""" - + def test_pipe_from_proc_list(self, pipe_dict): """Test creation and application of processing pipeline from list.""" - test_df = pd.DataFrame({ - "temp__°C__building": [10.0, np.nan, 20.0, 30.0], - "humid__%HR__building": [50.0, 60.0, np.nan, 80.0] - }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC")) + test_df = pd.DataFrame( + { + "temp__°C__building": [10.0, np.nan, 20.0, 30.0], + "humid__%HR__building": [50.0, 60.0, np.nan, 80.0], + }, + index=pd.date_range("2009", freq="h", periods=4, tz="UTC"), + ) pipe = _get_pipe_from_proc_list(test_df.columns, pipe_dict["common"], tz="UTC") result = pipe.fit_transform(test_df) @@ -119,14 +121,17 @@ def test_pipe_from_proc_list(self, pipe_dict): def test_column_wise_transformer(self, pipe_dict): """Test column-wise transformer creation and application.""" # Create controlled test data with known values - test_df = pd.DataFrame({ - "temp1__°C__zone1": [24.0, 26.0, np.nan, 28.0], - # Two values above threshold - "temp2__°C__zone2": [23.0, 25.0, 27.0, np.nan], - # One value above threshold - "radiation__W/m2__outdoor": [100, 200, 50, 150], # For gradient test - "humid__%HR__zone1": [50.0, 60.0, 70.0, 80.0] # Should be unaffected - }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC")) + test_df = pd.DataFrame( + { + "temp1__°C__zone1": [24.0, 26.0, np.nan, 28.0], + # Two values above threshold + "temp2__°C__zone2": [23.0, 25.0, 27.0, np.nan], + # One value above threshold + "radiation__W/m2__outdoor": [100, 200, 50, 150], # For gradient test + "humid__%HR__zone1": [50.0, 60.0, 70.0, 80.0], # Should be unaffected + }, + index=pd.date_range("2009", freq="h", periods=4, tz="UTC"), + ) # Test with all columns transformer = _get_column_wise_transformer( @@ -150,13 +155,13 @@ def test_column_wise_transformer(self, pipe_dict): assert pd.isna(result.iloc[3]["temp1__°C__zone1"]) # Capped # Check radiation gradient (should drop when rate < -100) - assert pd.isna(result.iloc[2][ - "radiation__W/m2__outdoor"]) # Dropped due to steep negative gradient + assert pd.isna( + result.iloc[2]["radiation__W/m2__outdoor"] + ) # Dropped due to steep negative gradient # Check humidity unaffected pd.testing.assert_series_equal( - result["humid__%HR__zone1"], - test_df["humid__%HR__zone1"] + result["humid__%HR__zone1"], test_df["humid__%HR__zone1"] ) # Test with subset of columns (temperature only) @@ -199,19 +204,20 @@ def test_pipeline_from_dict(self, gapped_data): ], "fill_final": [["Interpolate"]], } - + pipe = get_pipeline_from_dict(gapped_data.columns, pipe_dict, verbose=True) result = pipe.fit_transform(gapped_data.copy()) - + # Check new column created assert "new_unit__°C²__zone_1" in result.columns - + # Check gaps filled assert not result.isna().any().any() + class TestPlumber: """Tests for the Plumber class functionality.""" - + def test_initialization(self, gapped_data, pipe_dict): """Test Plumber initialization and basic attributes.""" plumber = Plumber(gapped_data, pipe_dict) @@ -222,12 +228,12 @@ def test_initialization(self, gapped_data, pipe_dict): def test_data_selection(self, gapped_data): """Test data selection using tags.""" plumber = Plumber(gapped_data) - + # Test unit selection temp_cols = plumber.select("°C") assert len(temp_cols) == 2 assert all("°C" in col for col in temp_cols) - + # Test zone selection zone_1_cols = plumber.select("zone_1") assert len(zone_1_cols) == 2 @@ -236,15 +242,15 @@ def test_data_selection(self, gapped_data): def test_pipeline_execution(self, basic_data, pipe_dict): """Test pipeline execution with different step selections.""" plumber = Plumber(basic_data, pipe_dict) - + # Test full pipeline full_pipe = plumber.get_pipeline() assert len(full_pipe.steps) > 0 - + # Test partial pipeline partial_pipe = plumber.get_pipeline(steps=["pre_processing"]) assert len(partial_pipe.steps) == 1 - + # Test with no pipeline identity_pipe = plumber.get_pipeline(steps=None) assert len(identity_pipe.steps) == 1 @@ -253,44 +259,59 @@ def test_pipeline_execution(self, basic_data, pipe_dict): def test_corrected_data(self, basic_data, pipe_dict): """Test data correction through pipeline.""" plumber = Plumber(basic_data, pipe_dict) - + # Test with time slice result = plumber.get_corrected_data( - start="2009-01-01 05:00", - stop="2009-01-01 10:00" + start="2009-01-01 05:00", stop="2009-01-01 10:00" ) assert len(result) == 3 + class TestGapsDescription: """Tests for gap analysis functionality.""" - + @pytest.fixture def gaps_data(self, time_index): """Create data with specific gaps for testing gap analysis.""" - data = pd.DataFrame({ - "temp__°C__Building": np.ones(24), - "humidity__%__Building": np.ones(24), - "power__W__Building": np.ones(24) - }, index=time_index) - + data = pd.DataFrame( + { + "temp__°C__Building": np.ones(24), + "humidity__%__Building": np.ones(24), + "power__W__Building": np.ones(24), + }, + index=time_index, + ) + # Create gaps of different durations data.loc["2009-01-01 02:00":"2009-01-01 04:00", "temp__°C__Building"] = np.nan data.loc["2009-01-01 08:00", "temp__°C__Building"] = np.nan - data.loc["2009-01-01 12:00":"2009-01-01 14:00", "humidity__%__Building"] = np.nan + data.loc["2009-01-01 12:00":"2009-01-01 14:00", "humidity__%__Building"] = ( + np.nan + ) data.loc["2009-01-01 06:00":"2009-01-01 18:00", "power__W__Building"] = np.nan - + return data def test_basic_gaps_description(self, gaps_data): """Test basic gap analysis functionality.""" plumber = Plumber(gaps_data) result = plumber.get_gaps_description() - + # Check structure assert all(col in result.columns for col in gaps_data.columns) - expected_stats = ["data_presence_%", "count", "mean", "std", "min", "25%", "50%", "75%", "max"] + expected_stats = [ + "data_presence_%", + "count", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] assert all(stat in result.index for stat in expected_stats) - + # Check specific values temp_col = "temp__°C__Building" assert result[temp_col]["count"] == 2 @@ -299,12 +320,12 @@ def test_basic_gaps_description(self, gaps_data): def test_gap_thresholds(self, gaps_data): """Test gap analysis with duration thresholds.""" plumber = Plumber(gaps_data) - + # Test minimum duration threshold result = plumber.get_gaps_description(gaps_gte="3h") assert result["temp__°C__Building"]["count"] == 1 assert result["power__W__Building"]["count"] == 1 - + # Test maximum duration threshold result = plumber.get_gaps_description(gaps_lte="2h") assert result["temp__°C__Building"]["count"] == 1 @@ -313,17 +334,15 @@ def test_gap_thresholds(self, gaps_data): def test_gap_analysis_edge_cases(self, time_index): """Test gap analysis edge cases.""" # Test with no gaps - clean_data = pd.DataFrame({ - "temp__°C__Building": np.ones(24) - }, index=time_index) + clean_data = pd.DataFrame({"temp__°C__Building": np.ones(24)}, index=time_index) plumber = Plumber(clean_data) result = plumber.get_gaps_description() assert result.empty - + # Test with invalid selection result = plumber.get_gaps_description(select="nonexistent") assert result.empty - + # Test single point gap data = clean_data.copy() data.loc[data.index[12], "temp__°C__Building"] = np.nan @@ -332,6 +351,7 @@ def test_gap_analysis_edge_cases(self, time_index): assert result["temp__°C__Building"]["count"] == 1 assert pd.Timedelta(result["temp__°C__Building"]["mean"]) == pd.Timedelta("1h") + class TestPlotting: """Tests for plotting functionality.""" @@ -339,10 +359,12 @@ def test_basic_plot(self, gapped_data): """Test basic plotting functionality.""" plumber = Plumber(gapped_data) fig = plumber.plot() - + # Check figure was created assert fig is not None # Check data is present in figure assert len(fig.data) > 0 # Check all columns are plotted - assert all(col in [trace.name for trace in fig.data] for col in gapped_data.columns) + assert all( + col in [trace.name for trace in fig.data] for col in gapped_data.columns + ) diff --git a/tide/plumbing.py b/tide/plumbing.py index d899801..0a49450 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -196,7 +196,7 @@ def get_gaps_description( Empty DataFrame if no gaps are found. """ data = self.get_corrected_data(select, steps=steps, verbose=verbose) - + # Get gaps and calculate durations gaps_dict = get_blocks_lte_and_gte( data=data, @@ -210,14 +210,14 @@ def get_gaps_description( for col, gaps_list in gaps_dict.items(): if not gaps_list: continue - + durations = [] for gap in gaps_list: if len(gap) > 1: durations.append(gap[-1] - gap[0]) else: durations.append(pd.to_timedelta(gap.freq)) - + if durations: gap_durations[col] = pd.Series(durations, name=col) @@ -225,7 +225,7 @@ def get_gaps_description( return pd.DataFrame() stats_df = pd.concat([ser.describe() for ser in gap_durations.values()], axis=1) - + gaps_mask = get_blocks_mask_lte_and_gte( data=data, lte=gaps_lte, @@ -233,11 +233,13 @@ def get_gaps_description( is_null=True, return_combination=return_combination, ) - + presence_percentages = (1 - gaps_mask.mean()) * 100 - + stats_df.loc["data_presence_%"] = presence_percentages[stats_df.columns] - row_order = ["data_presence_%"] + [idx for idx in stats_df.index if idx != "data_presence_%"] + row_order = ["data_presence_%"] + [ + idx for idx in stats_df.index if idx != "data_presence_%" + ] return stats_df.reindex(row_order) def set_data(self, data: pd.Series | pd.DataFrame): diff --git a/tide/utils.py b/tide/utils.py index 9f5ba1b..0ec969c 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -319,7 +319,7 @@ def get_blocks_lte_and_gte( lte: str | dt.timedelta = None, gte: str | dt.timedelta = None, is_null: bool = False, - return_combination: bool = False + return_combination: bool = False, ): """ Get blocks of data ore gaps (nan) based on duration thresholds. @@ -372,7 +372,7 @@ def get_blocks_mask_lte_and_gte( lte: str | dt.timedelta = None, gte: str | dt.timedelta = None, is_null: bool = False, - return_combination: bool = False + return_combination: bool = False, ) -> pd.DataFrame: """ Creates a boolean mask DataFrame indicating the location of data blocks or gaps. From 266a991334339d4a1140a662c7e70ef1c28ed55d Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 18:40:03 +0100 Subject: [PATCH 11/12] =?UTF-8?q?=F0=9F=93=9D=20Add=20Plumber=20doc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/plumbing.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/tide/plumbing.py b/tide/plumbing.py index 0a49450..e9cf1bc 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -120,7 +120,50 @@ def get_pipeline_from_dict( class Plumber: + """A class for managing and transforming time series data through configurable processing pipelines. + + The Plumber class provides a high-level interface for: + - Managing time series data with hierarchical column naming (name__unit__bloc__sub_bloc) + - Creating and executing data processing pipelines + - Analyzing and visualizing data gaps + - Plotting time series with customizable layouts + + The class uses a tree structure to organize data columns based on their tags, + allowing for flexible data selection and manipulation. + + Attributes + ---------- + data : pd.DataFrame + The input time series data with datetime index + root : Node + Root node of the tree structure organizing column names + pipe_dict : dict + Configuration dictionary defining the processing pipeline steps + + Examples + -------- + >>> data = pd.DataFrame({ + ... "temp__°C__zone1": [20, 21, np.nan, 23], + ... "humid__%HR__zone1": [50, 55, 60, np.nan] + ... }, index=pd.date_range("2023", freq="h", periods=4)) + >>> pipe_dict = { + ... "pre_processing": {"°C": [["ReplaceThreshold", {"upper": 25}]]}, + ... "common": [["Interpolate", ["linear"]]] + ... } + >>> plumber = Plumber(data, pipe_dict) + >>> corrected = plumber.get_corrected_data() + """ + def __init__(self, data: pd.Series | pd.DataFrame = None, pipe_dict: dict = None): + """ + Parameters + ---------- + data : pd.Series or pd.DataFrame, optional + Input time series data. Must have a datetime index. + pipe_dict : dict, optional + Pipeline configuration dictionary. Each key represents a processing step + and contains the corresponding transformation parameters. + """ self.data = check_and_return_dt_index_df(data) if data is not None else None self.root = data_columns_to_tree(data.columns) if data is not None else None self.pipe_dict = pipe_dict @@ -146,6 +189,18 @@ def show( steps: None | str | list[str] | slice = slice(None), depth_level: int | str = None, ): + """Display the tree structure of selected data columns at selected steps for + a given depth level. + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Data selection using tide's tag system + steps : None or str or list[str] or slice, default slice(None) + Pipeline steps to apply before showing the tree + depth_level : int or str, optional + Maximum depth level to display in the tree + """ pipe = self.get_pipeline(select=select, steps=steps) loc_tree = data_columns_to_tree(pipe.get_feature_names_out()) if depth_level is not None: @@ -243,6 +298,13 @@ def get_gaps_description( return stats_df.reindex(row_order) def set_data(self, data: pd.Series | pd.DataFrame): + """Set new data for the Plumber instance. + + Parameters + ---------- + data : pd.Series or pd.DataFrame + New time series data to process. Must have a datetime index. + """ self.data = check_and_return_dt_index_df(data) self.root = data_columns_to_tree(data.columns) @@ -250,6 +312,20 @@ def select( self, select: str | pd.Index | list[str] = None, ): + """Select columns based on tags. + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Selection criteria using tide's tag system. + Can be a unit (e.g., "°C"), location (e.g., "zone_1"), + or any other tag in the column names. + + Returns + ------- + pd.Index + Selected column names + """ return parse_request_to_col_names(self.data, select) def get_pipeline( @@ -258,6 +334,22 @@ def get_pipeline( steps: None | str | list[str] | slice = slice(None), verbose: bool = False, ) -> Pipeline: + """Create a scikit-learn pipeline from the configuration. + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Data selection using tide's tag system + steps : None or str or list[str] or slice, default slice(None) + Pipeline steps to include. If None, returns an Identity transformer. + verbose : bool, default False + Whether to print information about pipeline steps + + Returns + ------- + Pipeline + Scikit-learn pipeline configured with the selected steps + """ if self.data is None: raise ValueError("data is required to build a pipeline") selection = parse_request_to_col_names(self.data, select) @@ -280,6 +372,26 @@ def get_corrected_data( steps: None | str | list[str] | slice = slice(None), verbose: bool = False, ) -> pd.DataFrame: + """Apply pipeline transformations to selected data. + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Data selection using tide's tag system + start : str or datetime or Timestamp, optional + Start time for data slice + stop : str or datetime or Timestamp, optional + End time for data slice + steps : None or str or list[str] or slice, default slice(None) + Pipeline steps to apply + verbose : bool, default False + Whether to print information about pipeline steps + + Returns + ------- + pd.DataFrame + Transformed data + """ if self.data is None: raise ValueError("Cannot get corrected data. data are missing") select = parse_request_to_col_names(self.data, select) @@ -299,6 +411,30 @@ def plot_gaps_heatmap( title: str = None, verbose: bool = False, ): + """Create a heatmap visualization of data gaps. + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Data selection using tide's tag system + start : str or datetime or Timestamp, optional + Start time for visualization + stop : str or datetime or Timestamp, optional + End time for visualization + steps : None or str or list[str] or slice, default slice(None) + Pipeline steps to apply before visualization + time_step : str or Timedelta or timedelta, optional + Time step for aggregating gaps + title : str, optional + Plot title + verbose : bool, default False + Whether to print information about pipeline steps + + Returns + ------- + go.Figure + Plotly figure object containing the heatmap + """ data = self.get_corrected_data(select, start, stop, steps, verbose) return plot_gaps_heatmap(data, time_step=time_step, title=title) @@ -328,6 +464,68 @@ def plot( y_title_standoff: int | float = 5, verbose: bool = False, ): + """Create an interactive time series plot. + + Creates a highly customizable plot that can show: + - Multiple time series with automatic different y-axes based on unit + - Two different versions of the data (e.g., raw and processed) + - Data gaps visualization + - Custom styling and layout + + Parameters + ---------- + select : str or pd.Index or list[str], optional + Data selection using tide's tag system + start : str or datetime or Timestamp, optional + Start time for plot + stop : str or datetime or Timestamp, optional + End time for plot + y_axis_level : str, optional + Tag level to use for y-axis grouping + y_tag_list : list[str], optional + List of tags for custom y-axis ordering + steps : None or str or list[str] or slice, default slice(None) + Pipeline steps to apply for main data + data_mode : str, default "lines" + Plot mode for main data ("lines", "markers", or "lines+markers") + steps_2 : None or str or list[str] or slice, optional + Pipeline steps to apply for secondary data + data_2_mode : str, default "markers" + Plot mode for secondary data + markers_opacity : float, default 0.8 + Opacity for markers + lines_width : float, default 2.0 + Width of plot lines + title : str, optional + Plot title + plot_gaps : bool, default False + Whether to highlight gaps in main data + gaps_lower_td : str or Timedelta or timedelta, optional + Minimum duration for gap highlighting + gaps_rgb : tuple[int, int, int], default (31, 73, 125) + RGB color for main data gaps + gaps_alpha : float, default 0.5 + Opacity for main data gaps + plot_gaps_2 : bool, default False + Whether to highlight gaps in secondary data + gaps_2_lower_td : str or Timedelta or timedelta, optional + Minimum duration for secondary data gap highlighting + gaps_2_rgb : tuple[int, int, int], default (254, 160, 34) + RGB color for secondary data gaps + gaps_2_alpha : float, default 0.5 + Opacity for secondary data gaps + axis_space : float, default 0.03 + Space between multiple y-axes + y_title_standoff : int or float, default 5 + Distance between y-axis title and axis + verbose : bool, default False + Whether to print information about pipeline steps + + Returns + ------- + go.Figure + Plotly figure object containing the plot + """ # A bit dirty. Here we assume that if you ask a selection # that is not found in original data columns, it is because it # has not yet been computed (using ExpressionCombine processor From 6f2eeb288dbc19dc8fb7d2871b1809ec0cc28289 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Mon, 3 Mar 2025 18:45:12 +0100 Subject: [PATCH 12/12] =?UTF-8?q?=F0=9F=9A=A8=20plumbing=20linter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/plumbing.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tide/plumbing.py b/tide/plumbing.py index e9cf1bc..8842ab4 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -142,13 +142,16 @@ class Plumber: Examples -------- - >>> data = pd.DataFrame({ - ... "temp__°C__zone1": [20, 21, np.nan, 23], - ... "humid__%HR__zone1": [50, 55, 60, np.nan] - ... }, index=pd.date_range("2023", freq="h", periods=4)) + >>> data = pd.DataFrame( + ... { + ... "temp__°C__zone1": [20, 21, np.nan, 23], + ... "humid__%HR__zone1": [50, 55, 60, np.nan], + ... }, + ... index=pd.date_range("2023", freq="h", periods=4), + ... ) >>> pipe_dict = { ... "pre_processing": {"°C": [["ReplaceThreshold", {"upper": 25}]]}, - ... "common": [["Interpolate", ["linear"]]] + ... "common": [["Interpolate", ["linear"]]], ... } >>> plumber = Plumber(data, pipe_dict) >>> corrected = plumber.get_corrected_data() @@ -189,7 +192,7 @@ def show( steps: None | str | list[str] | slice = slice(None), depth_level: int | str = None, ): - """Display the tree structure of selected data columns at selected steps for + """Display the tree structure of selected data columns at selected steps for a given depth level. Parameters