From d2d6d6a6bd7d90f5dd4df4c2ff96b08c7656d5f2 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 11:10:54 +0100
Subject: [PATCH 01/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20get=5Fgaps=5Fmask=5F?=
 =?UTF-8?q?from=5Fblocks=20moved=20to=20utils=20and=20get=5Fdata=5Fblocks?=
 =?UTF-8?q?=20include=20inner=5Fselection=20handling=20if=20gte=20<=20lte?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tide/base.py  | 35 ++++++-------------
 tide/utils.py | 97 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 88 insertions(+), 44 deletions(-)

diff --git a/tide/base.py b/tide/base.py
index 3e66543..877c442 100644
--- a/tide/base.py
+++ b/tide/base.py
@@ -20,6 +20,7 @@
     get_idx_freq_delta_or_min_time_interval,
     get_tags_max_level,
     NAME_LEVEL_MAP,
+    get_gaps_mask_from_blocks,
 )
 
 from tide.meteo import get_oikolab_df
@@ -205,39 +206,25 @@ def __init__(
         self.gaps_gte = gaps_gte
 
     def get_gaps_dict_to_fill(self, X: pd.Series | pd.DataFrame):
-        X = check_and_return_dt_index_df(X)
-        lower_th, upper_th = self.gaps_lte, self.gaps_gte
-        select_inner = False
-
-        if lower_th is not None and upper_th is not None:
-            if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
-                lower_th, upper_th = upper_th, lower_th
-                select_inner = True
-
         return get_data_blocks(
             X,
             is_null=True,
-            select_inner=select_inner,
-            lower_td_threshold=lower_th,
-            upper_td_threshold=upper_th,
+            lower_td_threshold=self.gaps_lte,
+            upper_td_threshold=self.gaps_gte,
             upper_threshold_inclusive=True,
             lower_threshold_inclusive=True,
             return_combination=False,
         )
 
     def get_gaps_mask(self, X: pd.Series | pd.DataFrame):
-        gaps_dict = self.get_gaps_dict_to_fill(X)
-        mask_data = {}
-
-        for col, idx_list in gaps_dict.items():
-            if idx_list:
-                combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index
-                mask_data[col] = X.index.isin(combined_idx)
-            else:
-                mask_data[col] = np.zeros(X.shape[0], dtype=bool)
-
-        df_mask = pd.DataFrame(mask_data, index=X.index)
-        return df_mask
+        return get_gaps_mask_from_blocks(
+            X,
+            is_null=True,
+            lower_td_threshold=self.gaps_lte,
+            upper_td_threshold=self.gaps_gte,
+            lower_threshold_inclusive=True,
+            upper_threshold_inclusive=True,
+        )
 
 
 class BaseOikoMeteo:
diff --git a/tide/utils.py b/tide/utils.py
index b250d35..c42c122 100644
--- a/tide/utils.py
+++ b/tide/utils.py
@@ -318,12 +318,11 @@ def get_data_blocks(
     data: pd.Series | pd.DataFrame,
     is_null: bool = False,
     cols: str | list[str] = None,
-    select_inner: bool = True,
     lower_td_threshold: str | dt.timedelta = None,
     upper_td_threshold: str | dt.timedelta = None,
     lower_threshold_inclusive: bool = True,
     upper_threshold_inclusive: bool = True,
-    return_combination=True,
+    return_combination: bool = True,
 ):
     """
     Identifies groups of valid data if is_null = False, or groups of nan if
@@ -350,10 +349,9 @@ def get_data_blocks(
         Whether to return groups with valid data, or groups of Nan values
         (is_null = True)
     cols : str or list[str], optional
-        The columns in the DataFrame for which to detect gaps. If None (default), all
-        columns are considered.
-    select_inner : Bool, default True
-        Select the groups of data inside or outside the given boundaries
+        Columns to analyze. If None, uses all columns.
+    select_inner : bool, default True
+        If True, select groups within thresholds. If False, select groups outside thresholds.
     lower_td_threshold : str or timedelta, optional
         The minimum duration of a period for it to be considered valid.
         Can be passed as a string (e.g., '1d' for one day) or a `timedelta`.
@@ -381,33 +379,38 @@ def get_data_blocks(
         timestamps where the values in the corresponding column were NaN and
         exceeded the gap threshold.
     """
-
     data = check_and_return_dt_index_df(data)
-
-    if isinstance(cols, str):
-        cols = [cols]
-    elif cols is None:
-        cols = list(data.columns)
-
-    idx_dict = {}
-    for col in cols:
-        idx_dict[col] = get_series_bloc(
+    columns = ensure_list(columns) or list(data.columns)
+
+    # Handle threshold order and adjust select_inner if needed
+    lower_th, upper_th = lower_td_threshold, upper_td_threshold
+    select_inner = False
+    if lower_th is not None and upper_th is not None:
+        if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
+            lower_th, upper_th = upper_th, lower_th
+            select_inner = True
+
+    # Process each column
+    idx_dict = {
+        col: get_series_bloc(
             data[col],
             is_null,
             select_inner,
-            lower_td_threshold,
-            upper_td_threshold,
+            lower_th,
+            upper_th,
             lower_threshold_inclusive,
             upper_threshold_inclusive,
         )
+        for col in cols
+    }
 
     if return_combination:
         idx_dict["combination"] = get_series_bloc(
             ~data.isnull().any(axis=1),
             is_null,
             select_inner,
-            lower_td_threshold,
-            upper_td_threshold,
+            lower_th,
+            upper_th,
             lower_threshold_inclusive,
             upper_threshold_inclusive,
         )
@@ -487,3 +490,57 @@ def ensure_list(item):
     if item is None:
         return []
     return item if isinstance(item, list) else [item]
+
+
+def get_gaps_mask_from_blocks(
+    data: pd.Series | pd.DataFrame,
+    is_null: bool = False,
+    lower_td_threshold: str | dt.timedelta = None,
+    upper_td_threshold: str | dt.timedelta = None,
+    lower_threshold_inclusive: bool = True,
+    upper_threshold_inclusive: bool = True,
+) -> pd.DataFrame:
+    """
+    Creates a boolean mask DataFrame indicating the location of data blocks or gaps.
+
+    Parameters
+    ----------
+    data : pd.Series or pd.DataFrame
+        The input time series data with a DateTime index
+    is_null : bool, default False
+        Whether to find NaN blocks (True) or valid data blocks (False)
+    lower_td_threshold : str or timedelta, optional
+        The minimum duration threshold
+    upper_td_threshold : str or timedelta, optional
+        The maximum duration threshold
+    lower_threshold_inclusive : bool, default True
+        Include the blocks of exactly lower_td_threshold duration
+    upper_threshold_inclusive : bool, default True
+        Include the blocks of exactly upper_td_threshold duration
+
+    Returns
+    -------
+    pd.DataFrame
+        Boolean mask DataFrame with same index as input data and columns
+        corresponding to the input data columns. True values indicate
+        the presence of a block matching the criteria.
+    """
+    gaps_dict = get_data_blocks(
+        data,
+        is_null=is_null,
+        lower_td_threshold=lower_td_threshold,
+        upper_td_threshold=upper_td_threshold,
+        lower_threshold_inclusive=lower_threshold_inclusive,
+        upper_threshold_inclusive=upper_threshold_inclusive,
+        return_combination=False,
+    )
+
+    mask_data = {}
+    for col, idx_list in gaps_dict.items():
+        if idx_list:
+            combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index
+            mask_data[col] = data.index.isin(combined_idx)
+        else:
+            mask_data[col] = np.zeros(data.shape[0], dtype=bool)
+
+    return pd.DataFrame(mask_data, index=data.index)

From 3392ab648bb44fb37b80cc72bcc72214bc8a185a Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 11:12:23 +0100
Subject: [PATCH 02/12] =?UTF-8?q?=F0=9F=9A=A7=20stats=20dev?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_plumbing.py |  3 +--
 tide/plumbing.py       | 51 ++++++++++++++++++++++++++++++++++++++++++
 tide/utils.py          | 15 +++++++++----
 3 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py
index 8e1167e..ac54e9b 100644
--- a/tests/test_plumbing.py
+++ b/tests/test_plumbing.py
@@ -184,7 +184,6 @@ def test_plumber(self):
         plumber.pipe_dict = pipe
         plumber.get_pipeline()
         plumber.get_pipeline(steps=["fill_3", "combine"])
-
         plumber.plot()
-
+        plumber.get_gaps_description()
         assert True
diff --git a/tide/plumbing.py b/tide/plumbing.py
index 115c5ec..2beb39b 100644
--- a/tide/plumbing.py
+++ b/tide/plumbing.py
@@ -14,6 +14,7 @@
     get_data_level_values,
     get_tree_depth_from_level,
     NamedList,
+    get_data_blocks
 )
 from tide.plot import (
     plot_gaps_heatmap,
@@ -150,6 +151,56 @@ def show(
             depth_level = get_tree_depth_from_level(loc_tree.max_depth, depth_level)
         loc_tree.show(max_depth=depth_level)
 
+    def get_gaps_description(
+            self,
+            select: str | pd.Index | list[str] = None,
+            steps: None | str | list[str] | slice = slice(None),
+            verbose:bool = False,
+            gaps_lte: str | pd.Timedelta | dt.timedelta = None,
+            gaps_gte: str | pd.Timedelta | dt.timedelta = None,
+            return_combination:bool = True
+    ):
+        data = self.get_corrected_data(select, steps=steps, verbose=verbose)
+
+        lower_th, upper_th = gaps_lte, gaps_gte
+        select_inner = False
+        if lower_th is not None and upper_th is not None:
+            if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
+                lower_th, upper_th = upper_th, lower_th
+                select_inner = True
+
+        nan_blocks = get_data_blocks(
+                data=data,
+                is_null=True,
+                select_inner=select_inner,
+                lower_td_threshold=lower_th,
+                upper_td_threshold=upper_th,
+                return_combination=return_combination,
+        )
+
+
+        ser_list = []
+        for col, gaps_list in nan_blocks.items():
+            gaps_ser = []
+            if gaps_list:
+                for gap in gaps_list:
+                    if gap.shape[0] > 1:
+                        gaps_ser.append(gap[-1] - gap[0])
+                    elif gap.shape[0] == 1:
+                        gaps_ser.append(pd.to_timedelta(gap.freq))
+
+                ser_list.append(pd.Series(gaps_ser, name=col).describe())
+
+        try:
+            res = pd.concat(ser_list, axis=1)
+        except ValueError:
+            res = pd.DataFrame()
+
+        pass
+
+
+
+
     def set_data(self, data: pd.Series | pd.DataFrame):
         self.data = check_and_return_dt_index_df(data)
         self.root = data_columns_to_tree(data.columns)
diff --git a/tide/utils.py b/tide/utils.py
index b250d35..1e99420 100644
--- a/tide/utils.py
+++ b/tide/utils.py
@@ -389,14 +389,21 @@ def get_data_blocks(
     elif cols is None:
         cols = list(data.columns)
 
+    lower_th, upper_th = lower_td_threshold, upper_td_threshold
+    select_inner = False
+    if lower_th is not None and upper_th is not None:
+        if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
+            lower_th, upper_th = upper_th, lower_th
+            select_inner = True
+
     idx_dict = {}
     for col in cols:
         idx_dict[col] = get_series_bloc(
             data[col],
             is_null,
             select_inner,
-            lower_td_threshold,
-            upper_td_threshold,
+            lower_th,
+            upper_th,
             lower_threshold_inclusive,
             upper_threshold_inclusive,
         )
@@ -406,8 +413,8 @@ def get_data_blocks(
             ~data.isnull().any(axis=1),
             is_null,
             select_inner,
-            lower_td_threshold,
-            upper_td_threshold,
+            lower_th,
+            upper_th,
             lower_threshold_inclusive,
             upper_threshold_inclusive,
         )

From ae39b718b6a6f839b6c233ee15c6e67535a7ab43 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 11:18:34 +0100
Subject: [PATCH 03/12] =?UTF-8?q?=F0=9F=94=80=20merge=20and=20adapt=20plum?=
 =?UTF-8?q?ber=20stats?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tide/base.py     |  1 -
 tide/plumbing.py | 31 +++++++++++++------------------
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/tide/base.py b/tide/base.py
index 877c442..12ced32 100644
--- a/tide/base.py
+++ b/tide/base.py
@@ -4,7 +4,6 @@
 import typing
 from abc import ABC, abstractmethod
 
-import numpy as np
 import pandas as pd
 
 from sklearn.base import TransformerMixin, BaseEstimator
diff --git a/tide/plumbing.py b/tide/plumbing.py
index 2beb39b..6929668 100644
--- a/tide/plumbing.py
+++ b/tide/plumbing.py
@@ -14,7 +14,7 @@
     get_data_level_values,
     get_tree_depth_from_level,
     NamedList,
-    get_data_blocks
+    get_data_blocks,
 )
 from tide.plot import (
     plot_gaps_heatmap,
@@ -152,13 +152,13 @@ def show(
         loc_tree.show(max_depth=depth_level)
 
     def get_gaps_description(
-            self,
-            select: str | pd.Index | list[str] = None,
-            steps: None | str | list[str] | slice = slice(None),
-            verbose:bool = False,
-            gaps_lte: str | pd.Timedelta | dt.timedelta = None,
-            gaps_gte: str | pd.Timedelta | dt.timedelta = None,
-            return_combination:bool = True
+        self,
+        select: str | pd.Index | list[str] = None,
+        steps: None | str | list[str] | slice = slice(None),
+        verbose: bool = False,
+        gaps_lte: str | pd.Timedelta | dt.timedelta = None,
+        gaps_gte: str | pd.Timedelta | dt.timedelta = None,
+        return_combination: bool = True,
     ):
         data = self.get_corrected_data(select, steps=steps, verbose=verbose)
 
@@ -170,15 +170,13 @@ def get_gaps_description(
                 select_inner = True
 
         nan_blocks = get_data_blocks(
-                data=data,
-                is_null=True,
-                select_inner=select_inner,
-                lower_td_threshold=lower_th,
-                upper_td_threshold=upper_th,
-                return_combination=return_combination,
+            data=data,
+            is_null=True,
+            lower_td_threshold=lower_th,
+            upper_td_threshold=upper_th,
+            return_combination=return_combination,
         )
 
-
         ser_list = []
         for col, gaps_list in nan_blocks.items():
             gaps_ser = []
@@ -198,9 +196,6 @@ def get_gaps_description(
 
         pass
 
-
-
-
     def set_data(self, data: pd.Series | pd.DataFrame):
         self.data = check_and_return_dt_index_df(data)
         self.root = data_columns_to_tree(data.columns)

From 2d4f8ba42bff178652caa7b24f5ba4cd53b3cf9f Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 16:24:34 +0100
Subject: [PATCH 04/12] =?UTF-8?q?=E2=9C=A8=20get=5Fblocks=5Flte=5Fand=5Fgt?=
 =?UTF-8?q?e,=20get=5Fblocks=5Fmask=5Flte=5Fand=5Fgte?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_utils.py | 112 +++++++++++++++++++++++++++---
 tide/base.py        |  21 +++---
 tide/utils.py       | 165 ++++++++++++++++++++++++++------------------
 3 files changed, 207 insertions(+), 91 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index d7a6b82..9461b9c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -13,7 +13,9 @@
     parse_request_to_col_names,
     timedelta_to_int,
     NamedList,
-    get_series_bloc,
+    _get_series_bloc,
+    get_blocks_lte_and_gte,
+    get_blocks_mask_lte_and_gte,
     edit_tag_value_by_level,
 )
 
@@ -125,7 +127,7 @@ def test_get_series_bloc(self):
         toy_holes.loc["2009-01-01 05:00:00":"2009-01-01 08:00:00"] = np.nan
         toy_holes.loc["2009-01-01 12:00:00":"2009-01-01 16:00:00"] = np.nan
 
-        get_series_bloc(
+        _get_series_bloc(
             toy_holes,
             is_null=True,
             upper_td_threshold="3h",
@@ -133,15 +135,15 @@ def test_get_series_bloc(self):
         )
 
         # All data groups
-        assert len(get_series_bloc(toy_holes)) == 4
+        assert len(_get_series_bloc(toy_holes)) == 4
 
         # All gaps groups
-        assert len(get_series_bloc(toy_holes, is_null=True)) == 3
+        assert len(_get_series_bloc(toy_holes, is_null=True)) == 3
 
         # Gaps Inner bounds, one inclusive
         assert (
             len(
-                get_series_bloc(
+                _get_series_bloc(
                     toy_holes,
                     is_null=True,
                     select_inner=True,
@@ -157,7 +159,7 @@ def test_get_series_bloc(self):
         # Gaps outer selection, one inclusive
         assert (
             len(
-                get_series_bloc(
+                _get_series_bloc(
                     toy_holes,
                     is_null=True,
                     select_inner=False,
@@ -175,7 +177,7 @@ def test_get_series_bloc(self):
             [np.nan, 1, 2, np.nan, 3, 4, np.nan],
             index=pd.date_range("2009", freq="h", periods=7, tz="UTC"),
         )
-        res = get_series_bloc(ser, is_null=True)
+        res = _get_series_bloc(ser, is_null=True)
         assert len(res) == 3
 
         # No gaps case
@@ -183,7 +185,7 @@ def test_get_series_bloc(self):
             [0.0, 1.0, 2.0, 2.5, 3, 4, 5.0],
             index=pd.date_range("2009", freq="h", periods=7, tz="UTC"),
         )
-        res = get_series_bloc(ser, is_null=True)
+        res = _get_series_bloc(ser, is_null=True)
 
         assert res == []
 
@@ -192,7 +194,7 @@ def test_get_series_bloc(self):
             [0.0, 1.0, 2.0, np.nan, 3, 4, 5.0],
             index=pd.date_range("2009", freq="h", periods=7, tz="UTC"),
         )
-        res = get_series_bloc(ser, is_null=True)
+        res = _get_series_bloc(ser, is_null=True)
 
         assert len(res) == 1
 
@@ -259,6 +261,98 @@ def test_get_data_blocks(self):
         )
         assert res["data_1"] == []
 
+    def test_get_blocks_lte_and_gte(self):
+        toy_df = pd.DataFrame(
+            {"data_1": np.random.randn(24), "data_2": np.random.randn(24)},
+            index=pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC"),
+        )
+
+        toy_df.loc["2009-01-01 01:00:00", "data_1"] = np.nan
+        toy_df.loc["2009-01-01 10:00:00":"2009-01-01 12:00:00", "data_1"] = np.nan
+        toy_df.loc["2009-01-01 15:00:00":"2009-01-01 23:00:00", "data_2"] = np.nan
+
+        res = get_blocks_lte_and_gte(toy_df, "1h30min", "8h", True)
+        assert len(res["data_1"]) == 1 and len(res["data_2"]) == 1
+
+        res = get_blocks_lte_and_gte(toy_df, lte="8h", gte="1h30min", is_null=True)
+        assert len(res["data_1"]) == 1 and len(res["data_2"]) == 0
+
+    def test_get_blocks_mask_lte_and_gte(self):
+        toy_df = pd.DataFrame(
+            {"data_1": np.random.randn(24), "data_2": np.random.randn(24)},
+            index=pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC"),
+        )
+
+        toy_df.loc["2009-01-01 01:00:00", "data_1"] = np.nan
+        toy_df.loc["2009-01-01 10:00:00":"2009-01-01 12:00:00", "data_1"] = np.nan
+        toy_df.loc["2009-01-01 15:00:00":"2009-01-01 23:00:00", "data_2"] = np.nan
+
+        res = get_blocks_mask_lte_and_gte(toy_df, "1h30min", "8h", True)
+        np.testing.assert_array_equal(
+            res.values,
+            np.array(
+                [
+                    [False, False],
+                    [True, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                    [False, True],
+                ]
+            ),
+        )
+
+        res = get_blocks_mask_lte_and_gte(toy_df, lte="8h", gte="1h30min", is_null=True)
+        np.testing.assert_array_equal(
+            res.values,
+            np.array(
+                [
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [True, False],
+                    [True, False],
+                    [True, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                    [False, False],
+                ]
+            ),
+        )
+
     def test_outer_timestamps(self):
         ref_index = pd.date_range("2009-01-01", freq="d", periods=5, tz="UTC")
         idx = pd.date_range("2009-01-02", freq="d", periods=2, tz="UTC")
diff --git a/tide/base.py b/tide/base.py
index 12ced32..52693d7 100644
--- a/tide/base.py
+++ b/tide/base.py
@@ -15,11 +15,11 @@
     timedelta_to_int,
     validate_odd_param,
     process_stl_odd_args,
-    get_data_blocks,
+    get_blocks_lte_and_gte,
     get_idx_freq_delta_or_min_time_interval,
     get_tags_max_level,
     NAME_LEVEL_MAP,
-    get_gaps_mask_from_blocks,
+    get_blocks_mask_lte_and_gte,
 )
 
 from tide.meteo import get_oikolab_df
@@ -205,24 +205,19 @@ def __init__(
         self.gaps_gte = gaps_gte
 
     def get_gaps_dict_to_fill(self, X: pd.Series | pd.DataFrame):
-        return get_data_blocks(
+        return get_blocks_lte_and_gte(
             X,
             is_null=True,
-            lower_td_threshold=self.gaps_lte,
-            upper_td_threshold=self.gaps_gte,
-            upper_threshold_inclusive=True,
-            lower_threshold_inclusive=True,
-            return_combination=False,
+            lte=self.gaps_lte,
+            gte=self.gaps_gte,
         )
 
     def get_gaps_mask(self, X: pd.Series | pd.DataFrame):
-        return get_gaps_mask_from_blocks(
+        return get_blocks_mask_lte_and_gte(
             X,
             is_null=True,
-            lower_td_threshold=self.gaps_lte,
-            upper_td_threshold=self.gaps_gte,
-            lower_threshold_inclusive=True,
-            upper_threshold_inclusive=True,
+            lte=self.gaps_lte,
+            gte=self.gaps_gte,
         )
 
 
diff --git a/tide/utils.py b/tide/utils.py
index 1cb4f6d..cbbfdb4 100644
--- a/tide/utils.py
+++ b/tide/utils.py
@@ -245,7 +245,7 @@ def _upper_bound(series, bound, bound_inclusive: bool, inner: bool):
     return op(series, bound)
 
 
-def get_series_bloc(
+def _get_series_bloc(
     date_series: pd.Series,
     is_null: bool = False,
     select_inner: bool = True,
@@ -314,12 +314,101 @@ def get_series_bloc(
     ]
 
 
+def get_blocks_lte_and_gte(
+    data: pd.Series | pd.DataFrame,
+    lte: str | dt.timedelta = None,
+    gte: str | dt.timedelta = None,
+    is_null: bool = False,
+):
+    """
+    Get blocks of data ore gaps (nan) based on duration thresholds.
+
+    Returns them in a dictionary as list of DateTimeIndex. The keys values are
+    data columns (or name if data is a Series).
+
+
+    Parameters:
+    -----------
+    data : pd.Series or pd.DataFrame
+        The input data to be processed.
+    lte : str or datetime.timedelta, optional
+        The upper time threshold. Can be a string (e.g., '1h') or a timedelta object.
+    gte : str or datetime.timedelta, optional
+        The lower time threshold. Can be a string (e.g., '30min') or a timedelta object.
+    is_null : bool, default False
+        Whether to select blocks where the data is null.
+
+    Notes:
+    ------
+    - If both `lte` and `gte` are provided, and `lte` is smaller than `gte`, they
+    will be swapped. The function determines whether to select data within or outside
+    the boundaries based on the order of thresholds.
+    """
+
+    lower_th, upper_th = lte, gte
+    select_inner = False
+    if lower_th is not None and upper_th is not None:
+        if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
+            lower_th, upper_th = upper_th, lower_th
+            select_inner = True
+
+    return get_data_blocks(
+        data=data,
+        is_null=is_null,
+        lower_td_threshold=lower_th,
+        upper_td_threshold=upper_th,
+        select_inner=select_inner,
+        return_combination=False,
+    )
+
+
+def get_blocks_mask_lte_and_gte(
+    data: pd.Series | pd.DataFrame,
+    lte: str | dt.timedelta = None,
+    gte: str | dt.timedelta = None,
+    is_null: bool = False,
+) -> pd.DataFrame:
+    """
+    Creates a boolean mask DataFrame indicating the location of data blocks or gaps.
+
+    Parameters
+    ----------
+    data : pd.Series or pd.DataFrame
+        The input time series data with a DateTime index
+    lte : str or timedelta, optional
+        The minimum duration threshold
+    gte : str or timedelta, optional
+        The maximum duration threshold
+    is_null : bool, default False
+        Whether to find NaN blocks (True) or valid data blocks (False)
+
+    Returns
+    -------
+    pd.DataFrame
+        Boolean mask DataFrame with same index as input data and columns
+        corresponding to the input data columns. True values indicate
+        the presence of a block matching the criteria.
+    """
+    gaps_dict = get_blocks_lte_and_gte(data, lte, gte, is_null)
+
+    mask_data = {}
+    for col, idx_list in gaps_dict.items():
+        if idx_list:
+            combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index
+            mask_data[col] = data.index.isin(combined_idx)
+        else:
+            mask_data[col] = np.zeros(data.shape[0], dtype=bool)
+
+    return pd.DataFrame(mask_data, index=data.index)
+
+
 def get_data_blocks(
     data: pd.Series | pd.DataFrame,
     is_null: bool = False,
     cols: str | list[str] = None,
     lower_td_threshold: str | dt.timedelta = None,
     upper_td_threshold: str | dt.timedelta = None,
+    select_inner: bool = True,
     lower_threshold_inclusive: bool = True,
     upper_threshold_inclusive: bool = True,
     return_combination: bool = True,
@@ -382,22 +471,14 @@ def get_data_blocks(
     data = check_and_return_dt_index_df(data)
     cols = ensure_list(cols) or list(data.columns)
 
-    # Handle threshold order and adjust select_inner if needed
-    lower_th, upper_th = lower_td_threshold, upper_td_threshold
-    select_inner = False
-    if lower_th is not None and upper_th is not None:
-        if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
-            lower_th, upper_th = upper_th, lower_th
-            select_inner = True
-
     # Process each column
     idx_dict = {
-        col: get_series_bloc(
+        col: _get_series_bloc(
             data[col],
             is_null,
             select_inner,
-            lower_th,
-            upper_th,
+            lower_td_threshold,
+            upper_td_threshold,
             lower_threshold_inclusive,
             upper_threshold_inclusive,
         )
@@ -405,12 +486,12 @@ def get_data_blocks(
     }
 
     if return_combination:
-        idx_dict["combination"] = get_series_bloc(
+        idx_dict["combination"] = _get_series_bloc(
             ~data.isnull().any(axis=1),
             is_null,
             select_inner,
-            lower_th,
-            upper_th,
+            lower_td_threshold,
+            upper_td_threshold,
             lower_threshold_inclusive,
             upper_threshold_inclusive,
         )
@@ -490,57 +571,3 @@ def ensure_list(item):
     if item is None:
         return []
     return item if isinstance(item, list) else [item]
-
-
-def get_gaps_mask_from_blocks(
-    data: pd.Series | pd.DataFrame,
-    is_null: bool = False,
-    lower_td_threshold: str | dt.timedelta = None,
-    upper_td_threshold: str | dt.timedelta = None,
-    lower_threshold_inclusive: bool = True,
-    upper_threshold_inclusive: bool = True,
-) -> pd.DataFrame:
-    """
-    Creates a boolean mask DataFrame indicating the location of data blocks or gaps.
-
-    Parameters
-    ----------
-    data : pd.Series or pd.DataFrame
-        The input time series data with a DateTime index
-    is_null : bool, default False
-        Whether to find NaN blocks (True) or valid data blocks (False)
-    lower_td_threshold : str or timedelta, optional
-        The minimum duration threshold
-    upper_td_threshold : str or timedelta, optional
-        The maximum duration threshold
-    lower_threshold_inclusive : bool, default True
-        Include the blocks of exactly lower_td_threshold duration
-    upper_threshold_inclusive : bool, default True
-        Include the blocks of exactly upper_td_threshold duration
-
-    Returns
-    -------
-    pd.DataFrame
-        Boolean mask DataFrame with same index as input data and columns
-        corresponding to the input data columns. True values indicate
-        the presence of a block matching the criteria.
-    """
-    gaps_dict = get_data_blocks(
-        data,
-        is_null=is_null,
-        lower_td_threshold=lower_td_threshold,
-        upper_td_threshold=upper_td_threshold,
-        lower_threshold_inclusive=lower_threshold_inclusive,
-        upper_threshold_inclusive=upper_threshold_inclusive,
-        return_combination=False,
-    )
-
-    mask_data = {}
-    for col, idx_list in gaps_dict.items():
-        if idx_list:
-            combined_idx = pd.concat([idx.to_series() for idx in idx_list]).index
-            mask_data[col] = data.index.isin(combined_idx)
-        else:
-            mask_data[col] = np.zeros(data.shape[0], dtype=bool)
-
-    return pd.DataFrame(mask_data, index=data.index)

From b5926c051107e867d9db48fd947358bf745f895b Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 17:05:00 +0100
Subject: [PATCH 05/12] =?UTF-8?q?=F0=9F=9A=A7=20untest=20get=5Fgaps=5Fdesc?=
 =?UTF-8?q?ription?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tide/plumbing.py | 106 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 75 insertions(+), 31 deletions(-)

diff --git a/tide/plumbing.py b/tide/plumbing.py
index 6929668..d899801 100644
--- a/tide/plumbing.py
+++ b/tide/plumbing.py
@@ -14,7 +14,8 @@
     get_data_level_values,
     get_tree_depth_from_level,
     NamedList,
-    get_data_blocks,
+    get_blocks_lte_and_gte,
+    get_blocks_mask_lte_and_gte,
 )
 from tide.plot import (
     plot_gaps_heatmap,
@@ -159,42 +160,85 @@ def get_gaps_description(
         gaps_lte: str | pd.Timedelta | dt.timedelta = None,
         gaps_gte: str | pd.Timedelta | dt.timedelta = None,
         return_combination: bool = True,
-    ):
+    ) -> pd.DataFrame:
+        """
+        Get statistical description of gaps durations in the data.
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Data selection using tide's tag system
+        steps : None or str or list[str] or slice, default slice(None)
+            Pipeline steps to apply before analyzing gaps
+        verbose : bool, default False
+            Whether to print information about pipeline steps
+        gaps_lte : str or pd.Timedelta or dt.timedelta, optional
+            Upper threshold for gap duration
+        gaps_gte : str or pd.Timedelta or dt.timedelta, optional
+            Lower threshold for gap duration
+        return_combination : bool, default True
+            Whether to include statistics for gaps present in any column
+
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame containing statistics about gap durations for each column.
+            Statistics include:
+            - data_presence_%: percentage of non-gap data points
+            - count: number of gaps
+            - mean: average gap duration
+            - std: standard deviation of gap durations
+            - min: shortest gap
+            - 25%: first quartile
+            - 50%: median
+            - 75%: third quartile
+            - max: longest gap
+            Empty DataFrame if no gaps are found.
+        """
         data = self.get_corrected_data(select, steps=steps, verbose=verbose)
-
-        lower_th, upper_th = gaps_lte, gaps_gte
-        select_inner = False
-        if lower_th is not None and upper_th is not None:
-            if pd.to_timedelta(lower_th) > pd.to_timedelta(upper_th):
-                lower_th, upper_th = upper_th, lower_th
-                select_inner = True
-
-        nan_blocks = get_data_blocks(
+        
+        # Get gaps and calculate durations
+        gaps_dict = get_blocks_lte_and_gte(
             data=data,
+            lte=gaps_lte,
+            gte=gaps_gte,
             is_null=True,
-            lower_td_threshold=lower_th,
-            upper_td_threshold=upper_th,
             return_combination=return_combination,
         )
 
-        ser_list = []
-        for col, gaps_list in nan_blocks.items():
-            gaps_ser = []
-            if gaps_list:
-                for gap in gaps_list:
-                    if gap.shape[0] > 1:
-                        gaps_ser.append(gap[-1] - gap[0])
-                    elif gap.shape[0] == 1:
-                        gaps_ser.append(pd.to_timedelta(gap.freq))
-
-                ser_list.append(pd.Series(gaps_ser, name=col).describe())
-
-        try:
-            res = pd.concat(ser_list, axis=1)
-        except ValueError:
-            res = pd.DataFrame()
-
-        pass
+        gap_durations = {}
+        for col, gaps_list in gaps_dict.items():
+            if not gaps_list:
+                continue
+                
+            durations = []
+            for gap in gaps_list:
+                if len(gap) > 1:
+                    durations.append(gap[-1] - gap[0])
+                else:
+                    durations.append(pd.to_timedelta(gap.freq))
+            
+            if durations:
+                gap_durations[col] = pd.Series(durations, name=col)
+
+        if not gap_durations:
+            return pd.DataFrame()
+
+        stats_df = pd.concat([ser.describe() for ser in gap_durations.values()], axis=1)
+        
+        gaps_mask = get_blocks_mask_lte_and_gte(
+            data=data,
+            lte=gaps_lte,
+            gte=gaps_gte,
+            is_null=True,
+            return_combination=return_combination,
+        )
+        
+        presence_percentages = (1 - gaps_mask.mean()) * 100
+        
+        stats_df.loc["data_presence_%"] = presence_percentages[stats_df.columns]
+        row_order = ["data_presence_%"] + [idx for idx in stats_df.index if idx != "data_presence_%"]
+        return stats_df.reindex(row_order)
 
     def set_data(self, data: pd.Series | pd.DataFrame):
         self.data = check_and_return_dt_index_df(data)

From 447266bce90e80fe24b5f2e718d0f924bf9c72ff Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 17:05:25 +0100
Subject: [PATCH 06/12] =?UTF-8?q?=E2=9C=85=20TestGapsDescription?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_plumbing.py | 125 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py
index ac54e9b..9633fd3 100644
--- a/tests/test_plumbing.py
+++ b/tests/test_plumbing.py
@@ -10,6 +10,7 @@
 )
 
 import plotly.io as pio
+import pytest
 
 pio.renderers.default = "browser"
 
@@ -187,3 +188,127 @@ def test_plumber(self):
         plumber.plot()
         plumber.get_gaps_description()
         assert True
+
+
+class TestGapsDescription:
+    @pytest.fixture
+    def sample_data(self):
+        # Create sample data with known gaps
+        idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC")
+        data = pd.DataFrame({
+            "temp__°C__Building": np.ones(24),
+            "humidity__%__Building": np.ones(24),
+            "power__W__Building": np.ones(24)
+        }, index=idx)
+        
+        # Create gaps of different durations
+        data.loc["2023-01-01 02:00":"2023-01-01 04:00", "temp__°C__Building"] = np.nan  # 3h gap
+        data.loc["2023-01-01 08:00", "temp__°C__Building"] = np.nan  # 1h gap
+        data.loc["2023-01-01 12:00":"2023-01-01 14:00", "humidity__%__Building"] = np.nan  # 3h gap
+        data.loc["2023-01-01 06:00":"2023-01-01 18:00", "power__W__Building"] = np.nan  # 13h gap
+        
+        return data
+
+    def test_basic_gaps_description(self, sample_data):
+        """Test basic functionality with default parameters"""
+        plumber = Plumber(sample_data)
+        result = plumber.get_gaps_description()
+        
+        # Check presence of all columns
+        assert all(col in result.columns for col in sample_data.columns)
+        
+        # Check presence of all statistics
+        expected_stats = ["data_presence_%", "count", "mean", "std", "min", "25%", "50%", "75%", "max"]
+        assert all(stat in result.index for stat in expected_stats)
+        
+        # Check specific values for temp column
+        temp_col = "temp__°C__Building"
+        assert result[temp_col]["count"] == 2  # Two gaps
+        assert result[temp_col]["data_presence_%"] == pytest.approx(83.33, rel=1e-2)  # 20/24 hours present
+
+    def test_with_duration_thresholds(self, sample_data):
+        """Test with gap duration thresholds"""
+        plumber = Plumber(sample_data)
+        
+        # Only gaps >= 3h
+        result = plumber.get_gaps_description(gaps_gte="3h")
+        assert result["temp__°C__Building"]["count"] == 1  # Only one 3h gap
+        assert result["power__W__Building"]["count"] == 1  # One 13h gap
+        
+        # Only gaps <= 2h
+        result = plumber.get_gaps_description(gaps_lte="2h")
+        assert result["temp__°C__Building"]["count"] == 1  # Only one 1h gap
+        assert "power__W__Building" not in result.columns  # No gaps <= 2h
+
+    def test_with_data_selection(self, sample_data):
+        """Test with data selection using tags"""
+        plumber = Plumber(sample_data)
+        
+        # Select by unit
+        result = plumber.get_gaps_description(select="°C")
+        assert list(result.columns) == ["temp__°C__Building"]
+        
+        # Select by bloc
+        result = plumber.get_gaps_description(select="Building")
+        assert len(result.columns) == 3
+
+    def test_empty_cases(self):
+        """Test cases that should return empty DataFrame"""
+        # Data with no gaps
+        idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC")
+        clean_data = pd.DataFrame({
+            "temp__°C__Building": np.ones(24)
+        }, index=idx)
+        plumber = Plumber(clean_data)
+        
+        result = plumber.get_gaps_description()
+        assert result.empty
+        
+        # Data selection that returns no columns
+        plumber = Plumber(clean_data)
+        result = plumber.get_gaps_description(select="nonexistent")
+        assert result.empty
+
+    def test_combination_flag(self, sample_data):
+        """Test with and without return_combination flag"""
+        plumber = Plumber(sample_data)
+        
+        # With combination
+        result = plumber.get_gaps_description(return_combination=True)
+        assert "combination" in result.columns
+        
+        # Without combination
+        result = plumber.get_gaps_description(return_combination=False)
+        assert "combination" not in result.columns
+
+    def test_single_point_gaps(self):
+        """Test handling of single-point gaps"""
+        idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC")
+        data = pd.DataFrame({
+            "temp__°C__Building": np.ones(24)
+        }, index=idx)
+        
+        # Create single point gap
+        data.loc["2023-01-01 12:00", "temp__°C__Building"] = np.nan
+        
+        plumber = Plumber(data)
+        result = plumber.get_gaps_description()
+        
+        assert result["temp__°C__Building"]["count"] == 1
+        assert pd.Timedelta(result["temp__°C__Building"]["mean"]) == pd.Timedelta("1h")
+
+    def test_pipeline_steps(self, sample_data):
+        """Test with pipeline steps"""
+        plumber = Plumber(sample_data)
+        plumber.pipe_dict = {
+            "step1": [["Identity"]],  # Simple identity transformation
+            "step2": [["Identity"]]
+        }
+        
+        # Test with specific steps
+        result = plumber.get_gaps_description(steps=["step1"])
+        assert not result.empty
+        
+        # Test with no steps
+        result = plumber.get_gaps_description(steps=None)
+        assert not result.empty

From 68d725209bd7cf0347df26678c62a95e21f59795 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 17:19:21 +0100
Subject: [PATCH 07/12] =?UTF-8?q?=E2=9C=85=20update=20plumbing=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_plumbing.py | 468 +++++++++++++++++++++--------------------
 1 file changed, 238 insertions(+), 230 deletions(-)

diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py
index 9633fd3..850e53b 100644
--- a/tests/test_plumbing.py
+++ b/tests/test_plumbing.py
@@ -1,7 +1,6 @@
 import pandas as pd
-
 import numpy as np
-
+import pytest
 from tide.plumbing import (
     _get_pipe_from_proc_list,
     _get_column_wise_transformer,
@@ -10,131 +9,149 @@
 )
 
 import plotly.io as pio
-import pytest
 
 pio.renderers.default = "browser"
 
-TEST_DF = pd.DataFrame(
-    {
-        "Tin__°C__building": [10.0, 20.0, 30.0],
-        "Text__°C__outdoor": [-1.0, 5.0, 4.0],
-        "radiation__W/m2__outdoor": [50, 100, 400],
-        "Humidity__%HR": [10, 15, 13],
-        "Humidity__%HR__room1": [20, 30, 50],
-        "Humidity_2": [10, 15, 13],
-        "light__DIMENSIONLESS__building": [100, 200, 300],
-        "mass_flwr__m3/h__hvac": [300, 500, 600],
-    },
-    index=pd.date_range("2009", freq="h", periods=3, tz="UTC"),
-)
-
-TEST_DF_2 = pd.DataFrame(
-    {
-        "a__°C__zone_1": np.random.randn(24),
-        "b__°C__zone_1": np.random.randn(24),
-        "c__Wh__zone_2": np.random.randn(24) * 100,
-    },
-    index=pd.date_range("2009", freq="h", periods=24, tz="UTC"),
-)
-
-TEST_DF_2["c__Wh__zone_2"] = abs(TEST_DF_2).cumsum()["c__Wh__zone_2"]
-
-TEST_DF_2.loc["2009-01-01 05:00:00":"2009-01-01 09:00:00", "a__°C__zone_1"] = np.nan
-TEST_DF_2.loc["2009-01-01 15:00:00", "b__°C__zone_1"] = np.nan
-TEST_DF_2.loc["2009-01-01 17:00:00", "b__°C__zone_1"] = np.nan
-TEST_DF_2.loc["2009-01-01 20:00:00", "c__Wh__zone_2"] = np.nan
-
-PIPE_DICT = {
-    "pre_processing": {
-        "°C": [["ReplaceThreshold", {"upper": 25}]],
-        "W/m2__outdoor": [["DropTimeGradient", {"upper_rate": -100}]],
-    },
-    "common": [["Interpolate", ["linear"]], ["Ffill"], ["Bfill", {"limit": 3}]],
-    "resampling": [["Resample", ["3h", "mean", {"W/m2": "sum"}]]],
-    "compute_energy": [
-        [
-            "ExpressionCombine",
+@pytest.fixture
+def time_index():
+    """Create a standard time index for test data."""
+    return pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC")
+
+@pytest.fixture
+def basic_data(time_index):
+    """Create basic test data with various units and tags."""
+    return pd.DataFrame(
+        {
+            "Tin__°C__building": np.random.randn(24) * 5 + 20,
+            "Text__°C__outdoor": np.random.randn(24) * 3 + 10,
+            "radiation__W/m2__outdoor": np.abs(np.random.randn(24)) * 100,
+            "Humidity__%HR": np.random.randn(24) * 5 + 50,
+            "Humidity__%HR__room1": np.random.randn(24) * 5 + 45,
+            "Humidity_2": np.random.randn(24) * 5 + 55,
+            "light__DIMENSIONLESS__building": np.abs(np.random.randn(24)) * 200,
+            "mass_flwr__m3/h__hvac": np.abs(np.random.randn(24)) * 400 + 500,
+        },
+        index=time_index,
+    )
+
+@pytest.fixture
+def gapped_data(time_index):
+    """Create test data with specific gaps for testing gap-related functionality."""
+    data = pd.DataFrame(
+        {
+            "a__°C__zone_1": np.random.randn(24),
+            "b__°C__zone_1": np.random.randn(24),
+            "c__Wh__zone_2": np.abs(np.random.randn(24) * 100),
+        },
+        index=time_index,
+    )
+    
+    # Add cumulative sum to energy data
+    data["c__Wh__zone_2"] = data["c__Wh__zone_2"].cumsum()
+    
+    # Add specific gaps
+    data.loc["2009-01-01 05:00":"2009-01-01 09:00", "a__°C__zone_1"] = np.nan  # 5h gap
+    data.loc["2009-01-01 15:00", "b__°C__zone_1"] = np.nan  # 1h gap
+    data.loc["2009-01-01 17:00", "b__°C__zone_1"] = np.nan  # 1h gap
+    data.loc["2009-01-01 20:00", "c__Wh__zone_2"] = np.nan  # 1h gap
+    
+    return data
+
+@pytest.fixture
+def pipe_dict():
+    """Create a standard pipeline dictionary for testing."""
+    return {
+        "pre_processing": {
+            "°C": [["ReplaceThreshold", {"upper": 25}]],
+            "W/m2__outdoor": [["DropTimeGradient", {"upper_rate": -100}]],
+        },
+        "common": [
+            ["Interpolate", ["linear"]],
+            ["Ffill"],
+            ["Bfill", {"limit": 3}]
+        ],
+        "resampling": [
+            ["Resample", ["3h", "mean", {"W/m2": "sum"}]]
+        ],
+        "compute_energy": [
             [
-                {
-                    "T1": "Tin__°C__building",
-                    "T2": "Text__°C__outdoor",
-                    "m": "mass_flwr__m3/h__hvac",
-                },
-                "(T1 - T2) * m * 1004 * 1.204",
-                "Air_flow_energy__hvac__J",
-                True,
-            ],
-        ]
-    ],
-}
-
-
-class TestPlumbing:
-    def test__get_all_data_step(self):
-        test_df = TEST_DF.copy()
+                "ExpressionCombine",
+                [
+                    {
+                        "T1": "Tin__°C__building",
+                        "T2": "Text__°C__outdoor",
+                        "m": "mass_flwr__m3/h__hvac",
+                    },
+                    "(T1 - T2) * m * 1004 * 1.204",
+                    "Air_flow_energy__hvac__J",
+                    True,
+                ],
+            ]
+        ],
+    }
+
+class TestPipelineComponents:
+    """Tests for individual pipeline components and transformers."""
+    
+    def test_pipe_from_proc_list(self, basic_data, pipe_dict):
+        """Test creation and application of processing pipeline from list."""
+        # Create gap in data
+        test_df = basic_data.copy()
         test_df.iloc[1, 0] = np.nan
         test_df.iloc[0, 1] = np.nan
-        pipe = _get_pipe_from_proc_list(test_df.columns, PIPE_DICT["common"], tz="UTC")
-
-        res = pipe.fit_transform(test_df)
-
+        
+        # Create and apply pipeline
+        pipe = _get_pipe_from_proc_list(test_df.columns, pipe_dict["common"], tz="UTC")
+        result = pipe.fit_transform(test_df)
+        
+        # Check original data preserved where no gaps
         pd.testing.assert_series_equal(
-            res["Tin__°C__building"], TEST_DF["Tin__°C__building"]
+            result["Tin__°C__building"],
+            basic_data["Tin__°C__building"]
         )
-        assert float(res.iloc[0, 1]) == 5.0
-
-    def test__get_column_wise_transformer(self):
-        col_trans = _get_column_wise_transformer(
-            proc_dict=PIPE_DICT["pre_processing"],
-            data_columns=TEST_DF.columns,
+        
+        # Check gap filling worked
+        assert not result.isna().any().any()
+        assert result.iloc[0, 1] == pytest.approx(test_df.iloc[1, 1])
+
+    def test_column_wise_transformer(self, basic_data, pipe_dict):
+        """Test column-wise transformer creation and application."""
+        # Test with all columns
+        transformer = _get_column_wise_transformer(
+            proc_dict=pipe_dict["pre_processing"],
+            data_columns=basic_data.columns,
             tz="UTC",
             process_name="test",
         )
-
-        res = col_trans.fit_transform(TEST_DF.copy())
-
-        np.testing.assert_array_equal(res.iloc[:, 0].to_list(), [10.0, 20.0, np.nan])
-        np.testing.assert_array_equal(res.iloc[:, 2].to_list(), [50.0, 100.0, np.nan])
-
-        col_trans = _get_column_wise_transformer(
-            proc_dict=PIPE_DICT["pre_processing"],
-            data_columns=TEST_DF[
-                [col for col in TEST_DF.columns if col != "radiation__W/m2__outdoor"]
-            ].columns,
+        result = transformer.fit_transform(basic_data.copy())
+        
+        # Check temperature threshold applied
+        assert (result["Tin__°C__building"] <= 25).all()
+        
+        # Test with subset of columns
+        temp_cols = [col for col in basic_data.columns if "°C" in col]
+        transformer = _get_column_wise_transformer(
+            proc_dict=pipe_dict["pre_processing"],
+            data_columns=temp_cols,
             tz="UTC",
             process_name="test",
         )
-
-        res = col_trans.fit_transform(
-            TEST_DF[
-                [col for col in TEST_DF.columns if col != "radiation__W/m2__outdoor"]
-            ].copy()
-        )
-
-        np.testing.assert_array_equal(res.iloc[:, 0].to_list(), [10.0, 20.0, np.nan])
-        assert len(col_trans.transformers_) == 2
-
-        cols_none = [
-            "Humidity__%HR",
-            "Humidity__%HR__room1",
-            "Humidity_2",
-            "light__DIMENSIONLESS__building",
-            "mass_flwr__m3/h__hvac",
-        ]
-
-        col_trans = _get_column_wise_transformer(
-            proc_dict=PIPE_DICT["pre_processing"],
-            data_columns=cols_none,
+        assert len(transformer.transformers_) == 1
+        
+        # Test with no matching columns
+        humidity_cols = [col for col in basic_data.columns if "%HR" in col]
+        transformer = _get_column_wise_transformer(
+            proc_dict=pipe_dict["pre_processing"],
+            data_columns=humidity_cols,
             tz="UTC",
             process_name="test",
         )
+        assert transformer is None
 
-        assert col_trans is None
-
-    def test_get_pipeline_from_dict(self):
+    def test_pipeline_from_dict(self, gapped_data):
+        """Test creation of full pipeline from dictionary configuration."""
         pipe_dict = {
             "fill_1": {"a__°C__zone_1": [["Interpolate"]]},
-            # "fill_2": {"b": [["Interpolate"]]},
             "combine": [
                 [
                     "ExpressionCombine",
@@ -149,166 +166,157 @@ def test_get_pipeline_from_dict(self):
                     ],
                 ]
             ],
-            "fill_3": [["Interpolate"]],
-        }
-
-        pipe = get_pipeline_from_dict(TEST_DF_2.columns, pipe_dict, verbose=True)
-        pipe.fit_transform(TEST_DF_2.copy())
-
-        assert True
-
-    def test_plumber(self):
-        pipe = {
-            "fill_1": {"a__°C__zone_1": [["Interpolate"]]},
-            "fill_2": {"b": [["Interpolate"]]},
-            "combine": {
-                "zone_1": [
-                    [
-                        "ExpressionCombine",
-                        [
-                            {
-                                "T1": "a__°C__zone_1",
-                                "T2": "b__°C__zone_1",
-                            },
-                            "T1 * T2",
-                            "new_unit__°C²__zone_1",
-                            True,
-                        ],
-                    ]
-                ],
-            },
-            "fill_3": [["Interpolate"]],
+            "fill_final": [["Interpolate"]],
         }
-
-        plumber = Plumber()
-        plumber.set_data(TEST_DF_2)
-        plumber.pipe_dict = pipe
-        plumber.get_pipeline()
-        plumber.get_pipeline(steps=["fill_3", "combine"])
-        plumber.plot()
-        plumber.get_gaps_description()
-        assert True
-
+        
+        pipe = get_pipeline_from_dict(gapped_data.columns, pipe_dict, verbose=True)
+        result = pipe.fit_transform(gapped_data.copy())
+        
+        # Check new column created
+        assert "new_unit__°C²__zone_1" in result.columns
+        
+        # Check gaps filled
+        assert not result.isna().any().any()
+
+class TestPlumber:
+    """Tests for the Plumber class functionality."""
+    
+    def test_initialization(self, gapped_data, pipe_dict):
+        """Test Plumber initialization and basic attributes."""
+        plumber = Plumber(gapped_data, pipe_dict)
+        assert plumber.data is not None
+        assert plumber.root is not None
+        assert plumber.pipe_dict == pipe_dict
+
+    def test_data_selection(self, gapped_data):
+        """Test data selection using tags."""
+        plumber = Plumber(gapped_data)
+        
+        # Test unit selection
+        temp_cols = plumber.select("°C")
+        assert len(temp_cols) == 2
+        assert all("°C" in col for col in temp_cols)
+        
+        # Test zone selection
+        zone_1_cols = plumber.select("zone_1")
+        assert len(zone_1_cols) == 2
+        assert all("zone_1" in col for col in zone_1_cols)
+
+    def test_pipeline_execution(self, gapped_data, pipe_dict):
+        """Test pipeline execution with different step selections."""
+        plumber = Plumber(gapped_data, pipe_dict)
+        
+        # Test full pipeline
+        full_pipe = plumber.get_pipeline()
+        assert len(full_pipe.steps) > 0
+        
+        # Test partial pipeline
+        partial_pipe = plumber.get_pipeline(steps=["pre_processing"])
+        assert len(partial_pipe.steps) == 1
+        
+        # Test with no pipeline
+        identity_pipe = plumber.get_pipeline(steps=None)
+        assert len(identity_pipe.steps) == 1
+        assert identity_pipe.steps[0][0] == "Identity"
+
+    def test_corrected_data(self, gapped_data, pipe_dict):
+        """Test data correction through pipeline."""
+        plumber = Plumber(gapped_data, pipe_dict)
+        
+        # Test with time slice
+        result = plumber.get_corrected_data(
+            start="2009-01-01 05:00",
+            stop="2009-01-01 10:00"
+        )
+        assert len(result) == 6
+        
+        # Test with column selection
+        result = plumber.get_corrected_data(select="°C")
+        assert len(result.columns) == 2
+        assert all("°C" in col for col in result.columns)
 
 class TestGapsDescription:
+    """Tests for gap analysis functionality."""
+    
     @pytest.fixture
-    def sample_data(self):
-        # Create sample data with known gaps
-        idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC")
+    def gaps_data(self, time_index):
+        """Create data with specific gaps for testing gap analysis."""
         data = pd.DataFrame({
             "temp__°C__Building": np.ones(24),
             "humidity__%__Building": np.ones(24),
             "power__W__Building": np.ones(24)
-        }, index=idx)
+        }, index=time_index)
         
         # Create gaps of different durations
-        data.loc["2023-01-01 02:00":"2023-01-01 04:00", "temp__°C__Building"] = np.nan  # 3h gap
-        data.loc["2023-01-01 08:00", "temp__°C__Building"] = np.nan  # 1h gap
-        data.loc["2023-01-01 12:00":"2023-01-01 14:00", "humidity__%__Building"] = np.nan  # 3h gap
-        data.loc["2023-01-01 06:00":"2023-01-01 18:00", "power__W__Building"] = np.nan  # 13h gap
+        data.loc["2009-01-01 02:00":"2009-01-01 04:00", "temp__°C__Building"] = np.nan
+        data.loc["2009-01-01 08:00", "temp__°C__Building"] = np.nan
+        data.loc["2009-01-01 12:00":"2009-01-01 14:00", "humidity__%__Building"] = np.nan
+        data.loc["2009-01-01 06:00":"2009-01-01 18:00", "power__W__Building"] = np.nan
         
         return data
 
-    def test_basic_gaps_description(self, sample_data):
-        """Test basic functionality with default parameters"""
-        plumber = Plumber(sample_data)
+    def test_basic_gaps_description(self, gaps_data):
+        """Test basic gap analysis functionality."""
+        plumber = Plumber(gaps_data)
         result = plumber.get_gaps_description()
         
-        # Check presence of all columns
-        assert all(col in result.columns for col in sample_data.columns)
-        
-        # Check presence of all statistics
+        # Check structure
+        assert all(col in result.columns for col in gaps_data.columns)
         expected_stats = ["data_presence_%", "count", "mean", "std", "min", "25%", "50%", "75%", "max"]
         assert all(stat in result.index for stat in expected_stats)
         
-        # Check specific values for temp column
+        # Check specific values
         temp_col = "temp__°C__Building"
-        assert result[temp_col]["count"] == 2  # Two gaps
-        assert result[temp_col]["data_presence_%"] == pytest.approx(83.33, rel=1e-2)  # 20/24 hours present
+        assert result[temp_col]["count"] == 2
+        assert result[temp_col]["data_presence_%"] == pytest.approx(83.33, rel=1e-2)
 
-    def test_with_duration_thresholds(self, sample_data):
-        """Test with gap duration thresholds"""
-        plumber = Plumber(sample_data)
+    def test_gap_thresholds(self, gaps_data):
+        """Test gap analysis with duration thresholds."""
+        plumber = Plumber(gaps_data)
         
-        # Only gaps >= 3h
+        # Test minimum duration threshold
         result = plumber.get_gaps_description(gaps_gte="3h")
-        assert result["temp__°C__Building"]["count"] == 1  # Only one 3h gap
-        assert result["power__W__Building"]["count"] == 1  # One 13h gap
+        assert result["temp__°C__Building"]["count"] == 1
+        assert result["power__W__Building"]["count"] == 1
         
-        # Only gaps <= 2h
+        # Test maximum duration threshold
         result = plumber.get_gaps_description(gaps_lte="2h")
-        assert result["temp__°C__Building"]["count"] == 1  # Only one 1h gap
-        assert "power__W__Building" not in result.columns  # No gaps <= 2h
-
-    def test_with_data_selection(self, sample_data):
-        """Test with data selection using tags"""
-        plumber = Plumber(sample_data)
-        
-        # Select by unit
-        result = plumber.get_gaps_description(select="°C")
-        assert list(result.columns) == ["temp__°C__Building"]
-        
-        # Select by bloc
-        result = plumber.get_gaps_description(select="Building")
-        assert len(result.columns) == 3
+        assert result["temp__°C__Building"]["count"] == 1
+        assert "power__W__Building" not in result.columns
 
-    def test_empty_cases(self):
-        """Test cases that should return empty DataFrame"""
-        # Data with no gaps
-        idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC")
+    def test_gap_analysis_edge_cases(self, time_index):
+        """Test gap analysis edge cases."""
+        # Test with no gaps
         clean_data = pd.DataFrame({
             "temp__°C__Building": np.ones(24)
-        }, index=idx)
+        }, index=time_index)
         plumber = Plumber(clean_data)
-        
         result = plumber.get_gaps_description()
         assert result.empty
         
-        # Data selection that returns no columns
-        plumber = Plumber(clean_data)
+        # Test with invalid selection
         result = plumber.get_gaps_description(select="nonexistent")
         assert result.empty
-
-    def test_combination_flag(self, sample_data):
-        """Test with and without return_combination flag"""
-        plumber = Plumber(sample_data)
-        
-        # With combination
-        result = plumber.get_gaps_description(return_combination=True)
-        assert "combination" in result.columns
-        
-        # Without combination
-        result = plumber.get_gaps_description(return_combination=False)
-        assert "combination" not in result.columns
-
-    def test_single_point_gaps(self):
-        """Test handling of single-point gaps"""
-        idx = pd.date_range("2023-01-01", periods=24, freq="1h", tz="UTC")
-        data = pd.DataFrame({
-            "temp__°C__Building": np.ones(24)
-        }, index=idx)
-        
-        # Create single point gap
-        data.loc["2023-01-01 12:00", "temp__°C__Building"] = np.nan
         
+        # Test single point gap
+        data = clean_data.copy()
+        data.loc[data.index[12], "temp__°C__Building"] = np.nan
         plumber = Plumber(data)
         result = plumber.get_gaps_description()
-        
         assert result["temp__°C__Building"]["count"] == 1
         assert pd.Timedelta(result["temp__°C__Building"]["mean"]) == pd.Timedelta("1h")
 
-    def test_pipeline_steps(self, sample_data):
-        """Test with pipeline steps"""
-        plumber = Plumber(sample_data)
-        plumber.pipe_dict = {
-            "step1": [["Identity"]],  # Simple identity transformation
-            "step2": [["Identity"]]
-        }
-        
-        # Test with specific steps
-        result = plumber.get_gaps_description(steps=["step1"])
-        assert not result.empty
+class TestPlotting:
+    """Tests for plotting functionality."""
+
+    def test_basic_plot(self, gapped_data):
+        """Test basic plotting functionality."""
+        plumber = Plumber(gapped_data)
+        fig = plumber.plot()
         
-        # Test with no steps
-        result = plumber.get_gaps_description(steps=None)
-        assert not result.empty
+        # Check figure was created
+        assert fig is not None
+        # Check data is present in figure
+        assert len(fig.data) > 0
+        # Check all columns are plotted
+        assert all(col in [trace.name for trace in fig.data] for col in gapped_data.columns)

From 0c2f73d09cd1e72143dd1aac5195493d584357b9 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 18:09:40 +0100
Subject: [PATCH 08/12] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20get=5Fblocks=5Flte?=
 =?UTF-8?q?=5Fand=5Fgte,=20get=5Fblocks=5Fmask=5Flte=5Fand=5Fgte,=20return?=
 =?UTF-8?q?=5Fcombination=20added?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tide/utils.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tide/utils.py b/tide/utils.py
index cbbfdb4..9f5ba1b 100644
--- a/tide/utils.py
+++ b/tide/utils.py
@@ -319,6 +319,7 @@ def get_blocks_lte_and_gte(
     lte: str | dt.timedelta = None,
     gte: str | dt.timedelta = None,
     is_null: bool = False,
+    return_combination: bool = False
 ):
     """
     Get blocks of data ore gaps (nan) based on duration thresholds.
@@ -343,6 +344,10 @@ def get_blocks_lte_and_gte(
     - If both `lte` and `gte` are provided, and `lte` is smaller than `gte`, they
     will be swapped. The function determines whether to select data within or outside
     the boundaries based on the order of thresholds.
+    return_combination : bool, optional
+        If True (default), a combination column is created that checks for NaNs
+        across all columns in the DataFrame. Gaps in this combination column represent
+        rows where NaNs are present in any of the columns.
     """
 
     lower_th, upper_th = lte, gte
@@ -358,7 +363,7 @@ def get_blocks_lte_and_gte(
         lower_td_threshold=lower_th,
         upper_td_threshold=upper_th,
         select_inner=select_inner,
-        return_combination=False,
+        return_combination=return_combination,
     )
 
 
@@ -367,6 +372,7 @@ def get_blocks_mask_lte_and_gte(
     lte: str | dt.timedelta = None,
     gte: str | dt.timedelta = None,
     is_null: bool = False,
+    return_combination: bool = False
 ) -> pd.DataFrame:
     """
     Creates a boolean mask DataFrame indicating the location of data blocks or gaps.
@@ -381,6 +387,10 @@ def get_blocks_mask_lte_and_gte(
         The maximum duration threshold
     is_null : bool, default False
         Whether to find NaN blocks (True) or valid data blocks (False)
+    return_combination : bool, optional
+        If True (default), a combination column is created that checks for NaNs
+        across all columns in the DataFrame. Gaps in this combination column represent
+        rows where NaNs are present in any of the columns.
 
     Returns
     -------
@@ -389,7 +399,7 @@ def get_blocks_mask_lte_and_gte(
         corresponding to the input data columns. True values indicate
         the presence of a block matching the criteria.
     """
-    gaps_dict = get_blocks_lte_and_gte(data, lte, gte, is_null)
+    gaps_dict = get_blocks_lte_and_gte(data, lte, gte, is_null, return_combination)
 
     mask_data = {}
     for col, idx_list in gaps_dict.items():

From 19fd13ecdd33ff850ac5aa5f4f28041b2a713b95 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 18:10:46 +0100
Subject: [PATCH 09/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20big=20plumber=20test?=
 =?UTF-8?q?=20refactoring?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_plumbing.py | 104 +++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 39 deletions(-)

diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py
index 850e53b..1d80f96 100644
--- a/tests/test_plumbing.py
+++ b/tests/test_plumbing.py
@@ -93,60 +93,91 @@ def pipe_dict():
 class TestPipelineComponents:
     """Tests for individual pipeline components and transformers."""
     
-    def test_pipe_from_proc_list(self, basic_data, pipe_dict):
+    def test_pipe_from_proc_list(self, pipe_dict):
         """Test creation and application of processing pipeline from list."""
-        # Create gap in data
-        test_df = basic_data.copy()
-        test_df.iloc[1, 0] = np.nan
-        test_df.iloc[0, 1] = np.nan
-        
-        # Create and apply pipeline
+        test_df = pd.DataFrame({
+            "temp__°C__building": [10.0, np.nan, 20.0, 30.0],
+            "humid__%HR__building": [50.0, 60.0, np.nan, 80.0]
+        }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC"))
+
         pipe = _get_pipe_from_proc_list(test_df.columns, pipe_dict["common"], tz="UTC")
         result = pipe.fit_transform(test_df)
-        
-        # Check original data preserved where no gaps
-        pd.testing.assert_series_equal(
-            result["Tin__°C__building"],
-            basic_data["Tin__°C__building"]
-        )
-        
-        # Check gap filling worked
+
+        # Check that gaps were filled with interpolation
         assert not result.isna().any().any()
-        assert result.iloc[0, 1] == pytest.approx(test_df.iloc[1, 1])
+        # For temp: 10 -> [15] -> 20 -> 30 (linear interpolation)
+        assert result.iloc[1]["temp__°C__building"] == pytest.approx(15.0)
+        # For humid: 50 -> 60 -> [70] -> 80 (linear interpolation)
+        assert result.iloc[2]["humid__%HR__building"] == pytest.approx(70.0)
+
+        # Check that non-gap values remain unchanged
+        assert result.iloc[0]["temp__°C__building"] == 10.0
+        assert result.iloc[3]["temp__°C__building"] == 30.0
+        assert result.iloc[0]["humid__%HR__building"] == 50.0
+        assert result.iloc[1]["humid__%HR__building"] == 60.0
 
-    def test_column_wise_transformer(self, basic_data, pipe_dict):
+    def test_column_wise_transformer(self, pipe_dict):
         """Test column-wise transformer creation and application."""
+        # Create controlled test data with known values
+        test_df = pd.DataFrame({
+            "temp1__°C__zone1": [24.0, 26.0, np.nan, 28.0],
+            # Two values above threshold
+            "temp2__°C__zone2": [23.0, 25.0, 27.0, np.nan],
+            # One value above threshold
+            "radiation__W/m2__outdoor": [100, 200, 50, 150],  # For gradient test
+            "humid__%HR__zone1": [50.0, 60.0, 70.0, 80.0]  # Should be unaffected
+        }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC"))
+
         # Test with all columns
         transformer = _get_column_wise_transformer(
             proc_dict=pipe_dict["pre_processing"],
-            data_columns=basic_data.columns,
+            data_columns=test_df.columns,
             tz="UTC",
             process_name="test",
         )
-        result = transformer.fit_transform(basic_data.copy())
-        
-        # Check temperature threshold applied
-        assert (result["Tin__°C__building"] <= 25).all()
-        
-        # Test with subset of columns
-        temp_cols = [col for col in basic_data.columns if "°C" in col]
+        result = transformer.fit_transform(test_df.copy())
+
+        # Check temperature threshold applied (excluding NaN)
+        temp1_mask = ~pd.isna(result["temp1__°C__zone1"])
+        temp2_mask = ~pd.isna(result["temp2__°C__zone2"])
+        assert (result["temp1__°C__zone1"][temp1_mask] <= 25).all()
+        assert (result["temp2__°C__zone2"][temp2_mask] <= 25).all()
+
+        # Verify specific values
+        assert result.iloc[0]["temp1__°C__zone1"] == 24.0  # Unchanged
+        assert result.iloc[1]["temp2__°C__zone2"] == 25.0  # Capped
+        assert pd.isna(result.iloc[2]["temp1__°C__zone1"])  # NaN preserved
+        assert pd.isna(result.iloc[3]["temp1__°C__zone1"])  # Capped
+
+        # Check radiation gradient (should drop when rate < -100)
+        assert pd.isna(result.iloc[2][
+                           "radiation__W/m2__outdoor"])  # Dropped due to steep negative gradient
+
+        # Check humidity unaffected
+        pd.testing.assert_series_equal(
+            result["humid__%HR__zone1"],
+            test_df["humid__%HR__zone1"]
+        )
+
+        # Test with subset of columns (temperature only)
+        temp_cols = [col for col in test_df.columns if "°C" in col]
         transformer = _get_column_wise_transformer(
             proc_dict=pipe_dict["pre_processing"],
             data_columns=temp_cols,
             tz="UTC",
             process_name="test",
         )
-        assert len(transformer.transformers_) == 1
-        
+        assert len(transformer.transformers_) == 1  # Only temperature transformer
+
         # Test with no matching columns
-        humidity_cols = [col for col in basic_data.columns if "%HR" in col]
+        humidity_cols = [col for col in test_df.columns if "%HR" in col]
         transformer = _get_column_wise_transformer(
             proc_dict=pipe_dict["pre_processing"],
             data_columns=humidity_cols,
             tz="UTC",
             process_name="test",
         )
-        assert transformer is None
+        assert transformer is None  # No transformers needed
 
     def test_pipeline_from_dict(self, gapped_data):
         """Test creation of full pipeline from dictionary configuration."""
@@ -202,9 +233,9 @@ def test_data_selection(self, gapped_data):
         assert len(zone_1_cols) == 2
         assert all("zone_1" in col for col in zone_1_cols)
 
-    def test_pipeline_execution(self, gapped_data, pipe_dict):
+    def test_pipeline_execution(self, basic_data, pipe_dict):
         """Test pipeline execution with different step selections."""
-        plumber = Plumber(gapped_data, pipe_dict)
+        plumber = Plumber(basic_data, pipe_dict)
         
         # Test full pipeline
         full_pipe = plumber.get_pipeline()
@@ -219,21 +250,16 @@ def test_pipeline_execution(self, gapped_data, pipe_dict):
         assert len(identity_pipe.steps) == 1
         assert identity_pipe.steps[0][0] == "Identity"
 
-    def test_corrected_data(self, gapped_data, pipe_dict):
+    def test_corrected_data(self, basic_data, pipe_dict):
         """Test data correction through pipeline."""
-        plumber = Plumber(gapped_data, pipe_dict)
+        plumber = Plumber(basic_data, pipe_dict)
         
         # Test with time slice
         result = plumber.get_corrected_data(
             start="2009-01-01 05:00",
             stop="2009-01-01 10:00"
         )
-        assert len(result) == 6
-        
-        # Test with column selection
-        result = plumber.get_corrected_data(select="°C")
-        assert len(result.columns) == 2
-        assert all("°C" in col for col in result.columns)
+        assert len(result) == 3
 
 class TestGapsDescription:
     """Tests for gap analysis functionality."""

From e4c5ea03a795a4ab72077a77ff0ea2ad3643f103 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 18:12:05 +0100
Subject: [PATCH 10/12] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20big=20plumber=20test?=
 =?UTF-8?q?=20refactoring?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/test_plumbing.py | 144 ++++++++++++++++++++++++-----------------
 tide/plumbing.py       |  16 +++--
 tide/utils.py          |   4 +-
 3 files changed, 94 insertions(+), 70 deletions(-)

diff --git a/tests/test_plumbing.py b/tests/test_plumbing.py
index 1d80f96..079e818 100644
--- a/tests/test_plumbing.py
+++ b/tests/test_plumbing.py
@@ -12,11 +12,13 @@
 
 pio.renderers.default = "browser"
 
+
 @pytest.fixture
 def time_index():
     """Create a standard time index for test data."""
     return pd.date_range("2009-01-01", freq="h", periods=24, tz="UTC")
 
+
 @pytest.fixture
 def basic_data(time_index):
     """Create basic test data with various units and tags."""
@@ -34,6 +36,7 @@ def basic_data(time_index):
         index=time_index,
     )
 
+
 @pytest.fixture
 def gapped_data(time_index):
     """Create test data with specific gaps for testing gap-related functionality."""
@@ -45,18 +48,19 @@ def gapped_data(time_index):
         },
         index=time_index,
     )
-    
+
     # Add cumulative sum to energy data
     data["c__Wh__zone_2"] = data["c__Wh__zone_2"].cumsum()
-    
+
     # Add specific gaps
     data.loc["2009-01-01 05:00":"2009-01-01 09:00", "a__°C__zone_1"] = np.nan  # 5h gap
     data.loc["2009-01-01 15:00", "b__°C__zone_1"] = np.nan  # 1h gap
     data.loc["2009-01-01 17:00", "b__°C__zone_1"] = np.nan  # 1h gap
     data.loc["2009-01-01 20:00", "c__Wh__zone_2"] = np.nan  # 1h gap
-    
+
     return data
 
+
 @pytest.fixture
 def pipe_dict():
     """Create a standard pipeline dictionary for testing."""
@@ -65,14 +69,8 @@ def pipe_dict():
             "°C": [["ReplaceThreshold", {"upper": 25}]],
             "W/m2__outdoor": [["DropTimeGradient", {"upper_rate": -100}]],
         },
-        "common": [
-            ["Interpolate", ["linear"]],
-            ["Ffill"],
-            ["Bfill", {"limit": 3}]
-        ],
-        "resampling": [
-            ["Resample", ["3h", "mean", {"W/m2": "sum"}]]
-        ],
+        "common": [["Interpolate", ["linear"]], ["Ffill"], ["Bfill", {"limit": 3}]],
+        "resampling": [["Resample", ["3h", "mean", {"W/m2": "sum"}]]],
         "compute_energy": [
             [
                 "ExpressionCombine",
@@ -90,15 +88,19 @@ def pipe_dict():
         ],
     }
 
+
 class TestPipelineComponents:
     """Tests for individual pipeline components and transformers."""
-    
+
     def test_pipe_from_proc_list(self, pipe_dict):
         """Test creation and application of processing pipeline from list."""
-        test_df = pd.DataFrame({
-            "temp__°C__building": [10.0, np.nan, 20.0, 30.0],
-            "humid__%HR__building": [50.0, 60.0, np.nan, 80.0]
-        }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC"))
+        test_df = pd.DataFrame(
+            {
+                "temp__°C__building": [10.0, np.nan, 20.0, 30.0],
+                "humid__%HR__building": [50.0, 60.0, np.nan, 80.0],
+            },
+            index=pd.date_range("2009", freq="h", periods=4, tz="UTC"),
+        )
 
         pipe = _get_pipe_from_proc_list(test_df.columns, pipe_dict["common"], tz="UTC")
         result = pipe.fit_transform(test_df)
@@ -119,14 +121,17 @@ def test_pipe_from_proc_list(self, pipe_dict):
     def test_column_wise_transformer(self, pipe_dict):
         """Test column-wise transformer creation and application."""
         # Create controlled test data with known values
-        test_df = pd.DataFrame({
-            "temp1__°C__zone1": [24.0, 26.0, np.nan, 28.0],
-            # Two values above threshold
-            "temp2__°C__zone2": [23.0, 25.0, 27.0, np.nan],
-            # One value above threshold
-            "radiation__W/m2__outdoor": [100, 200, 50, 150],  # For gradient test
-            "humid__%HR__zone1": [50.0, 60.0, 70.0, 80.0]  # Should be unaffected
-        }, index=pd.date_range("2009", freq="h", periods=4, tz="UTC"))
+        test_df = pd.DataFrame(
+            {
+                "temp1__°C__zone1": [24.0, 26.0, np.nan, 28.0],
+                # Two values above threshold
+                "temp2__°C__zone2": [23.0, 25.0, 27.0, np.nan],
+                # One value above threshold
+                "radiation__W/m2__outdoor": [100, 200, 50, 150],  # For gradient test
+                "humid__%HR__zone1": [50.0, 60.0, 70.0, 80.0],  # Should be unaffected
+            },
+            index=pd.date_range("2009", freq="h", periods=4, tz="UTC"),
+        )
 
         # Test with all columns
         transformer = _get_column_wise_transformer(
@@ -150,13 +155,13 @@ def test_column_wise_transformer(self, pipe_dict):
         assert pd.isna(result.iloc[3]["temp1__°C__zone1"])  # Capped
 
         # Check radiation gradient (should drop when rate < -100)
-        assert pd.isna(result.iloc[2][
-                           "radiation__W/m2__outdoor"])  # Dropped due to steep negative gradient
+        assert pd.isna(
+            result.iloc[2]["radiation__W/m2__outdoor"]
+        )  # Dropped due to steep negative gradient
 
         # Check humidity unaffected
         pd.testing.assert_series_equal(
-            result["humid__%HR__zone1"],
-            test_df["humid__%HR__zone1"]
+            result["humid__%HR__zone1"], test_df["humid__%HR__zone1"]
         )
 
         # Test with subset of columns (temperature only)
@@ -199,19 +204,20 @@ def test_pipeline_from_dict(self, gapped_data):
             ],
             "fill_final": [["Interpolate"]],
         }
-        
+
         pipe = get_pipeline_from_dict(gapped_data.columns, pipe_dict, verbose=True)
         result = pipe.fit_transform(gapped_data.copy())
-        
+
         # Check new column created
         assert "new_unit__°C²__zone_1" in result.columns
-        
+
         # Check gaps filled
         assert not result.isna().any().any()
 
+
 class TestPlumber:
     """Tests for the Plumber class functionality."""
-    
+
     def test_initialization(self, gapped_data, pipe_dict):
         """Test Plumber initialization and basic attributes."""
         plumber = Plumber(gapped_data, pipe_dict)
@@ -222,12 +228,12 @@ def test_initialization(self, gapped_data, pipe_dict):
     def test_data_selection(self, gapped_data):
         """Test data selection using tags."""
         plumber = Plumber(gapped_data)
-        
+
         # Test unit selection
         temp_cols = plumber.select("°C")
         assert len(temp_cols) == 2
         assert all("°C" in col for col in temp_cols)
-        
+
         # Test zone selection
         zone_1_cols = plumber.select("zone_1")
         assert len(zone_1_cols) == 2
@@ -236,15 +242,15 @@ def test_data_selection(self, gapped_data):
     def test_pipeline_execution(self, basic_data, pipe_dict):
         """Test pipeline execution with different step selections."""
         plumber = Plumber(basic_data, pipe_dict)
-        
+
         # Test full pipeline
         full_pipe = plumber.get_pipeline()
         assert len(full_pipe.steps) > 0
-        
+
         # Test partial pipeline
         partial_pipe = plumber.get_pipeline(steps=["pre_processing"])
         assert len(partial_pipe.steps) == 1
-        
+
         # Test with no pipeline
         identity_pipe = plumber.get_pipeline(steps=None)
         assert len(identity_pipe.steps) == 1
@@ -253,44 +259,59 @@ def test_pipeline_execution(self, basic_data, pipe_dict):
     def test_corrected_data(self, basic_data, pipe_dict):
         """Test data correction through pipeline."""
         plumber = Plumber(basic_data, pipe_dict)
-        
+
         # Test with time slice
         result = plumber.get_corrected_data(
-            start="2009-01-01 05:00",
-            stop="2009-01-01 10:00"
+            start="2009-01-01 05:00", stop="2009-01-01 10:00"
         )
         assert len(result) == 3
 
+
 class TestGapsDescription:
     """Tests for gap analysis functionality."""
-    
+
     @pytest.fixture
     def gaps_data(self, time_index):
         """Create data with specific gaps for testing gap analysis."""
-        data = pd.DataFrame({
-            "temp__°C__Building": np.ones(24),
-            "humidity__%__Building": np.ones(24),
-            "power__W__Building": np.ones(24)
-        }, index=time_index)
-        
+        data = pd.DataFrame(
+            {
+                "temp__°C__Building": np.ones(24),
+                "humidity__%__Building": np.ones(24),
+                "power__W__Building": np.ones(24),
+            },
+            index=time_index,
+        )
+
         # Create gaps of different durations
         data.loc["2009-01-01 02:00":"2009-01-01 04:00", "temp__°C__Building"] = np.nan
         data.loc["2009-01-01 08:00", "temp__°C__Building"] = np.nan
-        data.loc["2009-01-01 12:00":"2009-01-01 14:00", "humidity__%__Building"] = np.nan
+        data.loc["2009-01-01 12:00":"2009-01-01 14:00", "humidity__%__Building"] = (
+            np.nan
+        )
         data.loc["2009-01-01 06:00":"2009-01-01 18:00", "power__W__Building"] = np.nan
-        
+
         return data
 
     def test_basic_gaps_description(self, gaps_data):
         """Test basic gap analysis functionality."""
         plumber = Plumber(gaps_data)
         result = plumber.get_gaps_description()
-        
+
         # Check structure
         assert all(col in result.columns for col in gaps_data.columns)
-        expected_stats = ["data_presence_%", "count", "mean", "std", "min", "25%", "50%", "75%", "max"]
+        expected_stats = [
+            "data_presence_%",
+            "count",
+            "mean",
+            "std",
+            "min",
+            "25%",
+            "50%",
+            "75%",
+            "max",
+        ]
         assert all(stat in result.index for stat in expected_stats)
-        
+
         # Check specific values
         temp_col = "temp__°C__Building"
         assert result[temp_col]["count"] == 2
@@ -299,12 +320,12 @@ def test_basic_gaps_description(self, gaps_data):
     def test_gap_thresholds(self, gaps_data):
         """Test gap analysis with duration thresholds."""
         plumber = Plumber(gaps_data)
-        
+
         # Test minimum duration threshold
         result = plumber.get_gaps_description(gaps_gte="3h")
         assert result["temp__°C__Building"]["count"] == 1
         assert result["power__W__Building"]["count"] == 1
-        
+
         # Test maximum duration threshold
         result = plumber.get_gaps_description(gaps_lte="2h")
         assert result["temp__°C__Building"]["count"] == 1
@@ -313,17 +334,15 @@ def test_gap_thresholds(self, gaps_data):
     def test_gap_analysis_edge_cases(self, time_index):
         """Test gap analysis edge cases."""
         # Test with no gaps
-        clean_data = pd.DataFrame({
-            "temp__°C__Building": np.ones(24)
-        }, index=time_index)
+        clean_data = pd.DataFrame({"temp__°C__Building": np.ones(24)}, index=time_index)
         plumber = Plumber(clean_data)
         result = plumber.get_gaps_description()
         assert result.empty
-        
+
         # Test with invalid selection
         result = plumber.get_gaps_description(select="nonexistent")
         assert result.empty
-        
+
         # Test single point gap
         data = clean_data.copy()
         data.loc[data.index[12], "temp__°C__Building"] = np.nan
@@ -332,6 +351,7 @@ def test_gap_analysis_edge_cases(self, time_index):
         assert result["temp__°C__Building"]["count"] == 1
         assert pd.Timedelta(result["temp__°C__Building"]["mean"]) == pd.Timedelta("1h")
 
+
 class TestPlotting:
     """Tests for plotting functionality."""
 
@@ -339,10 +359,12 @@ def test_basic_plot(self, gapped_data):
         """Test basic plotting functionality."""
         plumber = Plumber(gapped_data)
         fig = plumber.plot()
-        
+
         # Check figure was created
         assert fig is not None
         # Check data is present in figure
         assert len(fig.data) > 0
         # Check all columns are plotted
-        assert all(col in [trace.name for trace in fig.data] for col in gapped_data.columns)
+        assert all(
+            col in [trace.name for trace in fig.data] for col in gapped_data.columns
+        )
diff --git a/tide/plumbing.py b/tide/plumbing.py
index d899801..0a49450 100644
--- a/tide/plumbing.py
+++ b/tide/plumbing.py
@@ -196,7 +196,7 @@ def get_gaps_description(
             Empty DataFrame if no gaps are found.
         """
         data = self.get_corrected_data(select, steps=steps, verbose=verbose)
-        
+
         # Get gaps and calculate durations
         gaps_dict = get_blocks_lte_and_gte(
             data=data,
@@ -210,14 +210,14 @@ def get_gaps_description(
         for col, gaps_list in gaps_dict.items():
             if not gaps_list:
                 continue
-                
+
             durations = []
             for gap in gaps_list:
                 if len(gap) > 1:
                     durations.append(gap[-1] - gap[0])
                 else:
                     durations.append(pd.to_timedelta(gap.freq))
-            
+
             if durations:
                 gap_durations[col] = pd.Series(durations, name=col)
 
@@ -225,7 +225,7 @@ def get_gaps_description(
             return pd.DataFrame()
 
         stats_df = pd.concat([ser.describe() for ser in gap_durations.values()], axis=1)
-        
+
         gaps_mask = get_blocks_mask_lte_and_gte(
             data=data,
             lte=gaps_lte,
@@ -233,11 +233,13 @@ def get_gaps_description(
             is_null=True,
             return_combination=return_combination,
         )
-        
+
         presence_percentages = (1 - gaps_mask.mean()) * 100
-        
+
         stats_df.loc["data_presence_%"] = presence_percentages[stats_df.columns]
-        row_order = ["data_presence_%"] + [idx for idx in stats_df.index if idx != "data_presence_%"]
+        row_order = ["data_presence_%"] + [
+            idx for idx in stats_df.index if idx != "data_presence_%"
+        ]
         return stats_df.reindex(row_order)
 
     def set_data(self, data: pd.Series | pd.DataFrame):
diff --git a/tide/utils.py b/tide/utils.py
index 9f5ba1b..0ec969c 100644
--- a/tide/utils.py
+++ b/tide/utils.py
@@ -319,7 +319,7 @@ def get_blocks_lte_and_gte(
     lte: str | dt.timedelta = None,
     gte: str | dt.timedelta = None,
     is_null: bool = False,
-    return_combination: bool = False
+    return_combination: bool = False,
 ):
     """
     Get blocks of data ore gaps (nan) based on duration thresholds.
@@ -372,7 +372,7 @@ def get_blocks_mask_lte_and_gte(
     lte: str | dt.timedelta = None,
     gte: str | dt.timedelta = None,
     is_null: bool = False,
-    return_combination: bool = False
+    return_combination: bool = False,
 ) -> pd.DataFrame:
     """
     Creates a boolean mask DataFrame indicating the location of data blocks or gaps.

From 266a991334339d4a1140a662c7e70ef1c28ed55d Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 18:40:03 +0100
Subject: [PATCH 11/12] =?UTF-8?q?=F0=9F=93=9D=20Add=20Plumber=20doc?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tide/plumbing.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)

diff --git a/tide/plumbing.py b/tide/plumbing.py
index 0a49450..e9cf1bc 100644
--- a/tide/plumbing.py
+++ b/tide/plumbing.py
@@ -120,7 +120,50 @@ def get_pipeline_from_dict(
 
 
 class Plumber:
+    """A class for managing and transforming time series data through configurable processing pipelines.
+
+    The Plumber class provides a high-level interface for:
+    - Managing time series data with hierarchical column naming (name__unit__bloc__sub_bloc)
+    - Creating and executing data processing pipelines
+    - Analyzing and visualizing data gaps
+    - Plotting time series with customizable layouts
+
+    The class uses a tree structure to organize data columns based on their tags,
+    allowing for flexible data selection and manipulation.
+
+    Attributes
+    ----------
+    data : pd.DataFrame
+        The input time series data with datetime index
+    root : Node
+        Root node of the tree structure organizing column names
+    pipe_dict : dict
+        Configuration dictionary defining the processing pipeline steps
+
+    Examples
+    --------
+    >>> data = pd.DataFrame({
+    ...     "temp__°C__zone1": [20, 21, np.nan, 23],
+    ...     "humid__%HR__zone1": [50, 55, 60, np.nan]
+    ... }, index=pd.date_range("2023", freq="h", periods=4))
+    >>> pipe_dict = {
+    ...     "pre_processing": {"°C": [["ReplaceThreshold", {"upper": 25}]]},
+    ...     "common": [["Interpolate", ["linear"]]]
+    ... }
+    >>> plumber = Plumber(data, pipe_dict)
+    >>> corrected = plumber.get_corrected_data()
+    """
+
     def __init__(self, data: pd.Series | pd.DataFrame = None, pipe_dict: dict = None):
+        """
+        Parameters
+        ----------
+        data : pd.Series or pd.DataFrame, optional
+            Input time series data. Must have a datetime index.
+        pipe_dict : dict, optional
+            Pipeline configuration dictionary. Each key represents a processing step
+            and contains the corresponding transformation parameters.
+        """
         self.data = check_and_return_dt_index_df(data) if data is not None else None
         self.root = data_columns_to_tree(data.columns) if data is not None else None
         self.pipe_dict = pipe_dict
@@ -146,6 +189,18 @@ def show(
         steps: None | str | list[str] | slice = slice(None),
         depth_level: int | str = None,
     ):
+        """Display the tree structure of selected data columns at selected steps for 
+        a given depth level.
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Data selection using tide's tag system
+        steps : None or str or list[str] or slice, default slice(None)
+            Pipeline steps to apply before showing the tree
+        depth_level : int or str, optional
+            Maximum depth level to display in the tree
+        """
         pipe = self.get_pipeline(select=select, steps=steps)
         loc_tree = data_columns_to_tree(pipe.get_feature_names_out())
         if depth_level is not None:
@@ -243,6 +298,13 @@ def get_gaps_description(
         return stats_df.reindex(row_order)
 
     def set_data(self, data: pd.Series | pd.DataFrame):
+        """Set new data for the Plumber instance.
+
+        Parameters
+        ----------
+        data : pd.Series or pd.DataFrame
+            New time series data to process. Must have a datetime index.
+        """
         self.data = check_and_return_dt_index_df(data)
         self.root = data_columns_to_tree(data.columns)
 
@@ -250,6 +312,20 @@ def select(
         self,
         select: str | pd.Index | list[str] = None,
     ):
+        """Select columns based on tags.
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Selection criteria using tide's tag system.
+            Can be a unit (e.g., "°C"), location (e.g., "zone_1"),
+            or any other tag in the column names.
+
+        Returns
+        -------
+        pd.Index
+            Selected column names
+        """
         return parse_request_to_col_names(self.data, select)
 
     def get_pipeline(
@@ -258,6 +334,22 @@ def get_pipeline(
         steps: None | str | list[str] | slice = slice(None),
         verbose: bool = False,
     ) -> Pipeline:
+        """Create a scikit-learn pipeline from the configuration.
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Data selection using tide's tag system
+        steps : None or str or list[str] or slice, default slice(None)
+            Pipeline steps to include. If None, returns an Identity transformer.
+        verbose : bool, default False
+            Whether to print information about pipeline steps
+
+        Returns
+        -------
+        Pipeline
+            Scikit-learn pipeline configured with the selected steps
+        """
         if self.data is None:
             raise ValueError("data is required to build a pipeline")
         selection = parse_request_to_col_names(self.data, select)
@@ -280,6 +372,26 @@ def get_corrected_data(
         steps: None | str | list[str] | slice = slice(None),
         verbose: bool = False,
     ) -> pd.DataFrame:
+        """Apply pipeline transformations to selected data.
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Data selection using tide's tag system
+        start : str or datetime or Timestamp, optional
+            Start time for data slice
+        stop : str or datetime or Timestamp, optional
+            End time for data slice
+        steps : None or str or list[str] or slice, default slice(None)
+            Pipeline steps to apply
+        verbose : bool, default False
+            Whether to print information about pipeline steps
+
+        Returns
+        -------
+        pd.DataFrame
+            Transformed data
+        """
         if self.data is None:
             raise ValueError("Cannot get corrected data. data are missing")
         select = parse_request_to_col_names(self.data, select)
@@ -299,6 +411,30 @@ def plot_gaps_heatmap(
         title: str = None,
         verbose: bool = False,
     ):
+        """Create a heatmap visualization of data gaps.
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Data selection using tide's tag system
+        start : str or datetime or Timestamp, optional
+            Start time for visualization
+        stop : str or datetime or Timestamp, optional
+            End time for visualization
+        steps : None or str or list[str] or slice, default slice(None)
+            Pipeline steps to apply before visualization
+        time_step : str or Timedelta or timedelta, optional
+            Time step for aggregating gaps
+        title : str, optional
+            Plot title
+        verbose : bool, default False
+            Whether to print information about pipeline steps
+
+        Returns
+        -------
+        go.Figure
+            Plotly figure object containing the heatmap
+        """
         data = self.get_corrected_data(select, start, stop, steps, verbose)
         return plot_gaps_heatmap(data, time_step=time_step, title=title)
 
@@ -328,6 +464,68 @@ def plot(
         y_title_standoff: int | float = 5,
         verbose: bool = False,
     ):
+        """Create an interactive time series plot.
+
+        Creates a highly customizable plot that can show:
+        - Multiple time series with automatic different y-axes based on unit
+        - Two different versions of the data (e.g., raw and processed)
+        - Data gaps visualization
+        - Custom styling and layout
+
+        Parameters
+        ----------
+        select : str or pd.Index or list[str], optional
+            Data selection using tide's tag system
+        start : str or datetime or Timestamp, optional
+            Start time for plot
+        stop : str or datetime or Timestamp, optional
+            End time for plot
+        y_axis_level : str, optional
+            Tag level to use for y-axis grouping
+        y_tag_list : list[str], optional
+            List of tags for custom y-axis ordering
+        steps : None or str or list[str] or slice, default slice(None)
+            Pipeline steps to apply for main data
+        data_mode : str, default "lines"
+            Plot mode for main data ("lines", "markers", or "lines+markers")
+        steps_2 : None or str or list[str] or slice, optional
+            Pipeline steps to apply for secondary data
+        data_2_mode : str, default "markers"
+            Plot mode for secondary data
+        markers_opacity : float, default 0.8
+            Opacity for markers
+        lines_width : float, default 2.0
+            Width of plot lines
+        title : str, optional
+            Plot title
+        plot_gaps : bool, default False
+            Whether to highlight gaps in main data
+        gaps_lower_td : str or Timedelta or timedelta, optional
+            Minimum duration for gap highlighting
+        gaps_rgb : tuple[int, int, int], default (31, 73, 125)
+            RGB color for main data gaps
+        gaps_alpha : float, default 0.5
+            Opacity for main data gaps
+        plot_gaps_2 : bool, default False
+            Whether to highlight gaps in secondary data
+        gaps_2_lower_td : str or Timedelta or timedelta, optional
+            Minimum duration for secondary data gap highlighting
+        gaps_2_rgb : tuple[int, int, int], default (254, 160, 34)
+            RGB color for secondary data gaps
+        gaps_2_alpha : float, default 0.5
+            Opacity for secondary data gaps
+        axis_space : float, default 0.03
+            Space between multiple y-axes
+        y_title_standoff : int or float, default 5
+            Distance between y-axis title and axis
+        verbose : bool, default False
+            Whether to print information about pipeline steps
+
+        Returns
+        -------
+        go.Figure
+            Plotly figure object containing the plot
+        """
         # A bit dirty. Here we assume that if you ask a selection
         # that is not found in original data columns, it is because it
         # has not yet been computed (using ExpressionCombine processor

From 6f2eeb288dbc19dc8fb7d2871b1809ec0cc28289 Mon Sep 17 00:00:00 2001
From: BaptisteDE <bdurandestebe@nobatek.inef4.com>
Date: Mon, 3 Mar 2025 18:45:12 +0100
Subject: [PATCH 12/12] =?UTF-8?q?=F0=9F=9A=A8=20plumbing=20linter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tide/plumbing.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tide/plumbing.py b/tide/plumbing.py
index e9cf1bc..8842ab4 100644
--- a/tide/plumbing.py
+++ b/tide/plumbing.py
@@ -142,13 +142,16 @@ class Plumber:
 
     Examples
     --------
-    >>> data = pd.DataFrame({
-    ...     "temp__°C__zone1": [20, 21, np.nan, 23],
-    ...     "humid__%HR__zone1": [50, 55, 60, np.nan]
-    ... }, index=pd.date_range("2023", freq="h", periods=4))
+    >>> data = pd.DataFrame(
+    ...     {
+    ...         "temp__°C__zone1": [20, 21, np.nan, 23],
+    ...         "humid__%HR__zone1": [50, 55, 60, np.nan],
+    ...     },
+    ...     index=pd.date_range("2023", freq="h", periods=4),
+    ... )
     >>> pipe_dict = {
     ...     "pre_processing": {"°C": [["ReplaceThreshold", {"upper": 25}]]},
-    ...     "common": [["Interpolate", ["linear"]]]
+    ...     "common": [["Interpolate", ["linear"]]],
     ... }
     >>> plumber = Plumber(data, pipe_dict)
     >>> corrected = plumber.get_corrected_data()
@@ -189,7 +192,7 @@ def show(
         steps: None | str | list[str] | slice = slice(None),
         depth_level: int | str = None,
     ):
-        """Display the tree structure of selected data columns at selected steps for 
+        """Display the tree structure of selected data columns at selected steps for
         a given depth level.
 
         Parameters