Merge pull request #35 from BuildingEnergySimulationTools/25-improve-select-function

BaptisteDE · web-flow · commit f38af648982c · 2025-04-15T16:41:16.000+02:00
25 improve select function
diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst
@@ -4,6 +4,7 @@ API Reference
 .. toctree::
    :maxdepth: 2
 
+   utils
    plumbing
    processing
    regressor
diff --git a/docs/api_reference/utils.rst b/docs/api_reference/utils.rst
@@ -0,0 +1,7 @@
+Utils Modules
+===============
+
+Tides utility functions and class.
+Mostly for handling tags, generating tree, or finding and selecting data gaps.
+
+.. autofunction:: tide.utils.tide_request
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -993,7 +993,7 @@ def test_replace_tag(self):
     def test_add_fourier_pairs(self):
         test_df = pd.DataFrame(
             data=np.arange(24).astype("float64"),
-            index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"),
+            index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"),
             columns=["feat_1"],
         )
 
@@ -1036,14 +1036,14 @@ def test_add_fourier_pairs(self):
                 "1 days 00:00:00_order_2_Sine",
                 "1 days 00:00:00_order_2_Cosine",
             ],
-            index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"),
+            index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"),
         )
 
         pd.testing.assert_frame_equal(res, ref_df)
 
         test_df_phi = pd.DataFrame(
             data=np.arange(24),
-            index=pd.date_range("2009-01-01 06:00:00", freq="H", periods=24),
+            index=pd.date_range("2009-01-01 06:00:00", freq="h", periods=24),
             columns=["feat_1"],
         )
         test_df_phi = test_df_phi.tz_localize("UTC")
@@ -1053,7 +1053,7 @@ def test_add_fourier_pairs(self):
 
         test_df = pd.DataFrame(
             data=np.arange(24).astype("float64"),
-            index=pd.date_range("2009-01-01 00:00:00", freq="H", periods=24, tz="UTC"),
+            index=pd.date_range("2009-01-01 00:00:00", freq="h", periods=24, tz="UTC"),
             columns=["feat_1__°C__building__room"],
         )
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -10,7 +10,7 @@
     data_columns_to_tree,
     get_data_col_names_from_root,
     get_data_level_values,
-    parse_request_to_col_names,
+    tide_request,
     timedelta_to_int,
     NamedList,
     _get_series_bloc,
@@ -58,7 +58,7 @@ def test_columns_parser(self):
         assert all(col in DF_COLUMNS.columns for col in col_names)
 
     def test_parse_request_to_col_names(self):
-        res = parse_request_to_col_names(DF_COLUMNS)
+        res = tide_request(DF_COLUMNS)
         assert res == [
             "name_1__°C__bloc1",
             "name_1__°C__bloc2",
@@ -69,10 +69,13 @@ def test_parse_request_to_col_names(self):
             "name4__DIMENSIONLESS__bloc4",
         ]
 
-        res = parse_request_to_col_names(DF_COLUMNS, "name_1__°C__bloc1")
+        res = tide_request(DF_COLUMNS, "name_1__°C__bloc1")
         assert res == ["name_1__°C__bloc1"]
 
-        res = parse_request_to_col_names(
+        res = tide_request(DF_COLUMNS, ["name_1__°C__bloc1"])
+        assert res == ["name_1__°C__bloc1"]
+
+        res = tide_request(
             DF_COLUMNS,
             [
                 "name_1__°C__bloc1",
@@ -84,18 +87,28 @@ def test_parse_request_to_col_names(self):
             "name_1__°C__bloc2",
         ]
 
-        res = parse_request_to_col_names(DF_COLUMNS, "°C")
+        res = tide_request(DF_COLUMNS, "°C")
         assert res == ["name_1__°C__bloc1", "name_1__°C__bloc2"]
 
-        res = parse_request_to_col_names(DF_COLUMNS, "OTHER")
+        res = tide_request(DF_COLUMNS, "OTHER")
         assert res == ["name_2", "name_3__kWh/m²", "name_5__kWh"]
 
-        res = parse_request_to_col_names(DF_COLUMNS, "DIMENSIONLESS__bloc2")
+        res = tide_request(DF_COLUMNS, "DIMENSIONLESS__bloc2")
         assert res == ["name_2__DIMENSIONLESS__bloc2"]
 
-        res = parse_request_to_col_names(DF_COLUMNS, "kWh")
+        res = tide_request(DF_COLUMNS, "kWh")
         assert res == ["name_5__kWh"]
 
+        res = tide_request(DF_COLUMNS, "kWh|°C")
+        assert res == ["name_5__kWh", "name_1__°C__bloc1", "name_1__°C__bloc2"]
+
+        res = tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"])
+        assert res == [
+            "name_5__kWh",
+            "name_1__°C__bloc1",
+            "name_1__°C__bloc2",
+        ]
+
     def test_get_data_level_names(self):
         root = data_columns_to_tree(DF_COLUMNS.columns)
         res = get_data_level_values(root, "name")
diff --git a/tide/plot.py b/tide/plot.py
@@ -6,7 +6,7 @@
 
 from tide.utils import (
     check_and_return_dt_index_df,
-    parse_request_to_col_names,
+    tide_request,
     data_columns_to_tree,
     get_data_level_values,
     get_data_blocks,
@@ -63,7 +63,7 @@ def get_cols_axis_maps_and_labels(
     col_axes_map = {}
     axes_col_map = {}
     for i, tag in enumerate(y_tags):
-        selected_cols = parse_request_to_col_names(columns, tag)
+        selected_cols = tide_request(columns, tag)
         axes_col_map["y" if i == 0 else f"y{i + 1}"] = selected_cols
         for col in selected_cols:
             col_axes_map[col] = {"yaxis": "y"} if i == 0 else {"yaxis": f"y{i + 1}"}
diff --git a/tide/plumbing.py b/tide/plumbing.py
@@ -8,7 +8,7 @@
 from sklearn.compose import ColumnTransformer
 
 from tide.utils import (
-    parse_request_to_col_names,
+    tide_request,
     check_and_return_dt_index_df,
     data_columns_to_tree,
     get_data_level_values,
@@ -62,7 +62,7 @@ def _get_column_wise_transformer(
 ) -> ColumnTransformer | None:
     col_trans_list = []
     for req, proc_list in proc_dict.items():
-        requested_col = parse_request_to_col_names(data_columns, req)
+        requested_col = tide_request(data_columns, req)
         if not requested_col:
             pass
         else:
@@ -358,7 +358,7 @@ def select(
         pd.Index
             Selected column names
         """
-        return parse_request_to_col_names(self.data, select)
+        return tide_request(self.data, select)
 
     def get_pipeline(
         self,
@@ -438,7 +438,7 @@ def get_pipeline(
         """
         if self.data is None:
             raise ValueError("data is required to build a pipeline")
-        selection = parse_request_to_col_names(self.data, select)
+        selection = tide_request(self.data, select)
         if steps is None or self.pipe_dict is None:
             dict_to_pipe = None
         else:
@@ -541,7 +541,7 @@ def get_corrected_data(
         """
         if self.data is None:
             raise ValueError("Cannot get corrected data. data are missing")
-        select = parse_request_to_col_names(self.data, select)
+        select = tide_request(self.data, select)
         data = self.data.loc[
             start or self.data.index[0] : stop or self.data.index[-1], select
         ].copy()
@@ -834,9 +834,7 @@ def plot(
         # for example) So we just process the whole data hoping to find the result
         # after.
         select_corr = (
-            self.data.columns
-            if not parse_request_to_col_names(self.data, select)
-            else select
+            self.data.columns if not tide_request(self.data, select) else select
         )
 
         data_1 = self.get_corrected_data(select_corr, start, stop, steps, verbose)
diff --git a/tide/processing.py b/tide/processing.py
@@ -13,7 +13,7 @@
     get_data_blocks,
     get_outer_timestamps,
     check_and_return_dt_index_df,
-    parse_request_to_col_names,
+    tide_request,
     ensure_list,
 )
 from tide.regressors import SkSTLForecast, SkProphet
@@ -1269,9 +1269,7 @@ def _fit_implementation(self, X: pd.Series | pd.DataFrame, y=None):
         if self.tide_format_methods:
             self.columns_methods = []
             for req, method in self.tide_format_methods.items():
-                self.columns_methods.append(
-                    (parse_request_to_col_names(X.columns, req), method)
-                )
+                self.columns_methods.append((tide_request(X.columns, req), method))
 
         return self
 
diff --git a/tide/utils.py b/tide/utils.py
@@ -143,40 +143,136 @@ def get_data_col_names_from_root(data_root):
     ][-1]
 
 
-def parse_request_to_col_names(
+def find_cols_with_tide_tags(
+    data_columns: pd.Index | list[str], request: str
+) -> list[str]:
+    request_parts = request.split("__")
+
+    if not (1 <= len(request_parts) <= 4):
+        raise ValueError(
+            f"Request '{request}' is malformed. "
+            f"Use 'name__unit__bloc__sub_bloc' format or a "
+            f"combination of these tags."
+        )
+
+    full_tag_col_map = {
+        col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col
+        for col in data_columns
+    }
+
+    def find_exact_match(search_str, target):
+        pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)"
+        match = re.search(pattern, target)
+        return match is not None
+
+    return [
+        full_tag_col_map[augmented_col]
+        for augmented_col in full_tag_col_map.keys()
+        if all(find_exact_match(part, augmented_col) for part in request_parts)
+    ]
+
+
+def find_cols_multiple_tag_groups(
+    data_columns: pd.Index | list[str], request: str
+) -> list[str]:
+    request_parts = request.split("|")
+    list_to_return = []
+    for req in request_parts:
+        list_to_return.extend(find_cols_with_tide_tags(data_columns, req))
+    return list_to_return
+
+
+def tide_request(
     data_columns: pd.Index | list[str], request: str | pd.Index | list[str] = None
 ) -> list[str]:
+    """
+    Select columns by matching structured TIDE-style tags.
+
+    Filters column names based on a TIDE-style structured tag syntax. Columns are
+    expected to use a naming convention with double underscores (`__`) separating
+    tags.
+
+    A column name can include up to four hierarchical parts:
+    'name__unit__bloc__sub_bloc' where each part is optional, but must be separated
+    with double underscores.
+
+    The `request` argument allows searching for columns matching one or more
+    of these parts using full or partial tag patterns. Multiple tag patterns
+    can be combined using the pipe `|` character to form OR conditions.
+
+    Parameters
+    ----------
+    data_columns : pandas.Index or list of str
+        A collection of column names to filter. Each column name should follow
+        the TIDE format (e.g., "sensor__°C__bloc1").
+
+    request : str or list of str or pandas.Index, optional
+        Tag(s) to match against the column names. Each tag string may be:
+
+        - A full structured tag (e.g., "name__°C__bloc2")
+        - A partial tag (e.g., "°C", "bloc1")
+        - A group of tags separated by "|" (e.g., "kWh|°C")
+
+        If None, all columns from `data_columns` are returned.
+
+    Returns
+    -------
+    list of str
+        The list of column names that match any of the provided tag queries.
+
+    Notes
+    -----
+    - Matching is done per tag part, not substrings. For instance, the query
+      "bloc1" will match "name__°C__bloc1" but not "bloc11".
+    - If multiple requests are given, columns are returned if they match
+      at least one of them (logical OR).
+    - Tags can include between 1 and 4 parts, split by `__`.
+
+    Examples
+    --------
+    >>> DF_COLUMNS = [
+    ...     "name_1__°C__bloc1",
+    ...     "name_1__°C__bloc2",
+    ...     "name_2",
+    ...     "name_2__DIMENSIONLESS__bloc2",
+    ...     "name_3__kWh/m²",
+    ...     "name_5__kWh",
+    ...     "name4__DIMENSIONLESS__bloc4",
+    ... ]
+
+    >>> tide_request(DF_COLUMNS)
+    ['name_1__°C__bloc1', 'name_1__°C__bloc2', 'name_2',
+     'name_2__DIMENSIONLESS__bloc2', 'name_3__kWh/m²',
+     'name_5__kWh', 'name4__DIMENSIONLESS__bloc4']
+
+    >>> tide_request(DF_COLUMNS, "°C")
+    ['name_1__°C__bloc1', 'name_1__°C__bloc2']
+
+    >>> tide_request(DF_COLUMNS, "kWh|°C")
+    ['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2']
+
+    >>> # Columns are not selected twice
+    >>> tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"])
+    ['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2']
+    """
+
     if request is None:
         return list(data_columns)
 
-    elif isinstance(request, pd.Index) or isinstance(request, list):
-        return [col for col in request if col in data_columns]
+    elif isinstance(request, str):
+        request = [request]
 
-    else:
-        request_parts = request.split("__")
-
-        if not (1 <= len(request_parts) <= 4):
-            raise ValueError(
-                f"Request '{request}' is malformed. "
-                f"Use 'name__unit__bloc__sub_bloc' format or a "
-                f"combination of these tags."
-            )
-
-        full_tag_col_map = {
-            col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col
-            for col in data_columns
-        }
-
-        def find_exact_match(search_str, target):
-            pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)"
-            match = re.search(pattern, target)
-            return match is not None
-
-        return [
-            full_tag_col_map[augmented_col]
-            for augmented_col in full_tag_col_map.keys()
-            if all(find_exact_match(part, augmented_col) for part in request_parts)
-        ]
+    if not (isinstance(request, pd.Index) or isinstance(request, list)):
+        raise ValueError(
+            "Invalid request. Was expected an instance of str, pd.Index or List[str]"
+            f"got {type(request)} instead"
+        )
+
+    list_to_return = []
+    for req in request:
+        list_to_return.extend(find_cols_multiple_tag_groups(data_columns, req))
+
+    return list(dict.fromkeys(list_to_return))
 
 
 def data_columns_to_tree(columns: pd.Index | list[str]) -> T: