diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ad5790d..f5b2318 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -5,6 +5,17 @@ on: tags: ["*"] pull_request: jobs: + docs: + name: docs + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4.0.0 + - uses: actions/setup-python@v5 + with: + python-version: "3.13" + - run: pip install tox + - run: tox -e docs tests: name: ${{ matrix.name }} runs-on: ubuntu-latest diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..a612582 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,18 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.10" + +sphinx: + configuration: docs/conf.py + fail_on_warning: false + +python: + install: + - method: pip + path: . + extra_requirements: + - dev + - requirements: requirements/dev.txt \ No newline at end of file diff --git a/README.md b/README.md index 038a072..d2cfa26 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ [![codecov](https://codecov.io/gh/BuildingEnergySimulationTools/tide/branch/main/graph/badge.svg?token=F51O9CXI61)](https://codecov.io/gh/BuildingEnergySimulationTools/tide) [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![License](https://img.shields.io/badge/License-BSD_3--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![Documentation Status](https://readthedocs.org/projects/python-tide/badge/?version=latest)](https://python-tide.readthedocs.io/en/latest/?badge=latest) ## Pipeline Development and Data Visualization for Time Series in Physical Measurements @@ -206,9 +207,9 @@ plumber.plot( The development of this library has been supported by ENSNARE Project, which -has received funding from the European Union’s Horizon 2020 Research and Innovation +has received funding from the European Union's Horizon 2020 Research and Innovation Programme under Grant Agreement No. 953193. The sole responsibility for the content of -this library lies entirely with the author’s view. The European Commission is not +this library lies entirely with the author's view. The European Commission is not responsible for any use that may be made of the information it contains. diff --git a/docs/_static/custom.css b/docs/_static/custom.css new file mode 100644 index 0000000..2710457 --- /dev/null +++ b/docs/_static/custom.css @@ -0,0 +1,8 @@ +.wy-side-nav-search .wy-side-nav-search > div.version { + margin-top: 0.5rem; +} + +.wy-side-nav-search > a img.logo { + width: 100px; + height: auto; +} \ No newline at end of file diff --git a/docs/api_reference/classifiers.rst b/docs/api_reference/classifiers.rst new file mode 100644 index 0000000..688e040 --- /dev/null +++ b/docs/api_reference/classifiers.rst @@ -0,0 +1,9 @@ +Classifiers Module +================= + +The classifiers module provides classes for time series classification and pattern recognition. + +.. automodule:: tide.classifiers + :members: + :show-inheritance: + :exclude-members: _get_pipe_from_proc_list, _get_column_wise_transformer, _dummy_df \ No newline at end of file diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst new file mode 100644 index 0000000..fc4d4c7 --- /dev/null +++ b/docs/api_reference/index.rst @@ -0,0 +1,14 @@ +API Reference +============ + +.. toctree:: + :maxdepth: 2 + + plumbing + processing + regressor + classifiers + influx + metrics + meteo + plot \ No newline at end of file diff --git a/docs/api_reference/influx.rst b/docs/api_reference/influx.rst new file mode 100644 index 0000000..13a91e8 --- /dev/null +++ b/docs/api_reference/influx.rst @@ -0,0 +1,9 @@ +Influx Module +============ + +The influx module provides functionality for interacting with InfluxDB time series databases. + +.. automodule:: tide.influx + :members: + :show-inheritance: + :exclude-members: _get_pipe_from_proc_list, _get_column_wise_transformer, _dummy_df \ No newline at end of file diff --git a/docs/api_reference/meteo.rst b/docs/api_reference/meteo.rst new file mode 100644 index 0000000..74b14e0 --- /dev/null +++ b/docs/api_reference/meteo.rst @@ -0,0 +1,9 @@ +Meteo Module +=========== + +The meteo module provides functionality for handling meteorological data and calculations. + +.. automodule:: tide.meteo + :members: + :show-inheritance: + :exclude-members: _get_pipe_from_proc_list, _get_column_wise_transformer, _dummy_df \ No newline at end of file diff --git a/docs/api_reference/metrics.rst b/docs/api_reference/metrics.rst new file mode 100644 index 0000000..8e5e2d8 --- /dev/null +++ b/docs/api_reference/metrics.rst @@ -0,0 +1,9 @@ +Metrics Module +============= + +The metrics module provides functions and classes for calculating various time series metrics and performance indicators. + +.. automodule:: tide.metrics + :members: + :show-inheritance: + :exclude-members: _get_pipe_from_proc_list, _get_column_wise_transformer, _dummy_df \ No newline at end of file diff --git a/docs/api_reference/plumbing.rst b/docs/api_reference/plumbing.rst new file mode 100644 index 0000000..8f4515e --- /dev/null +++ b/docs/api_reference/plumbing.rst @@ -0,0 +1,23 @@ +Plumbing Module +============== + +The plumbing module provides the core functionality for data pipeline creation and management. + +.. autoclass:: tide.plumbing.Plumber + :members: + :undoc-members: + :show-inheritance: + +Pipeline Creation Functions +------------------------- + +.. autofunction:: tide.plumbing._get_pipe_from_proc_list + +.. autofunction:: tide.plumbing._get_column_wise_transformer + +.. autofunction:: tide.plumbing.get_pipeline_from_dict + +Helper Functions +-------------- + +.. autofunction:: tide.plumbing._dummy_df \ No newline at end of file diff --git a/docs/api_reference/processing.rst b/docs/api_reference/processing.rst new file mode 100644 index 0000000..100ae2a --- /dev/null +++ b/docs/api_reference/processing.rst @@ -0,0 +1,105 @@ +Processing Module +================ + +The processing module provides transformers for data processing and manipulation. + +.. autoclass:: tide.processing.Identity + :members: + :show-inheritance: + +.. autoclass:: tide.processing.Interpolate + :members: + :show-inheritance: + +.. autoclass:: tide.processing.Ffill + :members: + :show-inheritance: + +.. autoclass:: tide.processing.Bfill + :members: + :show-inheritance: + +.. autoclass:: tide.processing.ReplaceThreshold + :members: + :show-inheritance: + +.. autoclass:: tide.processing.DropTimeGradient + :members: + :show-inheritance: + +.. autoclass:: tide.processing.Resample + :members: + :show-inheritance: + +.. autoclass:: tide.processing.ExpressionCombine + :members: + :show-inheritance: + +.. autoclass:: tide.processing.ReplaceDuplicated + :members: + :show-inheritance: + +.. autoclass:: tide.processing.Dropna + :members: + :show-inheritance: + +.. autoclass:: tide.processing.RenameColumns + :members: + :show-inheritance: + +.. autoclass:: tide.processing.SkTransform + :members: + :show-inheritance: + +.. autoclass:: tide.processing.ApplyExpression + :members: + :show-inheritance: + +.. autoclass:: tide.processing.TimeGradient + :members: + :show-inheritance: + +.. autoclass:: tide.processing.AddTimeLag + :members: + :show-inheritance: + +.. autoclass:: tide.processing.CombineColumns + :members: + :show-inheritance: + +.. autoclass:: tide.processing.AddOikoData + :members: + :show-inheritance: + +.. autoclass:: tide.processing.AddSolarAngles + :members: + :show-inheritance: + +.. autoclass:: tide.processing.ProjectSolarRadOnSurfaces + :members: + :show-inheritance: + +.. autoclass:: tide.processing.FillOtherColumns + :members: + :show-inheritance: + +.. autoclass:: tide.processing.DropColumns + :members: + :show-inheritance: + +.. autoclass:: tide.processing.ReplaceTag + :members: + :show-inheritance: + +.. autoclass:: tide.processing.FillGapsAR + :members: + :show-inheritance: + +.. autoclass:: tide.processing.GaussianFilter1D + :members: + :show-inheritance: + +.. autoclass:: tide.processing.STLFilter + :members: + :show-inheritance: + diff --git a/docs/api_reference/regressor.rst b/docs/api_reference/regressor.rst new file mode 100644 index 0000000..13592f6 --- /dev/null +++ b/docs/api_reference/regressor.rst @@ -0,0 +1,12 @@ +Regressor Module +=============== + +The regressor module provides classes for time series regression and forecasting. + +.. autoclass:: tide.regressors.SkSTLForecast + :members: + :show-inheritance: + +.. autoclass:: tide.regressors.SkProphet + :members: + :show-inheritance: \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..9bb2472 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,88 @@ +import os +import sys + +sys.path.insert(0, os.path.abspath("..")) + +# Project information +project = "python-tide" +copyright = "2024, Baptiste Durand-Estebe" +author = "Baptiste Durand-Estebe" + +# The full version, including alpha/beta/rc tags +release = "0.1.4" + +# General configuration +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx_autodoc_typehints", + "myst_parser", + "nbsphinx", + "sphinx_copybutton", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# The theme to use for HTML and HTML Help pages. +html_theme = "sphinx_rtd_theme" + +# Theme options +html_theme_options = { + "logo_only": False, + "display_version": True, + "prev_next_buttons_location": "bottom", + "style_external_links": False, + "style_nav_header_background": "#2980B9", + # Toc options + "collapse_navigation": True, + "sticky_navigation": True, + "navigation_depth": 4, + "includehidden": True, + "titles_only": False, + # GitHub link + "github_url": "https://github.com/bdurandestebe/tide", + "github_repo": "tide", + "github_user": "bdurandestebe", +} + +# Add any paths that contain custom static files (such as style sheets) +html_static_path = ["_static"] + +# Custom CSS +html_css_files = [ + "custom.css", +] + +# Logo +html_logo = "../tide_logo.svg" +html_favicon = "../tide_logo.svg" + +# Napoleon settings +napoleon_google_docstring = False +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = True +napoleon_include_private_with_doc = True +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = True +napoleon_use_admonition_for_notes = True +napoleon_use_admonition_for_references = True +napoleon_use_ivar = True +napoleon_use_param = True +napoleon_use_rtype = True +napoleon_type_aliases = None + +# Autodoc settings +autodoc_default_options = { + "members": True, + "member-order": "bysource", + "special-members": "__init__", + "undoc-members": True, + "exclude-members": "__weakref__", +} diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..cf39a5d --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,76 @@ +Welcome to python-tide's documentation! +==================================== + +.. image:: ../tide_logo.svg + :width: 200px + :align: center + + +python-tide is a Python library for time series data visualization and pipeline creation, +with a focus on building data processing pipelines and analyzing data gaps. + +GitHub Repository +---------------- + +The source code for python-tide is available on `GitHub `_ + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + installation + quickstart + user_guide/index + api_reference/index + tutorials/index + contributing + changelog + +Features +-------- + +- Hierarchical column naming system (name__unit__bloc__sub_bloc) +- Flexible data selection using tags +- Configurable data processing pipelines +- Advanced gap analysis and visualization +- Interactive time series plotting with multiple y-axes +- Integration with scikit-learn transformers + +Quick Example +------------ + +.. code-block:: python + + import pandas as pd + import numpy as np + from tide.plumbing import Plumber + + # Create sample data + data = pd.DataFrame({ + "temp__°C__zone1": [20, 21, np.nan, 23], + "humid__%HR__zone1": [50, 55, 60, np.nan] + }, index=pd.date_range("2023", freq="h", periods=4)) + + # Define pipeline + pipe_dict = { + "pre_processing": {"°C": [["ReplaceThreshold", {"upper": 25}]]}, + "common": [["Interpolate", ["linear"]]] + } + + # Create plumber and process data + plumber = Plumber(data, pipe_dict) + corrected = plumber.get_corrected_data() + + # Analyze gaps + gaps = plumber.get_gaps_description() + + # Plot data + fig = plumber.plot(plot_gaps=True) + fig.show() + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` \ No newline at end of file diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..dbf57a3 --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,75 @@ +Installation +============ + +python-tide requires Python 3.10 or later. + +Using pip +--------- + +The recommended way to install python-tide is via pip: + +.. code-block:: bash + + pip install python-tide + +This will install python-tide and all its dependencies. + +From Source +---------- + +To install python-tide from source, clone the repository and install in editable mode: + +.. code-block:: bash + + git clone https://github.com/BuildingEnergySimulationTools/tide.git + cd tide + pip install -e . + +Development Installation +---------------------- + +For development, you'll want to install additional dependencies: + +.. code-block:: bash + + pip install -e ".[dev]" + +This will install all development dependencies including: + +- pytest for testing +- sphinx for documentation +- pre-commit for code quality +- other development tools + +Dependencies +----------- + +Core Dependencies +~~~~~~~~~~~~~~~ + +- numpy>=1.22.4 +- pandas>=2.0.0 +- scipy>=1.9.1 +- bigtree>=0.21.3 +- scikit-learn>=1.2.2 +- statsmodels>=0.14.4 +- matplotlib>=3.5.1 +- plotly>=5.3.1 +- requests>=2.32.3 +- influxdb-client>=1.48.0 +- prophet>=1.1.6 + +Optional Dependencies +~~~~~~~~~~~~~~~~~~~ + +For development and documentation: + +- pytest +- sphinx +- sphinx-rtd-theme +- sphinx-autodoc-typehints +- myst-parser +- nbsphinx +- sphinx-copybutton +- pre-commit +- bump2version \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 2b15205..b5c4ee6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,19 @@ dependencies = [ "prophet>=1.1.6", ] +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pre-commit>=3.3.3", + "bump2version>=1.0.1", + "sphinx>=7.1.0", + "sphinx-rtd-theme>=2.0.0", + "sphinx-autodoc-typehints>=1.25.2", + "myst-parser>=2.0.0", + "nbsphinx>=0.9.3", + "sphinx-copybutton>=0.5.2", +] + [project.urls] Source = "https://github.com/BuildingEnergySimulationTools/tide" diff --git a/requirements/dev.txt b/requirements/dev.txt index 0d6b4a0..ff0288e 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,3 +1,10 @@ +-r install-min.txt -r tests.txt pre-commit>=3.3.3 -bump2version>=1.0.1 \ No newline at end of file +bump2version>=1.0.1 +sphinx>=7.1.0 +sphinx-rtd-theme>=2.0.0 +sphinx-autodoc-typehints>=1.25.2 +myst-parser>=2.0.0 +nbsphinx>=0.9.3 +sphinx-copybutton>=0.5.2 \ No newline at end of file diff --git a/tide/influx.py b/tide/influx.py index f9dc29d..47b0e7a 100644 --- a/tide/influx.py +++ b/tide/influx.py @@ -79,58 +79,124 @@ def get_influx_data( waited_seconds_at_retry: int = 5, verbose: bool = False, ) -> pd.DataFrame: - """ - Fetches data from an InfluxDB instance for the specified time range, - bucket, and measurement, optionally splitting the request into smaller time - intervals. + """Fetch time series data from an InfluxDB instance. + + This function retrieves data from InfluxDB and formats it according to Tide's + hierarchical column naming convention. It supports: + + - Flexible time range specification + - Automatic query splitting for large time ranges + - Retry mechanism for handling timeouts + - Timezone-aware data handling Parameters ---------- - start : str, pd.Timestamp, or datetime.datetime - The start of the time range for the query. Can be: - - A relative time string (e.g., "-1d", "-2h"). - - A `pd.Timestamp` or `datetime.datetime` object. + start : str or pd.Timestamp or datetime.datetime + Start time for the query. Can be: + - A relative time string (e.g., "-1d", "-2h") + - A pandas Timestamp + - A datetime object + If using relative time strings, they are interpreted relative to the current time. - stop : str, pd.Timestamp, or datetime.datetime - The end of the time range for the query. - Accepts the same formats as `start`. + stop : str or pd.Timestamp or datetime.datetime + End time for the query. Accepts the same formats as start. bucket : str - The name of the InfluxDB bucket to query data from. + Name of the InfluxDB bucket to query. measurement : str - The _measurement name within the InfluxDB bucket to filter data. + Name of the InfluxDB measurement to filter data. tide_tags : list[str] - A list of fields or tags in Influx that correspond to Tide tags. - Must be specified in the following order name__unit__bloc__sub_bloc. + List of InfluxDB fields/tags to combine into Tide column names. + Must be specified in order: [name, unit, bloc, sub_bloc]. + Example: ["name", "unit", "location", "room"] will create columns like + "temperature__°C__zone1__room1" url : str - The URL of the InfluxDB instance (e.g., "http://localhost:8086"). + URL of the InfluxDB instance (e.g., "http://localhost:8086") org : str - The organization name in the InfluxDB instance. + InfluxDB organization name token : str - The authentication token for accessing the InfluxDB instance. - - split_td : str, datetime.timedelta, or pd.Timedelta, optional - The time interval for splitting the query into smaller chunks - (e.g., "1d", "12h"). If `None`, the query will not be split. - - tz_info : str, optional - The timezone for interpreting the start and stop times. - Defaults to "UTC". - - verbose : bool, optional - If `True`, prints progress messages for each time chunk being fetched. - Defaults to `False`. - - max_retry: int, default 5 - Number of retries for a query in case of ReadTimeoutError. - - waited_seconds_at_retry: int default 5 - Number of seconds waited before re-sending the query + Authentication token for InfluxDB access + + split_td : str or datetime.timedelta or pd.Timedelta, optional + Time interval for splitting large queries into smaller chunks. + Useful for handling large time ranges or rate limits. + Example: "1d" for daily chunks, "12h" for half-day chunks. + If None, queries the entire time range at once. + + tz_info : str, default "UTC" + Timezone for interpreting start and stop times. + Must be a valid timezone name from the IANA Time Zone Database. + + max_retry : int, default 5 + Maximum number of retry attempts for failed queries. + Only applies to ReadTimeoutError exceptions. + + waited_seconds_at_retry : int, default 5 + Number of seconds to wait between retry attempts. + + verbose : bool, default False + Whether to print progress information during data fetching. + + Returns + ------- + pd.DataFrame + DataFrame containing the fetched data with: + - Datetime index in UTC + - Columns named according to Tide's convention (name__unit__bloc__sub_bloc) + - Values from the InfluxDB _value field + + Raises + ------ + ReadTimeoutError + If all retry attempts fail to fetch data + ValueError + If tz_info is required but not provided for naive datetime objects + + Examples + -------- + >>> from tide import get_influx_data + >>> import pandas as pd + >>> # Fetch last 24 hours of data + >>> df = get_influx_data( + ... start="-24h", + ... stop="now", + ... bucket="my_bucket", + ... measurement="sensors", + ... tide_tags=["name", "unit", "location"], + ... url="http://localhost:8086", + ... org="my_org", + ... token="my_token", + ... ) + >>> # Fetch specific time range with daily splitting + >>> df = get_influx_data( + ... start="2023-01-01", + ... stop="2023-01-07", + ... bucket="my_bucket", + ... measurement="sensors", + ... tide_tags=["name", "unit", "location", "room"], + ... url="http://localhost:8086", + ... org="my_org", + ... token="my_token", + ... split_td="1d", + ... verbose=True, + ... ) + >>> # Fetch data with custom timezone + >>> df = get_influx_data( + ... start="2023-01-01T00:00:00", + ... stop="2023-01-01T23:59:59", + ... bucket="my_bucket", + ... measurement="sensors", + ... tide_tags=["name", "unit", "location"], + ... url="http://localhost:8086", + ... org="my_org", + ... token="my_token", + ... tz_info="Europe/Paris", + ... ) """ if isinstance(start, str) and isinstance(stop, str): diff --git a/tide/meteo.py b/tide/meteo.py index 4a87422..ee8c4d6 100644 --- a/tide/meteo.py +++ b/tide/meteo.py @@ -192,7 +192,6 @@ def sun_position(date: dt.datetime, lat: float = 46.5, long: float = 6.5): def aoi_projection(surface_tilt, surface_azimuth, solar_zenith, solar_azimuth): """ - === Function extracted from pvlib module === https://pvlib-python.readthedocs.io/en/stable/ Calculates the dot product of the sun position unit vector and the surface @@ -264,7 +263,6 @@ def beam_component(surface_tilt, surface_azimuth, solar_zenith, solar_azimuth, d def sky_diffuse(surface_tilt, dhi): """ - === Function extracted from pvlib module === https://pvlib-python.readthedocs.io/en/stable/ Determine diffuse irradiance from the sky on a tilted surface using @@ -272,7 +270,7 @@ def sky_diffuse(surface_tilt, dhi): .. math:: - I_{d} = DHI \frac{1 + \\cos\beta}{2} + I_{d} = DHI \frac{1 + \cos\beta}{2} Hottel and Woertz's model treats the sky as a uniform source of diffuse irradiance. Thus, the diffuse irradiance from the sky (ground @@ -313,7 +311,6 @@ def sky_diffuse(surface_tilt, dhi): def ground_diffuse(surface_tilt, ghi, albedo=0.25): """ - === Function extracted from pvlib module === https://pvlib-python.readthedocs.io/en/stable/ Estimate diffuse irradiance on a tilted surface from ground reflections. @@ -322,7 +319,7 @@ def ground_diffuse(surface_tilt, ghi, albedo=0.25): .. math:: - G_{ground} = GHI \times \rho \times \frac{1 - \\cos\beta}{2} + G_{ground} = GHI \times \rho \times \frac{1 - \cos\beta}{2} where :math:`\rho` is ``albedo`` and :math:`\beta` is ``surface_tilt``. diff --git a/tide/plumbing.py b/tide/plumbing.py index 8842ab4..19e7139 100644 --- a/tide/plumbing.py +++ b/tide/plumbing.py @@ -120,16 +120,28 @@ def get_pipeline_from_dict( class Plumber: - """A class for managing and transforming time series data through configurable processing pipelines. + """A powerful class for managing and transforming time series data through configurable processing pipelines. - The Plumber class provides a high-level interface for: + The Plumber class is the core component of the Tide library, providing a comprehensive interface for: - Managing time series data with hierarchical column naming (name__unit__bloc__sub_bloc) - - Creating and executing data processing pipelines - - Analyzing and visualizing data gaps - - Plotting time series with customizable layouts + - Creating and executing data processing pipelines with column-wise transformations + - Analyzing and visualizing data gaps and quality + - Plotting time series with customizable multi-axis layouts - The class uses a tree structure to organize data columns based on their tags, - allowing for flexible data selection and manipulation. + The class uses a tree structure to organize data columns based on their tags, allowing for: + - Flexible data selection using tag-based queries + - Hierarchical organization of data by unit, bloc, and sub-bloc + - Automatic handling of data transformations at different steps + + Parameters + ---------- + data : pd.Series or pd.DataFrame, optional + Input time series data. Must have a datetime index with timezone information. + pipe_dict : dict, optional + Pipeline configuration dictionary. Each key represents a processing step + and contains either: + - A list of transformations to apply to all columns + - A dictionary mapping column tags to specific transformations Attributes ---------- @@ -142,31 +154,48 @@ class Plumber: Examples -------- + >>> from tide import Plumber + >>> import pandas as pd + >>> # Create sample data with hierarchical column names >>> data = pd.DataFrame( ... { ... "temp__°C__zone1": [20, 21, np.nan, 23], ... "humid__%HR__zone1": [50, 55, 60, np.nan], + ... "power__kW__hvac": [1.5, 1.8, 1.6, 1.7], ... }, - ... index=pd.date_range("2023", freq="h", periods=4), + ... index=pd.date_range("2023", freq="h", periods=4, tz="UTC"), ... ) + >>> # Define pipeline configuration >>> pipe_dict = { - ... "pre_processing": {"°C": [["ReplaceThreshold", {"upper": 25}]]}, + ... "pre_processing": { + ... "°C": [["ReplaceThreshold", {"upper": 25}]], + ... "%HR": [["ReplaceThreshold", {"upper": 100}]], + ... }, ... "common": [["Interpolate", ["linear"]]], ... } + >>> # Initialize and process data >>> plumber = Plumber(data, pipe_dict) >>> corrected = plumber.get_corrected_data() + >>> # Analyze gaps + >>> gaps = plumber.get_gaps_description() + >>> # Visualize data + >>> plumber.plot(y_axis_level="unit") + + Notes + ----- + - Column names can use any combination of tags (name, unit, bloc, sub_bloc) + separated by double underscores. Examples: + - Simple: "temperature" + - With unit: "temperature__°C" + - Full: "temperature__°C__zone1__room1" + - Input data must have a datetime index with timezone information + - Pipeline steps can be applied globally or to specific column groups + - Supports all transformations from the processing module + - Provides comprehensive gap analysis and visualization tools + - Uses plotly for interactive data visualization """ def __init__(self, data: pd.Series | pd.DataFrame = None, pipe_dict: dict = None): - """ - Parameters - ---------- - data : pd.Series or pd.DataFrame, optional - Input time series data. Must have a datetime index. - pipe_dict : dict, optional - Pipeline configuration dictionary. Each key represents a processing step - and contains the corresponding transformation parameters. - """ self.data = check_and_return_dt_index_df(data) if data is not None else None self.root = data_columns_to_tree(data.columns) if data is not None else None self.pipe_dict = pipe_dict @@ -306,7 +335,7 @@ def set_data(self, data: pd.Series | pd.DataFrame): Parameters ---------- data : pd.Series or pd.DataFrame - New time series data to process. Must have a datetime index. + New time series data to process. Must have a datetime index with timezone information. """ self.data = check_and_return_dt_index_df(data) self.root = data_columns_to_tree(data.columns) @@ -339,19 +368,73 @@ def get_pipeline( ) -> Pipeline: """Create a scikit-learn pipeline from the configuration. + This method builds a scikit-learn Pipeline object based on the current configuration + and selected data columns. The pipeline can be used to transform data according to + the defined processing steps. + Parameters ---------- select : str or pd.Index or list[str], optional - Data selection using tide's tag system + Data selection using tide's tag system. Can be: + - A single tag (e.g., "°C" to select all temperature columns) + - A full column name pattern (e.g., "temp__°C__zone1") + If None, selects all columns. + steps : None or str or list[str] or slice, default slice(None) - Pipeline steps to include. If None, returns an Identity transformer. + Pipeline steps to include. Can be: + - A single step name (e.g., "pre_processing") + - A list of step names (e.g., ["pre_processing", "common"]) + - A slice object (e.g., slice("pre_processing", "common")) + - None to return an Identity transformer + - slice(None) to include all steps + verbose : bool, default False - Whether to print information about pipeline steps + Whether to print information about pipeline steps during creation Returns ------- Pipeline - Scikit-learn pipeline configured with the selected steps + A scikit-learn Pipeline object configured with the selected steps and columns. + The pipeline will transform the data according to the processing steps defined + in pipe_dict. + + Raises + ------ + ValueError + If data is not set (self.data is None) + + Examples + -------- + >>> from tide import Plumber + >>> import pandas as pd + >>> # Create sample data + >>> data = pd.DataFrame( + ... { + ... "temp__°C__zone1": [20, 21, np.nan, 23], + ... "humid__%HR__zone1": [50, 55, 60, np.nan], + ... "power__kW__hvac": [1.5, 1.8, 1.6, 1.7], + ... }, + ... index=pd.date_range("2023", freq="h", periods=4, tz="UTC"), + ... ) + >>> # Define pipeline configuration + >>> pipe_dict = { + ... "pre_processing": { + ... "°C": [["ReplaceThreshold", {"upper": 25}]], + ... "%HR": [["ReplaceThreshold", {"upper": 100}]], + ... }, + ... "common": [["Interpolate", ["linear"]]], + ... } + >>> # Initialize Plumber + >>> plumber = Plumber(data, pipe_dict) + >>> # Get pipeline for temperature columns only + >>> temp_pipe = plumber.get_pipeline(select="°C") + >>> # Get pipeline for all columns with only pre-processing step + >>> pre_pipe = plumber.get_pipeline(steps="pre_processing") + >>> # Get pipeline for specific columns and steps + >>> custom_pipe = plumber.get_pipeline( + ... select=["temp__°C__zone1", "power__kW__hvac"], + ... steps=["pre_processing", "common"], + ... ) """ if self.data is None: raise ValueError("data is required to build a pipeline") @@ -377,23 +460,84 @@ def get_corrected_data( ) -> pd.DataFrame: """Apply pipeline transformations to selected data. + This method applies the configured processing pipeline to the selected data columns + within the specified time range. It returns a new DataFrame with the transformed data. + Parameters ---------- select : str or pd.Index or list[str], optional - Data selection using tide's tag system + Data selection using tide's tag system. Can be: + - A single tag (e.g., "°C" to select all temperature columns) + - A full column name pattern (e.g., "temp__°C__zone1") + If None, selects all columns. + start : str or datetime or Timestamp, optional - Start time for data slice + Start time for data slice. Can be: + - A string in ISO format (e.g., "2023-01-01") + - A datetime object + - A pandas Timestamp + If None, uses the first timestamp in the data. + stop : str or datetime or Timestamp, optional - End time for data slice + End time for data slice. Can be: + - A string in ISO format (e.g., "2023-12-31") + - A datetime object + - A pandas Timestamp + If None, uses the last timestamp in the data. + steps : None or str or list[str] or slice, default slice(None) - Pipeline steps to apply + Pipeline steps to apply. Can be: + - A single step name (e.g., "pre_processing") + - A list of step names (e.g., ["pre_processing", "common"]) + - A slice object (e.g., slice("pre_processing", "common")) + - None to return an Identity transformer + - slice(None) to include all steps + verbose : bool, default False - Whether to print information about pipeline steps + Whether to print information about pipeline steps during processing Returns ------- pd.DataFrame - Transformed data + + Raises + ------ + ValueError + If data is not set (self.data is None) + + Examples + -------- + >>> from tide import Plumber + >>> import pandas as pd + >>> # Create sample data + >>> data = pd.DataFrame( + ... { + ... "temp__°C__zone1": [20, 21, np.nan, 23], + ... "humid__%HR__zone1": [50, 55, 60, np.nan], + ... "power__kW__hvac": [1.5, 1.8, 1.6, 1.7], + ... }, + ... index=pd.date_range("2023", freq="h", periods=4, tz="UTC"), + ... ) + >>> # Define pipeline configuration + >>> pipe_dict = { + ... "pre_processing": { + ... "°C": [["ReplaceThreshold", {"upper": 25}]], + ... "%HR": [["ReplaceThreshold", {"upper": 100}]], + ... }, + ... "common": [["Interpolate", ["linear"]]], + ... } + >>> # Initialize Plumber + >>> plumber = Plumber(data, pipe_dict) + >>> # Get corrected data for temperature columns only + >>> temp_data = plumber.get_corrected_data(select="°C") + >>> # Get corrected data for a specific time range + >>> time_slice = plumber.get_corrected_data( + ... start="2023-01-01T00:00:00", stop="2023-01-01T12:00:00" + ... ) + >>> # Get corrected data with specific steps + >>> pre_processed = plumber.get_corrected_data( + ... select=["temp__°C__zone1", "power__kW__hvac"], steps="pre_processing" + ... ) """ if self.data is None: raise ValueError("Cannot get corrected data. data are missing") @@ -416,27 +560,90 @@ def plot_gaps_heatmap( ): """Create a heatmap visualization of data gaps. + This method generates an interactive heatmap using plotly that shows the presence + and distribution of data gaps across different columns and time periods. The heatmap + helps identify patterns in missing data and potential data quality issues. + Parameters ---------- select : str or pd.Index or list[str], optional - Data selection using tide's tag system + Data selection using tide's tag system. Can be: + - A single tag (e.g., "°C" to select all temperature columns) + - A full column name pattern (e.g., "temp__°C__zone1") + If None, selects all columns. + start : str or datetime or Timestamp, optional - Start time for visualization + Start time for visualization. Can be: + - A string in ISO format (e.g., "2023-01-01") + - A datetime object + - A pandas Timestamp + If None, uses the first timestamp in the data. + stop : str or datetime or Timestamp, optional - End time for visualization + End time for visualization. Can be: + - A string in ISO format (e.g., "2023-12-31") + - A datetime object + - A pandas Timestamp + If None, uses the last timestamp in the data. + steps : None or str or list[str] or slice, default slice(None) - Pipeline steps to apply before visualization + Pipeline steps to apply before visualization. Can be: + - A single step name (e.g., "pre_processing") + - A list of step names (e.g., ["pre_processing", "common"]) + - A slice object (e.g., slice("pre_processing", "common")) + - None to return an Identity transformer + - slice(None) to include all steps + time_step : str or Timedelta or timedelta, optional - Time step for aggregating gaps + Time step for aggregating gaps. Can be: + - A string (e.g., "1h", "1d", "1w") + - A timedelta object + - A pandas Timedelta + If None, uses the original data frequency. + title : str, optional - Plot title + Plot title. If None, uses a default title based on the data selection. + verbose : bool, default False - Whether to print information about pipeline steps + Whether to print information about pipeline steps during processing Returns ------- go.Figure - Plotly figure object containing the heatmap + A plotly Figure object containing the heatmap with: + - Rows representing different columns + - Columns representing time periods + - Colors indicating presence (white) or absence (colored) of data + - Interactive features (zoom, pan, hover information) + + Examples + -------- + >>> from tide import Plumber + >>> import pandas as pd + >>> # Create sample data with gaps + >>> data = pd.DataFrame( + ... { + ... "temp__°C__zone1": [20, np.nan, 23, np.nan, 25], + ... "humid__%HR__zone1": [50, 55, np.nan, 60, np.nan], + ... "power__kW__hvac": [1.5, 1.8, 1.6, np.nan, 1.7], + ... }, + ... index=pd.date_range("2023", freq="h", periods=5, tz="UTC"), + ... ) + >>> # Initialize Plumber + >>> plumber = Plumber(data) + >>> # Create heatmap for all columns + >>> fig = plumber.plot_gaps_heatmap() + >>> fig.show() + >>> # Create heatmap for temperature data with daily aggregation + >>> fig = plumber.plot_gaps_heatmap( + ... select="°C", time_step="1d", title="Temperature Data Gaps" + ... ) + >>> fig.show() + >>> # Create heatmap for specific time range + >>> fig = plumber.plot_gaps_heatmap( + ... start="2023-01-01T00:00:00", stop="2023-01-01T12:00:00" + ... ) + >>> fig.show() """ data = self.get_corrected_data(select, start, stop, steps, verbose) return plot_gaps_heatmap(data, time_step=time_step, title=title) @@ -469,65 +676,157 @@ def plot( ): """Create an interactive time series plot. - Creates a highly customizable plot that can show: + This method generates a highly customizable interactive plot using plotly that can show: - Multiple time series with automatic different y-axes based on unit - Two different versions of the data (e.g., raw and processed) - - Data gaps visualization - - Custom styling and layout + - Data gaps visualization with customizable colors and opacity + - Custom styling and layout options Parameters ---------- select : str or pd.Index or list[str], optional - Data selection using tide's tag system + Data selection using tide's tag system. Can be: + - A single tag (e.g., "°C" to select all temperature columns) + - A full column name pattern (e.g., "temp__°C__zone1") + If None, selects all columns. + start : str or datetime or Timestamp, optional - Start time for plot + Start time for plot. Can be: + - A string in ISO format (e.g., "2023-01-01") + - A datetime object + - A pandas Timestamp + If None, uses the first timestamp in the data. + stop : str or datetime or Timestamp, optional - End time for plot + End time for plot. Can be: + - A string in ISO format (e.g., "2023-12-31") + - A datetime object + - A pandas Timestamp + If None, uses the last timestamp in the data. + y_axis_level : str, optional - Tag level to use for y-axis grouping + Tag level to use for y-axis grouping. Can be: + - "unit" to group by measurement unit + - "bloc" to group by data bloc + - "sub_bloc" to group by sub-bloc + If None, uses a single y-axis for all data. + y_tag_list : list[str], optional - List of tags for custom y-axis ordering + List of tags for custom y-axis ordering. The order of tags in this list + determines the order of y-axes from left to right. + steps : None or str or list[str] or slice, default slice(None) - Pipeline steps to apply for main data + Pipeline steps to apply for main data. Can be: + - A single step name (e.g., "pre_processing") + - A list of step names (e.g., ["pre_processing", "common"]) + - A slice object (e.g., slice("pre_processing", "common")) + - None to return an Identity transformer + - slice(None) to include all steps + data_mode : str, default "lines" - Plot mode for main data ("lines", "markers", or "lines+markers") + Plot mode for main data. Can be: + - "lines" for line plots + - "markers" for scatter plots + - "lines+markers" for combined line and marker plots + steps_2 : None or str or list[str] or slice, optional - Pipeline steps to apply for secondary data + Pipeline steps to apply for secondary data. Used to compare different + processing steps or versions of the data. + data_2_mode : str, default "markers" - Plot mode for secondary data + Plot mode for secondary data. Same options as data_mode. + markers_opacity : float, default 0.8 - Opacity for markers + Opacity for markers (0.0 to 1.0) + lines_width : float, default 2.0 - Width of plot lines + Width of plot lines in pixels + title : str, optional - Plot title + Plot title. If None, uses a default title based on the data selection. + plot_gaps : bool, default False Whether to highlight gaps in main data + gaps_lower_td : str or Timedelta or timedelta, optional - Minimum duration for gap highlighting + Minimum duration for gap highlighting. Can be: + - A string (e.g., "1h", "1d") + - A timedelta object + - A pandas Timedelta + gaps_rgb : tuple[int, int, int], default (31, 73, 125) - RGB color for main data gaps + RGB color for main data gaps (0-255 range) + gaps_alpha : float, default 0.5 - Opacity for main data gaps + Opacity for main data gaps (0.0 to 1.0) + plot_gaps_2 : bool, default False Whether to highlight gaps in secondary data + gaps_2_lower_td : str or Timedelta or timedelta, optional Minimum duration for secondary data gap highlighting + gaps_2_rgb : tuple[int, int, int], default (254, 160, 34) - RGB color for secondary data gaps + RGB color for secondary data gaps (0-255 range) + gaps_2_alpha : float, default 0.5 - Opacity for secondary data gaps + Opacity for secondary data gaps (0.0 to 1.0) + axis_space : float, default 0.03 - Space between multiple y-axes + Space between multiple y-axes (0.0 to 1.0) + y_title_standoff : int or float, default 5 - Distance between y-axis title and axis + Distance between y-axis title and axis in pixels + verbose : bool, default False - Whether to print information about pipeline steps + Whether to print information about pipeline steps during processing Returns ------- go.Figure - Plotly figure object containing the plot + A plotly Figure object containing the plot with: + - Multiple y-axes if y_axis_level is specified + - Interactive features (zoom, pan, hover information) + - Legend with all series + - Optional gap highlighting + - Customizable styling + + Examples + -------- + >>> from tide import Plumber + >>> import pandas as pd + >>> # Create sample data + >>> data = pd.DataFrame( + ... { + ... "temp__°C__zone1": [20, 21, np.nan, 23], + ... "humid__%HR__zone1": [50, 55, 60, np.nan], + ... "power__kW__hvac": [1.5, 1.8, 1.6, 1.7], + ... }, + ... index=pd.date_range("2023", freq="h", periods=4, tz="UTC"), + ... ) + >>> # Initialize Plumber + >>> plumber = Plumber(data) + >>> # Create basic plot with automatic y-axes + >>> fig = plumber.plot(y_axis_level="unit") + >>> fig.show() + >>> # Create plot with custom styling and gap highlighting + >>> fig = plumber.plot( + ... select=["temp__°C__zone1", "power__kW__hvac"], + ... data_mode="lines+markers", + ... plot_gaps=True, + ... gaps_lower_td="1h", + ... title="Temperature and Power Data", + ... ) + >>> fig.show() + >>> # Create plot comparing raw and processed data + >>> fig = plumber.plot( + ... steps="pre_processing", + ... steps_2=None, + ... data_mode="lines", + ... data_2_mode="markers", + ... title="Raw vs Processed Data", + ... ) + >>> fig.show() """ # A bit dirty. Here we assume that if you ask a selection # that is not found in original data columns, it is because it diff --git a/tide/processing.py b/tide/processing.py index c128b1b..f95a6d3 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -41,46 +41,39 @@ class Identity(BaseProcessing): - """ - A custom transformer that returns the input data without any modifications. - - This transformer is useful when you want to include an identity transformation step - in a scikit-learn pipeline, where the input data should be returned unchanged. + """A transformer that returns input data unchanged. - Parameters: - ----------- + Parameters + ---------- None - Methods: + Attributes + ---------- + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (same as input). + + Methods + ------- + fit(X, y=None) + No-op, returns self. + transform(X) + Returns input unchanged. + + Examples -------- - fit(X, y=None): - This method does nothing and simply returns the transformer instance. - - Parameters: - ----------- - X : array-like, shape (n_samples, n_features) - The input data. - - y : array-like, shape (n_samples,), optional (default=None) - The target values. - - Returns: - -------- - self : object - The transformer instance itself. - - transform(X): - This method returns the input data without any modifications. - - Parameters: - ----------- - X : array-like, shape (n_samples, n_features) - The input data. - - Returns: - -------- - transformed_X : array-like, shape (n_samples, n_features) - The input data without any modifications. + >>> import pandas as pd + >>> df = pd.DataFrame({"temp__°C": [20, 21, 22], "humid__%": [45, 50, 55]}) + >>> identity = Identity() + >>> result = identity.fit_transform(df) + >>> assert (result == df).all().all() # Data unchanged + >>> assert list(result.columns) == list(df.columns) # Column order preserved + + Returns + ------- + pd.DataFrame + The input data without any modifications. """ def __init__(self): @@ -94,28 +87,60 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class ReplaceDuplicated(BaseProcessing): - """This transformer replaces duplicated values in each column by - specified new value. + """A transformer that replaces duplicated values in each column with a specified value. + + This transformer identifies and replaces duplicated values in each column + of a pandas DataFrame, keeping either the first, last, or no occurrence + of duplicated values. Parameters ---------- keep : str, default 'first' Specify which of the duplicated (if any) value to keep. - Allowed arguments : ‘first’, ‘last’, False. + Allowed arguments : 'first', 'last', False. + - 'first': Keep first occurrence of duplicated values + - 'last': Keep last occurrence of duplicated values + - False: Keep no occurrence (replace all duplicates) + + value : float, default np.nan + Value used to replace the non-kept duplicated values. Attributes ---------- - value : str, default np.nan - value used to replace not kept duplicated. + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (same as input). - Methods - ------- - fit(X, y=None) - Returns self. + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> from datetime import datetime, timezone + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... {"temp__°C": [20, 20, 22, 22, 23], "humid__%": [45, 45, 50, 50, 55]}, + ... index=dates, + ... ) + >>> # Keep first occurrence of duplicates + >>> replacer = ReplaceDuplicated(keep="first", value=np.nan) + >>> result = replacer.fit_transform(df) + >>> print(result) + temp__°C humid__% + 2024-01-01 00:00:00+00:00 20.0 45.0 + 2024-01-01 00:01:00+00:00 NaN NaN + 2024-01-01 00:02:00+00:00 22.0 50.0 + 2024-01-01 00:03:00+00:00 NaN NaN + 2024-01-01 00:04:00+00:00 23.0 55.0 - transform(X) - Drops the duplicated values in the Pandas DataFrame `X` - Returns the DataFrame with the duplicated filled with 'value' + Returns + ------- + pd.DataFrame + The DataFrame with duplicated values replaced according to the specified strategy. + The output maintains the same DateTimeIndex as the input. """ def __init__(self, keep="first", value=np.nan): @@ -133,30 +158,62 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class Dropna(BaseProcessing): - """A class to drop NaN values in a Pandas DataFrame. + """A transformer that removes rows containing missing values from a DataFrame. + + This transformer removes rows from a DataFrame based on the presence of + missing values (NaN) according to the specified strategy. Parameters ---------- how : str, default 'all' - How to drop missing values in the data. 'all' drops the row/column if - all the values are missing, 'any' drops the row/column if any value is - missing, and a number 'n' drops the row/column if there are at least - 'n' missing values. + How to drop missing values in the data: + - 'all': Drop row if all values are missing + - 'any': Drop row if any value is missing + - int: Drop row if at least this many values are missing Attributes ---------- - how : str - How to drop missing values in the data. + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (same as input). - Methods - ------- - fit(X, y=None) - Returns self. + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C": [20, np.nan, 22, np.nan, np.nan], + ... "humid__%": [45, 50, np.nan, np.nan, np.nan], + ... }, + ... index=dates, + ... ) + >>> # Drop rows where all values are missing + >>> dropper = Dropna(how="all") + >>> result = dropper.fit_transform(df) + >>> print(result) + temp__°C humid__% + 2024-01-01 00:00:00+00:00 20.0 45.0 + 2024-01-01 00:01:00+00:00 NaN 50.0 + 2024-01-01 00:02:00+00:00 22.0 NaN + >>> # Drop rows with any missing value + >>> dropper_strict = Dropna(how="any") + >>> result_strict = dropper_strict.fit_transform(df) + >>> print(result_strict) + temp__°C humid__% + 2024-01-01 00:00:00+00:00 20.0 45.0 - transform(X) - Drops the NaN values in the Pandas DataFrame `X` based on the `how` - attribute. - Returns the DataFrame with the NaN values dropped. + Returns + ------- + pd.DataFrame + The DataFrame with rows containing missing values removed according to + the specified strategy. The output maintains the same DateTimeIndex + structure as the input, with rows removed. """ def __init__(self, how="all"): @@ -171,32 +228,59 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class RenameColumns(BaseProcessing): - """ - Scikit-learn transformer that renames columns of a Pandas DataFrame. + """A transformer that renames columns in a DataFrame. + + This transformer allows renaming DataFrame columns either by providing a list + of new names in the same order as the current columns, or by providing a + dictionary mapping old names to new names. Parameters ---------- - new_names: list or dict - A list or a dictionary of new names for columns of a DataFrame. - If it is a list, it must have the same length as the number of columns - in the DataFrame. If it is a dictionary, keys must be the old names of - columns and values must be the new names. + new_names : list[str] | dict[str, str] + New names for the columns. Can be specified in two ways: + - list[str]: List of new names in the same order as current columns. + Must have the same length as the number of columns. + - dict[str, str]: Dictionary mapping old column names to new names. + Keys must be existing column names, values are the new names. Attributes ---------- - new_names: list or dict - A list or a dictionary of new names for columns of a DataFrame. - - Methods - ------- - fit(self, x, y=None) - No learning is performed, the method simply returns self. + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns after renaming. - transform(self, x) - Renames columns of a DataFrame. + Examples + -------- + >>> import pandas as pd + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:02:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... {"temp__°C": [20, 21, 22], "humid__%": [45, 50, 55]}, index=dates + ... ) + >>> # Rename using a list (maintains order) + >>> renamer_list = RenameColumns(["temperature__°C", "humidity__%"]) + >>> result_list = renamer_list.fit_transform(df) + >>> print(result_list) + temperature__°C humidity__% + 2024-01-01 00:00:00+00:00 20.0 45.0 + 2024-01-01 00:01:00+00:00 21.0 50.0 + 2024-01-01 00:02:00+00:00 22.0 55.0 + >>> # Rename using a dictionary (selective renaming) + >>> renamer_dict = RenameColumns({"temp__°C": "temperature__°C"}) + >>> result_dict = renamer_dict.fit_transform(df) + >>> print(result_dict) + temperature__°C humid__% + 2024-01-01 00:00:00+00:00 20.0 45.0 + 2024-01-01 00:01:00+00:00 21.0 50.0 + 2024-01-01 00:02:00+00:00 22.0 55.0 - inverse_transform(self, x) - Renames columns of a DataFrame. + Returns + ------- + pd.DataFrame + The DataFrame with renamed columns. """ def __init__(self, new_names: list[str] | dict[str, str]): @@ -224,35 +308,61 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class SkTransform(BaseProcessing): - """A transformer class to apply scikit transformers on a pandas DataFrame + """A transformer that applies scikit-learn transformers to a pandas DataFrame. - This class takes in a scikit-learn transformers as input and applies the - transformer to a pandas DataFrame. The resulting data will be a pandas - DataFrame with the same index and columns as the input DataFrame. + This transformer wraps any scikit-learn transformer and applies it to a pandas + DataFrame while preserving the DataFrame's index and column structure. It is + particularly useful when you want to use scikit-learn's preprocessing tools + (like StandardScaler, MinMaxScaler, etc.) while maintaining the time series + nature of your data. Parameters ---------- transformer : object - A scikit-learn transformer to apply on the data. + A scikit-learn transformer to apply on the data. Must implement fit(), + transform(), and optionally inverse_transform() methods. Attributes ---------- - transformer : object - A scikit-learn transformer that is fitted on the data. + transformer_ : object + The fitted scikit-learn transformer. + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (same as input). - Methods - ------- - fit(x, y=None) - Fit the scaler to the input data `x` - - transform(x) - Apply the transformer to the input data `x` and return the result - as a pandas DataFrame. - - inverse_transform(x) - Apply the inverse transformer to the input data `x` and return the - result as a pandas DataFrame. + Examples + -------- + >>> import pandas as pd + >>> from sklearn.preprocessing import StandardScaler + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:02:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... {"temp__°C": [20, 21, 22], "humid__%": [45, 50, 55]}, index=dates + ... ) + >>> # Apply StandardScaler while preserving DataFrame structure + >>> sk_transform = SkTransform(StandardScaler()) + >>> result = sk_transform.fit_transform(df) + >>> print(result) + temp__°C humid__% + 2024-01-01 00:00:00+00:00 -1.0 -1.0 + 2024-01-01 00:01:00+00:00 0.0 0.0 + 2024-01-01 00:02:00+00:00 1.0 1.0 + >>> # Inverse transform to get back original values + >>> original = sk_transform.inverse_transform(result) + >>> print(original) + temp__°C humid__% + 2024-01-01 00:00:00+00:00 20.0 45.0 + 2024-01-01 00:01:00+00:00 21.0 50.0 + 2024-01-01 00:02:00+00:00 22.0 55.0 + Returns + ------- + pd.DataFrame + The transformed DataFrame with the same index and column structure as the input. + The values are transformed according to the specified scikit-learn transformer. """ def __init__(self, transformer): @@ -278,22 +388,59 @@ def inverse_transform(self, X: pd.Series | pd.DataFrame): class ReplaceThreshold(BaseProcessing): - """Class replacing values in a pandas DataFrame by "value" based on - threshold values. + """A transformer that replaces values in a DataFrame based on threshold values. - This class implements the scikit-learn transformer API and can be used in - a scikit-learn pipeline. + This transformer replaces values in a DataFrame that fall outside specified + upper and lower thresholds with a given replacement value. It is useful for + handling outliers or extreme values in time series data. Parameters ---------- upper : float, optional (default=None) - The upper threshold for values in the DataFrame. Values greater than - The upper threshold for values in the DataFrame. Values greater than - this threshold will be replaced. + The upper threshold value. Values greater than this threshold will be + replaced with the specified value. lower : float, optional (default=None) - The lower threshold for values in the DataFrame. Values less than - this threshold will be replaced. - value : (default=np.nan)The value to replace the targeted values in X DataFrame + The lower threshold value. Values less than this threshold will be + replaced with the specified value. + value : float, optional (default=np.nan) + The value to use for replacing values that fall outside the thresholds. + + Attributes + ---------- + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (same as input). + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... {"temp__°C": [20, 25, 30, 35, 40], "humid__%": [45, 50, 55, 60, 65]}, + ... index=dates, + ... ) + >>> # Replace values outside thresholds with NaN + >>> replacer = ReplaceThreshold(upper=35, lower=20, value=np.nan) + >>> result = replacer.fit_transform(df) + >>> print(result) + temp__°C humid__% + 2024-01-01 00:00:00+00:00 20.0 NaN + 2024-01-01 00:01:00+00:00 25.0 NaN + 2024-01-01 00:02:00+00:00 30.0 NaN + 2024-01-01 00:03:00+00:00 NaN NaN + 2024-01-01 00:04:00+00:00 NaN NaN + + Returns + ------- + pd.DataFrame + The DataFrame with values outside the specified thresholds replaced + with the given value. The output maintains the same DateTimeIndex + and column structure as the input. """ def __init__(self, upper=None, lower=None, value=np.nan): @@ -330,7 +477,7 @@ class DropTimeGradient(BaseProcessing): A transformer that removes values in a DataFrame based on the time gradient. The time gradient is calculated as the difference of consecutive values in - the time series divided by the time delta between each value. + the time series divided by the time delta between each value (in seconds). If the gradient is below the `lower_rate` or above the `upper_rate`, then the value is set to NaN. @@ -339,27 +486,68 @@ class DropTimeGradient(BaseProcessing): dropna : bool, default=True Whether to remove NaN values from the DataFrame before processing. upper_rate : float, optional - The upper rate threshold. If the gradient is greater than or equal to + The upper rate threshold in units of value/second. If the gradient is greater than or equal to this value, the value will be set to NaN. + Example: For a temperature change of 5°C per minute, set upper_rate=5/60 ≈ 0.083 lower_rate : float, optional - The lower rate threshold. If the gradient is less than or equal to - this value, the value will be set to NaN. + The lower rate threshold in units of value/second. If the gradient is less than or equal to + this value, the value will be set to NaN. + Example: For a pressure change of 100 Pa per minute, set lower_rate=100/60 ≈ 1.67 Attributes ---------- - None + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (same as input). - Methods - ------- - fit(X, y=None) - No learning is performed, the method simply returns self. - transform(X) - Removes values in the DataFrame based on the time gradient. + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C": [20, 25, 30, 35, 40], # Steady increase of 5°C/min + ... "humid__%": [45, 45, 45, 45, 45], # Constant + ... "press__Pa": [1000, 1000, 900, 1000, 1000], # Sudden change + ... }, + ... index=dates, + ... ) + >>> # Remove values with gradients outside thresholds + >>> # For temperature: 5°C/min = 5/60 ≈ 0.083°C/s + >>> # For pressure: 100 Pa/min = 100/60 ≈ 1.67 Pa/s + >>> dropper = DropTimeGradient(upper_rate=0.083, lower_rate=0.001) + >>> result = dropper.fit_transform(df) + >>> print(result) + temp__°C humid__% press__Pa + 2024-01-01 00:00:00+00:00 20.0 45.0 1000.0 + 2024-01-01 00:01:00+00:00 25.0 NaN 1000.0 + 2024-01-01 00:02:00+00:00 30.0 NaN NaN + 2024-01-01 00:03:00+00:00 35.0 NaN 1000.0 + 2024-01-01 00:04:00+00:00 40.0 45.0 1000.0 + + Notes + ----- + - The gradient is calculated as (value2 - value1) / (time2 - time1 in seconds) + - For the upper_rate threshold, both the current and next gradient must exceed + the threshold for a value to be removed + - For the lower_rate threshold, only the current gradient needs to be below + the threshold for a value to be removed + - NaN values are handled according to the dropna parameter: + - If True (default): NaN values are removed before processing + - If False: NaN values are kept and may affect gradient calculations + - The rate parameters (upper_rate and lower_rate) must be specified in units of + value/second. To convert from per-minute rates, divide by 60. Returns ------- - DataFrame - The transformed DataFrame. + pd.DataFrame + The DataFrame with values removed based on their time gradients. + The output maintains the same DateTimeIndex and column structure as the input. """ def __init__(self, dropna=True, upper_rate=None, lower_rate=None): @@ -413,27 +601,69 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class ApplyExpression(BaseProcessing): - """A transformer class to apply a mathematical expression on a Pandas - DataFrame. + """A transformer that applies a mathematical expression to a pandas DataFrame. - This class implements a transformer that can be used to apply a - mathematical expression to a Pandas DataFrame. - The expression can be any valid Python expression that - can be evaluated using the `eval` function. + This transformer allows you to apply any valid Python mathematical expression + to a pandas DataFrame. The expression is evaluated using pandas' `eval` function, + which provides efficient evaluation of mathematical expressions. Parameters ---------- expression : str - A string representing a valid Python expression. - The expression can use any variables defined in the local scope, - including the `X` variable that is passed to the `transform` method - as the input data. + A string representing a valid Python mathematical expression. + The expression can use the input DataFrame `X` as a variable. + Common operations include: + - Basic arithmetic: +, -, *, /, **, % + - Comparison: >, <, >=, <=, ==, != + - Boolean operations: &, |, ~ + - Mathematical functions: abs(), sqrt(), pow(), etc. + Example: "X * 2" or "X / 1000" or "X ** 2" + + new_unit : str, optional (default=None) + The new unit to apply to the column names after transformation. + If provided, the transformer will update the unit part of the column names + (the part after the second "__" in the Tide naming convention). + Example: If input columns are "power__W__building" and new_unit="kW", + output columns will be "power__kW__building". - Attributes - ---------- - expression : str - The mathematical expression that will be applied to the input data. + Examples + -------- + >>> import pandas as pd + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:02:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "power__W__building": [1000, 2000, 3000], + ... }, + ... index=dates, + ... ) + >>> # Convert power from W to kW + >>> transformer = ApplyExpression("X / 1000", "kW") + >>> result = transformer.fit_transform(df) + >>> print(result) + power__kW__building + 2024-01-01 00:00:00+00:00 1.0 + 2024-01-01 00:01:00+00:00 2.0 + 2024-01-01 00:02:00+00:00 3.0 + + Notes + ----- + - The expression is evaluated using pandas' `eval` function, which is optimized + for numerical operations on DataFrames. + - The input DataFrame `X` is available in the expression context. + - When using `new_unit`, the transformer follows the Tide naming convention + of "name__unit__block" for column names. + - The transformer preserves the DataFrame's index and column structure. + - All mathematical operations are applied element-wise to the DataFrame. + + Returns + ------- + pd.DataFrame + The transformed DataFrame with the mathematical expression applied to all values. + If new_unit is specified, the column names are updated accordingly. """ def __init__(self, expression: str, new_unit: str = None): @@ -455,29 +685,67 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class TimeGradient(BaseProcessing): - """ - A class to calculate the time gradient of a pandas DataFrame, - which is the derivative of the data with respect to time. + """A transformer that calculates the time gradient (derivative) of a pandas DataFrame. - Parameters - ---------- - dropna : bool, optional (default=True) - Whether to drop NaN values before calculating the time gradient. + This transformer computes the rate of change of values with respect to time. + The gradient is calculated using the time difference between consecutive data points. - Attributes + Parameters ---------- - dropna : bool - The dropna attribute of the class. + new_unit : str, optional (default=None) + The new unit to apply to the column names after transformation. + If provided, the transformer will update the unit part of the column names + (the part after the second "__" in the Tide naming convention). + Example: If input columns are "energy__J__building" and new_unit="W", + output columns will be "energy__W__building". - Methods - ------- - fit(X, y=None) - Fits the transformer to the data. Does not modify the input data. + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> # Create energy data (in Joules) with varying consumption + >>> df = pd.DataFrame( + ... { + ... "energy__J__building": [ + ... 0, # Start at 0 J + ... 360000, # 1 kWh = 3600000 J + ... 720000, # 2 kWh + ... 1080000, # 3 kWh + ... 1440000, # 4 kWh + ... ] + ... }, + ... index=dates, + ... ) + >>> # Calculate power (W) from energy (J) using time gradient + >>> # Power = Energy / time (in seconds) + >>> transformer = TimeGradient(new_unit="W") + >>> result = transformer.fit_transform(df) + >>> print(result) + energy__W__building + 2024-01-01 00:00:00+00:00 NaN + 2024-01-01 00:01:00+00:00 6000.0 + 2024-01-01 00:02:00+00:00 6000.0 + 2024-01-01 00:03:00+00:00 6000.0 + 2024-01-01 00:04:00+00:00 6000.0 - transform(X) - Transforms the input data by calculating the time gradient of - the data. + Notes + ----- + - The time gradient is calculated as (value2 - value1) / (time2 - time1 in seconds) + - The first and last values in each column will be NaN since they don't have + enough neighbors to calculate the gradient + - When using new_unit, the transformer follows the Tide naming convention + of "name__unit__block" for column names + Returns + ------- + pd.DataFrame + The DataFrame with time gradients calculated for each column. + The output maintains the same DateTimeIndex as the input. + If new_unit is specified, the column names are updated accordingly. """ def __init__(self, new_unit: str = None): @@ -500,24 +768,84 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class Ffill(BaseFiller, BaseProcessing): - """ - A class to front-fill missing values in a Pandas DataFrame. - the limit argument allows the function to stop frontfilling at a certain - number of missing value - - Parameters: - limit: int, default None If limit is specified, this is the maximum number - of consecutive NaN values to forward/backward fill. - In other words, if there is a gap with more than this number of consecutive - NaNs, it will only be partially filled. - If limit is not specified, this is the maximum number of entries along - the entire axis where NaNs will be filled. Must be greater than 0 if not None. - - Methods: - fit(self, X, y=None): - Does nothing. Returns the object itself. - transform(self, X): - Fill missing values in the input DataFrame. + """A transformer that forward-fills missing values in a pandas DataFrame. + + This transformer fills missing values (NaN) in a DataFrame by propagating + the last valid observation forward. It is particularly useful when past + values are more relevant for filling gaps than future values. + + Parameters + ---------- + limit : int, optional (default=None) + The maximum number of consecutive NaN values to forward-fill. + If specified, only gaps with this many or fewer consecutive NaN values + will be filled. Must be greater than 0 if not None. + Example: If limit=2, a gap of 3 or more NaN values will only be + partially filled. + + gaps_lte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration less than or equal to this value. + + gaps_gte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration greater than or equal to this value. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C__room": [20, np.nan, np.nan, 23, 24], + ... "press__Pa__room": [1000, np.nan, 900, np.nan, 1000], + ... }, + ... index=dates, + ... ) + >>> # Forward-fill all missing values + >>> filler = Ffill() + >>> result = filler.fit_transform(df) + >>> print(result) + temp__°C__room press__Pa__room + 2024-01-01 00:00:00+00:00 20.0 1000.0 + 2024-01-01 00:01:00+00:00 20.0 1000.0 + 2024-01-01 00:02:00+00:00 20.0 900.0 + 2024-01-01 00:03:00+00:00 23.0 900.0 + 2024-01-01 00:04:00+00:00 24.0 1000.0 + >>> # Forward-fill with limit of 1 + >>> filler_limited = Ffill(limit=1) + >>> result_limited = filler_limited.fit_transform(df) + >>> print(result_limited) + temp__°C__room press__Pa__room + 2024-01-01 00:00:00+00:00 20.0 1000.0 + 2024-01-01 00:01:00+00:00 20.0 1000.0 + 2024-01-01 00:02:00+00:00 NaN 900.0 + 2024-01-01 00:03:00+00:00 23.0 900.0 + 2024-01-01 00:04:00+00:00 24.0 1000.0 + >>> # Forward-fill only gaps of 1 hour or less + >>> filler_timed = Ffill(gaps_lte="1h") + >>> result_timed = filler_timed.fit_transform(df) + >>> print(result_timed) + temp__°C__room press__Pa__room + 2024-01-01 00:00:00+00:00 20.0 1000.0 + 2024-01-01 00:01:00+00:00 NaN 1000.0 + 2024-01-01 00:02:00+00:00 NaN 900.0 + 2024-01-01 00:03:00+00:00 23.0 900.0 + 2024-01-01 00:04:00+00:00 24.0 1000.0 + + Notes + ----- + - NaN values at the beginning of the time series will remain unfilled since + there are no past values to propagate + + Returns + ------- + pd.DataFrame + The DataFrame with missing values forward-filled according to the specified + parameters. The output maintains the same DateTimeIndex and column + structure as the input. """ def __init__( @@ -547,24 +875,81 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class Bfill(BaseFiller, BaseProcessing): - """ - A class to back-fill missing values in a Pandas DataFrame. - the limit argument allows the function to stop backfilling at a certain - number of missing value - - Parameters: - limit: int, default None If limit is specified, this is the maximum number - of consecutive NaN values to forward/backward fill. - In other words, if there is a gap with more than this number of consecutive - NaNs, it will only be partially filled. - If limit is not specified, this is the maximum number of entries along - the entire axis where NaNs will be filled. Must be greater than 0 if not None. - - Methods: - fit(self, X, y=None): - Does nothing. Returns the object itself. - transform(self, X): - Fill missing values in the input DataFrame. + """A transformer that back-fills missing values in a pandas DataFrame. + + This transformer fills missing values (NaN) in a DataFrame by propagating + the next valid observation backward. It is particularly useful when future + values are more relevant for filling gaps than past values. + + Parameters + ---------- + limit : int, optional (default=None) + The maximum number of consecutive NaN values to back-fill. + If specified, only gaps with this many or fewer consecutive NaN values + will be filled. Must be greater than 0 if not None. + Example: If limit=2, a gap of 3 or more NaN values will only be + partially filled. + + gaps_lte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration less than or equal to this value. + + gaps_gte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration greater than or equal to this value. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C__room": [20, np.nan, np.nan, 23, 24], + ... "press__Pa__room": [1000, np.nan, 900, np.nan, 1000], + ... }, + ... index=dates, + ... ) + >>> # Back-fill all missing values + >>> filler = Bfill() + >>> result = filler.fit_transform(df) + >>> print(result) + temp__°C__room press__Pa__room + 2024-01-01 00:00:00+00:00 20.0 1000.0 + 2024-01-01 00:01:00+00:00 23.0 900.0 + 2024-01-01 00:02:00+00:00 23.0 900.0 + 2024-01-01 00:03:00+00:00 23.0 1000.0 + 2024-01-01 00:04:00+00:00 24.0 1000.0 + >>> # Back-fill with limit of 1 + >>> filler_limited = Bfill(limit=1) + >>> result_limited = filler_limited.fit_transform(df) + >>> print(result_limited) + temp__°C__room press__Pa__room + 2024-01-01 00:00:00+00:00 20.0 1000.0 + 2024-01-01 00:01:00+00:00 23.0 900.0 + 2024-01-01 00:02:00+00:00 NaN 900.0 + 2024-01-01 00:03:00+00:00 23.0 1000.0 + 2024-01-01 00:04:00+00:00 24.0 1000.0 + + Notes + ----- + - The transformer fills NaN values by propagating the next valid observation + backward in time + - When limit is specified, only gaps with that many or fewer consecutive NaN + values will be filled + - The gaps_lte and gaps_gte parameters allow filtering gaps based on their + duration before filling + - The transformer preserves the DataFrame's index and column structure + - NaN values at the end of the time series will remain unfilled since there + are no future values to propagate + + Returns + ------- + pd.DataFrame + The DataFrame with missing values back-filled according to the specified + parameters. The output maintains the same DateTimeIndex and column + structure as the input. """ def __init__( @@ -592,28 +977,74 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): X[gaps_mask] = filled_x[gaps_mask] return X - # https://stackoverflow.com/questions/34321025/replace-values-in-numpy-2d-array-based-on-pandas-dataframe - # x_arr = np.array(X) - # gaps_mask = self.get_gaps_mask(X) - # gaps_idx_raveled = np.where(gaps_mask.to_numpy().ravel())[0] - # x_arr.flat[gaps_idx_raveled] = filled_x.to_numpy().ravel()[gaps_idx_raveled] - # return pd.DataFrame(data=x_arr, columns=X.columns, index=X.index) - class FillNa(BaseFiller, BaseProcessing): """ - A class that extends scikit-learn's TransformerMixin and BaseEstimator - to fill missing values in a Pandas DataFrame. - - Parameters: - value: scalar, dict, Series, or DataFrame - Value(s) used to replace missing values. - - Methods: - fit(self, X, y=None): - Does nothing. Returns the object itself. - transform(self, X): - Fill missing values in the input DataFrame. + A transformer that fills missing values in a pandas DataFrame with a specified value. + + Parameters + ---------- + value : float + The value to use for filling missing values. + + gaps_lte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration less than or equal to this value. + + gaps_gte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration greater than or equal to this value. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> from datetime import datetime, timedelta + >>> from tide.processing import FillNa + + >>> # Create a DataFrame with missing values and timezone-aware index + >>> dates = pd.date_range(start="2024-01-01", periods=5, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "temperature__°C": [20.0, np.nan, np.nan, 22.0, 23.0], + ... "pressure__Pa": [1013.0, np.nan, 1015.0, np.nan, 1014.0], + ... }, + ... index=dates, + ... ) + + >>> # Fill all missing values with 0 + >>> filler = FillNa(value=0) + >>> df_filled = filler.fit_transform(df) + >>> print(df_filled) + temperature__°C pressure__Pa + 2024-01-01 00:00:00+00:00 20.0 1013.0 + 2024-01-01 01:00:00+00:00 0.0 0.0 + 2024-01-01 02:00:00+00:00 0.0 1015.0 + 2024-01-01 03:00:00+00:00 22.0 0.0 + 2024-01-01 04:00:00+00:00 23.0 1014.0 + + >>> # Fill only gaps of 1 hour or less with -999 + >>> filler = FillNa(value=-999, gaps_lte="1h") + >>> df_filled = filler.fit_transform(df) + >>> print(df_filled) + temperature__°C pressure__Pa + 2024-01-01 00:00:00+00:00 20.0 1013.0 + 2024-01-01 01:00:00+00:00 np.nan -999.0 + 2024-01-01 02:00:00+00:00 np.nan 1015.0 + 2024-01-01 03:00:00+00:00 22.0 -999.0 + 2024-01-01 04:00:00+00:00 23.0 1014.0 + + Notes + ----- + - When using gap duration parameters (gaps_lte or gaps_gte), only gaps within + the specified time ranges will be filled + - This transformer is particularly useful for: + - Replacing missing values with a known default value + - Handling sensor errors or invalid measurements + + Returns + ------- + pd.DataFrame + A DataFrame with missing values filled according to the specified parameters. + The output maintains the same structure and index as the input DataFrame. """ def __init__( @@ -643,51 +1074,74 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class Interpolate(BaseFiller, BaseProcessing): - """A class that implements interpolation of missing values in - a Pandas DataFrame. - - This class is a transformer that performs interpolation of missing - values in a Pandas DataFrame, using the specified `method`. - It will interpolate the gaps of size greater or equal to gaps_gte OR less than - or equal to gaps_lte. - - Parameters: - ----------- - method : str or None, default None - The interpolation method to use. If None, the default interpolation - method of the Pandas DataFrame `interpolate()` method will be used. - ["linear", "time", "index", "values", "nearest", "zero", "slinear", - "quadratic", "cubic", "barycentric", "polynomial", "krogh", - "piecewise_polynomial", "spline", "pchip", "akima", "cubicspline", - "from_derivatives"] - - gaps_lte: str | pd.Timedelta | dt.timedelta: Interpolate gaps of size less or - equal to gaps lte - - gaps_gte: str | pd.Timedelta | dt.timedelta: Interpolate gaps of size greater or - equal to gaps lte - - Attributes: - ----------- - columns : Index or None - The columns of the input DataFrame. Will be set during fitting. - index : Index or None - The index of the input DataFrame. Will be set during fitting. - - Methods: + """ + A transformer that interpolates missing values in a pandas DataFrame using various methods. + + Parameters + ---------- + method : str, default="linear" + The interpolation method to use. Sample of useful available methods: + - "linear": Linear interpolation (default) + - "slinear": Spline interpolation of order 1 + - "quadratic": Spline interpolation of order 2 + - "cubic": Spline interpolation of order 3 + - "barycentric": Barycentric interpolation + - "polynomial": Polynomial interpolation + - "krogh": Krogh interpolation + - "piecewise_polynomial": Piecewise polynomial interpolation + - "spline": Spline interpolation + - "pchip": Piecewise cubic Hermite interpolation + - "akima": Akima interpolation + - "cubicspline": Cubic spline interpolation + - "from_derivatives": Interpolation from derivatives + + gaps_lte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only interpolate gaps with duration less than or equal to this value. + + gaps_gte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only interpolate gaps with duration greater than or equal to this value. + + Examples -------- - fit(X, y=None): - Fit the transformer to the input DataFrame X. This method will set - the `columns` and `index` attributes of the transformer, - and return the transformer instance. - transform(X): - Transform the input DataFrame X by performing interpolation of - missing values using the - specified `method`. Returns the transformed DataFrame. - - Returns: + >>> import pandas as pd + >>> import numpy as np + >>> from datetime import datetime, timedelta + >>> from tide.processing import Interpolate + + >>> # Create a DataFrame with missing values and timezone-aware index + >>> dates = pd.date_range(start="2024-01-01", periods=5, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "temperature__°C": [20.0, np.nan, np.nan, 22.0, 23.0], + ... "pressure__Pa": [1013.0, np.nan, 1015.0, np.nan, 1014.0], + ... }, + ... index=dates, + ... ) + + >>> # Linear interpolation of all missing values + >>> interpolator = Interpolate(method="linear") + >>> df_interpolated = interpolator.fit_transform(df) + >>> print(df_interpolated) + temperature__°C pressure__Pa + 2024-01-01 00:00:00+00:00 20.0 1013.0 + 2024-01-01 01:00:00+00:00 20.7 1014.0 + 2024-01-01 02:00:00+00:00 21.3 1015.0 + 2024-01-01 03:00:00+00:00 22.0 1014.5 + 2024-01-01 04:00:00+00:00 23.0 1014.0 + + Notes + ----- + - When using gap duration parameters (gaps_lte or gaps_gte), only gaps within + the specified time ranges will be interpolated + - Different interpolation methods may produce different results: + - Linear interpolation is simple but may not capture complex patterns + - Cubic interpolation provides smoother curves but may overshoot + + Returns ------- - A transformed Pandas DataFrame with interpolated missing values. + pd.DataFrame + A DataFrame with missing values interpolated according to the specified parameters. + The output maintains the same structure and index as the input DataFrame. """ def __init__( @@ -713,29 +1167,88 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class Resample(BaseProcessing): - """ - Resample time series data in a pandas DataFrame according to rule. - Allow column wise resampling methods. + """A transformer that resamples time series data to a different frequency. + + This transformer allows you to resample time series data to a different frequency + while applying specified aggregation methods. It supports both simple resampling + with a single method for all columns and custom methods for specific columns + using Tide's naming convention. Parameters ---------- - rule : str - The pandas timedelta or object representing the target resampling - frequency. - method : str | Callable - The default method for resampling. - It Will be overridden if a specific method - is specified in columns_method - tide_format_methods: - Allow the use of tide column format name__unit__bloc to specify - column aggregation method. - Warning using this argument will override columns_methods argument. - Requires fitting operation before transformation - columns_methods : list of Tuples Optional - List of tuples containing a list of column names and associated - resampling method. - The method should be a string or callable that can be passed - to the `agg()` method of a pandas DataFrame. + rule : str | pd.Timedelta | dt.timedelta + The frequency to resample to. Can be specified as: + - String: '1min', '5min', '1h', '1D', etc. + - Timedelta object: pd.Timedelta('1 hour') + - datetime.timedelta object: dt.timedelta(hours=1) + + method : str | Callable, default='mean' + The default aggregation method to use for resampling. + Can be: + - String: 'mean', 'sum', 'min', 'max', 'std', etc. + - Callable: Any function that can be used with pandas' resample + + tide_format_methods : dict[str, str | Callable], optional (default=None) + A dictionary mapping Tide tag components to specific aggregation methods. + Keys are the components to match (name, unit, block, sub_block). + Values are the aggregation methods to use for matching columns. + Example: {'name': 'power', 'method': 'sum'} will use sum aggregation + for all columns with 'power' in their name. + + columns_methods : list[tuple[list[str], str | Callable]], optional (default=None) + A list of tuples specifying custom methods for specific columns. + Each tuple contains: + - list[str]: List of column names to apply the method to + - str | Callable: The aggregation method to use + Example: [(['power__W__building'], 'sum')] + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "power__W__building": [1000, 1200, 1100, 1300, 1400], + ... "temp__°C__room": [20, 21, 22, 23, 24], + ... "humid__%__room": [45, 46, 47, 48, 49], + ... }, + ... index=dates, + ... ) + >>> # Resample to 5-minute intervals using mean + >>> resampler = Resample(rule="5min") + >>> result = resampler.fit_transform(df) + >>> print(result) + power__W__building temp__°C__room humid__%__room + 2024-01-01 00:00:00+00:00 1100.0 21.0 46.0 + 2024-01-01 00:05:00+00:00 1350.0 23.5 48.5 + >>> # Resample with custom methods + >>> resampler_custom = Resample( + ... rule="5min", + ... tide_format_methods={"name": "power", "method": "min"}, + ... columns_methods=[(["temp__°C__room"], "max")], + ... ) + + + Notes + ----- + - When using tide_format_methods, the matching is done on the Tide tag components + (name__unit__block__sub_block) + - If tide_format_methods is provided, it takes precedence over columns_methods + and completely replaces it during fitting + - If no custom method is specified for a column, the default method is used + - The output frequency is determined by the rule parameter + - Missing values in the input are handled according to the specified methods + + Returns + ------- + pd.DataFrame + The resampled DataFrame with the specified frequency and aggregation methods. + The output maintains the same column structure as the input, with values + aggregated according to the specified methods. """ def __init__( @@ -775,33 +1288,94 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class AddTimeLag(BaseProcessing): - """ - PdAddTimeLag - A transformer that adds lagged features to a pandas - DataFrame. + """A transformer that adds time-lagged features to a pandas DataFrame. - This transformer creates new features based on the provided features - lagged by the given time lag. + This transformer creates new features by shifting existing features in time, + allowing the creation of past or future values as new features. This is + particularly useful for time series analysis where historical or future + values might be relevant predictors. - Parameters: - ----------- - time_lag : datetime.timedelta - The time lag used to shift the provided features. A positive time lag - indicates that the new features will contain information from the past, - while a negative time lag indicates that the new features will - contain information from the future. - - features_to_lag : list of str or str or None, optional (default=None) - The list of feature names to lag. If None, all features in the input - DataFrame will be lagged. + Parameters + ---------- + time_lag : str | pd.Timedelta | dt.timedelta, default="1h" + The time lag to apply when creating new features. Can be specified as: + - A string (e.g., "1h", "30min", "1d") + - A pandas Timedelta object + - A datetime timedelta object + A positive time lag creates features with past values, while a negative + time lag creates features with future values. + + features_to_lag : str | list[str] | None, default=None + The features to create lagged versions of. If None, all features in the + input DataFrame will be lagged. Can be specified as: + - A single feature name (string) + - A list of feature names + - None (to lag all features) + + feature_marker : str | None, default=None + The prefix to use for the new lagged feature names. If None, the + string representation of time_lag followed by an underscore is used. + For example, with time_lag="1h", features will be prefixed with "1h_". + + drop_resulting_nan : bool, default=False + Whether to drop rows containing NaN values that result from the lag + operation. This is useful when you want to ensure complete data for + the lagged features. - feature_marker : str or None, optional (default=None) - The string used to prefix the names of the new lagged features. - If None, the feature names will be prefixed with the string - representation of the `time_lag` parameter followed by an underscore. + Examples + -------- + >>> import pandas as pd + >>> from tide.processing import AddTimeLag + >>> # Create sample data + >>> dates = pd.date_range(start="2024-01-01", periods=5, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "power__W__building": [100, 200, 300, 400, 500], + ... "temp__°C__room": [20, 21, 22, 23, 24], + ... }, + ... index=dates, + ... ) + >>> # Add 1-hour lagged features + >>> lagger = AddTimeLag(time_lag="1h") + >>> result = lagger.fit_transform(df) + >>> print(result) + power__W__building temp__°C__room 1h_power__W__building 1h_temp__°C__room + 2024-01-01 00:00:00 100.0 20.0 NaN NaN + 2024-01-01 01:00:00 200.0 21.0 100.0 20.0 + 2024-01-01 02:00:00 300.0 22.0 200.0 21.0 + 2024-01-01 03:00:00 400.0 23.0 300.0 22.0 + 2024-01-01 04:00:00 500.0 24.0 400.0 23.0 + >>> # Add custom lagged features with specific marker + >>> lagger_custom = AddTimeLag( + ... time_lag="1h", + ... features_to_lag=["power__W__building"], + ... feature_marker="prev_", + ... drop_resulting_nan=True, + ... ) + >>> result_custom = lagger_custom.fit_transform(df) + >>> print(result_custom) + power__W__building temp__°C__room prev_power__W__building + 2024-01-01 00:00:00 200.0 21.0 100.0 + 2024-01-01 01:00:00 300.0 22.0 200.0 + 2024-01-01 02:00:00 400.0 23.0 300.0 + 2024-01-01 03:00:00 500.0 24.0 400.0 - drop_resulting_nan : bool, optional (default=False) - Whether to drop rows with NaN values resulting from the lag operation. + Notes + ----- + - The transformer preserves the original features and adds new lagged versions + - Lagged features are created by shifting the index and concatenating with + the original data + - When drop_resulting_nan=True, rows with NaN values in lagged features + are removed from the output + - The feature_marker parameter allows for custom naming of lagged features + - The transformer supports both positive (past) and negative (future) lags + Returns + ------- + pd.DataFrame + The input DataFrame with additional lagged features. The original + features are preserved, and new lagged features are added with the + specified prefix. """ def __init__( @@ -849,53 +1423,72 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class GaussianFilter1D(BaseProcessing): - """ - A transformer that applies a 1D Gaussian filter to a Pandas DataFrame. - The Gaussian filter is a widely used smoothing filter that effectively - reduces the high-frequency noise in an input signal. + """A transformer that applies a 1D Gaussian filter to smooth time series data. + + This transformer applies a one-dimensional Gaussian filter to each column of + the input DataFrame, effectively reducing high-frequency noise while preserving + the overall trend and important features of the time series. Parameters ---------- sigma : float, default=5 - Standard deviation of the Gaussian kernel. - In practice, the value of sigma determines the level of smoothing - applied to the input signal. A larger value of sigma results in a - smoother output signal, while a smaller value results in less - smoothing. However, too large of a sigma value can result in the - loss of important features or details in the input signal. + Standard deviation of the Gaussian kernel. Controls the level of smoothing: + - Larger values result in smoother output but may lose fine details + - Smaller values preserve more details but may not reduce noise effectively + - Must be positive mode : str, default='nearest' - Points outside the boundaries of the input are filled according to - the given mode. The default, 'nearest' mode is used to set the values - beyond the edge of the array equal to the nearest edge value. - This avoids introducing new values into the smoothed signal that - could bias the result. Using 'nearest' mode can be particularly useful - when smoothing a signal with a known range or limits, such as a time - series with a fixed start and end time. - - truncate : float, default=4. - The filter will ignore values outside the range - (mean - truncate * sigma) to (mean + truncate * sigma). - The truncate parameter is used to define the length of the filter - kernel, which determines the degree of smoothing applied to the input - signal. + How to handle values outside the input boundaries. Options are: + - 'nearest': Use the nearest edge value (default) + - 'reflect': Reflect values around the edge + - 'mirror': Mirror values around the edge + - 'constant': Use a constant value (0) + - 'wrap': Wrap values around the edge + + truncate : float, default=4.0 + The filter window size in terms of standard deviations. Values outside + the range (mean ± truncate * sigma) are ignored. This parameter: + - Controls the effective size of the filter window + - Affects the computational efficiency + - Must be positive - Attributes - ---------- - columns : list - The column names of the input DataFrame. - index : pandas.Index - The index of the input DataFrame. + Examples + -------- + >>> import pandas as pd + >>> from tide.processing import GaussianFilter1D + >>> # Create sample data with timezone-aware index + >>> dates = pd.date_range(start="2024-01-01", periods=5, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "power__W__building": [100, 150, 200, 180, 220], + ... "temp__°C__room": [20, 21, 22, 21, 23], + ... }, + ... index=dates, + ... ) + >>> # Apply Gaussian filter with default settings + >>> smoother = GaussianFilter1D(sigma=2) + >>> result = smoother.fit_transform(df) + >>> print(result) + power__W__building temp__°C__room + 2024-01-01 00:00:00+00:00 130.0 20.0 + 2024-01-01 01:00:00+00:00 149.0 20.0 + 2024-01-01 02:00:00+00:00 169.0 21.0 + 2024-01-01 03:00:00+00:00 187.0 21.0 + 2024-01-01 04:00:00+00:00 201.0 22.0 + + Notes + ----- + - The input DataFrame must have a timezone-aware DatetimeIndex + - The filter is applied independently to each column + - The output maintains the same index and column structure as the input + - The smoothing effect is more pronounced at the edges of the time series - Methods - ------- - get_feature_names_out(input_features=None) - Get output feature names for the transformed data. - fit(X, y=None) - Fit the transformer to the input data. - transform(X, y=None) - Transform the input data by applying the 1D Gaussian filter. + Returns + ------- + pd.DataFrame + The smoothed DataFrame with the same structure as the input. Each column + has been smoothed using the 1D Gaussian filter with the specified parameters. """ def __init__(self, sigma=5, mode="nearest", truncate=4.0): @@ -917,20 +1510,83 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class CombineColumns(BaseProcessing): - """ - A class that combines multiple columns in a pandas DataFrame using mean, sum, - average, or dot. Original columns can be dropped. + """A transformer that combines multiple columns in a DataFrame using various aggregation methods. + + This transformer creates a new column by combining values from multiple input columns + using specified aggregation methods. It supports weighted and unweighted combinations, + and can optionally drop the original columns. Parameters ---------- - function (str): The name of the function to apply for combining columns. - Valide names are "mean", "sum", "average", "dot". - weights (list[float | int] or np.ndarray, optional): Weights to apply when - using 'average' or 'dot'. Ignored for functions like 'mean' or 'sum'. - drop_columns (bool): If True, the original columns used for combining will - be dropped from the DataFrame. If False, they will be retained. - result_column_name (str): The name of the new column that will store the - combined values. + function : str + The aggregation function to use for combining columns. Valid options are: + - "mean": Arithmetic mean of the columns + - "sum": Sum of the columns + - "average": Weighted average of the columns (requires weights) + - "dot": Dot product of the columns with weights (weighted sum) + + weights : list[float | int] | np.ndarray, default=None + Weights to apply when using 'average' or 'dot' functions. Must be provided + for these functions and must match the number of columns. Ignored for + 'mean' and 'sum' functions. + + drop_columns : bool, default=False + Whether to drop the original columns after combining them. If True, only + the combined result column is returned. + + result_column_name : str, default="combined" + The name for the new column containing the combined values. + + Examples + -------- + >>> import pandas as pd + >>> from tide.processing import CombineColumns + >>> # Create sample data with timezone-aware index + >>> dates = pd.date_range(start="2024-01-01", periods=3, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "power__W__building1": [100, 200, 300], + ... "power__W__building2": [150, 250, 350], + ... "power__W__building3": [200, 300, 400], + ... }, + ... index=dates, + ... ) + >>> # Combine columns using mean + >>> combiner = CombineColumns(function="mean", result_column_name="power__W__avg") + >>> result = combiner.fit_transform(df) + >>> print(result) + power__W__building1 power__W__building2 power__W__building3 power__W__avg + 2024-01-01 00:00:00+00:00 100.0 150.0 200.0 150.0 + 2024-01-01 01:00:00+00:00 200.0 250.0 300.0 250.0 + 2024-01-01 02:00:00+00:00 300.0 350.0 400.0 350.0 + >>> # Combine columns using weighted average + >>> combiner_weighted = CombineColumns( + ... function="average", + ... weights=[0.5, 0.3, 0.2], + ... result_column_name="power__W__weighted", + ... drop_columns=True, + ... ) + >>> result_weighted = combiner_weighted.fit_transform(df) + >>> print(result_weighted) + power__W__weighted + 2024-01-01 00:00:00+00:00 135.0 + 2024-01-01 01:00:00+00:00 235.0 + 2024-01-01 02:00:00+00:00 335.0 + + Notes + ----- + - The input DataFrame must have a timezone-aware DatetimeIndex + - Weights must be provided when using 'average' or 'dot' functions + - Weights are ignored for 'mean' and 'sum' functions + - The number of weights must match the number of columns being combined + - When drop_columns=True, only the combined result column is returned + - The transformer preserves the index of the input DataFrame + Returns + ------- + pd.DataFrame + The DataFrame with the combined column added. If drop_columns=True, + only the combined column is returned. The output maintains the same + index as the input. """ def __init__( @@ -977,56 +1633,57 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class STLFilter(BaseProcessing): - """ - A transformer that applies Seasonal-Trend decomposition using LOESS (STL) - to a pandas DataFrame, and filters outliers based on an absolute threshold - from the residual (error) component of the decomposition. - Detected outliers are replaced with NaN values. + """A transformer that applies Seasonal-Trend decomposition using LOESS (STL) + to detect and filter outliers in time series data. + + This transformer decomposes each column of the input DataFrame into seasonal, + trend, and residual components using STL decomposition. It then identifies + outliers in the residual component based on an absolute threshold and replaces + them with NaN values. Parameters ---------- - period : int | str | timedelta + period : int | str | dt.timedelta The periodicity of the seasonal component. Can be specified as: - - an integer for the number of observations in one seasonal cycle, - - a string representing the time frequency (e.g., '15T' for 15 minutes), - - a timedelta object representing the duration of the seasonal cycle. + - An integer for the number of observations in one seasonal cycle + - A string representing the time frequency (e.g., '15T' for 15 minutes) + - A timedelta object representing the duration of the seasonal cycle - trend : int | str | dt.timedelta, optional - The length of the trend smoother. Must be odd and larger than season - Statsplot indicate it is usually around 150% of season. - Strongly depends on your time series. + trend : int | str | dt.timedelta + The length of the trend smoother. Must be odd and larger than season. + Typically set to around 150% of the seasonal period. The choice depends + on the characteristics of your time series. absolute_threshold : int | float The threshold for detecting anomalies in the residual component. - Any value in the residual that exceeds this threshold (absolute value) - is considered an anomaly and replaced by NaN. + Any value in the residual that exceeds this threshold (in absolute value) + is considered an anomaly and replaced by NaN. - seasonal : int | str | timedelta, optional + seasonal : int | str | dt.timedelta, default=None The length of the smoothing window for the seasonal component. If not provided, it is inferred based on the period. Must be an odd integer if specified as an int. Can also be specified as a string representing a time frequency or a timedelta object. - stl_additional_kwargs : dict[str, float], optional + stl_additional_kwargs : dict[str, float], default=None Additional keyword arguments to pass to the STL decomposition. - Methods - ------- - fit(X, y=None) - Stores the columns and index of the input DataFrame but does not change - the data. The method is provided for compatibility with the - scikit-learn pipeline. - transform(X) - Applies the STL decomposition to each column of the input DataFrame `X` - and replaces outliers detected in the residual component with NaN values. - The outliers are determined based on the provided `absolute_threshold`. + Notes + ----- + - The STL decomposition is applied independently to each column + - Outliers are detected based on the residual component of the decomposition + - Detected outliers are replaced with NaN values + - The trend parameter should be larger than the period parameter + - The seasonal parameter is optional and defaults to an inferred value + - The transformer preserves the index and column structure of the input Returns ------- pd.DataFrame - The transformed DataFrame with outliers replaced by NaN. + The input DataFrame with outliers replaced by NaN values. The output + maintains the same index and column structure as the input. """ def __init__( @@ -1067,44 +1724,118 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class FillGapsAR(BaseFiller, BaseProcessing): """ - A class designed to identify gaps in time series data and fill them using - a specified model. + A transformer that fills gaps in time series data using autoregressive models. + + This transformer identifies and fills gaps in time series data using a specified + model (e.g., Prophet). The filling process depends on the `recursive_fill` parameter: + + When recursive_fill=True: + 1. Identifies gaps in the data and filters them based on size thresholds + 2. Uses the largest continuous block of valid data to fit the model + 3. Fills neighboring gaps using backcasting or forecasting + 4. Optionally handles high-frequency data by: + - Resampling to a larger timestep for better pattern recognition + - Performing predictions at the resampled timestep + - Using linear interpolation to restore original resolution + 5. Repeats steps 2-4 until no more gaps remain + + When recursive_fill=False: + 1. Identifies gaps in the data and filters them based on size thresholds + 2. Uses the entire dataset to fit the model + 3. Fills all gaps in a single pass using the fitted model + 4. Optionally handles high-frequency data as described above - 1- The class identified the gaps to fill and filter them using upper and lower gap - thresholds. - 2- The biggest group of valid data is identified and is used to fit the model. - 3- The neighboring gaps are filled using backcasting or forecasting. - 4- OPTIONAL When the data's timestep is too short compared to the periodic behavior - (e.g., 5-min data for a 24h pattern): - - Resample data to a larger timestep - - Perform predictions at the resampled timestep - - Use linear interpolation to restore original data resolution + Parameters + ---------- + model_name : str, default="Prophet" + The name of the model to use for gap filling. Currently supports "Prophet" and "STL". + Note: STL model requires recursive_fill=True as it cannot handle NaN values. + model_kwargs : dict, default={} + Additional keyword arguments to pass to the model during initialization. - The process is repeated at step 2 until there are no more gaps to fill + gaps_lte : str | datetime | pd.Timestamp, default=None + Upper threshold for gap size. Gaps larger than this will not be filled. + Can be a string (e.g., "1D"), datetime object, or pd.Timestamp. - Parameters - ---------- - model_name : str, optional - The name of the model to be used for filling gaps, by default "STL". - It must be a key of MODEL_MAP - model_kwargs : dict, optional - A dictionary containing the arguments of the model. - lower_gap_threshold : str or datetime.datetime, optional - The lower threshold for the size of gaps to be considered, by default None. - upper_gap_threshold : str or datetime.datetime, optional - The upper threshold for the size of gaps to be considered, by default None. - resample_at_td: str or time delta, optinal - The time delta to resample fitting data before prediction + gaps_gte : str | datetime | pd.Timestamp, default=None + Lower threshold for gap size. Gaps smaller than this will not be filled. + Can be a string (e.g., "1h"), datetime object, or pd.Timestamp. - Attributes - ---------- - model_ : callable - The predictive model class used to fill gaps, determined by `model_name`. - features_ : list - The list of feature columns present in the data. - index_ : pd.Index - The index of the data passed during the `fit` method. + resample_at_td : str | timedelta | pd.Timedelta, default=None + Optional resampling period for high-frequency data. If provided, data will be + resampled to this frequency before model fitting and prediction. + + recursive_fill : bool, default=False + Whether to recursively fill gaps until no more gaps remain. If False, only + performs one pass of gap filling. Must be True when using STL model. + + Examples + -------- + >>> import pandas as pd + >>> from tide.processing import FillGapsAR + >>> # Create sample data with gaps + >>> dates = pd.date_range(start="2024-01-01", periods=24, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "power__W__building": [ + ... 100, + ... np.nan, + ... np.nan, + ... 180, + ... 220, + ... 190, + ... np.nan, + ... 230, + ... 180, + ... 160, + ... 140, + ... 120, + ... 110, + ... 130, + ... 150, + ... 170, + ... 190, + ... 210, + ... 230, + ... 220, + ... 200, + ... 180, + ... 160, + ... 140, + ... ] + ... }, + ... index=dates, + ... ) + >>> # Fill gaps using Prophet model (non-recursive) + >>> filler = FillGapsAR( + ... model_name="Prophet", gaps_lte="1D", gaps_gte="1h", resample_at_td="1h" + ... ) + >>> result = filler.fit_transform(df) + >>> # Fill gaps using STL model (recursive required) + >>> filler = FillGapsAR( + ... model_name="STL", + ... gaps_lte="1D", + ... gaps_gte="1h", + ... recursive_fill=True, # Required for STL + ... ) + >>> result = filler.fit_transform(df) + + Notes + ----- + - Gaps are filled independently for each column + - For high-frequency data, resampling can improve pattern recognition + - When recursive_fill=True, the model is fitted on the largest continuous block + of valid data for each gap + - When recursive_fill=False, the model is fitted on the entire dataset + - STL model requires recursive_fill=True as it cannot handle NaN values + - Prophet model requires additional dependencies (prophet package) + + Returns + ------- + pd.DataFrame + DataFrame with gaps filled using the specified model. The output maintains + the same structure and timezone information as the input. """ def __init__( @@ -1226,45 +1957,68 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class ExpressionCombine(BaseProcessing): - """ - Performs specified operations on selected columns, creating a new column - based on the provided expression. - Useful for aggregation in a single column, or physical expression. - The transformer can also optionally drop the columns used in the expression - after computation. + """A transformer that combines DataFrame columns using a mathematical expression. + + This transformer evaluates a mathematical expression using specified columns from a DataFrame, + creating a new column with the result. It supports both simple aggregations and complex + physical expressions, with the option to drop the source columns after computation. Parameters ---------- columns_dict : dict[str, str] - A dictionary mapping variable names (as used in the expression) to the - column names in the X DataFrame. Keys are variable names in the expression, - and values are the corresponding column names in the DataFrame. + Dictionary mapping expression variables to DataFrame column names. + Keys are the variable names used in the expression, and values are the + corresponding column names in the DataFrame. expression : str - A mathematical expression in string format, which will be evaluated using the - specified columns from the DataFrame. Variables in the expression should - match the keys in `variables_dict`. + Mathematical expression to evaluate, using variables defined in columns_dict. + The expression should be a valid Python mathematical expression that can be + evaluated using pandas.eval(). result_column_name : str - Name of the new column in which the result of the evaluated expression - will be stored. + Name of the new column that will contain the evaluated expression result. + Must not already exist in the DataFrame. drop_columns : bool, default=False - If True, the columns used in the calculation will be dropped - from the resulting DataFrame after the transformation. + Whether to drop the source columns used in the expression after computation. + If True, only the result column and other non-source columns are kept. + + Attributes + ---------- + feature_names_out_ : list[str] + List of column names in the transformed DataFrame. If drop_columns is True, + excludes the source columns used in the expression. + + Raises + ------ + ValueError + If result_column_name already exists in the DataFrame. Examples -------- - combiner = Combiner( - columns_dict={ - "T1": "Tin__°C__building", - "T2": "Text__°C__outdoor", - "m": "mass_flwr__m3/h__hvac", - }, - expression="(T1 - T2) * m * 1004 * 1.204", - result_column_name="loss_ventilation__J__hvac", - drop_columns = True - ) + >>> from tide import ExpressionCombine + >>> import pandas as pd + >>> # Create sample data + >>> df = pd.DataFrame( + ... { + ... "Tin__°C__building": [20, 21, 22], + ... "Text__°C__outdoor": [10, 11, 12], + ... "mass_flwr__m3/h__hvac": [1, 2, 3], + ... } + ... ) + >>> # Calculate ventilation losses + >>> combiner = ExpressionCombine( + ... columns_dict={ + ... "T1": "Tin__°C__building", + ... "T2": "Text__°C__outdoor", + ... "m": "mass_flwr__m3/h__hvac", + ... }, + ... expression="(T1 - T2) * m * 1004 * 1.204", + ... result_column_name="loss_ventilation__J__hvac", + ... drop_columns=True, + ... ) + >>> # Transform the data + >>> result = combiner.fit_transform(df) """ def __init__( @@ -1301,47 +2055,83 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class FillOikoMeteo(BaseFiller, BaseOikoMeteo, BaseProcessing): - """ - A processor that fills gaps using meteorological data from the Oikolab API. + """A transformer that fills data gaps using meteorological data from the Oikolab API. + + This transformer identifies gaps in time series data and fills them with corresponding + meteorological data retrieved from the Oikolab API. It supports filtering gaps based on + their size and can handle different data frequencies through automatic interpolation + or resampling. + + Parameters + ---------- + gaps_lte : str | pd.Timedelta | dt.timedelta, default=None + Maximum gap size to fill. Gaps larger than this will be ignored. + Can be specified as a string (e.g., "24h") or timedelta object. - This class extends BaseFiller to provide functionality for - filtering gaps based onthere size. It fills them with corresponding - meteorological data retrieved from the Oikolab API. + gaps_gte : str | pd.Timedelta | dt.timedelta, default=None + Minimum gap size to fill. Gaps smaller than this will be ignored. + Can be specified as a string (e.g., "1h") or timedelta object. - Attributes: - ----------- - lat : float + lat : float, default=43.47 Latitude of the location for which to retrieve meteorological data. - lon : float + + lon : float, default=-1.51 Longitude of the location for which to retrieve meteorological data. - param_map : dict[str, str] - Mapping of input columns to Oikolab API parameters. Oikolab parameters are : - 'temperature', 'dewpoint_temperature', 'mean_sea_level_pressure', - 'wind_speed', '100m_wind_speed', 'relative_humidity', - 'surface_solar_radiation', 'direct_normal_solar_radiation', - 'surface_diffuse_solar_radiation', 'surface_thermal_radiation', - 'total_cloud_cover', 'total_precipitation' - model : str - The meteorological model to use for data retrieval (default is "era5"). - env_oiko_api_key : str - The name of the environement variable that holds the Oikolab API key - (set during fitting). - - Example: + + columns_param_map : dict[str, str], default=None + Mapping of input columns to Oikolab API parameters. If None, all columns + will be filled with temperature data. Available Oikolab parameters are: + - temperature + - dewpoint_temperature + - mean_sea_level_pressure + - wind_speed + - 100m_wind_speed + - relative_humidity + - surface_solar_radiation + - direct_normal_solar_radiation + - surface_diffuse_solar_radiation + - surface_thermal_radiation + - total_cloud_cover + - total_precipitation + + model : str, default="era5" + The meteorological model to use for data retrieval. + + env_oiko_api_key : str, default="OIKO_API_KEY" + Name of the environment variable containing the Oikolab API key. + + Examples -------- - >>> filler = FillOikoMeteo(gaps_gte="1h", gaps_lte="24h", lat=43.47, lon=-1.51) - >>> filler.fit(X) - >>> X_filled = filler.transform(X) + >>> from tide import FillOikoMeteo + >>> import pandas as pd + >>> # Create sample data with gaps + >>> df = pd.DataFrame( + ... { + ... "temperature": [20, None, 22, None, 24], + ... "humidity": [50, None, 55, None, 60], + ... }, + ... index=pd.date_range("2024-01-01", periods=5, freq="H"), + ... ) + >>> # Initialize and fit the transformer + >>> filler = FillOikoMeteo( + ... gaps_gte="1h", + ... gaps_lte="24h", + ... lat=43.47, + ... lon=-1.51, + ... columns_param_map={ + ... "temperature": "temperature", + ... "humidity": "relative_humidity", + ... }, + ... ) + >>> # Transform the data + >>> result = filler.fit_transform(df) - Notes: - ------ - - The class requires an Oikolab API key to be set as an environment - variable env_oiko_api_key. - - If param_map is not provided, all columns will be filled with temperature data. - This dumb behavior ensures the processing object is working with default values - to comply with scikit learn API recomandation. - - The class handles different frequencies of input data, interpolating or - resampling as needed. + Notes + ----- + - Requires an Oikolab API key to be set as an environment variable. + - If columns_param_map is not provided, all columns will be filled with temperature data + to comply with scikit-learn API recommendations. + - Automatically handles different data frequencies through interpolation or resampling. """ def __init__( @@ -1453,16 +2243,49 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class AddSolarAngles(BaseProcessing): - """ - Transformer that adds solar elevation and azimuth angle to passed DataFrame. - - Attributes: - lat (float): The latitude of the location in degrees. - lon (float): The longitude of the location in degrees. - data_bloc (str): Identifier for the tide data block. - Default to "OTHER". - data_sub_bloc (str): Identifier for the data sub-block; - Default to "OTHER_SUB_BLOC". + """A transformer that adds solar angles (azimuth and elevation) to a DataFrame. + + This transformer calculates and adds solar azimuth and elevation angles for a given + location and time series. The angles are calculated using the Astronomical Almanac's + algorithm (1950-2050) as described in Michalsky (1988) and subsequent papers. + + Parameters + ---------- + lat : float, default=43.47 + Latitude of the location in decimal degrees. + + lon : float, default=-1.51 + Longitude of the location in decimal degrees. + + data_bloc : str, default="OTHER" + Name of the data block to store the solar angles. + + data_sub_bloc : str, default="OTHER_SUB_BLOC" + Name of the sub-block to store the solar angles. + + Examples + -------- + >>> from tide import AddSolarAngles + >>> import pandas as pd + >>> # Create sample data with datetime index + >>> df = pd.DataFrame( + ... {"temperature": [20, 21, 22]}, + ... index=pd.date_range("2024-01-01", periods=3, freq="H"), + ... ) + >>> # Add solar angles + >>> transformer = AddSolarAngles( + ... lat=43.47, lon=-1.51, data_bloc="SOLAR", data_sub_bloc="ANGLES" + ... ) + >>> # Transform the data + >>> result = transformer.fit_transform(df) + + Notes + ----- + - Requires a DataFrame with a DateTimeIndex. + - Adds two new columns: solar_azimuth and solar_elevation. + - Uses the Astronomical Almanac's algorithm for solar position calculations. + - Valid for years 1950-2050. Given the course of the world right now, I don't think + anyone will need to use this transformer for dates after 2050. """ def __init__( @@ -1498,31 +2321,125 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class ProjectSolarRadOnSurfaces(BaseProcessing): """ - Project solar radiation on various surfaces with specific orientations and tilts. - - Attributes: - bni_column_name (str): Name of the column containing beam normal irradiance - (BNI) data. - dhi_column_name (str): Name of the column containing diffuse horizontal - irradiance (DHI) data. - ghi_column_name (str): Name of the column containing global horizontal - irradiance (GHI) data. - lat (float): Latitude of the location (default is 43.47). - lon (float): Longitude of the location (default is -1.51). - surface_azimuth_angles (int | float | list[int | float]): Azimuth angles of - the surfaces in degrees east of north (default is 180.0, - which corresponds to a south-facing surface in the northern hemisphere). - surface_tilt_angle (float | list[float]): Tilt angles of the surfaces in - degrees (default is 35.0). 0 is façing ground. - albedo (float): Ground reflectivity or albedo (default is 0.25). - surface_name (str | list[str]): Names for the surfaces - (default is "az_180_tilt_35"). - data_bloc (str): Tide bloc name Default is "OTHER". - data_sub_bloc (str): Tide sub_bloc_name default is "OTHER_SUB_BLOC". - - Raises: - ValueError: If the number of azimuth angles, tilt angles, and surface names - do not match. + A transformer that projects solar radiation onto surfaces with specific orientations and tilts. + + This transformer calculates the total solar radiation incident on surfaces by combining: + - Direct beam radiation (projected onto the tilted surface) + - Diffuse sky radiation (from the sky dome) + - Ground-reflected radiation (albedo effect) + + Parameters + ---------- + bni_column_name : str + Name of the column containing beam normal irradiance (BNI) data in W/m². + This is the direct solar radiation perpendicular to the sun's rays. + + dhi_column_name : str + Name of the column containing diffuse horizontal irradiance (DHI) data in W/m². + This is the scattered solar radiation from the sky dome. + + ghi_column_name : str + Name of the column containing global horizontal irradiance (GHI) data in W/m². + This is the total solar radiation on a horizontal surface. + + lat : float, default=43.47 + Latitude of the location in degrees. Positive for northern hemisphere. + + lon : float, default=-1.51 + Longitude of the location in degrees. Positive for eastern hemisphere. + + surface_azimuth_angles : int | float | list[int | float], default=180.0 + Azimuth angles of the surfaces in degrees east of north. + - 0°: North-facing + - 90°: East-facing + - 180°: South-facing + + surface_tilt_angle : float | list[float], default=35.0 + Tilt angles of the surfaces in degrees from horizontal. + - 0°: Horizontal surface + - 90°: Vertical surface + - 180°: Horizontal surface facing down + + albedo : float, default=0.25 + Ground reflectivity or albedo coefficient. + Typical values: + - 0.1-0.2: Dark surfaces (asphalt, forest) + - 0.2-0.3: Grass, soil + - 0.3-0.4: Light surfaces (concrete, sand) + - 0.4-0.5: Snow + - 0.8-0.9: Fresh snow + + surface_name : str | list[str], default="az_180_tilt_35" + Names for the output columns following Tide naming convention. + Example: "south_facing_35deg" will create + "south_facing_35deg__W/m²__OTHER__OTHER_SUB_BLOC" + + data_bloc : str, default="OTHER" + Tide bloc name for the output columns. + + data_sub_bloc : str, default="OTHER_SUB_BLOC" + Tide sub_bloc name for the output columns. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> from datetime import datetime, timedelta + >>> from tide.processing import ProjectSolarRadOnSurfaces + >>> import pytz + + >>> # Create a DataFrame with solar radiation data and timezone-aware index + >>> dates = pd.date_range(start="2024-01-01", periods=3, freq="1h", tz="UTC") + >>> df = pd.DataFrame( + ... { + ... "bni__W/m²__outdoor__meteo": [ + ... 800, + ... 900, + ... 1000, + ... ], # Direct normal irradiance + ... "dhi__W/m²__outdoor__meteo": [ + ... 200, + ... 250, + ... 300, + ... ], # Diffuse horizontal irradiance + ... "ghi__W/m²__outdoor__meteo": [ + ... 600, + ... 700, + ... 800, + ... ], # Global horizontal irradiance + ... }, + ... index=dates, + ... ) + + >>> # Project radiation on a south-facing surface tilted at 35 degrees + >>> projector = ProjectSolarRadOnSurfaces( + ... bni_column_name="bni__W/m²__outdoor__meteo", + ... dhi_column_name="dhi__W/m²__outdoor__meteo", + ... ghi_column_name="ghi__W/m²__outdoor__meteo", + ... surface_azimuth_angles=180.0, # South-facing + ... surface_tilt_angle=35.0, # 35-degree tilt + ... surface_name="south_facing_35deg", + ... data_bloc="SOLAR", + ... data_sub_bloc="ROOF", + ... ) + >>> result = projector.fit_transform(df) + >>> print(result) + bni__W/m²__outdoor__meteo dhi__W/m²__outdoor__meteo ghi__W/m²__outdoor__meteo south_facing_35deg__W/m²__SOLAR__ROOF + 2024-01-01 00:00:00+00:00 800.0 200.0 600.0 850.5 + 2024-01-01 01:00:00+00:00 900.0 250.0 700.0 950.2 + 2024-01-01 02:00:00+00:00 1000.0 300.0 800.0 1050.8 + + Notes + ----- + - All input radiation values must be in W/m² + - The output radiation values are also in W/m² + + Returns + ------- + pd.DataFrame + The input DataFrame with additional columns containing the total solar + radiation projected onto each specified surface. The output maintains + the same DateTimeIndex as the input. """ def __init__( @@ -1590,24 +2507,96 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class FillOtherColumns(BaseFiller, BaseProcessing): - """ - Fill gaps in specified columns using corresponding values from - other columns + """A transformer that fills missing values in specified columns using values + from corresponding filler columns. + + This transformer is useful when you have multiple columns measuring the + same quantity (e.g., temperature from different sensors) and want to use one + column to fill gaps in another. Or fill gaps with computed values, for example + solar radiations on a pyranometer from projected radiations based on + meteo services. Parameters ---------- - gaps_lte : str | pd.Timedelta | dt.timedelta, optional - Fill gaps of duration less than or equal to gaps_lte. - If None, no upper limit is applied. - gaps_gte : str | pd.Timedelta | dt.timedelta, optional - Fill gaps of duration greater than or equal to gaps_gte. - If None, no lower limit is applied. - columns_map : dict[str, str], optional - A mapping of target columns to the columns that will be used for filling - their gaps. Keys represent the columns with gaps, and values represent the - corresponding filler columns. + gaps_lte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration less than or equal to this value. + + gaps_gte : str | pd.Timedelta | dt.timedelta, optional (default=None) + Only fill gaps with duration greater than or equal to this value. + + columns_map : dict[str, str], optional (default={}) + A mapping of target columns to their corresponding filler columns. + Keys are the columns with gaps to be filled. + Values are the columns to use for filling the gaps. + Example: {'temp__°C__room1': 'temp__°C__room2'} + drop_filling_columns : bool, default=False - If True, removes the filler columns after filling the gaps. + Whether to remove the filler columns after filling the gaps. + If True, only the target columns remain in the output. + + Examples + -------- + >>> import pandas as pd + >>> import numpy as np + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:04:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C__room1": [20, np.nan, np.nan, 23, 24], + ... "temp__°C__room2": [21, 22, 22, 22, 23], + ... "humid__%__room1": [45, np.nan, 47, np.nan, 49], + ... "humid__%__room2": [46, 46, 48, 48, 50], + ... }, + ... index=dates, + ... ) + >>> # Fill gaps in room1 using room2 data + >>> filler = FillOtherColumns( + ... columns_map={ + ... "temp__°C__room1": "temp__°C__room2", + ... "humid__%__room1": "humid__%__room2", + ... } + ... ) + >>> result = filler.fit_transform(df) + >>> print(result) + temp__°C__room1 temp__°C__room2 humid__%__room1 humid__%__room2 + 2024-01-01 00:00:00+00:00 20.0 21.0 45.0 46.0 + 2024-01-01 00:01:00+00:00 22.0 22.0 46.0 46.0 + 2024-01-01 00:02:00+00:00 22.0 22.0 47.0 48.0 + 2024-01-01 00:03:00+00:00 23.0 22.0 48.0 48.0 + 2024-01-01 00:04:00+00:00 24.0 23.0 49.0 50.0 + >>> # Fill gaps and drop filler columns + >>> filler_drop = FillOtherColumns( + ... columns_map={ + ... "temp__°C__room1": "temp__°C__room2", + ... "humid__%__room1": "humid__%__room2", + ... }, + ... drop_filling_columns=True, + ... ) + >>> result_drop = filler_drop.fit_transform(df) + >>> print(result_drop) + temp__°C__room1 humid__%__room1 + 2024-01-01 00:00:00+00:00 20.0 45.0 + 2024-01-01 00:01:00+00:00 22.0 46.0 + 2024-01-01 00:02:00+00:00 22.0 47.0 + 2024-01-01 00:03:00+00:00 23.0 48.0 + 2024-01-01 00:04:00+00:00 24.0 49.0 + + Notes + ----- + - When using gap duration parameters (gaps_lte or gaps_gte), only gaps within + the specified time ranges will be filled + - The filler columns must contain valid values at the timestamps where + the target columns have gaps + - If drop_filling_columns is True, the output DataFrame will only contain + the target columns with filled gaps + + Returns + ------- + pd.DataFrame + The DataFrame with gaps filled using values from the specified filler columns. + If drop_filling_columns is True, the filler columns are removed from the output. """ def __init__( @@ -1643,15 +2632,71 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class DropColumns(BaseProcessing): - """ - Drop specified columns. + """A transformer that removes specified columns from a pandas DataFrame. + + It is particularly useful for data preprocessing when certain columns are + no longer needed or for removing intermediate calculation columns. Parameters ---------- - columns : str or list[str], optional + columns : str | list[str], optional (default=None) The column name or a list of column names to be dropped. - If None, no columns are dropped. + If None, no columns are dropped and the DataFrame is returned unchanged. + Example: 'temp__°C' or ['temp__°C', 'humid__%'] + + Attributes + ---------- + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns (input columns minus dropped columns). + + Examples + -------- + >>> import pandas as pd + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:02:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C": [20, 21, 22], + ... "humid__%": [45, 50, 55], + ... "press__Pa": [1000, 1010, 1020], + ... }, + ... index=dates, + ... ) + >>> # Drop a single column + >>> dropper = DropColumns(columns="temp__°C") + >>> result = dropper.fit_transform(df) + >>> print(result) + humid__% press__Pa + 2024-01-01 00:00:00+00:00 45.0 1000.0 + 2024-01-01 00:01:00+00:00 50.0 1010.0 + 2024-01-01 00:02:00+00:00 55.0 1020.0 + >>> # Drop multiple columns + >>> dropper_multi = DropColumns(columns=["temp__°C", "humid__%"]) + >>> result_multi = dropper_multi.fit_transform(df) + >>> print(result_multi) + press__Pa + 2024-01-01 00:00:00+00:00 1000.0 + 2024-01-01 00:01:00+00:00 1010.0 + 2024-01-01 00:02:00+00:00 1020.0 + + Notes + ----- + - If a specified column doesn't exist in the DataFrame, it will be silently + ignored + - The order of remaining columns is preserved + - If no columns are specified (columns=None), the DataFrame is returned + unchanged + Returns + ------- + pd.DataFrame + The DataFrame with specified columns removed. The output maintains + the same DateTimeIndex as the input, with only the specified columns + removed. """ def __init__(self, columns: str | list[str] = None): @@ -1672,15 +2717,69 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class ReplaceTag(BaseProcessing): - """ - Replaces Tide tag components with new values based on a specified mapping. - Tags are structured as strings separated by "__", typically following the format - "Name__unit__bloc__sub_bloc". + """A transformer that replaces components of Tide tag names with new values. - Attributes: - tag_map (dict[str, str]): A dictionary mapping old tag substrings to new - tag substrings. + This transformer allows you to selectively replace parts of Tide tag names + (components separated by "__") with new values. It is particularly useful + for standardizing tag names, updating units, or changing block/sub-block + names across multiple columns. + Parameters + ---------- + tag_map : dict[str, str], optional (default=None) + A dictionary mapping old tag components to new values. + Keys are the components to replace, values are their replacements. + Example: {'°C': 'K', 'room1': 'room2'} + If None, no replacements are made and the DataFrame is returned unchanged. + + Attributes + ---------- + feature_names_in_ : list[str] + Names of input columns (set during fit). + feature_names_out_ : list[str] + Names of output columns with replaced tag components. + + Examples + -------- + >>> import pandas as pd + >>> # Create DataFrame with DateTimeIndex + >>> dates = pd.date_range( + ... start="2024-01-01 00:00:00", end="2024-01-01 00:02:00", freq="1min" + ... ).tz_localize("UTC") + >>> df = pd.DataFrame( + ... { + ... "temp__°C__room1__north": [20, 21, 22], + ... "humid__%__room1__north": [45, 50, 55], + ... "press__Pa__room1__north": [1000, 1010, 1020], + ... }, + ... index=dates, + ... ) + >>> # Replace room1 with room2 and °C with K + >>> replacer = ReplaceTag( + ... tag_map={ + ... "room1": "room2", + ... "°C": "K", # It is dumb, just for the exemple + ... } + ... ) + >>> result = replacer.fit_transform(df) + >>> print(result) + temp__K__room2__north humid__%__room2__north press__Pa__room2__north + 2024-01-01 00:00:00+00:00 20.0 0.45 1000.0 + 2024-01-01 00:01:00+00:00 21.0 0.50 1010.0 + 2024-01-01 00:02:00+00:00 22.0 0.55 1020.0 + + Notes + ----- + - Tide tags follow the format "name__unit__block__sub_block" + - The transformer preserves the order of tag components + - Components not specified in tag_map remain unchanged + - If tag_map is None, the DataFrame is returned unchanged + + Returns + ------- + pd.DataFrame + The DataFrame with updated column names based on the tag replacements. + The output maintains the same DateTimeIndex and data values as the input. """ def __init__(self, tag_map: dict[str, str] = None):