diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e1b539d..2c6d39b 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -23,7 +23,8 @@ jobs: - name: Install dependencies run: | - pip install sphinx sphinx-rtd-theme sphinx-autodoc-typehints sphinx-copybutton + pip install -r requirements-docs.txt + pip install -r requirements.txt - name: Create VERSION file run: echo "${GITHUB_REF##*/}" > "${PKG_NAME}/VERSION" diff --git a/README.md b/README.md index be2e1f0..4d04c94 100644 --- a/README.md +++ b/README.md @@ -48,11 +48,15 @@ For an example using one of the CARE2Compare datasets, run: For more information, have a look at the notebook [Quick Fault Detection](./notebooks/Example%20-%20Quick%20Fault%20Detection.ipynb) -## Fault detection in 4 lines of code +## Fault detection quickstart ```python from energy_fault_detector import FaultDetector, Config +from energy_fault_detector.config import generate_quickstart_config +# 1) Generate and save a base config (YAML) +generate_quickstart_config(output_path="base_config.yaml") +# 2) Train and predict using the generated config fault_detector = FaultDetector(config=Config('base_config.yaml')) model_data = fault_detector.train(sensor_data=sensor_data, normal_index=normal_index) results = fault_detector.predict(sensor_data=test_sensor_data) @@ -84,10 +88,8 @@ All contributions, bug reports, bug fixes, documentation improvements, enhanceme 2. Unification, standardisation and generic improvements 1. Additional options for all autoencoders (e.g. drop out, regularization) 2. Data preparation (e.g. extend imputation strategies). - 3. Download method for the Care2Compare class. - 3. Unify default value settings. - 4. No or low configuration - 5. Upgrade to Keras 3.0 + 3. No or low configuration need (e.g. use defaults where possible). + 4. Upgrade to Keras 3.0 3. Root cause analysis expansion 1. integrate SHAP and possibly other XAI-methods. diff --git a/docs/advanced_config.yaml b/docs/advanced_config.yaml new file mode 100644 index 0000000..ac25ad0 --- /dev/null +++ b/docs/advanced_config.yaml @@ -0,0 +1,111 @@ +train: + # clip training data to remove outliers (only applied for training) + data_clipping: # (optional) if not specified, not applied. + lower_percentile: 0.01 + upper_percentile: 0.99 + # Choose one of: + # features_to_exclude: + # - do_not_clip_this_feature + # features_to_clip: + # - clip_only_this_feature + + data_preprocessor: + steps: + # Replace consecutive duplicate 0-values with NaN + - name: duplicate_to_nan + params: + value_to_replace: 0 + n_max_duplicates: 6 + features_to_exclude: + - do_not_replace_value_with_nan_for_this_feature + # Normalize counters to differences (configure your counter columns) + # If needed, you can create multiple counter_diff_transformer steps with different settings for different counters + - name: counter_diff_transformer + step_name: counter_diff_energy + params: + counters: + - energy_total_kwh + compute_rate: false + reset_strategy: zero + fill_first: nan + # Column selection: drop columns where > 20% is missing and exclude specific features + - name: column_selector + params: + max_nan_frac_per_col: 0.20 + features_to_exclude: + - feature1 + - feature2 + # Alternatively, keep only selected features: + # features_to_select: + # - temp_outdoor + # - flow + # - power + # Filter low unique value features or high-zero-fraction columns + - name: low_unique_value_filter + params: + min_unique_value_count: 2 + max_col_zero_frac: 0.99 + # Transform angles to sin/cos + - name: angle_transformer + params: + angles: + - angle1 + - angle2 + # Imputer (explicit; will be auto-inserted if omitted) + - name: simple_imputer + params: + strategy: mean + # Scaler (choose one; StandardScaler is auto-added by default if omitted) + - name: standard_scaler + params: + with_mean: true + with_std: true + + data_splitter: + # How to split data in train and validation sets for the autoencoder + type: sklearn + validation_split: 0.2 + shuffle: true # false by default (last part of the data is taken as validation data in this case) + # or block splitting, 4 weeks training, 1 week validation + # type: DataSplitter + # train_block_size: 4032 + # val_block_size: 1008 + + autoencoder: + name: MultilayerAutoencoder + params: + batch_size: 128 + # Use a ExponentialDecay schedule for the learning rate: + learning_rate: 0.001 # starting point + decay_rate: 0.99 + decay_steps: 100000 + # Set early stopping with max 1000 epochs, minimal improvement of 1e-4 and patience of 5 epochs + early_stopping: True + min_delta: 0.0001 + patience: 5 + epochs: 1000 + # architecture settings + layers: [200, 100, 50] + code_size: 20 + act: prelu # activation to use for hidden layers + last_act: linear # output layer activation + + anomaly_score: + name: rmse + params: + scale: false + + threshold_selector: + name: fbeta + params: + beta: 0.5 + +root_cause_analysis: + alpha: 0.5 + init_x_bias: recon + num_iter: 1000 + verbose: true + +predict: + criticality: + max_criticality: 144 diff --git a/docs/basic_config.yaml b/docs/basic_config.yaml new file mode 100644 index 0000000..d927187 --- /dev/null +++ b/docs/basic_config.yaml @@ -0,0 +1,45 @@ +train: + # clip training data to remove outliers (only applied for training) + data_clipping: # (optional) if not specified, not applied. + # Use features_to_exclude or features_to_clip: [feature] to skip or to apply to specific features + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + steps: + # This drops features where > 20% is missing + - name: column_selector + params: + max_nan_frac_per_col: 0.2 + # This drops constants by default (controlled by `min_unique_value_count`) + - name: low_unique_value_filter + # SimpleImputer and StandardScaler are always added + + data_splitter: + # How to split data in train and validation sets for the autoencoder + type: sklearn + validation_split: 0.2 + shuffle: true + + autoencoder: + name: default + params: + layers: # Symmetric autoencoder: inputs - 200 - 100 - 50 - 20 - 50 - 100 - 200 - outputs + - 200 + - 100 + - 50 + code_size: 20 # Size of the bottleneck layer + + anomaly_score: + name: rmse + + threshold_selector: + fit_on_val: true + name: quantile + params: + quantile: 0.95 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/docs/conf.py b/docs/conf.py index c4de4e8..7aba3a6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -127,6 +127,11 @@ napoleon_use_param = True napoleon_use_rtype = True +napoleon_type_aliases = { + "Config": "energy_fault_detector.Config", + "FaultDetector": "energy_fault_detector.FaultDetector", +} + # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/configuration.rst b/docs/configuration.rst new file mode 100644 index 0000000..530b5bd --- /dev/null +++ b/docs/configuration.rst @@ -0,0 +1,136 @@ +.. _configuration_guide: + +Configuration +================================ +This page explains how to configure training, prediction, and optional root cause analysis (ARCANA). + +.. contents:: Table of Contents + :depth: 3 + :local: + +Quick start: minimal configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A minimal configuration that clips outliers, imputes missing values, and scales features: + +.. include:: basic_config.yaml + :literal: + +This setup: + +- Applies DataClipper if specified. +- Builds a DataPreprocessor with: + + - ColumnSelector that drops columns with more than 20% NaNs (configurable). + - LowUniqueValueFilter that removes constant features by default (configurable). + - SimpleImputer (mean) and a scaler (StandardScaler by default). If you do not add an imputer/scaler explicitly, + the pipeline ensures mean-imputation and StandardScaler are added. + +- Trains a default autoencoder (with provided architecture, otherwise default values), with an RMSE anomaly score and a + quantile threshold selector. +- Runs ARCANA with provided parameters when calling :py:obj:`FaultDetector.predict(..., root_cause_analysis=True) `. + If not provided, default ARCANA parameters are used (see :py:obj:`ARCANA docs `). + +If you leave out the data_preprocessor configuration (i.e., ``data_preprocessor: {}``), a default preprocessing pipeline +is generated, which drops constant features, features where >5% of the data is missing, imputes remaining missing values +with the mean value and scales the data to zero mean and unit standard deviation. + +Detailed configuration +^^^^^^^^^^^^^^^^^^^^^^ +Below is a more thorough configuration. It shows how to specify preprocessing steps and more model parameters. + +.. include:: advanced_config.yaml + :literal: + +DataPreprocessor specification +"""""""""""""""""""""""""""""" +A steps-based preprocessing pipeline can be configured under ``train.data_preprocessor.steps``. Each step is a dict +with the following keys: + +- ``name`` (str): the registered step name (see table below). +- ``enabled`` (bool, optional): default ``True``; set to ``False`` to skip a step. +- ``params`` (dict, optional): constructor arguments for the step. +- ``step_name`` (str, optional): custom key for the sklearn pipeline; useful if a step is repeated. + +Allowed step names and aliases: + ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| Step name | Purpose | Aliases | ++=========================+===============================================+================================================+ +| column_selector | Drop columns with too many NaNs | \- | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| low_unique_value_filter | Drop columns with low variance/many zeros | \- | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| angle_transformer | Convert angles to sin/cos pairs | angle_transform | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| counter_diff_transformer| Convert counters to differences/rates | counter_diff, counter_diff_transform | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| simple_imputer | Impute missing values | imputer | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| standard_scaler | Standardize features (z-score) | standardize, standardscaler, standard | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| minmax_scaler | Scale to [0, 1] | minmax | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| duplicate_to_nan | Replace consecutive duplicate values with NaN | duplicate_value_to_nan, duplicate_values_to_nan| ++-------------------------+-----------------------------------------------+------------------------------------------------+ + +For detailed documentation of the data preprocessor pipeline, refer to the +:py:obj:`DataPreprocessor ` docs. + +Other training configuration sections +""""""""""""""""""""""""""""""""""""" + +- Data clipping: + :py:obj:`DataClipper ` supports + ``features_to_exclude`` and ``features_to_clip`` for fine-grained control. + + +- Data splitter (``train.data_splitter``): + + - ``type``: one of ``BlockDataSplitter`` (aliases: ``blocks``, ``DataSplitter``), or ``sklearn`` (alias ``train_test_split``). + - For sklearn: ``validation_split`` (float in (0, 1)) and ``shuffle`` (bool). + - For :py:obj:`BlockDataSplitter `: ``train_block_size`` and ``val_block_size``. + - Early stopping guard: if ``train.autoencoder.params.early_stopping`` is true, you must either set a + valid ``validation_split`` in (0, 1), or use :py:obj:`BlockDataSplitter ` + with a positive ``val_block_size``. + + +- Autoencoder (``train.autoencoder``): + + - ``name``: class name in the registry. + - ``params``: architecture and training args (e.g., ``layers``, ``epochs``, ``learning_rate``, ``early_stopping``). + Refer to the autoencoder class docs (:py:obj:`autoencoders `) for specific params and their defaults. + +- Anomaly score (``train.anomaly_score``): + + - ``name``: score name (e.g., ``rmse``, ``mahalanobis``). + - ``params``: score-specific parameters. Refer to the :py:obj:`anomaly_scores ` docs. + +- Threshold selector (``train.threshold_selector``): + + - ``name``: e.g., ``quantile``, ``fbeta``, etc. + - ``fit_on_val``: fit the threshold on validation only. + - ``params``: selector-specific parameters (e.g., ``quantile`` for the quantile selector). + See the :py:obj:`threshold_selectors ` docs for more info on the settings. + +Prediction options +^^^^^^^^^^^^^^^^^^ +Under ``predict``, you can set: + +- ``criticality.max_criticality``: cap the calculated criticality (anomaly counter) to this value. + + +Root cause analysis (ARCANA) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If ``root_cause_analysis`` is provided, ARCANA will attempt to attribute anomalies to specific features using the +provided settings. If not provided, default settings are used. For detailed documentation refer to +:py:obj:`ARCANA docs `. + + +Old params data preprocessing configuration (for older versions) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Older configurations use params under ``train.data_preprocessor.params``. +These remain supported but are deprecated in favor of steps mode. +When both ``steps`` and legacy params are present, ``steps`` take precedence and legacy params are ignored with a warning. + +.. include:: old_config.yaml + :literal: diff --git a/docs/index.rst b/docs/index.rst index dda8d7b..c13d39e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,10 +1,10 @@ Energy Fault Detector - Autoencoder-based Fault Detection for the Future Energy System -============================================================ +====================================================================================== **Energy Fault Detector** is an open-source Python package designed for the automated detection of anomalies in operational data from renewable energy systems as well as power grids. It uses autoencoder-based normal behaviour models to identify irregularities in operational data. In addition to the classic anomaly detection, the package -includes the unique “ARCANA” approach for root cause analysis and thus allows interpretable early fault detection. +includes the unique ''ARCANA'' approach for root cause analysis and thus allows interpretable early fault detection. In addition to the pure ML models, the package also contains a range of preprocessing methods, which are particularly useful for analyzing systems in the energy sector. A holistic `EnergyFaultDetector` framework is provided for easy use of all these methods, which can be adapted to the respective use case via a single configuration file. @@ -27,11 +27,10 @@ To install the `energy-fault-detector` package, run: :glob: :maxdepth: 2 - The Energy Fault Detector package usage_examples + configuration logging - changelog - + The EnergyFaultDetector package Module index ================== diff --git a/docs/logging.rst b/docs/logging.rst index 485fcd0..7c943b3 100644 --- a/docs/logging.rst +++ b/docs/logging.rst @@ -1,45 +1,19 @@ Logging Configuration ===================== -The framework uses Python's built-in logging module to provide logging capabilities. By default, the logging -configuration is defined in a YAML file. You can customize this configuration to suit your needs. +The framework uses Python's built-in logging module for logging. +You can customize this configuration to suit your needs. Default Configuration --------------------- -The framework uses a default logging configuration file named ``logging.yaml``. +The framework uses a default logging configuration file ``energy_fault_detector/logging.yaml``. The logger used throughout the code is called ``energy_fault_detector``. -The default logging configuration is as follows. - -.. code-block:: yaml - - version: 1 - disable_existing_loggers: False - formatters: - simple: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - - handlers: - console: - class: logging.StreamHandler - level: DEBUG - formatter: simple - stream: ext://sys.stdout - - loggers: - energy_fault_detector: - level: INFO - handlers: [console] - propagate: no - - root: - level: INFO - handlers: [console] - You can silence the logger as follows: .. code-block:: python + import logging from energy_fault_detector.fault_detector import FaultDetector, Config diff --git a/docs/config_example.yaml b/docs/old_config.yaml similarity index 52% rename from docs/config_example.yaml rename to docs/old_config.yaml index 3dd8c39..4e0f454 100644 --- a/docs/config_example.yaml +++ b/docs/old_config.yaml @@ -1,10 +1,5 @@ train: - data_clipping: # (optional) if not specified, not applied. - # clip training data to remove outliers - lower_percentile: 0.01 - upper_percentile: 0.99 - features_to_exclude: - - do_not_clip_this_feature + # ... data_preprocessor: # only imputation and scaling are done by default, other steps can be skipped. @@ -28,39 +23,4 @@ train: duplicate_features_to_exclude: # DuplicateValuesToNan option - list of feature to not transform with DuplicateValuesToNan - do_not_replace_value_with_nan - data_splitter: # (optional) Define block size of train and validation blocks. Optional, if not specified, the defaults are used - # defaults: - type: DataSplitter # or sklearn - train_block_size: 5040 - val_block_size: 1680 # set val_block_size = 0 to use all data for training - - autoencoder: - name: 'MultilayerAutoencoder' - params: - batch_size: 128 - decay_rate: 0.001 # remove decay_rate+decay_steps for a fixed learning rate - decay_steps: 10000 - epochs: 10 - layers: - - 200 # Size of the first and last hidden layer - - 100 # Size of the second and second to last hidden layer - - 50 # Size of the third and third to last hidden layer - code_size: 20 # Size of the bottleneck - learning_rate: 0.001 - loss_name: 'mean_squared_error' - - anomaly_score: - name: 'rmse' - params: - scale: false - - threshold_selector: - name: 'fbeta' - params: - beta: 0.5 - -root_cause_analysis: # (optional) if not specified, no root_cause_analysis (ARCANA) is run - alpha: 0.8 - init_x_bias: recon - num_iter: 200 - + # ... diff --git a/docs/usage_examples.rst b/docs/usage_examples.rst index ed70e09..2dbe77e 100644 --- a/docs/usage_examples.rst +++ b/docs/usage_examples.rst @@ -14,65 +14,63 @@ refer to the example notebooks in the repository's notebooks folder. Energy Fault Detection -^^^^^^^^^^^^^^^ -The main interface for the `energy-fault-detector` package is the :py:obj:`FaultDetector ` class, which -needs a configuration object :py:obj:`Config `. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The main interface for the `energy-fault-detector` package is the :py:obj:`FaultDetector ` class, which +needs a configuration object :py:obj:`Config `. -To create a new :py:obj:`FaultDetector ` model, +To create a new :py:obj:`FaultDetector ` model, create a configuration, as described below in the :ref:`configuration` section, and run: .. code-block:: python - from energy_fault_detector.fault_detector import FaultDetector - from energy_fault_detector.config import Config + from energy_fault_detector import FaultDetector, Config - config = Config('configs/base_config.yaml') + config = Config('configs/basic_config.yaml') fault_detector = FaultDetector(config=config, model_directory='model_directory') - -To train new models, you need to provide the input data and call the ``fit`` method: +To train new models, you need to provide the input data and call the :py:obj:`FaultDetector.fit ` method: .. code-block:: python # get data from database / csv / API ... sensor_data = ... # a pandas DataFrame with timestamp as index and numerical sensor values as columns normal_index = ... # a pandas Series with timestamp as index and booleans indicating normal behaviour - # NOTE: The normal_index is optional, it is used to select training data for the autoencoder. - # If not provided, we assume all data represents normal behaviour. The other data points are used to set a - # threshold for the fault detection. + # NOTE: The normal_index is optional; it is used to select training data for the autoencoder. + # If not provided, we assume all data represents normal behaviour. + # If you do not have any labels, you cannot use th F-beta-score- and FDR-based thresholds. - # If you do not use the models for time series, the index can also be a standard RangeIndex, as long as the - # sensor_data dataframe and the normal_index series have the same index. + # If you do not use the models for time series, the index can also be a standard RangeIndex, + # as long as the sensor_data DataFrame and the normal_index Series share the same index. model_data = fault_detector.fit(sensor_data=sensor_data, normal_index=normal_index, save_models=True) # to save model manually: # fault_detector.save_models('model_name') # model_name is optional -The trained models are saved locally in the provided ``model_directory``. The ``fit`` method returns a +The trained models are saved locally in the provided ``model_directory``. The :py:obj:`FaultDetector.fit ` method returns a :py:obj:`ModelMetadata ` object with the model metadata such as the model date and the model path. -To predict using the trained model, use the ``predict`` method: +To predict using the trained model, use the :py:obj:`FaultDetector.predict ` method: .. code-block:: python results = fault_detector.predict(sensor_data=test_sensor_data) The result is a :py:obj:`FaultDetectionResult ` object -with with the following information: +with the following information: -* predicted_anomalies: DataFrame with a column 'anomaly' (bool). -* reconstruction: DataFrame with reconstruction of the sensor data with timestamp as index. -* deviations: DataFrame with reconstruction errors. -* anomaly_score: DataFrame with anomaly scores for each timestamp. -* bias_data: DataFrame with ARCANA results with timestamp as index. None if ARCANA was not run. -* arcana_losses: DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run. -* tracked_bias: List of DataFrames. None if ARCANA was not run. +* predicted_anomalies: pandas Series with the predicted anomalies (bool). +* reconstruction: pandas DataFrame with reconstruction of the sensor data with timestamp as index. +* deviations: pandas DataFrame with reconstruction errors. +* anomaly_score: pandas Series with anomaly scores for each timestamp. +* bias_data: pandas DataFrame with ARCANA results with timestamp as index. None if ARCANA was not run. +* arcana_losses: pandas DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run. +* tracked_bias: List of pandas DataFrames. None if ARCANA was not run. You can also create a :py:obj:`FaultDetector ` object and load -trained models using the ``load_models`` method. In this case, you do not need to provide a ``model_path`` -in the ``predict`` method. +trained models using the :py:obj:`FaultDetector.load_models ` method. In this case, you do not need to provide a ``model_path`` +in the :py:obj:`predict ` method. .. code-block:: python @@ -95,11 +93,17 @@ The training configuration is set with a ``yaml`` file which contains ``train`` train new models and ``root_cause_analysis`` specification if you want to analyse the model predictions with the `ARCANA` algorithm. An example: -.. include:: config_example.yaml +.. include:: basic_config.yaml :literal: +If you leave out the data_preprocessor configuration (i.e., ``data_preprocessor: None``), as default preprocessing +pipeline is generated, which drops constant features, features where >5% of the data is missing, imputes remaining +missing values with the mean value and scales the data to zero mean and unit standard deviation. + +See the :ref:`Configuration guide ` for more details on the configuration file and options. + To update the configuration 'on the fly' (for example for hyperparameter optimization), you provide a new -configuration dictionary via the ``update_config`` method: +configuration dictionary via the :py:obj:`Config.update_config ` method: .. code-block:: python @@ -150,7 +154,7 @@ you can import the data preprocessor, autoencoder, anomaly score and threshold s This allows you to add additional steps or use different data preprocessing pipelines. -An example training pipeline (similar to the :py:obj:`FaultDetector ` class ) +An example training pipeline (similar to the :py:obj:`FaultDetector ` class) would be: .. code-block:: python diff --git a/energy_fault_detector/__init__.py b/energy_fault_detector/__init__.py index 33c5e19..e77c74a 100644 --- a/energy_fault_detector/__init__.py +++ b/energy_fault_detector/__init__.py @@ -46,4 +46,3 @@ version = f.readlines()[0].strip() __version__ = version -__all__ = ['FaultDetector', 'Config', 'registry', 'quick_fault_detector'] diff --git a/energy_fault_detector/base_config.yaml b/energy_fault_detector/base_config.yaml index f708a7e..aebcc2b 100644 --- a/energy_fault_detector/base_config.yaml +++ b/energy_fault_detector/base_config.yaml @@ -4,14 +4,13 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - imputer_strategy: mean - max_nan_frac_per_col: 0.2 - min_unique_value_count: 2 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.2 + - name: low_unique_value_filter + - name: simple_imputer + - name: standard_scaler autoencoder: name: default diff --git a/energy_fault_detector/config/__init__.py b/energy_fault_detector/config/__init__.py index 1184dd5..45c6372 100644 --- a/energy_fault_detector/config/__init__.py +++ b/energy_fault_detector/config/__init__.py @@ -2,3 +2,4 @@ from energy_fault_detector.config.config import Config from energy_fault_detector.config.base_config import InvalidConfigFile +from energy_fault_detector.config.quickstart_config import generate_quickstart_config diff --git a/energy_fault_detector/config/config.py b/energy_fault_detector/config/config.py index 9722615..de26319 100644 --- a/energy_fault_detector/config/config.py +++ b/energy_fault_detector/config/config.py @@ -34,9 +34,19 @@ 'data_preprocessor': { 'type': 'dict', 'required': True, - 'allow_unknown': True, + 'allow_unknown': False, + 'nullable': True, # if not specfied, create default pipeline 'schema': { - 'params': {'type': 'dict', 'required': False}, + 'params': {'type': 'dict', 'required': False, 'nullable': True,}, + 'steps': { + 'type': 'list', + 'required': False, + 'nullable': True, + 'schema': { + 'type': 'dict', + 'allow_unknown': True + } + }, } }, 'threshold_selector': { @@ -60,7 +70,7 @@ 'type': 'dict', 'required': False, # defaults if not specified 'schema': { - 'type': {'type': 'string', 'required': False, 'default': 'DataSplitter', + 'type': {'type': 'string', 'required': False, 'default': 'BlockDataSplitter', 'allowed': ['DataSplitter', 'BlockDataSplitter', 'blocks', 'sklearn', 'train_test_split']}, 'train_block_size': {'type': 'integer', 'required': False, 'dependencies': {'type': ['DataSplitter', 'BlockDataSplitter', 'blocks']}}, 'val_block_size': {'type': 'integer', 'required': False, 'dependencies': {'type': ['DataSplitter', 'BlockDataSplitter', 'blocks']}}, @@ -179,15 +189,10 @@ def data_clipping_params(self) -> Dict[str, Any]: """Data clipping parameters.""" return self.config_dict.get('train', {}).get('data_clipping', {}) - @property - def angle_columns(self) -> List[str]: - """List of angle columns.""" - return self.config_dict.get('train', {}).get('data_preprocessor', {}).get('params', {}).get('angles', []) - @property def max_criticality(self) -> Optional[int]: """Max criticality value.""" - return self.config_dict.get('prediction', {}).get('criticality', {}).get('max_criticality', 144) + return self.config_dict.get('predict', {}).get('criticality', {}).get('max_criticality', 144) @property def fit_threshold_on_val(self) -> bool: diff --git a/energy_fault_detector/config/quickstart_config.py b/energy_fault_detector/config/quickstart_config.py new file mode 100644 index 0000000..208187b --- /dev/null +++ b/energy_fault_detector/config/quickstart_config.py @@ -0,0 +1,246 @@ +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import yaml + + +def _build_preprocessor_steps( + *, + max_nan_frac: float, + min_unique_value_count: int, + max_col_zero_frac: float, + angle_columns: Optional[List[str]], + counter_columns: Optional[List[str]], + imputer_strategy: str, + scaler: str, +) -> List[Dict[str, Any]]: + """ + Build the steps specification for the DataPreprocessor pipeline. + + This helper focuses solely on the steps list for the preprocessing pipeline + and keeps the public function small, readable, and testable. + + Args: + max_nan_frac (float): Maximum fraction of missing values allowed per column. + min_unique_value_count (int): Minimal number of unique values required for a column to remain. + max_col_zero_frac (float): Maximum allowed fraction of zeros in a column (used in the unique-value filter). + angle_columns (Optional[List[str]]): Optional list of column names to be angle-transformed. + counter_columns (Optional[List[str]]): Optional list of counter columns to be transformed to differences. + imputer_strategy (str): SimpleImputer strategy, e.g., "mean", "median", "most_frequent", or "constant". + scaler (str): Scaler type; supports "standard" (and aliases) or "minmax" (and aliases). + + Returns: + List[Dict[str, Any]]: A steps list suitable for DataPreprocessor(steps=[...]). + + Notes: + + - The order is kept minimal here; DataPreprocessor enforces proper ordering internally. + + """ + steps: List[Dict[str, Any]] = [] + + # Optional counter-diff transformation (DataPreprocessor will place it early). + if counter_columns: + steps.append( + { + "name": "counter_diff_transformer", + "params": { + "counters": counter_columns, + "compute_rate": False, + "fill_first": "nan", + }, + } + ) + + # Column selection: drop columns with too many NaNs. + steps.append( + { + "name": "column_selector", + "params": {"max_nan_frac_per_col": max_nan_frac}, + } + ) + + # Filter for columns with very few unique values or many zeros. + steps.append( + { + "name": "low_unique_value_filter", + "params": { + "min_unique_value_count": min_unique_value_count, + "max_col_zero_frac": max_col_zero_frac, + }, + } + ) + + # Optional angle transformer (e.g., degrees => sin/cos). + if angle_columns: + steps.append( + { + "name": "angle_transformer", + "params": {"angles": angle_columns}, + } + ) + + # Explicit imputer; adding it avoids relying on DataPreprocessor defaults. + steps.append( + { + "name": "simple_imputer", + "params": {"strategy": imputer_strategy}, + } + ) + + # Final scaler with aliases supported for convenience. + scaler_key = scaler.lower() + if scaler_key in ("standard", "standardize", "standard_scaler"): + steps.append({"name": "standard_scaler"}) + elif scaler_key in ("minmax", "minmax_scaler", "normalize"): + steps.append({"name": "minmax_scaler"}) + else: + raise ValueError( + f"Unknown scaler '{scaler}'. Use 'standard' (aka 'standardize') or 'minmax'." + ) + + return steps + + +def _dump_yaml_if_requested( + config: Dict[str, Any], + output_path: Optional[Union[str, Path]], +) -> None: + """ + Write the configuration dictionary to a YAML file if a path is provided. + + Args: + config (Dict[str, Any]): The configuration dictionary to serialize. + output_path (Optional[Union[str, Path]]): Destination path. If None, nothing is written. + + Raises: + RuntimeError: If PyYAML is not installed but output_path is not None. + """ + if output_path is None: + return + + if yaml is None: # pragma: no cover - optional dependency + raise RuntimeError( + "PyYAML is not installed; install 'pyyaml' or set output_path=None." + ) + + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + + with path.open("w", encoding="utf-8") as f: + yaml.safe_dump(config, f, sort_keys=False) + + +def generate_quickstart_config( + output_path: Optional[Union[str, Path]] = "base_config.yaml", + *, + # Preprocessor configuration + max_nan_frac: float = 0.05, + min_unique_value_count: int = 2, + max_col_zero_frac: float = 1.0, + angle_columns: Optional[List[str]] = None, + counter_columns: Optional[List[str]] = None, + imputer_strategy: str = "mean", + scaler: str = "standard", + # Early stopping + early_stopping: bool = False, + validation_split: float = 0.2, + # Thresholding + threshold_quantile: float = 0.99, + # Autoencoder defaults + batch_size: int = 128, + code_size: int = 20, + epochs: int = 10, + layers: Optional[List[int]] = None, + learning_rate: float = 1e-3, +) -> Dict[str, Any]: + """ + Generate a minimal, valid configuration for EnergyFaultDetector. + + This function returns a configuration dictionary that uses the steps-based + DataPreprocessor and sensible defaults for training. It can also write the + configuration to YAML if an output path is supplied. + + Example: + from energy_fault_detector import FaultDetector, Config + cfg = generate_quickstart_config(output_path=None) + fault_detector = FaultDetector(config=Config(config_dict=cfg)) + + Args: + output_path (Optional[Union[str, Path]]): YAML output path; set None to return only the dict. + max_nan_frac (float): Max fraction of missing values per column for selection. Default: 0.05 + min_unique_value_count (int): Minimal unique values required to keep a column. Default: 2 + max_col_zero_frac (float): Max fraction of zeros allowed in a column. Default: 1.0 + angle_columns (Optional[List[str]]): Optional columns to transform as angles (sin/cos). Default: None + counter_columns (Optional[List[str]]): Optional counter columns to convert to differences. Default: None + imputer_strategy (str): Strategy for SimpleImputer ("mean", "median", etc.). Default: mean + scaler (str): Scaler selection ("standard" or "minmax"; common aliases allowed). Default: standard + early_stopping (bool): Enable early stopping in the autoencoder training. Default: False + validation_split (float): Fraction for validation in sklearn splitter (0 < val < 1). + threshold_quantile (float): Quantile for the "quantile" threshold selector. Default: 0.99 + batch_size (int): Autoencoder batch size. Default: 128 + code_size (int): Bottleneck code size. Default: 20 + epochs (int): Number of training epochs. Default: 10 + layers (Optional[List[int]]): Autoencoder layer sizes; defaults to [200, 100, 50] if None. + learning_rate (float): Optimizer learning rate. + + Returns: + Dict[str, Any]: Configuration dictionary ready for Config(config_dict=...). + + Raises: + ValueError: If early_stopping is True but validation_split is not in (0, 1). + """ + if not (0 < validation_split < 1.0): + raise ValueError("validation_split must be in (0, 1).") + + # Fallback layers if none provided by user + if layers is None: + layers = [200, 100, 50] + + # Build the preprocessor steps list + steps = _build_preprocessor_steps( + max_nan_frac=max_nan_frac, + min_unique_value_count=min_unique_value_count, + max_col_zero_frac=max_col_zero_frac, + angle_columns=angle_columns, + counter_columns=counter_columns, + imputer_strategy=imputer_strategy, + scaler=scaler, + ) + + # Assemble training configuration + train_config: Dict[str, Any] = { + "data_preprocessor": {"steps": steps}, + "data_splitter": { + "type": "sklearn", + "validation_split": validation_split, + "shuffle": True, + }, + "autoencoder": { + "name": "default", + "params": { + "batch_size": batch_size, + "code_size": code_size, + "early_stopping": early_stopping, + "epochs": epochs, + "layers": layers, + "learning_rate": learning_rate, + }, + "verbose": 1, + }, + "anomaly_score": {"name": "rmse"}, + "threshold_selector": { + "fit_on_val": False, + "name": "quantile", + "params": {"quantile": threshold_quantile}, + }, + # Optional clipping (disabled by default; uncomment to enable): + # "data_clipping": {"lower_percentile": 0.001, "upper_percentile": 0.999}, + } + + config: Dict[str, Any] = {"train": train_config} + + # Optionally write YAML + _dump_yaml_if_requested(config=config, output_path=output_path) + + return config diff --git a/energy_fault_detector/core/fault_detection_model.py b/energy_fault_detector/core/fault_detection_model.py index 47df523..0aed66b 100644 --- a/energy_fault_detector/core/fault_detection_model.py +++ b/energy_fault_detector/core/fault_detection_model.py @@ -65,6 +65,11 @@ def __init__(self, config: Optional[Config] = None, model_directory: str = 'mode # build models self._model_factory: Optional[ModelFactory] = ModelFactory(config) if config else None + if config is None: + logger.debug('No configuration set. Load models and config from path with the `FaultDetector.load_models`' + ' method.') + else: + self._init_models() def _init_models(self): """Initialize models.""" @@ -79,24 +84,34 @@ def _init_models(self): self.data_preprocessor = self._model_factory.data_preprocessor @abstractmethod - def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, asset_id: Union[int, str] = None, - **kwargs) -> ModelMetadata: + def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, save_models: bool = True, + overwrite_models: bool = False, **kwargs) -> ModelMetadata: """Fit models on the given sensor_data and save them locally and return the metadata. Args: - asset_id: asset ID of the asset for which the model should be trained. sensor_data: pandas DataFrame with the sensor data to use. The time stamp should be the index and the sensor values as columns. normal_index: a pandas Series indicating normal behaviour as boolean with the timestamp as index. + save_models (bool, optional): Whether to save models. Defaults to True. + overwrite_models (bool, optional): If True, existing model directories can be overwritten. Defaults to + False. Returns: ModelMetadata object. """ - def train(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, asset_id: Union[int, str] = None, - **kwargs) -> ModelMetadata: - """Same as the `fit`-method.""" - return self.fit(sensor_data=sensor_data, normal_index=normal_index, asset_id=asset_id, **kwargs) + def train(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, **kwargs) -> ModelMetadata: + """Same as the `fit`-method. + + Args: + sensor_data: pandas DataFrame with the sensor data to use. + The time stamp should be the index and the sensor values as columns. + normal_index: a pandas Series indicating normal behaviour as boolean with the timestamp as index. + + Returns: + ModelMetadata object. + """ + return self.fit(sensor_data=sensor_data, normal_index=normal_index, **kwargs) @abstractmethod def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, asset_id: Union[int, str] = None diff --git a/energy_fault_detector/core/model_factory.py b/energy_fault_detector/core/model_factory.py index 4aef0bf..1aca17f 100644 --- a/energy_fault_detector/core/model_factory.py +++ b/energy_fault_detector/core/model_factory.py @@ -32,9 +32,11 @@ def _initialize_models(self) -> None: # Retrieve training configuration train_dict = self.config['train'] - # data preprocessor + # data preprocessor - not specified leads to a default pipeline + data_prep_conf = (train_dict.get('data_preprocessor', {}) or {}) self._models['data_preprocessor'] = DataPreprocessor( - **train_dict.get('data_preprocessor', {}).get('params', {}) + steps=data_prep_conf.get('steps'), + **data_prep_conf.get('params', {}) ) # Loop through each model type and initialize the corresponding model diff --git a/energy_fault_detector/data_preprocessing/column_selector.py b/energy_fault_detector/data_preprocessing/column_selector.py index 3274cbf..2c5ced3 100644 --- a/energy_fault_detector/data_preprocessing/column_selector.py +++ b/energy_fault_detector/data_preprocessing/column_selector.py @@ -1,4 +1,3 @@ - from typing import Optional, List import numpy as np @@ -14,7 +13,8 @@ class ColumnSelector(DataTransformer): Args: max_nan_frac_per_col: maximum fraction of NaN values allowed per column. Defaults to 0.05. If the fraction exceeds max_nan_frac_per_col, the column is dropped. - features_to_exclude: list of features that should be dropped. Defaults to None. + features_to_exclude: columns to drop (case-insensitive). + features_to_select: columns to keep (case-insensitive). Mutually exclusive with features_to_exclude. Attributes: feature_names_in_: list of column names in input. @@ -23,12 +23,20 @@ class ColumnSelector(DataTransformer): columns_dropped_: list of columns that were dropped. """ - def __init__(self, max_nan_frac_per_col: float = 0.05, features_to_exclude: List[str] = None): - + def __init__( + self, + max_nan_frac_per_col: float = 0.05, + features_to_exclude: Optional[List[str]] = None, + features_to_select: Optional[List[str]] = None, + ): super().__init__() - + if features_to_exclude is not None and features_to_select is not None: + raise ValueError("Only one of features_to_exclude or features_to_select can be specified.") + if not (0.0 <= max_nan_frac_per_col <= 1.0): + raise ValueError("max_nan_frac_per_col must be within [0, 1].") self.max_nan_frac_per_col: float = max_nan_frac_per_col self.features_to_exclude: List[str] = features_to_exclude if features_to_exclude is not None else [] + self.features_to_select: Optional[List[str]] = features_to_select # pylint: disable=attribute-defined-outside-init # noinspection PyAttributeOutsideInit @@ -43,11 +51,17 @@ def fit(self, x: pd.DataFrame, y: Optional[np.array] = None) -> 'ColumnSelector' self.feature_names_in_ = x.columns.to_list() self.n_features_in_ = len(x.columns) - # drop features to exclude - ignore upper/lower case - to_drop = [col for col in x.columns if col.lower() in - [excluded_feature.lower() for excluded_feature in self.features_to_exclude] - ] - x_transformed = x.drop(to_drop, axis=1, errors='ignore') + # If features_to_select is provided - ignore upper/lower case + if self.features_to_select is not None: + select_lower = [f.lower() for f in self.features_to_select] + keep_cols = [col for col in x.columns if col.lower() in select_lower] + x_transformed = x[keep_cols] + else: + # drop features to exclude - ignore upper/lower case + to_drop = [col for col in x.columns if col.lower() in + [excluded_feature.lower() for excluded_feature in self.features_to_exclude] + ] + x_transformed = x.drop(to_drop, axis=1, errors='ignore') # drop columns which have more than max_nan_frac_per_col relative NaN frequency empty_percentage = x_transformed.isnull().mean(axis=0) @@ -69,8 +83,11 @@ def transform(self, x: pd.DataFrame) -> pd.DataFrame: # transformation is not possible. missing_columns = set(self.feature_names_out_) - set(x.columns) if len(missing_columns) > 0: - raise ValueError('ColumnSelector: There are columns missing in the prediction data, which were present in' - ' the training data. New models need to be trained!') + raise ValueError( + 'ColumnSelector: There are columns missing in the prediction data, which were present in' + ' the training data. Missing columns: ' + f"{', '.join(sorted(missing_columns))}. New models need to be trained!" + ) x = x[self.feature_names_out_] # ensure ordering return x diff --git a/energy_fault_detector/data_preprocessing/counter_diff_transformer.py b/energy_fault_detector/data_preprocessing/counter_diff_transformer.py new file mode 100644 index 0000000..c56fb6e --- /dev/null +++ b/energy_fault_detector/data_preprocessing/counter_diff_transformer.py @@ -0,0 +1,322 @@ + +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +from sklearn.utils.validation import check_is_fitted + +from energy_fault_detector.core.data_transformer import DataTransformer + + +class CounterDiffTransformer(DataTransformer): + """ + Transform monotonic counter columns into per-sample increments (default) or per-second rates (if compute_rate=True), + handling resets/rollovers and masking long time gaps. + + It handles counter resets/rollovers and optionally masks values after large time gaps, which helps avoid misleading + diffs/rates caused by missing data. + + Args: + counters: List of counter column names to transform. + compute_rate: If True, output per-second rates (increment / dt). If False (default), + output per-sample increments. + reset_strategy: One of {'zero', 'rollover', 'nan', 'auto'}: + + - 'zero' (default): if diff < 0, treat as reset-to-zero; increment = current_value. + - 'rollover': if diff < 0, increment = current_value + (rollover_value - previous_value). + - 'nan': if diff < 0, set increment to NaN. + - 'auto': use 'rollover' if rollover_values contains the counter; otherwise 'zero'. + + rollover_values: Optional mapping counter -> known max value (used by 'rollover' or 'auto'). + small_negative_tolerance: Treat small negative diffs (``abs(diff) <= tol``) as 0 (noise). Default: 0.0. + fill_first: One of {'nan', 'zero'}. How to fill the first sample where diff is undefined. + keep_original: If True, keep original counters alongside new outputs. If False, drop them. + gap_policy: One of {'mask', 'ignore'}: + + - 'mask' (default): set output to NaN for rows where time delta > threshold. + - 'ignore': do nothing special for large gaps. + + max_gap_seconds: Explicit threshold (in seconds) for gap masking. If provided, overrides + max_gap_factor. + max_gap_factor: If max_gap_seconds is None, use threshold = factor * median(dt). + Default is 3.0. + + Notes: + - A DatetimeIndex is required if compute_rate=True or gap_policy='mask'. + - The inverse_transform is a no-op and returns the input unchanged. + + Examples: + - Diffs: [0, 1, 3, 0 (reset), 2] -> [NaN|0, 1, 2, 0|NaN, 2] + - Rates: increment / dt (in seconds), with large-gap rows optionally masked to NaN. + """ + + def __init__( + self, + counters: List[str], + compute_rate: bool = False, + reset_strategy: str = "zero", + rollover_values: Optional[Dict[str, float]] = None, + small_negative_tolerance: float = 0.0, + fill_first: str = "nan", + keep_original: bool = False, + gap_policy: str = "mask", + max_gap_seconds: Optional[float] = None, + max_gap_factor: float = 3.0, + ) -> None: + super().__init__() + self.counters = counters or [] + self.compute_rate = compute_rate + self.reset_strategy = reset_strategy + self.rollover_values = rollover_values or {} + self.small_negative_tolerance = float(small_negative_tolerance) + self.fill_first = fill_first + self.keep_original = keep_original + self.gap_policy = gap_policy + self.max_gap_seconds = max_gap_seconds + self.max_gap_factor = float(max_gap_factor) + + def fit(self, x: pd.DataFrame, y: Optional[pd.Series] = None) -> "CounterDiffTransformer": + """Validate inputs and compute output schema. + + This method validates the time index (when needed), stores the list of counters that are + present in the input, and computes the output column layout such that transform() can + reproduce the same order deterministically. + + Args: + x: Input DataFrame. Requires a DatetimeIndex if compute_rate=True or gap_policy='mask'. + y: Unused. Present for estimator interface compatibility. + + Returns: + self + + Raises: + ValueError: If a DatetimeIndex is required but missing or non-monotonic. + """ + self.feature_names_in_ = x.columns.to_list() + self.n_features_in_ = len(x.columns) + + # DatetimeIndex is required for rates or for gap masking + if self.compute_rate or self.gap_policy == "mask": + if not isinstance(x.index, pd.DatetimeIndex): + raise ValueError( + "CounterDiffTransformer: DatetimeIndex required (rate or gap masking)." + ) + if not x.index.is_monotonic_increasing: + raise ValueError("CounterDiffTransformer: index must be monotonic increasing.") + + # Keep only counters present in the DataFrame + self.counters_ = [c for c in self.counters if c in self.feature_names_in_] + + # Determine output suffix + self.output_suffix_ = "_rate" if self.compute_rate else "_diff" + + # Compose output feature order + new_cols = [f"{c}{self.output_suffix_}" for c in self.counters_] + if self.keep_original: + # Append new output columns after all original features + self.feature_names_out_ = list(self.feature_names_in_) + new_cols + else: + # Keep non-counter features first, then the new output columns + others = [col for col in self.feature_names_in_ if col not in self.counters_] + self.feature_names_out_ = others + new_cols + + # Track columns dropped when keep_original is False (for introspection/testing) + self.columns_dropped_ = [] if self.keep_original else [c for c in self.counters_] + return self + + def _time_deltas_seconds(self, x: pd.DataFrame) -> Optional[pd.Series]: + """Compute per-row time delta in seconds, or None if not needed. + + Returns NaN for the first row and when dt is 0 seconds (zero dt is masked to NaN to avoid + division by zero for rate calculations). + + Args: + x: Input DataFrame. + + Returns: + A Series of dt seconds aligned to x.index, or None if neither rate nor masking is used. + + Raises: + ValueError: If a DatetimeIndex is required but missing or non-monotonic. + """ + if not (self.compute_rate or self.gap_policy == "mask"): + return None + if not isinstance(x.index, pd.DatetimeIndex): + raise ValueError("CounterDiffTransformer: DatetimeIndex required for rate or gap masking.") + if not x.index.is_monotonic_increasing: + raise ValueError("CounterDiffTransformer: index must be monotonic increasing.") + + # Create a series of timestamps to keep the original index for alignment + dt = pd.Series(x.index, index=x.index).diff().dt.total_seconds() + # Prevent division by zero when computing rates + dt = dt.mask(dt == 0, np.nan) + return dt + + def _gap_threshold(self, dt: pd.Series) -> Optional[float]: + """Compute the gap masking threshold in seconds, or None if masking disabled. + + Args: + dt: Series of time deltas in seconds. + + Returns: + Threshold in seconds or None if masking is not applicable. If max_gap_seconds is given, + it is used; otherwise threshold = max_gap_factor * median(dt). If median is not finite + or <= 0, returns None and masking is effectively disabled. + """ + if self.gap_policy != "mask" or dt is None: + return None + if self.max_gap_seconds is not None: + return float(self.max_gap_seconds) + + med = float(np.nanmedian(dt.values)) if len(dt) else np.nan + if not np.isfinite(med) or med <= 0: + return None + return self.max_gap_factor * med + + def _compute_increment( + self, + s: pd.Series, + strategy: str, + rollover_value: Optional[float], + ) -> pd.Series: + """Compute per-sample increment for a counter series with reset handling. + + This applies the selected reset strategy to negative diffs and also clamps small negative + diffs (within small_negative_tolerance) to zero to mitigate minor noise/clock skew. + + Args: + s: Input counter Series. + strategy: Reset strategy ('zero', 'rollover', 'nan', 'auto'). + rollover_value: Known rollover maximum (used by 'rollover' or 'auto'). + + Returns: + Series of increments aligned to s.index, with the first element filled according to + fill_first ('zero' or 'nan'). + + Raises: + ValueError: If series contains non-numeric values (excluding existing NaNs), + or if strategy is 'rollover' but rollover_value is None, + or if an unknown reset strategy is provided. + """ + # Try to coerce to numeric; if this introduces new NaNs (beyond existing ones), error out + v = pd.to_numeric(s, errors="coerce") + if v.isna().sum() > s.isna().sum(): + raise ValueError( + "CounterDiffTransformer: non-numeric values found in counter series. " + "Ensure all counter values are numeric or NaN." + ) + prev = v.shift(1) + diff = v.diff() + + # Clamp small negative diffs to zero (treat as noise) + if self.small_negative_tolerance > 0: + small_neg = (diff < 0) & ((-diff) <= self.small_negative_tolerance) + diff = diff.mask(small_neg, 0.0) + + neg_mask = diff < 0 + + # Map 'auto' to a concrete strategy + if strategy == "auto": + strategy = "rollover" if rollover_value is not None else "zero" + + if strategy == "nan": + inc = diff.mask(neg_mask, np.nan) + elif strategy == "zero": + # Treat reset-to-zero as increment equals current value + inc = diff.where(~neg_mask, v) + elif strategy == "rollover": + if rollover_value is None: + # Explicit 'rollover' requires a value. + raise ValueError( + "CounterDiffTransformer: rollover strategy requires a rollover_value for the " + f"counter '{s.name}'. Use reset_strategy='auto' to fallback to 'zero' when not provided." + ) + # Add the wrapped amount: current + (rollover - previous) + inc = diff.where(~neg_mask, v + (rollover_value - prev)) + else: + raise ValueError(f"CounterDiffTransformer: unknown reset_strategy '{strategy}'") + + return inc + + def transform(self, x: pd.DataFrame) -> pd.DataFrame: + """Transform counters into diffs or rates, with optional gap masking. + + For each configured counter: + 1) Compute per-sample increment with reset handling. + 2) If compute_rate=True, divide by dt seconds. + 3) If gap_policy='mask', set values to NaN where dt > gap_threshold. + + Args: + x: Input DataFrame. Requires a DatetimeIndex if compute_rate=True or gap_policy='mask'. + + Returns: + A DataFrame with transformed columns appended (if keep_original=True) or replacing the + original counters (if keep_original=False). Column order matches fit()'s schema. + + Raises: + ValueError: If DatetimeIndex is required but missing or non-monotonic. + """ + check_is_fitted(self) + x_ = x.copy() + + dt = self._time_deltas_seconds(x_) + gap_thr = self._gap_threshold(dt) if dt is not None else None + + new_cols = {} + for c in self.counters_: + increment = self._compute_increment( + x_[c], strategy=self.reset_strategy, rollover_value=self.rollover_values.get(c) + ) + series = (increment / dt) if self.compute_rate and dt is not None else increment + + # Ensure first sample respects fill_first setting + series.iloc[0] = 0.0 if self.fill_first == "zero" else np.nan + + # Mask large gaps for both diffs and rates to avoid misleading values + if gap_thr is not None: + series = series.mask(dt > gap_thr) + + new_cols[f"{c}{self.output_suffix_}"] = series + + # Attach new columns + for name, col in new_cols.items(): + x_[name] = col + + # Optionally remove original counter columns + if not self.keep_original: + x_ = x_.drop(columns=self.counters_, errors='ignore') + + # Reorder to the schema established during fit + x_ = x_[self.feature_names_out_] + return x_ + + def inverse_transform(self, x: pd.DataFrame) -> pd.DataFrame: + """If original counter columns are present, drop the derived columns and restore original feature order. + Otherwise, returns the input as is. + + Args: + x: Input DataFrame. + + Returns: + The input DataFrame unchanged. + """ + check_is_fitted(self) + x_ = x.copy() + orig_counters_present = all(c in x_.columns for c in self.counters_) + if orig_counters_present: + if all(col in x_.columns for col in self.feature_names_in_): + x_ = x_[self.feature_names_in_] + return x_ + return x + + def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> List[str]: + """Return the output feature names determined in fit(). + + Args: + input_features: Unused. Present for compatibility with sklearn API. + + Returns: + List of output column names. + """ + check_is_fitted(self) + return self.feature_names_out_ diff --git a/energy_fault_detector/data_preprocessing/data_clipper.py b/energy_fault_detector/data_preprocessing/data_clipper.py index 3adf558..965c457 100644 --- a/energy_fault_detector/data_preprocessing/data_clipper.py +++ b/energy_fault_detector/data_preprocessing/data_clipper.py @@ -1,7 +1,7 @@ """Clip data before standardization or normalization""" import logging -from typing import Optional, List, Union +from typing import Optional, List import numpy as np import pandas as pd @@ -19,19 +19,42 @@ class DataClipper(DataTransformer): Args: lower_percentile (float): The lower percentile for clipping (default: 0.01). upper_percentile (float): The upper percentile for clipping (default: 0.99). - features_to_exclude (List[str]): A list of column names representing feature that should not be clipped. + features_to_exclude (List[str] | None): Column names that should not be clipped. + features_to_clip (List[str] | None): Column names that should be clipped (mutually exclusive with + features_to_exclude). + Configuration example: + + .. code-block:: text + + train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + features_to_exclude: + - do_not_clip_this_feature """ def __init__(self, lower_percentile: float = 0.01, upper_percentile: float = 0.99, - features_to_exclude: List[str] = None): + features_to_exclude: Optional[List[str]] = None, features_to_clip: Optional[List[str]] = None) -> None: + super().__init__() + if features_to_clip is not None and features_to_exclude is not None: + raise ValueError('Only one of features_to_clip or features_to_exclude can be specified.') + if not (0.0 <= lower_percentile <= 1.0) or not (0.0 <= upper_percentile <= 1.0): + raise ValueError('Percentiles must be within [0, 1].') + if lower_percentile >= upper_percentile: + raise ValueError('lower_percentile must be strictly less than upper_percentile.') + self.lower_percentile = lower_percentile self.upper_percentile = upper_percentile - self.feature_to_exclude: List[str] = features_to_exclude if features_to_exclude is not None else [] + self.feature_to_exclude: Optional[List[str]] = features_to_exclude + self.features_to_clip: Optional[List[str]] = features_to_clip - def fit(self, x: Union[np.array, pd.DataFrame], y: Optional[np.array] = None) -> 'DataClipper': + def fit(self, x: pd.DataFrame, y: Optional[np.array] = None) -> 'DataClipper': """Set feature names in and out.""" + if not isinstance(x, pd.DataFrame): + raise TypeError('DataClipper.fit expects a pandas DataFrame.') self.feature_names_in_ = x.columns.to_list() self.feature_names_out_ = x.columns.to_list() return self @@ -48,11 +71,23 @@ def transform(self, x: pd.DataFrame) -> pd.DataFrame: """ check_is_fitted(self) - # Exclude columns representing angles + + # Select feature to clip x_ = x.copy() - x_without_feature_to_exclude = x_[[col for col in x_.columns if col not in self.feature_to_exclude]] + if self.feature_to_exclude is not None: + selected_features = [col for col in x_.columns if col not in self.feature_to_exclude] + elif self.features_to_clip is not None: + selected_features = [col for col in x_.columns if col in self.features_to_clip] + else: + # Clip all numeric columns + selected_features = x_.columns.tolist() + # Exclude non-numeric columns - x_numeric = x_without_feature_to_exclude.select_dtypes(include=np.number) + x_numeric = x_[selected_features].select_dtypes(include=np.number) + + if x_numeric.shape[1] == 0: + logger.debug('DataClipper.transform: no numeric columns selected; returning input unchanged.') + return x_ # Clip the data using the specified percentiles x_clipped = x_numeric.clip( diff --git a/energy_fault_detector/data_preprocessing/data_preprocessor.py b/energy_fault_detector/data_preprocessing/data_preprocessor.py index 5d0c809..8883b46 100644 --- a/energy_fault_detector/data_preprocessing/data_preprocessor.py +++ b/energy_fault_detector/data_preprocessing/data_preprocessor.py @@ -1,6 +1,8 @@ """Generic class for building a preprocessing pipeline.""" -from typing import List, Optional +from collections import Counter, defaultdict +from typing import List, Optional, Dict, Any, Tuple +import warnings import pandas as pd from sklearn.pipeline import Pipeline @@ -12,179 +14,498 @@ from energy_fault_detector.data_preprocessing.low_unique_value_filter import LowUniqueValueFilter from energy_fault_detector.data_preprocessing.angle_transformer import AngleTransformer from energy_fault_detector.data_preprocessing.duplicate_value_to_nan import DuplicateValuesToNan +from energy_fault_detector.data_preprocessing.counter_diff_transformer import CounterDiffTransformer class DataPreprocessor(Pipeline, SaveLoadMixin): - """A data preprocessing pipeline that allows for configurable steps based on the extended pipeline. - - 0. (optional) Replace any consecutive duplicate zero-values (or another value) with NaN. This step should be - used if 0 can also represent missing values in the data. - 1. (optional) Column selection: A ColumnSelector object filters out columns/features with too many NaN values. - 2. (optional) Features containing angles are transformed to sine/cosine values. - 3. (optional) Low unique value filter: Remove columns/features with a low number of unique values or - high fraction of zeroes. The high fraction of zeros setting should be used if 0 can also represent missing - values in the data. - 4. Imputation with sklearn's SimpleImputer - 5. Scaling: Apply either sklearn's StandardScaler or MinMaxScaler. - - Args: - angles: List of angle features for transformation. Defaults to None. - If none provided (or empty list), this step is skipped. - imputer_strategy: Strategy for imputation ('mean', 'median', 'most_frequent', 'constant'). Defaults to 'mean'. - imputer_fill_value: Value to fill for imputation (if imputer_strategy=='constant'). - scale: Type of scaling ('standardize' or 'normalize'). Defaults to 'standardize'. - include_column_selector: Whether to include the column selector step. Defaults to True. - features_to_exclude: ColumnSelector option, list of features to exclude from processing. - max_nan_frac_per_col: ColumnSelector option, max fraction of NaN values allowed per column. Defaults to 0.05. - include_low_unique_value_filter: Whether to include the low unique value filter step. Defaults to True. - min_unique_value_count: Minimum number of unique values for low unique value filter. Defaults to 2. - max_col_zero_frac: Maximum fraction of zeroes for low unique value filter. Defaults to 1.0. - include_duplicate_value_to_nan: Whether to include the duplicate value replacement step. Defaults to False. - value_to_replace: Value to replace with NaN (if using duplicate value replacement). Defaults to None. - n_max_duplicates: Max number of consecutive duplicates to replace with NaN. Defaults to 144. - - Configuration example: - - .. code-block:: text - - train: - data_preprocessor: - params: - scale: normalize - imputer_strategy: mean - max_nan_frac_per_col: 0.05 - include_low_unique_value_filter: true - min_unique_value_count: 2 - max_col_zero_frac: 0.99 - angles: - - angle1 - - angle2 - features_to_exclude: - - feature1 - - feature2 - """ - - def __init__(self, - angles: Optional[List[str]] = None, - imputer_strategy: str = 'mean', - imputer_fill_value: Optional[int] = None, - scale: str = 'standardize', - include_column_selector: bool = True, - features_to_exclude: Optional[List[str]] = None, - max_nan_frac_per_col: float = 0.05, - include_low_unique_value_filter: bool = True, - min_unique_value_count: int = 2, - max_col_zero_frac: float = 1., - include_duplicate_value_to_nan: bool = False, - value_to_replace: float = 0, - n_max_duplicates: int = 144, - duplicate_features_to_exclude: Optional[List[str]] = None - ): - - self.include_column_selector = include_column_selector - self.features_to_exclude = features_to_exclude - self.max_nan_frac_per_col = max_nan_frac_per_col - - self.angles = angles - - self.include_low_unique_value_filter = include_low_unique_value_filter - self.min_unique_value_count = min_unique_value_count - self.max_col_zero_frac = max_col_zero_frac - - self.imputer_strategy = imputer_strategy - self.imputer_fill_value = imputer_fill_value - - self.scale = scale - - self.include_duplicate_value_to_nan = include_duplicate_value_to_nan - self.value_to_replace = value_to_replace - self.n_max_duplicates = n_max_duplicates - self.duplicate_features_to_exclude = duplicate_features_to_exclude - - # Define the scaler based on the chosen scale type - scaler = (StandardScaler(with_mean=True, with_std=True) - if scale in ['standardize', 'standard', 'standardscaler'] - else MinMaxScaler(feature_range=(0, 1))) - - # Configure the pipeline steps - steps = [] - - if include_duplicate_value_to_nan: - steps.append( - ('value_to_nan', - # Do not open source, very specific to our data problems - DuplicateValuesToNan(value_to_replace=value_to_replace, n_max_duplicates=n_max_duplicates, - features_to_exclude=duplicate_features_to_exclude)) - ) - if include_column_selector: - steps.append( - ('column_selector', - ColumnSelector(max_nan_frac_per_col=max_nan_frac_per_col, features_to_exclude=features_to_exclude)) - ) - if include_low_unique_value_filter: - steps.append( - ('low_unique_value_filter', - LowUniqueValueFilter(min_unique_value_count=min_unique_value_count, max_col_zero_frac=max_col_zero_frac)) - ) - if angles is not None and len(angles) > 0: - steps.append(('angle_transform', AngleTransformer(angles=angles))) + STEP_REGISTRY = { + 'duplicate_to_nan': DuplicateValuesToNan, + 'column_selector': ColumnSelector, + 'low_unique_value_filter': LowUniqueValueFilter, + 'angle_transformer': AngleTransformer, + 'counter_diff_transformer': CounterDiffTransformer, + 'simple_imputer': SimpleImputer, + 'standard_scaler': StandardScaler, + 'minmax_scaler': MinMaxScaler, + } + + NAME_ALIASES: Dict[str, str] = { + "angle_transform": "angle_transformer", + "counter_diff": "counter_diff_transformer", + "counter_diff_transform": "counter_diff_transformer", + "standardize": "standard_scaler", + "standard": "standard_scaler", + "standardscaler": "standard_scaler", + "minmax": "minmax_scaler", + "imputer": "simple_imputer", + "duplicate_value_to_nan": "duplicate_to_nan", + "duplicate_values_to_nan": "duplicate_to_nan", + } + + def __init__(self, steps: Optional[List[Dict[str, Any]]] = None, **params: Any) -> None: + """A data preprocessing pipeline that allows for configurable steps based on the extended pipeline. + + If both steps and legacy params are provided, steps take precedence and a warning is emitted. + When neither steps nor legacy params are provided, a default "old-style" pipeline is created which removes + features that are constant or just binary and contain more 5% missing values. Afterward, remaining missing + values are imputed with the mean and the features are scaled with the StandardScaler. - # default steps: - steps.append(('imputer', SimpleImputer(strategy=imputer_strategy, - fill_value=imputer_fill_value).set_output(transform='pandas'))) - steps.append(('scaler', scaler)) - - super().__init__(steps=steps) - self.set_output(transform="pandas") # set output of all transformers to pandas + Args: + steps: Optional list of step specifications. Each item is a dict with: + + - name: registered step name (see STEP_REGISTRY). + - enabled: optional bool (default True). + - params: dict of constructor arguments for the step. + - step_name: optional explicit pipeline name (defaults to name). + + **params: Legacy parameters used when steps is None (see _legacy_keys()). + + Notes: + Enforced ordering in steps mode: + + 1) NaN introducing steps first (DuplicateValuesToNan, CounterDiffTransformer), + 2) ColumnSelector (if present), + 3) Other steps + 4) SimpleImputer placed before scaler (always present; mean strategy by default), + 5) Scaler always last (StandardScaler by default). + + Configuration example: + + .. code-block:: text + + train: + data_preprocessor: + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + features_to_exclude: ['exclude_this_feature'] + - name: counter_diff_transformer + step_name: counter_flow + params: + counters: ['flow_total_m3'] + compute_rate: True + fill_first: 'zero' + - name: counter_diff_transformer + step_name: counter_energy + params: + counters: ['energy_total_kwh'] + compute_rate: False + fill_first: 'zero' + reset_strategy: 'rollover', + rollover_values: + 'energy_total_kwh': 100000.0 + """ - def inverse_transform(self, x: pd.DataFrame, **kwargs) -> pd.DataFrame: - """Reverses the scaler and angle transforms applied to the data. - Other transformations are not reversed. + self.steps_spec_: Optional[List[Dict[str, Any]]] = steps + self.params_: Dict[str, Any] = dict(params) + + if steps is not None and len(steps) > 0: + # Warn if legacy params are present alongside steps. + legacy_keys = set(self._legacy_keys()) + legacy_used = [k for k in self.params_.keys() if k in legacy_keys] + if legacy_used: + warnings.warn( + f"DataPreprocessor: 'steps' provided; legacy params are ignored: {legacy_used}", + UserWarning + ) + built_steps = self._build_from_steps_spec() + else: + # Build the default or legacy pipeline. If params is empty, defaults are applied. + built_steps = self._build_from_legacy() + + super().__init__(steps=built_steps) + # Ensure pandas output for supported transformers. + self.set_output(transform="pandas") + + def inverse_transform(self, x: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """Inverse-transform scaler and angles (other transforms are not reversed). Args: x: The transformed data. Returns: - A DataFrame with the inverse transformed data. + DataFrame with inverse scaling and angle back-transformation. """ + # Find scaler by type + scaler_key, _ = self._find_step_by_type((StandardScaler, MinMaxScaler)) + x_ = self.named_steps[scaler_key].inverse_transform(x.copy()) + x_ = pd.DataFrame(data=x_, columns=self.named_steps[scaler_key].get_feature_names_out()) - x_ = self.named_steps['scaler'].inverse_transform(x.copy()) - x_ = pd.DataFrame(data=x_, columns=self.named_steps['scaler'].get_feature_names_out()) - if 'angle_transform' in self.named_steps: - x_ = self.named_steps['angle_transform'].inverse_transform(x_) + # AngleTransformer supports inverse_transform; apply if present. + angle_key, _ = self._find_step_by_type((AngleTransformer,)) + if angle_key is not None: + x_ = self.named_steps[angle_key].inverse_transform(x_) + # Keep original index (important for time series). if isinstance(x, pd.DataFrame): - # ensure the index is kept x_.index = x.index - return x_ # pylint: disable=arguments-renamed - def transform(self, x: pd.DataFrame, **kwargs) -> pd.DataFrame: - """Transforms the input DataFrame using the pipeline. + def transform(self, x: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """Apply pipeline steps to the input DataFrame. Args: x: Input DataFrame. Returns: - a dataframe with the same index as the input dataframe. + DataFrame with the same index as input. """ - x_ = super().transform(X=x.copy()) - return pd.DataFrame(data=x_, - columns=self.get_feature_names_out(), - index=x.index) + return pd.DataFrame(data=x_, columns=self.get_feature_names_out(), index=x.index) # pylint: disable=arguments-renamed - def fit_transform(self, x: pd.DataFrame, **kwargs) -> pd.DataFrame: - """Fit the model and transform with the final estimator. + def fit_transform(self, x: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """Fit and transform in one step. Args: x: Input DataFrame. Returns: - Transformed DataFrame with the same index as the input dataframe. + Transformed DataFrame with the same index as input. """ - super().fit(X=x) return self.transform(x) + + def _find_step_by_type(self, types: Tuple[type, ...]) -> Tuple[Optional[str], Optional[object]]: + """Return the (step name, estimator) of the first step matching any of the given types.""" + for name, est in self.named_steps.items(): + if isinstance(est, types): + return name, est + return None, None + + @staticmethod + def _legacy_keys() -> List[str]: + """Return the list of supported legacy parameter keys.""" + return [ + "angles", + "imputer_strategy", + "imputer_fill_value", + "scale", + "include_column_selector", + "features_to_exclude", + "max_nan_frac_per_col", + "include_low_unique_value_filter", + "min_unique_value_count", + "max_col_zero_frac", + "include_duplicate_value_to_nan", + "value_to_replace", + "n_max_duplicates", + "duplicate_features_to_exclude", + "counter_columns_to_transform", + ] + + def _normalize_name(self, name: str) -> str: + """Normalize a user-provided step name to a canonical registry key.""" + return self.NAME_ALIASES.get(name, name) + + @staticmethod + def _validate_singletons(steps_spec: List[Dict[str, Any]]) -> None: + """Ensure only one instance of selected steps is present (enabled ones).""" + singleton_names = { + "angle_transformer", + "column_selector", + "low_unique_value_filter", + "simple_imputer", + # scaler handled separately (standard_scaler/minmax_scaler) in your code + } + counts: List[Tuple[str, int]] = [] + for name in singleton_names: + n = sum(1 for s in steps_spec if s.get("enabled", True) and s.get("name") == name) + if n > 1: + counts.append((name, n)) + if counts: + raise ValueError( + "Each of these steps may appear at most once: " + f"{[n for n, _ in counts]}. Found duplicates: {counts}" + ) + + def _build_from_legacy(self) -> List: + """Build pipeline from legacy parameters (old behavior + enforced ordering). + + Steps: + 0. (optional) Replace any consecutive duplicate zero-values (or another value) with NaN. This step should be + used if 0 can also represent missing values in the data. + 1. (optional) Normalize counters to differences. + 2. (optional) Column selection: A ColumnSelector object filters out columns/features with too many NaN values. + 3. (optional) Low unique value filter: Remove columns/features with a low number of unique values or + high fraction of zeroes. The high fraction of zeros setting should be used if 0 can also represent missing + values in the data. + 4. (optional) Features containing angles are transformed to sine/cosine values. + 5. Imputation with sklearn's SimpleImputer + 6. Scaling: Apply either sklearn's StandardScaler or MinMaxScaler. + + Use legacy parameters passed via **params. If empty, defaults are used. + - angles: List of angle features for transformation. Default: None (skipped). + - imputer_strategy: Strategy for imputation ('mean', 'median', 'most_frequent', 'constant'). Default: 'mean'. + - imputer_fill_value: Value to fill for imputation (if imputer_strategy=='constant'). + - scale: Type of scaling ('standardize' or 'normalize'). Default: 'standardize'. + - include_column_selector: Whether to include the column selector step. Default: True. + - features_to_exclude: ColumnSelector option, list of features to exclude from processing. + - max_nan_frac_per_col: ColumnSelector option, max fraction of NaN values allowed per column. Default: 0.05. + - include_low_unique_value_filter: Whether to include the low unique value filter step. Default: True. + - min_unique_value_count: Minimum number of unique values for low unique value filter. Default: 2. + - max_col_zero_frac: Maximum fraction of zeroes for low unique value filter. Default: 1.0. + - include_duplicate_value_to_nan: Whether to include the duplicate value replacement step. Default: False. + - value_to_replace: Value to replace with NaN (if using duplicate value replacement). Default: None. + - n_max_duplicates: Max number of consecutive duplicates to replace with NaN. Default: 144. + - counter_columns_to_transform: List of counters to normalize to differences. Default: None (skipped). + + Returns: + List of (name, estimator) tuples for the pipeline. + """ + steps: List = [] + params = self.params_ + + # 0. Replace any consecutive duplicate zero-values (or another value) with NaN. + if params.get("include_duplicate_value_to_nan", False): + steps.append( + ( + "value_to_nan", + DuplicateValuesToNan(value_to_replace=params.get("value_to_replace", 0), + n_max_duplicates=params.get("n_max_duplicates", 144), + features_to_exclude=params.get("duplicate_features_to_exclude")), + ) + ) + # 1. (optional) Normalize counters to differences. + counter_cols = params.get("counter_columns_to_transform", []) + if len(counter_cols) > 0: + steps.append( + ( + "counter_diff", + CounterDiffTransformer( + counters=counter_cols, + compute_rate=False, + reset_strategy="zero", + rollover_values=None, + small_negative_tolerance=0.0, + fill_first="nan", + keep_original=False, + gap_policy="mask", + max_gap_seconds=None, + max_gap_factor=3.0, + ), + ) + ) + # 2. ColumnSelector (default enabled) + if params.get("include_column_selector", True): + steps.append( + ( + "column_selector", + ColumnSelector(max_nan_frac_per_col=params.get("max_nan_frac_per_col", 0.05), + features_to_exclude=params.get("features_to_exclude")), + ) + ) + # 3. Optional value filters and angle transforms (before imputer) + if params.get("include_low_unique_value_filter", True): + steps.append( + ( + "low_unique_value_filter", + LowUniqueValueFilter( + min_unique_value_count=params.get("min_unique_value_count", 2), + max_col_zero_frac=params.get("max_col_zero_frac", 1.0), + ), + ) + ) + # 4. Apply optional angle transformations + angles = params.get("angles", []) + if len(angles) > 0: + steps.append(("angle_transform", AngleTransformer(angles=angles))) + # 5. Impute missing values with SimpleImputer + steps.append( + ( + "simple_imputer", + SimpleImputer( + strategy=params.get("imputer_strategy", "mean"), + fill_value=params.get("imputer_fill_value", None), + ).set_output(transform="pandas"), + ) + ) + # 6. Scale data + scale = params.get("scale", "standardize") + scaler = ( + StandardScaler(with_mean=True, with_std=True) + if scale in ["standardize", "standard", "standardscaler"] + else MinMaxScaler(feature_range=(0, 1)) + ) + steps.append(("scaler", scaler)) + return steps + + def _build_from_steps_spec(self) -> List: + """Build pipeline from steps specification (preferred mode) with enforced ordering. + + Each step has the following keys: + - name: registered step name (see STEP_REGISTRY). + - enabled: optional, defaults to True. + - params: dict of constructor parameters for the step. + - step_name: optional explicit pipeline key (defaults to name). + + Returns: + List of (name, estimator) tuples for the pipeline. + + Raises: + ValueError: If a step lacks 'name' or references an unknown step. + """ + self._validate_step_spec_keys(self.steps_spec_) + # Filter disabled steps first to simplify ordering. + enabled_spec = [s for s in self.steps_spec_ if s.get("enabled", True)] + self._validate_singletons(enabled_spec) + ordered_spec = self._order_steps_spec(enabled_spec) + # Assign unique step names for duplicates or missing step_name + ordered_spec = self._assign_unique_step_names(ordered_spec) + + steps: List = [] + scaler_defined = False + scaler_names = {"standard_scaler", "minmax_scaler"} + scaler_idx = None + for step_idx, spec in enumerate(ordered_spec): + name = spec.get("name") + if name is None: + raise ValueError("Each step spec requires a 'name'.") + if name in scaler_names: + scaler_defined = True + scaler_idx = step_idx + params = spec.get("params", {}) + cls = self.STEP_REGISTRY.get(name) + if cls is None: + raise ValueError(f"Unknown step name '{name}'. Register it in STEP_REGISTRY.") + estimator = cls(**params) + step_name = spec.get("step_name", name) + steps.append((step_name, estimator)) + + # Ensure an Imputer exists and is placed before the scaler. + if not any(n == "simple_imputer" for n, _ in steps): + default_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas") + # Insert before scaler if scaler already present; else append. + if scaler_idx is not None: + steps.insert(scaler_idx, ("simple_imputer", default_imputer)) + else: + steps.append(("simple_imputer", default_imputer)) + + # Ensure a scaler exists and is last. If missing, add StandardScaler by default. + if not scaler_defined: + steps.append(("scaler", StandardScaler(with_mean=True, with_std=True))) + + return steps + + def _order_steps_spec(self, steps_spec: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Normalize ordering rules for a steps specification. + + Rules: + - NaN introducing steps first (DuplicateValuesToNan and CounterDiffTransformer) + - ColumnSelector (if present). + - Other steps + - Any imputer placed at the end, before scaler. If no imputer was defined, the SimpleImputer with imputation + strategy 'mean' is added. + - Scaler last (if present). If no scaler is added, the StandardScaler with default values is added. + + Args: + steps_spec: List of step dictionaries. + + Returns: + Reordered list of step dictionaries. + """ + # Normalize names to canonical keys for grouping + for s in steps_spec: + s["name"] = self._normalize_name(s.get("name")) + + # Separate groups by type for easy reassembly. + column_selector = [s for s in steps_spec if s.get("name") == "column_selector"] + low_unique_value_filter = [s for s in steps_spec if s.get("name") == "low_unique_value_filter"] + duplicates = [s for s in steps_spec if s.get("name") == "duplicate_to_nan"] + counter = [s for s in steps_spec if s.get("name") == "counter_diff_transformer"] + imputer = [s for s in steps_spec if s.get("name") == "simple_imputer"] + scaler_names = {"standard_scaler", "minmax_scaler"} + scalers = [s for s in steps_spec if s.get("name") in scaler_names] + if len(scalers) > 1: + raise ValueError("Only one scaler can be used, two found in the steps specification: ." + f"{scalers}") + others = [ + s for s in steps_spec + if s.get("name") not in { + "column_selector", "duplicate_to_nan", "counter_diff_transformer", "simple_imputer", + "low_unique_value_filter", + } | scaler_names + ] + + # Keep 'others' in their original relative order. + ordered = [] + # can add NaN avalues or add new features that may be constant + ordered.extend(duplicates) + ordered.extend(counter) + # drop columns based on the values (NaNs, no variance) + ordered.extend(column_selector) + ordered.extend(low_unique_value_filter) + # other transformations + ordered.extend(others) + # end with imputation and scaling + ordered.extend(imputer) # may be empty; scaler gets default added later if missing + ordered.extend(scalers) # may be empty; scaler gets default added later if missing + return ordered + + @staticmethod + def _assign_unique_step_names(specs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Assign unique pipeline step names. If step_name is provided, use it; if it collides, append _2, _3, ... + If step_name is not provided, use the 'name' key. If this key occurs multiple times, assign name_1, name_2, ... + + This method mutates specs in place and also returns it. + + Returns: + Altered step specifications. + """ + total_counts = Counter(s["name"] for s in specs) + used: set[str] = set() + per_base_index = defaultdict(int) + + for s in specs: + explicit = s.get("step_name") + if explicit: + base = explicit + candidate = base + i = 1 + while candidate in used: + i += 1 + candidate = f"{base}_{i}" + s["step_name"] = candidate + used.add(candidate) + continue + + base = s["name"] + if total_counts[base] == 1 and base not in used: + candidate = base + else: + per_base_index[base] += 1 + candidate = f"{base}_{per_base_index[base]}" + while candidate in used: + per_base_index[base] += 1 + candidate = f"{base}_{per_base_index[base]}" + + s["step_name"] = candidate + used.add(candidate) + + return specs + + @staticmethod + def _validate_step_spec_keys(steps_spec: List[Dict[str, Any]]) -> None: + """Validate that each step spec uses only allowed keys and includes 'name'. + + Allowed keys: {'name', 'enabled', 'params', 'step_name'}. + + Args: + steps_spec: Raw steps specification provided by the user. + + Raises: + ValueError: If a step is missing 'name' or contains unknown keys. + """ + allowed = {"name", "enabled", "params", "step_name"} + + for i, spec in enumerate(steps_spec): + if "name" not in spec: + raise ValueError(f"Step #{i} is missing required key 'name'.") + unknown = set(spec.keys()) - allowed + if unknown: + raise ValueError( + f"Step #{i} has unknown keys: {sorted(unknown)}. " + f"Allowed keys are: {sorted(allowed)}." + ) diff --git a/energy_fault_detector/data_preprocessing/low_unique_value_filter.py b/energy_fault_detector/data_preprocessing/low_unique_value_filter.py index b910047..3502634 100644 --- a/energy_fault_detector/data_preprocessing/low_unique_value_filter.py +++ b/energy_fault_detector/data_preprocessing/low_unique_value_filter.py @@ -14,9 +14,9 @@ class LowUniqueValueFilter(DataTransformer): exceeds `max_col_zero_frac`. Args: - min_unique_value_count (int): Minimum number of unique values a feature should have. Defaults to 2. - If set to 2, only constant features are dropped. - max_col_zero_frac (float): Maximum fraction of zeroes a column may contain. + min_unique_value_count (int): Minimum number of unique values a feature should have. Default: 2. + If set to 2, only constant features are dropped. + max_col_zero_frac (float): Maximum fraction of zeroes a column may contain. Default: 1.0 Attributes: feature_names_in_ (list): List of column names in input. @@ -25,7 +25,7 @@ class LowUniqueValueFilter(DataTransformer): columns_dropped_ (list): List of columns that were dropped during filtering. """ - def __init__(self, min_unique_value_count: int = 1, max_col_zero_frac: float = 0.9): + def __init__(self, min_unique_value_count: int = 2, max_col_zero_frac: float = 1.0): super().__init__() self.min_unique_value_count: int = min_unique_value_count diff --git a/energy_fault_detector/evaluation/care2compare.py b/energy_fault_detector/evaluation/care2compare.py index 11b05cc..897a9ac 100644 --- a/energy_fault_detector/evaluation/care2compare.py +++ b/energy_fault_detector/evaluation/care2compare.py @@ -23,19 +23,20 @@ class Care2CompareDataset: By default, only the averages are read. See statistics argument of the data loading methods. - Methods: - get_event_info: Returns event info for a given event ID - iter_datasets: Reads datasets and yields the resulting training and test DataFrames while iterating over - event IDs. - format_event_dataset: Extracts normal_index from a loaded dataset and returns normal_index and sensor_data. - iter_formatted_datasets: Reads datasets, extracts normal_index and yields the resulting train and test - DataFrames as well as the normal_indexes while iterating over event IDs. - load_event_dataset: Reads dataset specified by event_id and returns training and test data. - load_and_format_event_dataset: Reads dataset specified by event_id and returns training and test data as well as - the corresponding normal indexes. - iter_train_datasets_per_asset: Reads datasets and yields the resulting training DataFrames while - iterating over asset IDs and aggregating event IDs for the same assets. - update_c2c_config: Updates a specified FaultDetector config based on provided feature descriptions. + Method overview: + + - get_event_info: Returns event info for a given event ID + - iter_datasets: Reads datasets and yields the resulting training and test DataFrames while iterating over + event IDs. + - format_event_dataset: Extracts normal_index from a loaded dataset and returns normal_index and sensor_data. + - iter_formatted_datasets: Reads datasets, extracts normal_index and yields the resulting train and test + DataFrames as well as the normal_indexes while iterating over event IDs. + - load_event_dataset: Reads dataset specified by event_id and returns training and test data. + - load_and_format_event_dataset: Reads dataset specified by event_id and returns training and test data as well as + the corresponding normal indexes. + - iter_train_datasets_per_asset: Reads datasets and yields the resulting training DataFrames while + iterating over asset IDs and aggregating event IDs for the same assets. + - update_c2c_config: Updates a specified FaultDetector config based on provided feature descriptions. Args: path (Path): The directory path where the dataset is located. @@ -127,12 +128,14 @@ def iter_formatted_datasets(self, wind_farm: str = None, test_only: bool = False statistics=statistics, index_column=index_column, use_readable_columns=use_readable_columns): if not test_only: - train_sensor_data, train_normal_index = self.format_event_dataset(tup[0]) - test_sensor_data, test_normal_index = self.format_event_dataset(tup[1]) - yield train_sensor_data, train_normal_index, test_sensor_data, test_normal_index, tup[2] + (x_train, x_test), event_id = tup + train_sensor_data, train_normal_index = self.format_event_dataset(x_train) + test_sensor_data, test_normal_index = self.format_event_dataset(x_test) + yield train_sensor_data, train_normal_index, test_sensor_data, test_normal_index, event_id else: - test_sensor_data, test_normal_index = self.format_event_dataset(tup[0]) - yield test_sensor_data, test_normal_index, tup[1] + x_test, event_id = tup + test_sensor_data, test_normal_index = self.format_event_dataset(x_test) + yield test_sensor_data, test_normal_index, event_id def load_event_dataset(self, event_id: int, test_only: bool = False, statistics: List[str] = None, index_column: str = 'id', use_readable_columns: bool = True @@ -179,9 +182,10 @@ def load_and_format_event_dataset(self, event_id: int, statistics: List[str] = N Returns: Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: - If test_only=False, yields a tuple of train_sensor_data, train_status, - test_sensor_data and test_status. - If test_only=True, yields a tuple of test_sensor_data and test_status. + + - If test_only=False, yields a tuple of train_sensor_data, train_status, test_sensor_data and test_status. + - If test_only=True, yields a tuple of test_sensor_data and test_status. + """ tup = self.load_event_dataset(event_id=event_id, test_only=test_only, statistics=statistics, index_column=index_column, use_readable_columns=use_readable_columns) @@ -265,6 +269,24 @@ def get_columns(feature_description_selection: pd.DataFrame) -> List[str]: columns.append(f'{row.sensor_name}_{stat}') return columns + def merge_unique(base: List[str], to_add: List[str]) -> List[str]: + """Merge two lists preserving order and removing duplicates.""" + seen = set() + out: List[str] = [] + for v in (base or []) + (to_add or []): + if v not in seen: + seen.add(v) + out.append(v) + return out + + def find_step(names: List[str]) -> dict | None: + """Find step by name specification.""" + for s in steps: + name = s.get('name') + if name in names: + return s + return None + feature_descriptions = self.feature_descriptions[wind_farm] angles = feature_descriptions.loc[feature_descriptions['is_angle']] to_exclude = feature_descriptions.loc[feature_descriptions['is_counter']] @@ -272,12 +294,28 @@ def get_columns(feature_description_selection: pd.DataFrame) -> List[str]: angle_columns = get_columns(angles) to_exclude_columns = get_columns(to_exclude) - config['train']['data_preprocessor']['params']['angles'] = ( - config['train']['data_preprocessor']['params'].get('angles', []) + angle_columns - ) - config['train']['data_preprocessor']['params']['features_to_exclude'] = ( - config['train']['data_preprocessor']['params'].get('features_to_exclude', []) + to_exclude_columns - ) + # old: + dp = config['train'].setdefault('data_preprocessor', {}) + params = dp.get('params') + steps = dp.get('steps') + if params: + params['angles'] = merge_unique(params.get('angles', []), angle_columns) + params['features_to_exclude'] = merge_unique(params.get('features_to_exclude', []), to_exclude_columns) + # new + else: + angle_step = find_step(['angle_transformer', 'angle_transform']) + if angle_step is None: + steps.append({'name': 'angle_transformer', 'params': {'angles': angle_columns}}) + else: + angle_params = angle_step.setdefault('params', {}) + angle_params['angles'] = merge_unique(angle_params.get('angles', []), angle_columns) + colsel_step = find_step(['column_selector']) + if colsel_step is None: + steps.append({'name': 'column_selector', 'params': {'features_to_exclude': to_exclude_columns}}) + else: + colsel_params = colsel_step.setdefault('params', {}) + colsel_params['features_to_exclude'] = merge_unique( + colsel_params.get('features_to_exclude', []), to_exclude_columns) config.update_config(config.config_dict) diff --git a/energy_fault_detector/evaluation/care_score.py b/energy_fault_detector/evaluation/care_score.py index 2357e5e..5d77172 100644 --- a/energy_fault_detector/evaluation/care_score.py +++ b/energy_fault_detector/evaluation/care_score.py @@ -27,7 +27,7 @@ class CAREScore: The CARE score combines Coverage, Accuracy, Reliability and Earliness to evaluate early fault-detection performance (see CARE to Compare: A Real-World Benchmark Dataset for Early Fault Detection in Wind Turbine Data, - https://doi.org/10.3390/data9120138). The goal of the CARE-Score is to evaluate the ability of a given model to + https://doi.org/10.3390/data9120138). The goal of the CARE-Score is to evaluate the ability of a given model to separate `normal behavior` from `actionable anomalies` (see glossary for definitions), that lead to a fault or indicate a fault. @@ -167,15 +167,17 @@ def evaluate_event(self, event_start: Union[int, pd.Timestamp], event_end: Union Returns: dict: Dictionary with computed metrics, e.g.: - { - 'event_id': int, - 'event_label': str, - 'weighted_score': float, - 'max_criticality': float, - 'f_beta_score': float or NaN, - 'accuracy': float, - 'tp': int, 'fp': int, 'tn': int, 'fn': int - } + .. code-block:: python + + { + 'event_id': int, + 'event_label': str, + 'weighted_score': float, + 'max_criticality': float, + 'f_beta_score': float or NaN, + 'accuracy': float, + 'tp': int, 'fp': int, 'tn': int, 'fn': int + } Raises: ValueError: If event_label is invalid, evaluate_until_event_end has an unknown value, @@ -184,7 +186,7 @@ def evaluate_event(self, event_start: Union[int, pd.Timestamp], event_end: Union Notes: - The function sorts inputs by index to ensure alignment. - If normal_index is provided, this also influences the criticality calculation: criticality does not change - if the expected behaviour is not normal. + if the expected behaviour is not normal. - If predicted_anomalies_event is empty, a ValueError is raised. - Use evaluate_until_event_end to control whether post-event predictions are considered. """ @@ -265,15 +267,17 @@ def get_final_score(self, event_selection: Optional[List[int]] = None, criticali score for anomaly events), average Accuracy (for normal events) and Reliability (eventwise F-score) using the configured weights. - If the average accuracy over all normal events < 0.5, CARE-score = average accuracy over all normal events - (worse than random guessing). - If no anomalies were detected, the CARE-score = 0. - Else, the CARE-score is calculated as: + - If the average accuracy over all normal events < 0.5, CARE-score = average accuracy over all normal events + (worse than random guessing). + - If no anomalies were detected, the CARE-score = 0. + - Else, the CARE-score is calculated as: - ( (average F-score over all anomaly events) * coverage_w - + (average weighted score over all anomaly events) * weighted_score_w - + (average accuracy over all normal events) * accuracy_w - + event wise F-score * eventwise_f_score_w ) / sum_of_weights + .. code-block:: text + + ( (average F-score over all anomaly events) * coverage_w + + (average weighted score over all anomaly events) * weighted_score_w + + (average accuracy over all normal events) * accuracy_w + + event wise F-score * eventwise_f_score_w ) / sum_of_weights where `sum_of_weights` = coverage_w + weighted_score_w + accuracy_w + eventwise_f_score_w. diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 2e1432b..30891d5 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -52,11 +52,6 @@ def __init__(self, config: Optional[Config] = None, model_directory: str = 'faul ) super().__init__(config=config, model_directory=model_directory) - if config is None: - logger.debug('No configuration set. Load models and config from path with the `FaultDetector.load_models`' - ' method.') - else: - self._init_models() def preprocess_train_data(self, sensor_data: pd.DataFrame, normal_index: pd.Series, fit_preprocessor: bool = True ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]: diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index b2a0bf5..65180ee 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -22,9 +22,10 @@ class Arcana: """Anomaly root cause analysis. Tries to find which of the sensors/inputs caused - the reconstruction error of an autoencoder model. + the reconstruction error of an autoencoder model. Implementation details are found in + https://doi.org/10.1016/j.egyai.2021.100065. - This is done by minimizing the loss function: + This method minimizes the loss function: '(1 - alpha) L2(X_corr - autoencoder(X_corr)) + alpha * L1(X_corr - X_obs)' @@ -103,11 +104,12 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi loss 2 for each 50th iteration) track_bias: If True bias will be returned as a list arcana biases each 50th iteration) - Returns: - x_bias: pandas DataFrame - tracked_losses: A dataframe containing the combined loss, loss 1 (reconstruction) and - loss 2 (regularization) for each 50th iteration (if track_losses is False this list is empty) - tracked_bias: A List of dataframes representing x_bias + Returns: A tuple with the following three objects + + - x_bias: pandas DataFrame + - tracked_losses: A dataframe containing the combined loss, loss 1 (reconstruction) and + loss 2 (regularization) for each 50th iteration (if track_losses is False this list is empty) + - tracked_bias: A List of dataframes representing x_bias """ conditions = None diff --git a/energy_fault_detector/utils/visualisation.py b/energy_fault_detector/utils/visualisation.py index 6609b52..74e3003 100644 --- a/energy_fault_detector/utils/visualisation.py +++ b/energy_fault_detector/utils/visualisation.py @@ -13,6 +13,8 @@ from energy_fault_detector.fault_detector import FaultDetector from energy_fault_detector.utils.analysis import calculate_criticality +MAX_PLOTS = 20 + def plot_learning_curve(model: Union[Autoencoder, FaultDetector], ax: plt.Axes = None, label: str = '', **subplot_kwargs) -> Tuple[plt.Figure, plt.Axes]: @@ -71,7 +73,7 @@ def plot_reconstruction(data: pd.DataFrame, reconstruction: pd.DataFrame, featur missing = set(to_plot) - set(data.columns) raise ValueError(f'The columns {missing} are not present in the dataset.') - if len(to_plot) > 30: # You can adjust this threshold + if len(to_plot) > MAX_PLOTS: warnings.warn(f"You are attempting to plot a large number of features ({len(to_plot)}). " "This may result in a cluttered figure. Consider selecting fewer features to plot.") @@ -93,6 +95,67 @@ def plot_reconstruction(data: pd.DataFrame, reconstruction: pd.DataFrame, featur return fig, ax +def plot_reconstruction_with_model(model: FaultDetector, data: pd.DataFrame, + features_to_plot: Optional[List[str]] = None, + height_multiplier: float = 1.5, + original_scale: bool = True) -> Tuple[plt.Figure, plt.Axes]: + """Plots the data and its reconstruction using the provided model. Similar to plot_reconstruction, but uses the + 'model.predict' method to get the reconstruction. Counter values are plottet as diffs or rates with their + reconstruction. + + Args: + model (FaultDetector): Fitted model with data_preprocessor and autoencoder. + data (pd.DataFrame): Raw input data. + features_to_plot (Optional[List[str]], optional): Columns to plot. If None, uses reconstruction columns. + height_multiplier (float, optional): Vertical scaling for the figure. Defaults to 1.5. + original_scale (bool, optional): If True, y-limits are based on the observed plot-series + (min-std, max+std). Defaults to True. + + Returns: + Tuple[plt.Figure, plt.Axes]: The figure and axes. + """ + + # Get model predictions + predictions = model.predict(sensor_data=data) + reconstruction = predictions.reconstruction + # model data preprocessor + dp = model.data_preprocessor + + # Discover counter mappings (original -> derived) from CounterDiffTransformer steps + from energy_fault_detector.data_preprocessing.counter_diff_transformer import CounterDiffTransformer + counter_map = {} # original counter -> derived column (e.g., energy_total_kwh -> energy_total_kwh_diff) + for name, est in dp.named_steps.items(): + if isinstance(est, CounterDiffTransformer): + # Need fitted attributes + try: + counters = getattr(est, "counters_", []) + suffix = getattr(est, "output_suffix_", "_diff") + except Exception: + counters = [] + suffix = "_diff" + for c in counters: + counter_map[c] = f"{c}{suffix}" + + # Determine features to plot + to_plot = list(reconstruction.columns) if features_to_plot is None else features_to_plot + + # If any counter are in features to plot, transform input data, so we can plot the diffs with their reconstructions + if any(col in counter_map.keys() for col in to_plot): + # replace counters with their _diff/_rate name + to_plot = [col if col not in counter_map.keys() else counter_map[col] for col in to_plot] + dataset_to_plot = dp.inverse_transform(dp.transform(data.copy()))[to_plot] + else: + dataset_to_plot = data[to_plot].copy() + + return plot_reconstruction( + dataset_to_plot, + reconstruction, + features_to_plot=to_plot, + height_multiplier=height_multiplier, + original_scale=original_scale + ) + + def plot_score_with_threshold(model: FaultDetector, data: pd.DataFrame, normal_index: pd.Series = None, ax: plt.Axes = None, figsize: Tuple[float, float] = (8, 3), show_predicted_anomaly: bool = False, show_threshold: bool = True, diff --git a/notebooks/Example - Hyperparameter Optimization.ipynb b/notebooks/Example - Hyperparameter Optimization.ipynb index 37abecf..344db94 100644 --- a/notebooks/Example - Hyperparameter Optimization.ipynb +++ b/notebooks/Example - Hyperparameter Optimization.ipynb @@ -10,119 +10,58 @@ "1. Optimizing the Autoencoder reconstruction using the MSE\n", "2. Optimizing the FaultDetector classification performance using the Fbeta score\n", "3. Optimizing the FaultDetector classification performance using the CARE-score\n", - "The optimization is done using the [CARE to Compare dataset](https://doi.org/10.5281/zenodo.14958989)\n", "\n", - "For this example you need to install Optuna, which is not contained in the standard requirements of the framework\n", - "Optuna [docs](https://optuna.readthedocs.io/en/stable/index.html) and [tutorials](https://optuna.readthedocs.io/en/stable/tutorial/index.html)\n", - "\n", - "-> Install additional requirements for this example using 'pip notebooks/example_requirements.txt'" + "The optimization is done using the [CARE to Compare dataset](https://doi.org/10.5281/zenodo.14958989)" ], - "id": "552412b97335ea1c" + "id": "acc177b6ece47b21" }, { + "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "from typing import List\n", + "from copy import deepcopy\n", "\n", "import optuna as op\n", - "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics import fbeta_score\n", "\n", "from energy_fault_detector import FaultDetector, Config\n", "from energy_fault_detector.evaluation import CAREScore, Care2CompareDataset" ], - "metadata": { - "collapsed": false - }, - "id": "217e454f48a9879b", - "outputs": [], - "execution_count": null - }, - { - "cell_type": "code", - "source": [ - "data_path = './Care_To_Compare'" - ], - "metadata": { - "collapsed": false - }, - "id": "1d2e20520349e34e", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "def update_config(config: Config, feature_descriptions: pd.DataFrame) -> None:\n", - " \"\"\"Update config based on provided feature descriptions.\"\"\"\n", - "\n", - " def get_columns(feature_description_selection: pd.DataFrame) -> List[str]:\n", - " col_suffix = {\n", - " 'average': 'avg',\n", - " 'minimum': 'min',\n", - " 'maximum': 'max',\n", - " 'std_dev': 'std'\n", - " }\n", - " columns = []\n", - " for _, row in feature_description_selection.iterrows():\n", - " if row.statistics_type == 'average':\n", - " # in this case the column can be either sensor_i or sensor_i_avg, so we add both\n", - " columns.append(row.sensor_name)\n", - " for stat in row.statistics_type.split(','):\n", - " columns.append(f'{row.sensor_name}_{col_suffix[stat]}')\n", - " return columns\n", - "\n", - " angles = feature_descriptions.loc[feature_descriptions['is_angle']]\n", - " to_exclude = feature_descriptions.loc[feature_descriptions['is_counter']]\n", - "\n", - " angle_columns = get_columns(angles)\n", - " to_exclude_columns = get_columns(to_exclude)\n", - " \n", - " config['train']['data_preprocessor']['params']['angles'] = (\n", - " config['train']['data_preprocessor']['params'].get('angles', []) + angle_columns\n", - " )\n", - " config['train']['data_preprocessor']['params']['features_to_exclude'] = (\n", - " config['train']['data_preprocessor']['params'].get('features_to_exclude', []) + to_exclude_columns\n", - " )\n", - " \n", - " config.update_config(config.config_dict)\n" - ], - "id": "723566d33fa10db6", - "outputs": [], - "execution_count": null + "id": "f498cdd89c2cf406" }, { "metadata": {}, "cell_type": "code", - "source": [ - "c2c = Care2CompareDataset(data_path)" - ], - "id": "ec8c21f059bba3c8", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "data_path = './Care_To_Compare'", + "id": "641b5084847234b8" }, { "metadata": {}, "cell_type": "markdown", - "source": [ - "### Optimize autoencoder reconstruction" - ], - "id": "99fcaa7054047666" + "source": "## Optimize autoencoder reconstruction", + "id": "d5ac0225fa5e566b" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "model_config = Config('c2c_configs/windfarm_C.yaml') # starting point\n", - "\n", - "# our test set\n", + "# Our test set (a specific event from the CARE2Compare dataset)\n", "c2c = Care2CompareDataset(data_path)\n", "event_id = 47\n", - "train_data, normal_index, _, _ = c2c.get_formatted_event_dataset(event_id=event_id, index_column='time_stamp')\n", + "train_data, normal_index, _, _ = c2c.load_and_format_event_dataset(event_id=event_id, index_column='time_stamp')\n", "\n", - "# speed up for testing\n", + "# Model configuration starting point\n", + "model_config = Config('c2c_configs/windfarm_C.yaml')\n", + "c2c.update_c2c_config(model_config, 'C')\n", + "\n", + "# speed up for testing (select a small part of the dataset)\n", "N = 10000\n", "normal_index = normal_index.iloc[:N]\n", "train_data = train_data.iloc[:N]\n", @@ -140,36 +79,23 @@ " Returns:\n", " MSE of the reconstruction.\n", " \"\"\"\n", + " # Use a fresh config dict per trial\n", + " cfg = deepcopy(model_config.config_dict)\n", "\n", - " autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", + " autoencoder_params = cfg['train']['autoencoder']['params']\n", "\n", " # sample new parameters\n", - " autoencoder_params['batch_size'] = int(trial.suggest_categorical(\n", - " name='batch_size', choices=[32, 64, 128]\n", - " ))\n", - " autoencoder_params['learning_rate'] = trial.suggest_float(\n", - " name='learning_rate', low=1e-5, high=0.01, log=True\n", - " )\n", - " autoencoder_params['decay_rate'] = trial.suggest_float(\n", - " name='decay_rate', low=0.8, high=0.99\n", - " )\n", + " autoencoder_params['batch_size'] = int(trial.suggest_categorical(name='batch_size', choices=[32, 64, 128]))\n", + " autoencoder_params['learning_rate'] = trial.suggest_float(name='learning_rate', low=1e-5, high=0.01, log=True)\n", + " autoencoder_params['decay_rate'] = trial.suggest_float(name='decay_rate', low=0.8, high=0.99)\n", "\n", " # architecture\n", - " autoencoder_params['layers'][0] = trial.suggest_int(\n", - " name='layers_0', low=100, high=400\n", - " )\n", - " autoencoder_params['layers'][1] = trial.suggest_int(\n", - " name='layers_1', low=50, high=100\n", - " )\n", - " autoencoder_params['code_size'] = trial.suggest_int(\n", - " name='code_size', low=10, high=30\n", - " )\n", - "\n", - " # update the configuration\n", - " model_config.update_config(model_config.config_dict)\n", + " autoencoder_params['layers'][0] = trial.suggest_int(name='layers_0', low=100, high=400)\n", + " autoencoder_params['layers'][1] = trial.suggest_int(name='layers_1', low=50, high=100)\n", + " autoencoder_params['code_size'] = trial.suggest_int(name='code_size', low=10, high=30)\n", "\n", " # create a new model using our new configuration and train the model\n", - " model = FaultDetector(model_config)\n", + " model = FaultDetector(Config(config_dict=cfg))\n", " # For autoencoder optimization, we do not need to fit a threshold\n", " training_result = model.fit(train_data, normal_index=normal_index, fit_autoencoder_only=True, save_model=False)\n", "\n", @@ -179,81 +105,69 @@ "\n", " return score" ], - "id": "eadbdf08b64a43e4", - "outputs": [], - "execution_count": null + "id": "6cc9b7bee0de0a25" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "study = op.create_study(sampler=op.samplers.TPESampler(),\n", - " study_name='autoencoder_optimization',\n", - " direction='minimize')\n", + "study = op.create_study(sampler=op.samplers.TPESampler(), study_name='autoencoder_optimization', direction='minimize')\n", "\n", "# if we want to ensure that the first trial is done with the hyperparameters of the configuration, we need to enqueue a trial:\n", "autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", "study.enqueue_trial(params={\n", " 'batch_size': autoencoder_params['batch_size'],\n", " 'learning_rate': autoencoder_params['learning_rate'],\n", - " 'decay_rate': autoencoder_params['decay_rate'],\n", " 'layers_0': autoencoder_params['layers'][0],\n", " 'layers_1': autoencoder_params['layers'][1],\n", " 'code_size': autoencoder_params['code_size'],\n", - "})" - ], - "id": "ff13f4aeb3d8b6d0", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ + "})\n", + "\n", + "# Run optimization for 5 trials\n", "study.optimize(reconstruction_mse, n_trials=5)" ], - "id": "edfa14b97d00a0b9", - "outputs": [], - "execution_count": null + "id": "1036f11c1b97e12" }, { "metadata": {}, "cell_type": "code", - "source": [ - "study.best_params" - ], - "id": "91747046042e809a", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "study.best_params", + "id": "7d452e30dd1a1e0e" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ "# analyze results\n", "study.trials_dataframe()" ], - "id": "4cf3e2b09291ba57", - "outputs": [], - "execution_count": null + "id": "5f061454dc9f7980" }, { "metadata": {}, "cell_type": "markdown", - "source": "# Optimize fault detection model", - "id": "cb3c07b2be2cb5ed" + "source": "## Optimize fault detection model - F-beta score", + "id": "80664d61f648132f" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ + "# Our test set (a specific event from the CARE2Compare dataset)\n", "c2c = Care2CompareDataset(data_path)\n", - "\n", "event_id = 47\n", - "event_info = c2c.event_info_all[c2c.event_info_all['event_id'] == event_id].iloc[0]\n", - "\n", - "train_data, normal_index, test_data, test_normal_index = c2c.get_formatted_event_dataset(event_id=event_id, index_column='time_stamp')\n", + "train_data, normal_index, test_data, test_normal_index = c2c.load_and_format_event_dataset(event_id=event_id, index_column='time_stamp')\n", "\n", + "# Create a ground truth for this event\n", + "event_info = c2c.event_info_all[c2c.event_info_all['event_id'] == event_id].iloc[0]\n", "ground_truth = CAREScore.create_ground_truth(\n", " event_label=event_info['event_label'],\n", " event_start=event_info['event_start'],\n", @@ -261,21 +175,42 @@ " normal_index=test_normal_index\n", ")" ], - "id": "fa98c752d06b0e35", - "outputs": [], - "execution_count": null + "id": "3d4b96de7af51a2a" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "model_config = Config('c2c_configs/windfarm_C.yaml') # starting point\n", + "# Model configuration starting point\n", + "model_config = Config('c2c_configs/windfarm_C.yaml')\n", + "c2c.update_c2c_config(model_config, 'C')\n", "\n", - "# speed up for testing\n", + "# speed up for testing (select a small part of the dataset)\n", "N = 10000\n", "normal_index = normal_index.iloc[:N]\n", "train_data = train_data.iloc[:N]\n", "\n", + "# helper function to (re)set the scaling step of the DataPreprocessor\n", + "def set_scaler_step(cfg: dict, choice: str) -> dict:\n", + " \"\"\"Update cfg to use the chosen scaler.\"\"\"\n", + " dp = cfg['train'].setdefault('data_preprocessor', {})\n", + " steps = dp.get('steps')\n", + "\n", + " # Remove any existing scaler step(s)\n", + " scaler_names = {'standard_scaler', 'minmax_scaler'}\n", + " steps = [s for s in steps if s.get('name') not in scaler_names]\n", + " # Add the chosen scaler step\n", + " if choice == 'minmax':\n", + " steps.append({'name': 'minmax_scaler'})\n", + " else:\n", + " # 'standardize'\n", + " steps.append({'name': 'standard_scaler'})\n", + "\n", + " dp['steps'] = steps\n", + " return cfg\n", + "\n", "\n", "def f_score(trial: op.Trial) -> float:\n", " \"\"\"Returns the F-score of the model (only useful for datasets with anomalies).\n", @@ -286,69 +221,50 @@ " Returns:\n", " Score of the FaultDetector model \n", " \"\"\"\n", + " # Use a fresh config dict per trial\n", + " cfg = deepcopy(model_config.config_dict)\n", "\n", - " dataprep_params = model_config.config_dict['train']['data_preprocessor']['params']\n", - " autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", - "\n", - " dataprep_params['scale'] = trial.suggest_categorical(\n", - " name='scale', choices=['minmax', 'standardize']\n", - " )\n", + " # Scale choice (new steps mode or legacy fallback)\n", + " scale_choice = trial.suggest_categorical('scale', ['minmax', 'standardize'])\n", + " cfg = set_scaler_step(cfg, scale_choice)\n", "\n", - " autoencoder_params['batch_size'] = int(trial.suggest_categorical(\n", - " name='batch_size', choices=[32, 64, 128]\n", - " ))\n", - " autoencoder_params['learning_rate'] = trial.suggest_float(\n", - " name='learning_rate', low=1e-5, high=0.01, log=True\n", - " )\n", - " autoencoder_params['decay_rate'] = trial.suggest_float(\n", - " name='decay_rate', low=0.8, high=0.99\n", - " )\n", + " # Autoencoder params\n", + " autoencoder_params = cfg['train']['autoencoder']['params']\n", + " autoencoder_params['batch_size'] = int(trial.suggest_categorical(name='batch_size', choices=[32, 64, 128]))\n", + " autoencoder_params['learning_rate'] = trial.suggest_float(name='learning_rate', low=1e-5, high=0.01, log=True)\n", + " autoencoder_params['decay_rate'] = trial.suggest_float(name='decay_rate', low=0.8, high=0.99)\n", "\n", " # architecture\n", - " autoencoder_params['layers'][0] = trial.suggest_int(\n", - " name='layers_0', low=100, high=400\n", - " )\n", - " autoencoder_params['layers'][1] = trial.suggest_int(\n", - " name='layers_1', low=50, high=100\n", - " )\n", - " autoencoder_params['code_size'] = trial.suggest_int(\n", - " name='code_size', low=10, high=30\n", - " )\n", - "\n", - " # update the configuration\n", - " model_config.update_config(model_config.config_dict)\n", + " autoencoder_params['layers'][0] = trial.suggest_int(name='layers_0', low=100, high=400)\n", + " autoencoder_params['layers'][1] = trial.suggest_int(name='layers_1', low=50, high=100)\n", + " autoencoder_params['code_size'] = trial.suggest_int(name='code_size', low=10, high=30)\n", "\n", " # create a new model using our new configuration and train the model\n", - " model = FaultDetector(model_config)\n", + " model = FaultDetector(Config(config_dict=cfg))\n", " _ = model.fit(train_data, normal_index=normal_index, save_models=False)\n", " predictions = model.predict(test_data)\n", "\n", - " score = fbeta_score(\n", + " return fbeta_score(\n", " y_true=ground_truth.sort_index(),\n", " y_pred=predictions.predicted_anomalies.sort_index(),\n", " beta=0.5\n", - " )\n", - "\n", - " return score" + " )" ], - "id": "1651af56e86b8c17", - "outputs": [], - "execution_count": null + "id": "2d93c80b0090b855" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "study = op.create_study(sampler=op.samplers.TPESampler(),\n", - " study_name='ad_optimization',\n", - " direction='maximize')\n", + "study = op.create_study(sampler=op.samplers.TPESampler(), study_name='ad_optimization', direction='maximize')\n", "\n", "# if we want to ensure that the first trial is done with the hyperparameters of the configuration, we need to enqueue a trial:\n", "autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", "study.enqueue_trial(params={\n", " 'batch_size': autoencoder_params['batch_size'],\n", " 'learning_rate': autoencoder_params['learning_rate'],\n", - " 'decay_rate': autoencoder_params['decay_rate'],\n", " 'layers_0': autoencoder_params['layers'][0],\n", " 'layers_1': autoencoder_params['layers'][1],\n", " 'code_size': autoencoder_params['code_size'],\n", @@ -356,74 +272,64 @@ "\n", "study.optimize(f_score, n_trials=5)" ], - "id": "713d5956170a993e", - "outputs": [], - "execution_count": null + "id": "d8a1cd60b4664efd" }, { "metadata": {}, "cell_type": "code", - "source": [ - "study.trials_dataframe()" - ], - "id": "7b1b25ddfb3e5b2d", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "study.trials_dataframe()", + "id": "49d60813c32812ed" }, { "metadata": {}, "cell_type": "markdown", "source": [ - "### Optimize CARE score\n", - "Optimize the CARE Score. Note that this is extremely slow, as we train a model for each subdataset." + "## Optimize fault detection model - CARE score\n", + "Optimize the CARE Score. Note that this takes a while, as we train a model for each subdataset." ], - "id": "bbdfaaaf03341cbc" + "id": "bb8fc58e17ba1b64" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ + "# Our test set - Wind Farm B from the CARE2Compare dataset\n", + "c2c = Care2CompareDataset(data_path)\n", "wind_farm = 'B'\n", + "\n", + "# Model configuration starting point\n", "model_config = Config('c2c_configs/windfarm_B.yaml')\n", + "c2c.update_c2c_config(model_config, 'B')\n", "\n", "# speed up for testing\n", - "N = 100\n", + "N = 10000\n", + "max_datasets = 15\n", "\n", "def care_objective(trial: op.Trial) -> float:\n", - " \"\"\"Returns the F-score of the model (only useful for datasets with anomalies).\n", - "\n", - " Args:\n", - " trial: optuna Trial object\n", + " \"\"\"Returns the CARE score of the FaultDetector model.\"\"\"\n", "\n", - " Returns:\n", - " Score of the FaultDetector model.\n", - " \"\"\"\n", + " # Use a fresh config dict per trial\n", + " cfg = deepcopy(model_config.config_dict)\n", "\n", - " autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", - " threshold_params = model_config.config_dict['train']['threshold_selector']['params']\n", + " autoencoder_params = cfg['train']['autoencoder']['params']\n", + " threshold_params = cfg['train']['threshold_selector']['params']\n", "\n", - " autoencoder_params['batch_size'] = int(trial.suggest_categorical(\n", - " name='batch_size', choices=[32, 64, 128]\n", - " ))\n", - " autoencoder_params['learning_rate'] = trial.suggest_float(\n", - " name='learning_rate', low=1e-5, high=0.01, log=True\n", - " )\n", + " autoencoder_params['batch_size'] = int(trial.suggest_categorical(name='batch_size', choices=[32, 64, 128]))\n", + " autoencoder_params['learning_rate'] = trial.suggest_float(name='learning_rate', low=1e-5, high=0.01, log=True)\n", "\n", " # architecture\n", - " autoencoder_params['layers'][0] = trial.suggest_int(\n", - " name='layers_0', low=20, high=100\n", - " )\n", - " autoencoder_params['code_size'] = trial.suggest_int(\n", - " name='code_size', low=5, high=20\n", - " )\n", + " autoencoder_params['layers'][0] = trial.suggest_int(name='layers_0', low=20, high=100)\n", + " autoencoder_params['code_size'] = trial.suggest_int(name='code_size', low=5, high=20)\n", "\n", " # threshold\n", " threshold_params['gamma'] = trial.suggest_float(name='gamma', low=0.05, high=0.3)\n", " threshold_params['nn_size'] = trial.suggest_int(name='nn_size', low=20, high=50)\n", "\n", - " # update the configuration with the new hyperparameters\n", - " model_config.update_config(model_config.config_dict)\n", - "\n", + " # Create a CAREScore object and train+evaluate each dataset for this wind farm\n", " care_score = CAREScore(coverage_beta=0.5, eventwise_f_score_beta=0.5, anomaly_detection_method='criticality')\n", " i = 1\n", " for x_train, y_train, x_test, y_test, event_id in c2c.iter_formatted_datasets(wind_farm=wind_farm, index_column='time_stamp'):\n", @@ -435,7 +341,7 @@ " y_test = y_test.iloc[:N]\n", " \n", " # create a new model using our new configuration and train the model\n", - " model = FaultDetector(model_config)\n", + " model = FaultDetector(Config(config_dict=cfg))\n", " _ = model.fit(x_train, normal_index=y_train, save_models=False)\n", " prediction = model.predict(x_test)\n", " event_info = c2c.event_info_all[c2c.event_info_all['event_id'] == event_id].iloc[0]\n", @@ -449,22 +355,22 @@ " ignore_normal_index=False\n", " )\n", " i += 1\n", + " if i > max_datasets:\n", + " break\n", "\n", " score = care_score.get_final_score()\n", "\n", " return score" ], - "id": "7cee58426b72d9a4", - "outputs": [], - "execution_count": null + "id": "ce3000b2686a463f" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "study = op.create_study(sampler=op.samplers.TPESampler(),\n", - " study_name='care_optimization',\n", - " direction='maximize')\n", + "study = op.create_study(sampler=op.samplers.TPESampler(), study_name='care_optimization', direction='maximize')\n", "\n", "# Ensure that the first trial is done with the hyperparameters of the provided configuration\n", "autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", @@ -478,41 +384,37 @@ " 'nn_size': threshold_params['nn_size'],\n", "})\n", "\n", - "# since we loop through many datasets, train many models, we run the garbage collector after each trial\n", + "# Since we loop through many datasets, train many models, we run the garbage collector after each trial\n", "study.optimize(care_objective, n_trials=5, gc_after_trial=True)" ], - "id": "6621d7a2bf3ac717", - "outputs": [], - "execution_count": null + "id": "2063c7f60b979cc5" }, { "metadata": {}, "cell_type": "code", - "source": [ - "study.trials_dataframe()" - ], - "id": "9aa3d678f4fa22e3", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "study.trials_dataframe()", + "id": "95c8498e85467561" } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.8" } }, "nbformat": 4, diff --git a/notebooks/c2c_configs/windfarm_A.yaml b/notebooks/c2c_configs/windfarm_A.yaml index da7133a..5ef3264 100644 --- a/notebooks/c2c_configs/windfarm_A.yaml +++ b/notebooks/c2c_configs/windfarm_A.yaml @@ -4,14 +4,15 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - max_col_zero_frac: 0.99 - max_nan_frac_per_col: 0.05 - min_unique_value_count: 10 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + - name: low_unique_value_filter + params: + min_unique_value_count: 10 + max_col_zero_frac: 0.99 + - name: minmax_scaler data_splitter: shuffle: true diff --git a/notebooks/c2c_configs/windfarm_B.yaml b/notebooks/c2c_configs/windfarm_B.yaml index 7b55de8..68320b9 100644 --- a/notebooks/c2c_configs/windfarm_B.yaml +++ b/notebooks/c2c_configs/windfarm_B.yaml @@ -4,14 +4,15 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - max_col_zero_frac: 0.8 - max_nan_frac_per_col: 0.05 - min_unique_value_count: 10 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + - name: low_unique_value_filter + params: + min_unique_value_count: 10 + max_col_zero_frac: 0.8 + - name: minmax_scaler data_splitter: shuffle: true diff --git a/notebooks/c2c_configs/windfarm_C.yaml b/notebooks/c2c_configs/windfarm_C.yaml index c618cc3..3f7b08a 100644 --- a/notebooks/c2c_configs/windfarm_C.yaml +++ b/notebooks/c2c_configs/windfarm_C.yaml @@ -4,14 +4,15 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - max_col_zero_frac: 0.99 - max_nan_frac_per_col: 0.05 - min_unique_value_count: 10 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + - name: low_unique_value_filter + params: + min_unique_value_count: 10 + max_col_zero_frac: 0.99 + - name: minmax_scaler data_splitter: shuffle: true diff --git a/tests/config/test_config.py b/tests/config/test_config.py index ba39c46..a1a147e 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -3,7 +3,6 @@ import shutil import unittest -import numpy as np from energy_fault_detector.config import Config, InvalidConfigFile PROJECT_ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '..') @@ -22,10 +21,7 @@ def test_init(self): self.assertDictEqual(conf.config_dict['train'], { 'anomaly_score': {'name': 'mahalanobis', 'params': {'pca': True, 'pca_min_var': 0.85}}, - 'data_preprocessor': {'params': {'max_nan_frac_per_col': 0.05, - 'imputer_strategy': 'mean', - 'features_to_exclude': ['feature1', 'feature2'], - 'include_duplicate_value_to_nan': False}}, + 'data_preprocessor': None, # unspecified, default pipeline 'autoencoder': {'name': 'MultilayerAutoencoder', 'verbose': 0, 'params': {'layers': [300], @@ -39,7 +35,7 @@ def test_init(self): 'threshold_selector': {'name': 'FDR', 'params': {'target_false_discovery_rate': 0.8}, 'fit_on_val': False}, - 'data_splitter': {'train_block_size': 7, 'val_block_size': 3, 'type': 'DataSplitter'}, + 'data_splitter': {'train_block_size': 7, 'val_block_size': 3, 'type': 'BlockDataSplitter'}, 'data_clipping': {'lower_percentile': 0.01, 'upper_percentile': 0.99} }) self.assertDictEqual(conf.config_dict['root_cause_analysis'], diff --git a/tests/config/test_quickstart_config.py b/tests/config/test_quickstart_config.py new file mode 100644 index 0000000..ae55b30 --- /dev/null +++ b/tests/config/test_quickstart_config.py @@ -0,0 +1,47 @@ +from unittest import TestCase +from typing import Any, Dict + +from energy_fault_detector.config.config import Config +from energy_fault_detector.config.quickstart_config import generate_quickstart_config # adjust + + +class TestQuickstartConfig(TestCase): + def test_generate_quickstart_config_valid_dict(self) -> None: + """Should return a valid config dict that Config accepts and includes required sections.""" + cfg: Dict[str, Any] = generate_quickstart_config( + output_path=None, + angle_columns=["theta_deg"], + counter_columns=["energy_total_kwh"], + scaler="standard", + imputer_strategy="mean", + early_stopping=False, + ) + + # Basic structure checks + self.assertIn("train", cfg) + train = cfg["train"] + self.assertIn("data_preprocessor", train) + self.assertIn("steps", train["data_preprocessor"]) + self.assertIn("autoencoder", train) + self.assertIn("params", train["autoencoder"]) + self.assertIn("threshold_selector", train) + self.assertIn("params", train["threshold_selector"]) + + # Ensure certain steps exist + step_names = [s["name"] for s in train["data_preprocessor"]["steps"]] + self.assertIn("column_selector", step_names) + self.assertIn("simple_imputer", step_names) + self.assertTrue( + any(n in ("standard_scaler", "minmax_scaler") for n in step_names), + "Expected a scaler step in the pipeline." + ) + + # Should not raise: validate via Config + Config(config_dict=cfg) + + def test_generate_quickstart_config_validation_split_guard(self) -> None: + """If validation split not in (0, 1) it should raise ValueError.""" + with self.assertRaises(ValueError): + _ = generate_quickstart_config( + validation_split=0.0, # invalid by design + ) diff --git a/tests/core/test_model_factory.py b/tests/core/test_model_factory.py index 90d4ca2..28ffef5 100644 --- a/tests/core/test_model_factory.py +++ b/tests/core/test_model_factory.py @@ -27,7 +27,6 @@ def test_model_creation(self): # Test for data preprocessor data_preprocessor = model_factory.data_preprocessor self.assertIsInstance(data_preprocessor, DataPreprocessor) - self.assertEqual(data_preprocessor.max_nan_frac_per_col, 0.05) # Test for threshold selector threshold_selector = model_factory.threshold_selector diff --git a/tests/data_preprocessing/test_column_selector.py b/tests/data_preprocessing/test_column_selector.py index 78a2c84..761f40e 100644 --- a/tests/data_preprocessing/test_column_selector.py +++ b/tests/data_preprocessing/test_column_selector.py @@ -51,3 +51,35 @@ def test_missing_columns(self): def test_not_fitted(self): with self.assertRaises(NotFittedError): self.column_selector.transform(self.raw_dataframe) + + def test_fit_with_features_to_select(self): + # Select a mix (case-insensitive); Sensor_3 should be dropped due to NaN fraction (0.5 >= 0.2) + selector = ColumnSelector(max_nan_frac_per_col=0.2, + features_to_select=['sensor_1', 'SENSOR_2', 'sensor_3', 'sensor_5']) + selector.fit(self.raw_dataframe) + expected_attributes = ["Sensor_1", "Sensor_2", "Sensor_5"] + assert_array_equal(expected_attributes, selector.feature_names_out_) + + def test_transform_with_features_to_select(self): + # Keep only Sensor_1 and Sensor_5 + selector = ColumnSelector(max_nan_frac_per_col=0.2, + features_to_select=['sensor_1', 'sensor_5']) + expected_df = self.raw_dataframe[["Sensor_1", "Sensor_5"]] + df = selector.fit_transform(self.raw_dataframe) + # Check values and column order + assert_array_equal(expected_df.columns.values, df.columns.values) + assert_array_equal(expected_df.values, df.values) + + def test_features_to_select_case_insensitive(self): + # Mixed casing in selection should match columns + selector = ColumnSelector(max_nan_frac_per_col=0.2, + features_to_select=['SeNsOr_1', 'seNSor_5']) + selector.fit(self.raw_dataframe) + expected_attributes = ["Sensor_1", "Sensor_5"] + assert_array_equal(expected_attributes, selector.feature_names_out_) + + def test_init_mutually_exclusive_args(self): + with self.assertRaises(ValueError): + ColumnSelector(max_nan_frac_per_col=0.2, + features_to_exclude=['sensor_1'], + features_to_select=['sensor_1', 'sensor_5']) diff --git a/tests/data_preprocessing/test_counter_diff_transformer.py b/tests/data_preprocessing/test_counter_diff_transformer.py new file mode 100644 index 0000000..9b4c178 --- /dev/null +++ b/tests/data_preprocessing/test_counter_diff_transformer.py @@ -0,0 +1,365 @@ +import unittest +from datetime import datetime, timedelta +from typing import List + +import numpy as np +import pandas as pd + +from energy_fault_detector.data_preprocessing.counter_diff_transformer import CounterDiffTransformer + + +class TestCounterDiffTransformer(unittest.TestCase): + """Unit tests for CounterDiffTransformer.""" + + def setUp(self) -> None: + """Create small helper datasets used across tests.""" + # Regular 1-second interval index + self.t0 = datetime(2024, 1, 1, 0, 0, 0) + self.idx_1s = pd.date_range(self.t0, periods=5, freq="1s", tz="UTC") + + def _df( + self, + values_a: List[float], + values_b: List[float] | None = None, + index: pd.DatetimeIndex | None = None, + ) -> pd.DataFrame: + """Helper to build a DataFrame with optional second counter.""" + index = index if index is not None else self.idx_1s + if len(values_a) < len(index): + index = index[:len(values_a)] + data = {"counter_a": values_a} + if values_b is not None: + data["counter_b"] = values_b + return pd.DataFrame(data, index=index) + + def test_fit_requires_datetime_index_when_rate_or_mask(self) -> None: + """fit should error on non-DatetimeIndex when rate/mask are requested.""" + df = pd.DataFrame({"counter_a": [0, 1, 2]}, index=[0, 1, 2]) + + # compute_rate=True requires DatetimeIndex + with self.assertRaises(ValueError): + CounterDiffTransformer(counters=["counter_a"], compute_rate=True).fit(df) + + # gap_policy='mask' requires DatetimeIndex + with self.assertRaises(ValueError): + CounterDiffTransformer(counters=["counter_a"], gap_policy="mask").fit(df) + + # If neither rate nor mask, fit should succeed + CounterDiffTransformer(counters=["counter_a"], gap_policy="ignore").fit(df) + + def test_fit_requires_monotonic_index(self) -> None: + """fit should error on non-monotonic DatetimeIndex when rate/mask are requested.""" + idx = pd.DatetimeIndex( + [self.t0, self.t0 + timedelta(seconds=2), self.t0 + timedelta(seconds=1)], + tz="UTC", + ) + df = self._df(values_a=[0, 1, 2], index=idx) + with self.assertRaises(ValueError): + CounterDiffTransformer(counters=["counter_a"], compute_rate=True).fit(df) + + # No problem if index is sorted + CounterDiffTransformer(counters=["counter_a"], compute_rate=True).fit(df.sort_index()) + + def test_diff_zero_strategy_default(self) -> None: + """Default 'zero' strategy: negative diff -> increment equals current value.""" + # 0 -> 1 -> 3 -> 0 (reset) -> 2 + df = self._df(values_a=[0, 1, 4, 1, 3]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="zero", + fill_first="nan", + keep_original=False, + gap_policy="ignore", + ).fit(df) + + out = tr.transform(df) + self.assertListEqual(list(out.columns), ["counter_a_diff"]) + + expected = pd.Series([np.nan, 1, 3, 1, 2], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_diff_fill_first_zero(self) -> None: + """First increment filled with zero when fill_first='zero'.""" + df = self._df(values_a=[5, 7, 8, 10, 12]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="zero", + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + + out = tr.transform(df) + expected = pd.Series([0, 2, 1, 2, 2], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_rollover_strategy_with_value(self) -> None: + """'rollover' strategy uses provided rollover value to compute increment.""" + # 95 -> 98 -> 2 (rollover at 100) => inc: NaN/0, 3, 2 + (100 - 98) = 4 + df = self._df(values_a=[95, 98, 2, 7, 20]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="rollover", + rollover_values={"counter_a": 100.0}, + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + + out = tr.transform(df) + expected = pd.Series([0, 3, 4, 5, 13], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_rollover_strategy_without_value_errors(self) -> None: + """'rollover' without a rollover_value should raise a ValueError.""" + df = self._df(values_a=[50, 10]) # negative diff + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="rollover", + rollover_values={}, # missing + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + with self.assertRaises(ValueError): + tr.transform(df) + + def test_nan_strategy(self) -> None: + """'nan' strategy sets negative diffs to NaN.""" + df = self._df(values_a=[10, 8, 9]) + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="nan", + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + out = tr.transform(df) + expected = pd.Series([0, np.nan, 1], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_auto_strategy_prefers_rollover_when_available(self) -> None: + """'auto' uses rollover if a value is supplied; else behaves like 'zero'.""" + df = self._df(values_a=[95, 98, 2]) + + # With rollover value -> like 'rollover' + tr1 = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="auto", + rollover_values={"counter_a": 100.0}, + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out1 = tr1.transform(df) + expected1 = pd.Series([0, 3, 4], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out1["counter_a_diff"], expected1, check_dtype=False) + + # Without rollover value -> like 'zero' + tr2 = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="auto", + rollover_values={}, # none + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out2 = tr2.transform(df) + expected2 = pd.Series([0, 3, 2], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out2["counter_a_diff"], expected2, check_dtype=False) + + def test_small_negative_tolerance(self) -> None: + """Small negative diff within tolerance is clamped to zero.""" + df = self._df(values_a=[10.0, 9.9995, 10.5]) + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="zero", + small_negative_tolerance=0.01, + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out = tr.transform(df) + # diff: 0, -0.0005 (-> 0), 0.5005 + expected = pd.Series([0.0, 0.0, 0.5005], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected) + + def test_compute_rate(self) -> None: + """Rate equals increment divided by dt seconds.""" + idx = pd.DatetimeIndex( + [self.t0, self.t0 + timedelta(seconds=2), self.t0 + timedelta(seconds=5)], + tz="UTC", + ) + df = self._df(values_a=[0, 4, 7], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=True, + reset_strategy="zero", + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out = tr.transform(df) + # increments: [0, 4, 3]; dt: [NaN, 2, 3]; rate: [0, 2, 1] + expected = pd.Series([0.0, 2.0, 1.0], index=df.index, name="counter_a_rate") + pd.testing.assert_series_equal(out["counter_a_rate"], expected) + + def test_gap_masking_with_max_gap_seconds(self) -> None: + """Values at positions where dt > threshold should be masked (NaN).""" + idx = pd.DatetimeIndex( + [ + self.t0, + self.t0 + timedelta(seconds=1), + self.t0 + timedelta(seconds=10), # big gap from previous + self.t0 + timedelta(seconds=11), + ], + tz="UTC", + ) + df = self._df(values_a=[0, 1, 2, 3], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="mask", + max_gap_seconds=8.0, # gap = 9 seconds + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + # increments: [0,1,1,1]; dt: [NaN,1,9,1]; mask where dt>5 -> index 2 + self.assertTrue(np.isnan(out["counter_a_diff"].iloc[2])) + self.assertEqual(out["counter_a_diff"].iloc[1], 1.0) + self.assertEqual(out["counter_a_diff"].iloc[3], 1.0) + + def test_gap_masking_with_factor_median(self) -> None: + """Threshold computed as factor * median(dt).""" + idx = pd.DatetimeIndex( + [ + self.t0, + self.t0 + timedelta(seconds=2), + self.t0 + timedelta(seconds=4), + self.t0 + timedelta(seconds=20), # gap 16 > factor*median (median=2) + ], + tz="UTC", + ) + df = self._df(values_a=[0, 2, 3, 5], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="mask", + max_gap_seconds=None, + max_gap_factor=3.0, # 3 * median = 6 + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + self.assertTrue(np.isnan(out["counter_a_diff"].iloc[3])) # masked at data gap + self.assertEqual(out["counter_a_diff"].iloc[1], 2.0) + self.assertEqual(out["counter_a_diff"].iloc[2], 1.0) + + def test_gap_policy_ignore(self) -> None: + """No masking when gap_policy='ignore'.""" + idx = pd.DatetimeIndex( + [self.t0, self.t0 + timedelta(seconds=1), self.t0 + timedelta(seconds=10)], + tz="UTC", + ) + df = self._df(values_a=[0, 1, 30], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + expected = pd.Series([0, 1, 29], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_keep_original_false_drops_counters(self) -> None: + """When keep_original=False, original counters are dropped from output.""" + df = self._df(values_a=[0, 1, 2], values_b=[0, 10, 20]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + keep_original=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + # 'counter_b' should be kept, 'counter_a' replaced by 'counter_a_diff' + self.assertListEqual(list(out.columns), ["counter_b", "counter_a_diff"]) + + def test_keep_original_true_keeps_counters(self) -> None: + """When keep_original=True, original counters remain alongside outputs.""" + df = self._df(values_a=[0, 1, 2], values_b=[0, 10, 20]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + keep_original=True, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + self.assertListEqual(list(out.columns), ["counter_a", "counter_b", "counter_a_diff"]) + + def test_feature_names_out(self) -> None: + """get_feature_names_out returns correct output ordering.""" + df = self._df(values_a=[0, 1, 2], values_b=[0, 10, 20]) + tr = CounterDiffTransformer( + counters=["counter_a", "missing_counter"], + compute_rate=False, + keep_original=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + # Only present counters are transformed; others ignored + self.assertEqual(tr.counters_, ["counter_a"]) + self.assertEqual(tr.get_feature_names_out(), ["counter_b", "counter_a_diff"]) + + out = tr.transform(df) + self.assertListEqual(tr.get_feature_names_out(), list(out.columns)) + + def test_non_numeric_values_raise_error(self) -> None: + """Non-numeric values should be coerced to NaN then diff computed.""" + df = self._df(values_a=[0, "1", "3", "bad", 7]) # 'bad' -> NaN + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + with self.assertRaises(ValueError): + tr.transform(df) + + def test_inverse_transform(self) -> None: + """inverse_transform returns input unchanged.""" + df = self._df(values_a=[0, 1, 2]) + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + out = tr.transform(df) + back = tr.inverse_transform(out.copy()) + pd.testing.assert_frame_equal(out, back) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/data_preprocessing/test_data_clipper.py b/tests/data_preprocessing/test_data_clipper.py index dd9cd87..effefa9 100644 --- a/tests/data_preprocessing/test_data_clipper.py +++ b/tests/data_preprocessing/test_data_clipper.py @@ -2,6 +2,7 @@ import unittest import pandas as pd +import pandas.testing as pdt from energy_fault_detector.data_preprocessing.data_clipper import DataClipper @@ -29,3 +30,23 @@ def test_transform(self): ) self.data_clipper.fit(x_test) self.assertTrue(self.data_clipper.transform(x_test).equals(expected_output)) + + def test_transform_with_features_to_clip(self): + # Only clip 'feature1'; leave 'feature2' and angles unchanged + clipper = DataClipper(lower_percentile=0.2, upper_percentile=0.8, + features_to_clip=['feature1']) + x_test = pd.DataFrame( + {'feature1': [1, 2, 3, 4, 5], 'feature2': [4, 5, 6, 7, 8], 'angle1': [0, 45, 90, 135, 180], + 'angle2': [0, 45, 90, 135, 180]} + ) + expected_output = pd.DataFrame( + {'feature1': [1.8, 2, 3, 4, 4.2], 'feature2': [4, 5, 6, 7, 8], 'angle1': [0, 45, 90, 135, 180], + 'angle2': [0, 45, 90, 135, 180]} + ) + clipper.fit(x_test) + pdt.assert_frame_equal(clipper.transform(x_test), expected_output) + + def test_init_mutually_exclusive_args(self): + with self.assertRaises(ValueError): + DataClipper(lower_percentile=0.2, upper_percentile=0.8, + features_to_exclude=['angle1'], features_to_clip=['feature1']) diff --git a/tests/data_preprocessing/test_data_preprocessor.py b/tests/data_preprocessing/test_data_preprocessor.py index 5ebd235..2f33f6e 100644 --- a/tests/data_preprocessing/test_data_preprocessor.py +++ b/tests/data_preprocessing/test_data_preprocessor.py @@ -12,6 +12,17 @@ class TestDataPreprocessorPipeline(TestCase): def setUp(self) -> None: self.standard_preprocessor = DataPreprocessor( + steps=[ + {'name': 'column_selector', + 'params': {'max_nan_frac_per_col': 0.2}}, + {'name': 'angle_transform', + 'params': {'angles': ['Sensor_6']}}, + {'name': 'duplicate_values_to_nan'}, + {'name': 'low_unique_value_filter',} + ] + ) + # legacy set up + self.standard_preprocessor_old = DataPreprocessor( max_nan_frac_per_col=0.2, imputer_strategy='mean', angles=['Sensor_6'], @@ -20,17 +31,21 @@ def setUp(self) -> None: include_low_unique_value_filter=True, min_unique_value_count=2, ) - - self.ts_preprocessor = DataPreprocessor( - max_nan_frac_per_col=0.2, - min_unique_value_count=3, - max_col_zero_frac=0.30, - include_column_selector=True, - include_duplicate_value_to_nan=False, - include_low_unique_value_filter=True + self.another_preprocessor = DataPreprocessor( + steps=[ + {'name': 'column_selector', + 'params': {'max_nan_frac_per_col': 0.2}}, + {'name': 'angle_transform', + 'params': {'angles': ['Sensor_6']}}, + {'name': 'duplicate_values_to_nan', + 'params': {'n_max_duplicates': 4, + 'value_to_replace': 0}}, + {'name': 'low_unique_value_filter', + 'params': {'min_unique_value_count': 1}}, + ] ) - - self.extended_preprocessor = DataPreprocessor( + # legacy set up + self.another_preprocessor_old = DataPreprocessor( max_nan_frac_per_col=0.2, imputer_strategy='mean', min_unique_value_count=1, @@ -41,8 +56,16 @@ def setUp(self) -> None: include_duplicate_value_to_nan=True, include_low_unique_value_filter=True ) - + # Feature consistent, does not drop columns self.fc_preprocessor = DataPreprocessor( + steps=[ + {'name': 'column_selector', 'enabled': False}, + {'name': 'angle_transform', + 'params': {'angles': ['Sensor_6']}}, + ] + ) + # legacy set up + self.fc_preprocessor_old = DataPreprocessor( imputer_strategy='mean', angles=['Sensor_6'], include_low_unique_value_filter=False, @@ -94,12 +117,14 @@ def setUp(self) -> None: self.test_data3 = pd.DataFrame(index=time_index, data=data) def test_fit_standard_preprocessor(self): + self.standard_preprocessor_old.fit(self.test_data1) + check_is_fitted(self.standard_preprocessor_old.named_steps['scaler']) self.standard_preprocessor.fit(self.test_data1) check_is_fitted(self.standard_preprocessor.named_steps['scaler']) def test_fit_extended(self): - self.extended_preprocessor.fit(self.test_data3) - check_is_fitted(self.extended_preprocessor.named_steps['scaler']) + self.another_preprocessor_old.fit(self.test_data3) + check_is_fitted(self.another_preprocessor_old.named_steps['scaler']) def test_transform(self): # expected output @@ -139,8 +164,8 @@ def test_transform_extended(self): sincos = (sincos - sincos.mean(axis=0)) / sincos.std(axis=0) exp_result = np.hstack([exp_result, sincos]) - self.extended_preprocessor.fit(self.test_data3) - data = self.extended_preprocessor.transform(self.test_data3) + self.another_preprocessor_old.fit(self.test_data3) + data = self.another_preprocessor_old.transform(self.test_data3) assert_array_almost_equal(data, exp_result) @@ -156,8 +181,8 @@ def test_transform_fc(self): [1.21854359, 1.22474487, 0., 0., 0., 1.21773319, -1.32214018], [1.5666989, 1.63299316, 0., 0., 0., 1.56338116, -1.95410719]]) - self.fc_preprocessor.fit(self.test_data1) - data = self.fc_preprocessor.transform(self.test_data1) + self.fc_preprocessor_old.fit(self.test_data1) + data = self.fc_preprocessor_old.transform(self.test_data1) assert_array_almost_equal(data, exp_result) @@ -166,49 +191,134 @@ def test_not_fitted(self): self.standard_preprocessor.transform(self.test_data1) with self.assertRaises(NotFittedError): - self.ts_preprocessor.transform(self.test_data1) + self.another_preprocessor.transform(self.test_data1) def test_inverse_transform(self): - self.standard_preprocessor.fit(self.test_data1) + for preprocessor in [self.standard_preprocessor, self.standard_preprocessor_old]: + preprocessor.fit(self.test_data1) - output = self.standard_preprocessor.inverse_transform( - self.standard_preprocessor.transform(self.test_data1) - ).astype(float) - expected = self.test_data1[['Sensor_1', 'Sensor_2', 'Sensor_6']].astype(float) - expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. + output = preprocessor.inverse_transform( + preprocessor.transform(self.test_data1) + ).astype(float) + expected = self.test_data1[['Sensor_1', 'Sensor_2', 'Sensor_6']].astype(float) + expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. - assert_frame_equal( - output.reset_index(drop=True), - expected.reset_index(drop=True), - ) + assert_frame_equal( + output.reset_index(drop=True), + expected.reset_index(drop=True), + ) def test_inverse_transform_extended(self): - self.extended_preprocessor.fit(self.test_data3) - - output = self.extended_preprocessor.inverse_transform( - self.extended_preprocessor.transform(self.test_data3) - ).astype(float) - expected = self.test_data3[['Sensor_1', 'Sensor_2', 'Sensor_6', 'Sensor_7']].astype(float) - expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. - expected.loc['2021-05-02 08:00:00', 'Sensor_7'] = 0.555556 - - assert_frame_equal( - output.reset_index(drop=True), - expected.reset_index(drop=True), - ) + for preprocessor in [self.another_preprocessor, self.another_preprocessor_old]: + preprocessor.fit(self.test_data3) + + output = preprocessor.inverse_transform( + preprocessor.transform(self.test_data3) + ).astype(float) + expected = self.test_data3[['Sensor_1', 'Sensor_2', 'Sensor_6', 'Sensor_7']].astype(float) + expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. + expected.loc['2021-05-02 08:00:00', 'Sensor_7'] = 0.555556 + + assert_frame_equal( + output.reset_index(drop=True), + expected.reset_index(drop=True), + ) def test_inverse_transform_fc(self): - self.fc_preprocessor.fit(self.test_data1) - - output = self.fc_preprocessor.inverse_transform( - self.fc_preprocessor.transform(self.test_data1) - ).astype(float) - expected = self.test_data1.astype(float) - expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. - expected.loc[pd.isnull(expected['Sensor_3']), 'Sensor_3'] = 2. - expected.loc[pd.isnull(expected['Sensor_4']), 'Sensor_4'] = 0. - - assert_frame_equal( - output.reset_index(drop=True), - expected.reset_index(drop=True), + for preprocessor in [self.fc_preprocessor, self.fc_preprocessor_old]: + preprocessor.fit(self.test_data1) + + output = preprocessor.inverse_transform( + preprocessor.transform(self.test_data1) + ).astype(float) + expected = self.test_data1.astype(float) + expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. + expected.loc[pd.isnull(expected['Sensor_3']), 'Sensor_3'] = 2. + expected.loc[pd.isnull(expected['Sensor_4']), 'Sensor_4'] = 0. + + assert_frame_equal( + output.reset_index(drop=True), + expected.reset_index(drop=True), + ) + + def test_steps_mode_no_duplicate_imputer(self) -> None: + """Providing 'simple_imputer' explicitly should not add a second default imputer.""" + dp = DataPreprocessor( + steps=[ + {"name": "column_selector", "params": {"max_nan_frac_per_col": 0.2}}, + {"name": "simple_imputer", "params": {"strategy": "median"}}, + {"name": "standard_scaler"}, + ] + ) + # Count imputers by estimator type + n_imputers = sum( + est.__class__.__name__ == "SimpleImputer" for _, est in dp.steps + ) + self.assertEqual(n_imputers, 1, "There should be exactly one SimpleImputer.") + + # Ensure imputer precedes scaler + imputer_idx = next( + i for i, (_, est) in enumerate(dp.steps) if est.__class__.__name__ == "SimpleImputer" + ) + scaler_idx = next( + i for i, (_, est) in enumerate(dp.steps) + if est.__class__.__name__ in {"StandardScaler", "MinMaxScaler"} + ) + self.assertLess(imputer_idx, scaler_idx, "Imputer must precede scaler.") + + def test_steps_mode_default_imputer_inserted(self) -> None: + """Omitting 'simple_imputer' should auto-insert a default imputer before the scaler.""" + dp = DataPreprocessor( + steps=[ + {"name": "column_selector", "params": {"max_nan_frac_per_col": 0.2}}, + {"name": "standard_scaler"}, + ] + ) + # Exactly one imputer should be present + n_imputers = sum( + est.__class__.__name__ == "SimpleImputer" for _, est in dp.steps + ) + self.assertEqual(n_imputers, 1, "A single default SimpleImputer should be added.") + + # Imputer must be before scaler + imputer_idx = next( + i for i, (_, est) in enumerate(dp.steps) if est.__class__.__name__ == "SimpleImputer" + ) + scaler_idx = next( + i for i, (_, est) in enumerate(dp.steps) + if est.__class__.__name__ in {"StandardScaler", "MinMaxScaler"} + ) + self.assertLess(imputer_idx, scaler_idx, "Default imputer must be inserted before scaler.") + + def test_steps_mode_alias_imputer_is_normalized(self) -> None: + """Using 'imputer' alias should be normalized to 'simple_imputer' internally.""" + dp = DataPreprocessor( + steps=[ + {"name": "imputer", "params": {"strategy": "mean"}}, # alias + {"name": "standard_scaler"}, + ] ) + # Named steps should include the canonical 'simple_imputer' + self.assertIn("simple_imputer", dp.named_steps) + + def test_singleton_violation_raises(self) -> None: + """Two enabled simple_imputer steps should raise a ValueError.""" + with self.assertRaises(ValueError): + _ = DataPreprocessor( + steps=[ + {"name": "simple_imputer", "params": {"strategy": "mean"}}, + {"name": "simple_imputer", "params": {"strategy": "median"}}, + {"name": "standard_scaler"}, + ] + ) + + def test_only_one_scaler_allowed(self) -> None: + """Defining more than one scaler should raise a ValueError.""" + with self.assertRaises(ValueError): + _ = DataPreprocessor( + steps=[ + {"name": "column_selector", "params": {"max_nan_frac_per_col": 0.2}}, + {"name": "standard_scaler"}, + {"name": "minmax_scaler"}, + ] + ) diff --git a/tests/test_data/ensemble_config.yaml b/tests/test_data/ensemble_config.yaml deleted file mode 100644 index c610063..0000000 --- a/tests/test_data/ensemble_config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Model settings -train: - anomaly_score: - name: 'mahalanobis' - params: - pca: true - pca_min_var: 0.85 - data_preprocessor: - name: 'default' - autoencoder: - name: 'EnsembleAutoencoder' - params: - hyperparams_list: - - layers: [300] - code_size: 50 - learning_rate: 0.001 - decay_rate: 0.001 - batch_size: 144 - epochs: 10 - loss_name: 'mean_squared_error' - - layers: [200] - code_size: 25 - learning_rate: 0.001 - decay_rate: 0.001 - batch_size: 144 - epochs: 20 - loss_name: 'mean_squared_error' - threshold_selector: - name: 'FDR' - params: - target_false_discovery_rate: 0.8 diff --git a/tests/test_data/test_bad_early_stopping_config.yaml b/tests/test_data/test_bad_early_stopping_config.yaml index 76c5908..f6e9284 100644 --- a/tests/test_data/test_bad_early_stopping_config.yaml +++ b/tests/test_data/test_bad_early_stopping_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_conditional_ae_config.yaml b/tests/test_data/test_conditional_ae_config.yaml index f020a4a..66983fe 100644 --- a/tests/test_data/test_conditional_ae_config.yaml +++ b/tests/test_data/test_conditional_ae_config.yaml @@ -4,14 +4,7 @@ train: params: pca: true pca_min_var: 0.85 - data_preprocessor: - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 - include_duplicate_value_to_nan: false + data_preprocessor: # no further spec, so defaults are applied autoencoder: name: 'ConditionalAE' verbose: 0 diff --git a/tests/test_data/test_config.yaml b/tests/test_data/test_config.yaml index c6cf979..2f093ef 100644 --- a/tests/test_data/test_config.yaml +++ b/tests/test_data/test_config.yaml @@ -4,14 +4,7 @@ train: params: pca: true pca_min_var: 0.85 - data_preprocessor: - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 - include_duplicate_value_to_nan: false + data_preprocessor: # no further spec, so defaults are applied autoencoder: name: 'MultilayerAutoencoder' verbose: 0 diff --git a/tests/test_data/test_config_no_rca.yaml b/tests/test_data/test_config_no_rca.yaml index 7c83e0b..aa32fdc 100644 --- a/tests/test_data/test_config_no_rca.yaml +++ b/tests/test_data/test_config_no_rca.yaml @@ -5,13 +5,6 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_criticality_config.yaml b/tests/test_data/test_criticality_config.yaml index 7165be0..f366d4c 100644 --- a/tests/test_data/test_criticality_config.yaml +++ b/tests/test_data/test_criticality_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_early_stopping_val_block_config.yaml b/tests/test_data/test_early_stopping_val_block_config.yaml index 95ab325..62aad09 100644 --- a/tests/test_data/test_early_stopping_val_block_config.yaml +++ b/tests/test_data/test_early_stopping_val_block_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_early_stopping_val_split_config.yaml b/tests/test_data/test_early_stopping_val_split_config.yaml index f1b256b..eb0d903 100644 --- a/tests/test_data/test_early_stopping_val_split_config.yaml +++ b/tests/test_data/test_early_stopping_val_split_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_export_default_adaptive.yaml b/tests/test_data/test_export_default_adaptive.yaml deleted file mode 100644 index 77b0d8b..0000000 --- a/tests/test_data/test_export_default_adaptive.yaml +++ /dev/null @@ -1,42 +0,0 @@ -train: - data_clipping: # (optional) if not specified, not applied. - # clip training data to remove outliers - lower_percentile: 0.01 - upper_percentile: 0.99 - - data_preprocessor: - params: - features_to_exclude: - - a - - b - max_nan_frac_per_col: 0.2 - imputer_strategy: mean - min_unique_value_count: 1 - angles: - - c - - data_splitter: - # use train_test_split without shuffle for LSTM and CNN models! - type: sklearn - validation_split: 0.3 - shuffle: False - - autoencoder: - name: default - params: - batch_size: 8 - learning_rate: 0.001 - epochs: 10 - code_size: 1 - layers: - - 5 - loss_name: mean_squared_error - verbose: 0 - - anomaly_score: - name: mahalanobis - - threshold_selector: - name: adaptive - params: - gamma: 0.1 \ No newline at end of file diff --git a/tests/test_data/test_export_full_prep_cnn_ad.yaml b/tests/test_data/test_export_full_prep_cnn_ad.yaml deleted file mode 100644 index c62b556..0000000 --- a/tests/test_data/test_export_full_prep_cnn_ad.yaml +++ /dev/null @@ -1,53 +0,0 @@ -train: - data_clipping: # (optional) if not specified, not applied. - # clip training data to remove outliers - lower_percentile: 0.01 - upper_percentile: 0.99 - - data_preprocessor: - params: - features_to_exclude: - - a - - b - max_nan_frac_per_col: 0.2 - imputer_strategy: mean - min_unique_value_count: 1 - angles: - - c - n_max_duplicates: 4 - value_to_replace: 0 - include_column_selector: True - include_duplicate_value_to_nan: True - include_low_unique_value_filter: True - ts_features: - - 'day_of_year' - - 'hour_of_day' - verbose: 0 - - data_splitter: - # use train_test_split without shuffle for LSTM and CNN models! - type: sklearn - validation_split: 0.5 - shuffle: False - - autoencoder: - name: CNN - # Data sampler configuration (only for Seq2Seq models) - time_series_sampler: - sequence_length: 2 - overlap: 0 # ignored in random mode, used in inference phase - params: - batch_size: 8 - learning_rate: 0.001 - epochs: 1 - filters: - - 5 - loss_name: mean_squared_error - - anomaly_score: - name: rmse - - threshold_selector: - name: fbeta - params: - beta: 0.5 \ No newline at end of file diff --git a/tests/test_data/verbose_config.yaml b/tests/test_data/verbose_config.yaml index c9eaec9..bfebf23 100644 --- a/tests/test_data/verbose_config.yaml +++ b/tests/test_data/verbose_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: