diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index e1b539d..2c6d39b 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -23,7 +23,8 @@ jobs: - name: Install dependencies run: | - pip install sphinx sphinx-rtd-theme sphinx-autodoc-typehints sphinx-copybutton + pip install -r requirements-docs.txt + pip install -r requirements.txt - name: Create VERSION file run: echo "${GITHUB_REF##*/}" > "${PKG_NAME}/VERSION" diff --git a/README.md b/README.md index be2e1f0..92de174 100644 --- a/README.md +++ b/README.md @@ -48,11 +48,15 @@ For an example using one of the CARE2Compare datasets, run: For more information, have a look at the notebook [Quick Fault Detection](./notebooks/Example%20-%20Quick%20Fault%20Detection.ipynb) -## Fault detection in 4 lines of code +## Fault detection quickstart ```python from energy_fault_detector import FaultDetector, Config +from energy_fault_detector.config import generate_quickstart_config +# 1) Generate and save a base config (YAML) +generate_quickstart_config(output_path="base_config.yaml") +# 2) Train and predict using the generated config fault_detector = FaultDetector(config=Config('base_config.yaml')) model_data = fault_detector.train(sensor_data=sensor_data, normal_index=normal_index) results = fault_detector.predict(sensor_data=test_sensor_data) @@ -60,7 +64,7 @@ results = fault_detector.predict(sensor_data=test_sensor_data) The pandas `DataFrame` `sensor_data` contains the operational data in wide format with the timestamp as index, the pandas `Series` `normal_index` indicates which timestamps are considered 'normal' operation and can be used to create -a normal behaviour model. The [`base_config.yaml`](energy_fault_detector/base_config.yaml) file contains all model +a normal behaviour model. The [`base_config.yaml`](energy_fault_detector/base_config.yaml) file contains the model settings, an example is found [here](energy_fault_detector/base_config.yaml). @@ -84,10 +88,8 @@ All contributions, bug reports, bug fixes, documentation improvements, enhanceme 2. Unification, standardisation and generic improvements 1. Additional options for all autoencoders (e.g. drop out, regularization) 2. Data preparation (e.g. extend imputation strategies). - 3. Download method for the Care2Compare class. - 3. Unify default value settings. - 4. No or low configuration - 5. Upgrade to Keras 3.0 + 3. No or low configuration need (e.g. use defaults where possible). + 4. Upgrade to Keras 3.0 3. Root cause analysis expansion 1. integrate SHAP and possibly other XAI-methods. @@ -98,12 +100,17 @@ This project is licensed under the [MIT License](./LICENSE). ## References If you use this work, please cite us: +**Fault detection in district heating substations**: +- Enabling Predictive Maintenance in District Heating Substations: A Labelled Dataset and Fault Detection Evaluation Framework based on Service Data. +PrePrint on ArXiv. https://doi.org/10.48550/arXiv.2511.14791 +- Dataset: PreDist Dataset - Operational data of district heating substations labelled with faults and maintenance information. Zenodo, Nov 2025, https://doi.org/10.5281/zenodo.17522254. + **ARCANA Algorithm**: Autoencoder-based anomaly root cause analysis for wind turbines. Energy and AI. 2021;4:100065. https://doi.org/10.1016/j.egyai.2021.100065 **CARE to Compare dataset and CARE-Score**: - Paper: CARE to Compare: A Real-World Benchmark Dataset for Early Fault Detection in Wind Turbine Data. Data. 2024; 9(12):138. https://doi.org/10.3390/data9120138 -- Dataset: Wind Turbine SCADA Data For Early Fault Detection. Zenodo, Mar. 2025, https://doi.org/10.5281/ZENODO.14958989. +- Dataset: Wind Turbine SCADA Data For Early Fault Detection. Zenodo, Oct. 2024, https://doi.org/10.5281/ZENODO.14958989. **Transfer learning methods**: Transfer learning applications for autoencoder-based anomaly detection in wind turbines. Energy and AI. 2024;17:100373. https://doi.org/10.1016/j.egyai.2024.100373 diff --git a/docs/advanced_config.yaml b/docs/advanced_config.yaml new file mode 100644 index 0000000..ac25ad0 --- /dev/null +++ b/docs/advanced_config.yaml @@ -0,0 +1,111 @@ +train: + # clip training data to remove outliers (only applied for training) + data_clipping: # (optional) if not specified, not applied. + lower_percentile: 0.01 + upper_percentile: 0.99 + # Choose one of: + # features_to_exclude: + # - do_not_clip_this_feature + # features_to_clip: + # - clip_only_this_feature + + data_preprocessor: + steps: + # Replace consecutive duplicate 0-values with NaN + - name: duplicate_to_nan + params: + value_to_replace: 0 + n_max_duplicates: 6 + features_to_exclude: + - do_not_replace_value_with_nan_for_this_feature + # Normalize counters to differences (configure your counter columns) + # If needed, you can create multiple counter_diff_transformer steps with different settings for different counters + - name: counter_diff_transformer + step_name: counter_diff_energy + params: + counters: + - energy_total_kwh + compute_rate: false + reset_strategy: zero + fill_first: nan + # Column selection: drop columns where > 20% is missing and exclude specific features + - name: column_selector + params: + max_nan_frac_per_col: 0.20 + features_to_exclude: + - feature1 + - feature2 + # Alternatively, keep only selected features: + # features_to_select: + # - temp_outdoor + # - flow + # - power + # Filter low unique value features or high-zero-fraction columns + - name: low_unique_value_filter + params: + min_unique_value_count: 2 + max_col_zero_frac: 0.99 + # Transform angles to sin/cos + - name: angle_transformer + params: + angles: + - angle1 + - angle2 + # Imputer (explicit; will be auto-inserted if omitted) + - name: simple_imputer + params: + strategy: mean + # Scaler (choose one; StandardScaler is auto-added by default if omitted) + - name: standard_scaler + params: + with_mean: true + with_std: true + + data_splitter: + # How to split data in train and validation sets for the autoencoder + type: sklearn + validation_split: 0.2 + shuffle: true # false by default (last part of the data is taken as validation data in this case) + # or block splitting, 4 weeks training, 1 week validation + # type: DataSplitter + # train_block_size: 4032 + # val_block_size: 1008 + + autoencoder: + name: MultilayerAutoencoder + params: + batch_size: 128 + # Use a ExponentialDecay schedule for the learning rate: + learning_rate: 0.001 # starting point + decay_rate: 0.99 + decay_steps: 100000 + # Set early stopping with max 1000 epochs, minimal improvement of 1e-4 and patience of 5 epochs + early_stopping: True + min_delta: 0.0001 + patience: 5 + epochs: 1000 + # architecture settings + layers: [200, 100, 50] + code_size: 20 + act: prelu # activation to use for hidden layers + last_act: linear # output layer activation + + anomaly_score: + name: rmse + params: + scale: false + + threshold_selector: + name: fbeta + params: + beta: 0.5 + +root_cause_analysis: + alpha: 0.5 + init_x_bias: recon + num_iter: 1000 + verbose: true + +predict: + criticality: + max_criticality: 144 diff --git a/docs/basic_config.yaml b/docs/basic_config.yaml new file mode 100644 index 0000000..d927187 --- /dev/null +++ b/docs/basic_config.yaml @@ -0,0 +1,45 @@ +train: + # clip training data to remove outliers (only applied for training) + data_clipping: # (optional) if not specified, not applied. + # Use features_to_exclude or features_to_clip: [feature] to skip or to apply to specific features + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + steps: + # This drops features where > 20% is missing + - name: column_selector + params: + max_nan_frac_per_col: 0.2 + # This drops constants by default (controlled by `min_unique_value_count`) + - name: low_unique_value_filter + # SimpleImputer and StandardScaler are always added + + data_splitter: + # How to split data in train and validation sets for the autoencoder + type: sklearn + validation_split: 0.2 + shuffle: true + + autoencoder: + name: default + params: + layers: # Symmetric autoencoder: inputs - 200 - 100 - 50 - 20 - 50 - 100 - 200 - outputs + - 200 + - 100 + - 50 + code_size: 20 # Size of the bottleneck layer + + anomaly_score: + name: rmse + + threshold_selector: + fit_on_val: true + name: quantile + params: + quantile: 0.95 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/docs/conf.py b/docs/conf.py index c4de4e8..7aba3a6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -127,6 +127,11 @@ napoleon_use_param = True napoleon_use_rtype = True +napoleon_type_aliases = { + "Config": "energy_fault_detector.Config", + "FaultDetector": "energy_fault_detector.FaultDetector", +} + # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/configuration.rst b/docs/configuration.rst new file mode 100644 index 0000000..530b5bd --- /dev/null +++ b/docs/configuration.rst @@ -0,0 +1,136 @@ +.. _configuration_guide: + +Configuration +================================ +This page explains how to configure training, prediction, and optional root cause analysis (ARCANA). + +.. contents:: Table of Contents + :depth: 3 + :local: + +Quick start: minimal configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A minimal configuration that clips outliers, imputes missing values, and scales features: + +.. include:: basic_config.yaml + :literal: + +This setup: + +- Applies DataClipper if specified. +- Builds a DataPreprocessor with: + + - ColumnSelector that drops columns with more than 20% NaNs (configurable). + - LowUniqueValueFilter that removes constant features by default (configurable). + - SimpleImputer (mean) and a scaler (StandardScaler by default). If you do not add an imputer/scaler explicitly, + the pipeline ensures mean-imputation and StandardScaler are added. + +- Trains a default autoencoder (with provided architecture, otherwise default values), with an RMSE anomaly score and a + quantile threshold selector. +- Runs ARCANA with provided parameters when calling :py:obj:`FaultDetector.predict(..., root_cause_analysis=True) `. + If not provided, default ARCANA parameters are used (see :py:obj:`ARCANA docs `). + +If you leave out the data_preprocessor configuration (i.e., ``data_preprocessor: {}``), a default preprocessing pipeline +is generated, which drops constant features, features where >5% of the data is missing, imputes remaining missing values +with the mean value and scales the data to zero mean and unit standard deviation. + +Detailed configuration +^^^^^^^^^^^^^^^^^^^^^^ +Below is a more thorough configuration. It shows how to specify preprocessing steps and more model parameters. + +.. include:: advanced_config.yaml + :literal: + +DataPreprocessor specification +"""""""""""""""""""""""""""""" +A steps-based preprocessing pipeline can be configured under ``train.data_preprocessor.steps``. Each step is a dict +with the following keys: + +- ``name`` (str): the registered step name (see table below). +- ``enabled`` (bool, optional): default ``True``; set to ``False`` to skip a step. +- ``params`` (dict, optional): constructor arguments for the step. +- ``step_name`` (str, optional): custom key for the sklearn pipeline; useful if a step is repeated. + +Allowed step names and aliases: + ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| Step name | Purpose | Aliases | ++=========================+===============================================+================================================+ +| column_selector | Drop columns with too many NaNs | \- | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| low_unique_value_filter | Drop columns with low variance/many zeros | \- | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| angle_transformer | Convert angles to sin/cos pairs | angle_transform | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| counter_diff_transformer| Convert counters to differences/rates | counter_diff, counter_diff_transform | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| simple_imputer | Impute missing values | imputer | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| standard_scaler | Standardize features (z-score) | standardize, standardscaler, standard | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| minmax_scaler | Scale to [0, 1] | minmax | ++-------------------------+-----------------------------------------------+------------------------------------------------+ +| duplicate_to_nan | Replace consecutive duplicate values with NaN | duplicate_value_to_nan, duplicate_values_to_nan| ++-------------------------+-----------------------------------------------+------------------------------------------------+ + +For detailed documentation of the data preprocessor pipeline, refer to the +:py:obj:`DataPreprocessor ` docs. + +Other training configuration sections +""""""""""""""""""""""""""""""""""""" + +- Data clipping: + :py:obj:`DataClipper ` supports + ``features_to_exclude`` and ``features_to_clip`` for fine-grained control. + + +- Data splitter (``train.data_splitter``): + + - ``type``: one of ``BlockDataSplitter`` (aliases: ``blocks``, ``DataSplitter``), or ``sklearn`` (alias ``train_test_split``). + - For sklearn: ``validation_split`` (float in (0, 1)) and ``shuffle`` (bool). + - For :py:obj:`BlockDataSplitter `: ``train_block_size`` and ``val_block_size``. + - Early stopping guard: if ``train.autoencoder.params.early_stopping`` is true, you must either set a + valid ``validation_split`` in (0, 1), or use :py:obj:`BlockDataSplitter ` + with a positive ``val_block_size``. + + +- Autoencoder (``train.autoencoder``): + + - ``name``: class name in the registry. + - ``params``: architecture and training args (e.g., ``layers``, ``epochs``, ``learning_rate``, ``early_stopping``). + Refer to the autoencoder class docs (:py:obj:`autoencoders `) for specific params and their defaults. + +- Anomaly score (``train.anomaly_score``): + + - ``name``: score name (e.g., ``rmse``, ``mahalanobis``). + - ``params``: score-specific parameters. Refer to the :py:obj:`anomaly_scores ` docs. + +- Threshold selector (``train.threshold_selector``): + + - ``name``: e.g., ``quantile``, ``fbeta``, etc. + - ``fit_on_val``: fit the threshold on validation only. + - ``params``: selector-specific parameters (e.g., ``quantile`` for the quantile selector). + See the :py:obj:`threshold_selectors ` docs for more info on the settings. + +Prediction options +^^^^^^^^^^^^^^^^^^ +Under ``predict``, you can set: + +- ``criticality.max_criticality``: cap the calculated criticality (anomaly counter) to this value. + + +Root cause analysis (ARCANA) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +If ``root_cause_analysis`` is provided, ARCANA will attempt to attribute anomalies to specific features using the +provided settings. If not provided, default settings are used. For detailed documentation refer to +:py:obj:`ARCANA docs `. + + +Old params data preprocessing configuration (for older versions) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Older configurations use params under ``train.data_preprocessor.params``. +These remain supported but are deprecated in favor of steps mode. +When both ``steps`` and legacy params are present, ``steps`` take precedence and legacy params are ignored with a warning. + +.. include:: old_config.yaml + :literal: diff --git a/docs/index.rst b/docs/index.rst index dda8d7b..c13d39e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,10 +1,10 @@ Energy Fault Detector - Autoencoder-based Fault Detection for the Future Energy System -============================================================ +====================================================================================== **Energy Fault Detector** is an open-source Python package designed for the automated detection of anomalies in operational data from renewable energy systems as well as power grids. It uses autoencoder-based normal behaviour models to identify irregularities in operational data. In addition to the classic anomaly detection, the package -includes the unique “ARCANA” approach for root cause analysis and thus allows interpretable early fault detection. +includes the unique ''ARCANA'' approach for root cause analysis and thus allows interpretable early fault detection. In addition to the pure ML models, the package also contains a range of preprocessing methods, which are particularly useful for analyzing systems in the energy sector. A holistic `EnergyFaultDetector` framework is provided for easy use of all these methods, which can be adapted to the respective use case via a single configuration file. @@ -27,11 +27,10 @@ To install the `energy-fault-detector` package, run: :glob: :maxdepth: 2 - The Energy Fault Detector package usage_examples + configuration logging - changelog - + The EnergyFaultDetector package Module index ================== diff --git a/docs/logging.rst b/docs/logging.rst index 485fcd0..7c943b3 100644 --- a/docs/logging.rst +++ b/docs/logging.rst @@ -1,45 +1,19 @@ Logging Configuration ===================== -The framework uses Python's built-in logging module to provide logging capabilities. By default, the logging -configuration is defined in a YAML file. You can customize this configuration to suit your needs. +The framework uses Python's built-in logging module for logging. +You can customize this configuration to suit your needs. Default Configuration --------------------- -The framework uses a default logging configuration file named ``logging.yaml``. +The framework uses a default logging configuration file ``energy_fault_detector/logging.yaml``. The logger used throughout the code is called ``energy_fault_detector``. -The default logging configuration is as follows. - -.. code-block:: yaml - - version: 1 - disable_existing_loggers: False - formatters: - simple: - format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - - handlers: - console: - class: logging.StreamHandler - level: DEBUG - formatter: simple - stream: ext://sys.stdout - - loggers: - energy_fault_detector: - level: INFO - handlers: [console] - propagate: no - - root: - level: INFO - handlers: [console] - You can silence the logger as follows: .. code-block:: python + import logging from energy_fault_detector.fault_detector import FaultDetector, Config diff --git a/docs/config_example.yaml b/docs/old_config.yaml similarity index 52% rename from docs/config_example.yaml rename to docs/old_config.yaml index 3dd8c39..4e0f454 100644 --- a/docs/config_example.yaml +++ b/docs/old_config.yaml @@ -1,10 +1,5 @@ train: - data_clipping: # (optional) if not specified, not applied. - # clip training data to remove outliers - lower_percentile: 0.01 - upper_percentile: 0.99 - features_to_exclude: - - do_not_clip_this_feature + # ... data_preprocessor: # only imputation and scaling are done by default, other steps can be skipped. @@ -28,39 +23,4 @@ train: duplicate_features_to_exclude: # DuplicateValuesToNan option - list of feature to not transform with DuplicateValuesToNan - do_not_replace_value_with_nan - data_splitter: # (optional) Define block size of train and validation blocks. Optional, if not specified, the defaults are used - # defaults: - type: DataSplitter # or sklearn - train_block_size: 5040 - val_block_size: 1680 # set val_block_size = 0 to use all data for training - - autoencoder: - name: 'MultilayerAutoencoder' - params: - batch_size: 128 - decay_rate: 0.001 # remove decay_rate+decay_steps for a fixed learning rate - decay_steps: 10000 - epochs: 10 - layers: - - 200 # Size of the first and last hidden layer - - 100 # Size of the second and second to last hidden layer - - 50 # Size of the third and third to last hidden layer - code_size: 20 # Size of the bottleneck - learning_rate: 0.001 - loss_name: 'mean_squared_error' - - anomaly_score: - name: 'rmse' - params: - scale: false - - threshold_selector: - name: 'fbeta' - params: - beta: 0.5 - -root_cause_analysis: # (optional) if not specified, no root_cause_analysis (ARCANA) is run - alpha: 0.8 - init_x_bias: recon - num_iter: 200 - + # ... diff --git a/docs/usage_examples.rst b/docs/usage_examples.rst index ed70e09..2dbe77e 100644 --- a/docs/usage_examples.rst +++ b/docs/usage_examples.rst @@ -14,65 +14,63 @@ refer to the example notebooks in the repository's notebooks folder. Energy Fault Detection -^^^^^^^^^^^^^^^ -The main interface for the `energy-fault-detector` package is the :py:obj:`FaultDetector ` class, which -needs a configuration object :py:obj:`Config `. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The main interface for the `energy-fault-detector` package is the :py:obj:`FaultDetector ` class, which +needs a configuration object :py:obj:`Config `. -To create a new :py:obj:`FaultDetector ` model, +To create a new :py:obj:`FaultDetector ` model, create a configuration, as described below in the :ref:`configuration` section, and run: .. code-block:: python - from energy_fault_detector.fault_detector import FaultDetector - from energy_fault_detector.config import Config + from energy_fault_detector import FaultDetector, Config - config = Config('configs/base_config.yaml') + config = Config('configs/basic_config.yaml') fault_detector = FaultDetector(config=config, model_directory='model_directory') - -To train new models, you need to provide the input data and call the ``fit`` method: +To train new models, you need to provide the input data and call the :py:obj:`FaultDetector.fit ` method: .. code-block:: python # get data from database / csv / API ... sensor_data = ... # a pandas DataFrame with timestamp as index and numerical sensor values as columns normal_index = ... # a pandas Series with timestamp as index and booleans indicating normal behaviour - # NOTE: The normal_index is optional, it is used to select training data for the autoencoder. - # If not provided, we assume all data represents normal behaviour. The other data points are used to set a - # threshold for the fault detection. + # NOTE: The normal_index is optional; it is used to select training data for the autoencoder. + # If not provided, we assume all data represents normal behaviour. + # If you do not have any labels, you cannot use th F-beta-score- and FDR-based thresholds. - # If you do not use the models for time series, the index can also be a standard RangeIndex, as long as the - # sensor_data dataframe and the normal_index series have the same index. + # If you do not use the models for time series, the index can also be a standard RangeIndex, + # as long as the sensor_data DataFrame and the normal_index Series share the same index. model_data = fault_detector.fit(sensor_data=sensor_data, normal_index=normal_index, save_models=True) # to save model manually: # fault_detector.save_models('model_name') # model_name is optional -The trained models are saved locally in the provided ``model_directory``. The ``fit`` method returns a +The trained models are saved locally in the provided ``model_directory``. The :py:obj:`FaultDetector.fit ` method returns a :py:obj:`ModelMetadata ` object with the model metadata such as the model date and the model path. -To predict using the trained model, use the ``predict`` method: +To predict using the trained model, use the :py:obj:`FaultDetector.predict ` method: .. code-block:: python results = fault_detector.predict(sensor_data=test_sensor_data) The result is a :py:obj:`FaultDetectionResult ` object -with with the following information: +with the following information: -* predicted_anomalies: DataFrame with a column 'anomaly' (bool). -* reconstruction: DataFrame with reconstruction of the sensor data with timestamp as index. -* deviations: DataFrame with reconstruction errors. -* anomaly_score: DataFrame with anomaly scores for each timestamp. -* bias_data: DataFrame with ARCANA results with timestamp as index. None if ARCANA was not run. -* arcana_losses: DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run. -* tracked_bias: List of DataFrames. None if ARCANA was not run. +* predicted_anomalies: pandas Series with the predicted anomalies (bool). +* reconstruction: pandas DataFrame with reconstruction of the sensor data with timestamp as index. +* deviations: pandas DataFrame with reconstruction errors. +* anomaly_score: pandas Series with anomaly scores for each timestamp. +* bias_data: pandas DataFrame with ARCANA results with timestamp as index. None if ARCANA was not run. +* arcana_losses: pandas DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run. +* tracked_bias: List of pandas DataFrames. None if ARCANA was not run. You can also create a :py:obj:`FaultDetector ` object and load -trained models using the ``load_models`` method. In this case, you do not need to provide a ``model_path`` -in the ``predict`` method. +trained models using the :py:obj:`FaultDetector.load_models ` method. In this case, you do not need to provide a ``model_path`` +in the :py:obj:`predict ` method. .. code-block:: python @@ -95,11 +93,17 @@ The training configuration is set with a ``yaml`` file which contains ``train`` train new models and ``root_cause_analysis`` specification if you want to analyse the model predictions with the `ARCANA` algorithm. An example: -.. include:: config_example.yaml +.. include:: basic_config.yaml :literal: +If you leave out the data_preprocessor configuration (i.e., ``data_preprocessor: None``), as default preprocessing +pipeline is generated, which drops constant features, features where >5% of the data is missing, imputes remaining +missing values with the mean value and scales the data to zero mean and unit standard deviation. + +See the :ref:`Configuration guide ` for more details on the configuration file and options. + To update the configuration 'on the fly' (for example for hyperparameter optimization), you provide a new -configuration dictionary via the ``update_config`` method: +configuration dictionary via the :py:obj:`Config.update_config ` method: .. code-block:: python @@ -150,7 +154,7 @@ you can import the data preprocessor, autoencoder, anomaly score and threshold s This allows you to add additional steps or use different data preprocessing pipelines. -An example training pipeline (similar to the :py:obj:`FaultDetector ` class ) +An example training pipeline (similar to the :py:obj:`FaultDetector ` class) would be: .. code-block:: python diff --git a/energy_fault_detector/__init__.py b/energy_fault_detector/__init__.py index 33c5e19..e77c74a 100644 --- a/energy_fault_detector/__init__.py +++ b/energy_fault_detector/__init__.py @@ -46,4 +46,3 @@ version = f.readlines()[0].strip() __version__ = version -__all__ = ['FaultDetector', 'Config', 'registry', 'quick_fault_detector'] diff --git a/energy_fault_detector/base_config.yaml b/energy_fault_detector/base_config.yaml index f708a7e..aebcc2b 100644 --- a/energy_fault_detector/base_config.yaml +++ b/energy_fault_detector/base_config.yaml @@ -4,14 +4,13 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - imputer_strategy: mean - max_nan_frac_per_col: 0.2 - min_unique_value_count: 2 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.2 + - name: low_unique_value_filter + - name: simple_imputer + - name: standard_scaler autoencoder: name: default diff --git a/energy_fault_detector/config/__init__.py b/energy_fault_detector/config/__init__.py index 1184dd5..45c6372 100644 --- a/energy_fault_detector/config/__init__.py +++ b/energy_fault_detector/config/__init__.py @@ -2,3 +2,4 @@ from energy_fault_detector.config.config import Config from energy_fault_detector.config.base_config import InvalidConfigFile +from energy_fault_detector.config.quickstart_config import generate_quickstart_config diff --git a/energy_fault_detector/config/config.py b/energy_fault_detector/config/config.py index 9722615..65e6101 100644 --- a/energy_fault_detector/config/config.py +++ b/energy_fault_detector/config/config.py @@ -34,9 +34,19 @@ 'data_preprocessor': { 'type': 'dict', 'required': True, - 'allow_unknown': True, + 'allow_unknown': False, + 'nullable': True, # if not specfied, create default pipeline 'schema': { - 'params': {'type': 'dict', 'required': False}, + 'params': {'type': 'dict', 'required': False, 'nullable': True,}, + 'steps': { + 'type': 'list', + 'required': False, + 'nullable': True, + 'schema': { + 'type': 'dict', + 'allow_unknown': True + } + }, } }, 'threshold_selector': { @@ -60,7 +70,7 @@ 'type': 'dict', 'required': False, # defaults if not specified 'schema': { - 'type': {'type': 'string', 'required': False, 'default': 'DataSplitter', + 'type': {'type': 'string', 'required': False, 'default': 'BlockDataSplitter', 'allowed': ['DataSplitter', 'BlockDataSplitter', 'blocks', 'sklearn', 'train_test_split']}, 'train_block_size': {'type': 'integer', 'required': False, 'dependencies': {'type': ['DataSplitter', 'BlockDataSplitter', 'blocks']}}, 'val_block_size': {'type': 'integer', 'required': False, 'dependencies': {'type': ['DataSplitter', 'BlockDataSplitter', 'blocks']}}, @@ -88,6 +98,7 @@ 'train': {'type': 'dict', 'schema': TRAIN_SCHEMA, 'required': False, 'allow_unknown': True}, 'predict': {'type': 'dict', 'schema': PREDICT_SCHEMA, 'required': False}, 'root_cause_analysis': {'type': 'dict', 'schema': ROOT_CAUSE_ANALYSIS_SCHEMA, 'required': False}, + 'dtype': {'type': 'string', 'required': False, 'allowed': ['float32', 'float64']} } @@ -179,15 +190,10 @@ def data_clipping_params(self) -> Dict[str, Any]: """Data clipping parameters.""" return self.config_dict.get('train', {}).get('data_clipping', {}) - @property - def angle_columns(self) -> List[str]: - """List of angle columns.""" - return self.config_dict.get('train', {}).get('data_preprocessor', {}).get('params', {}).get('angles', []) - @property def max_criticality(self) -> Optional[int]: """Max criticality value.""" - return self.config_dict.get('prediction', {}).get('criticality', {}).get('max_criticality', 144) + return self.config_dict.get('predict', {}).get('criticality', {}).get('max_criticality', 144) @property def fit_threshold_on_val(self) -> bool: @@ -198,3 +204,8 @@ def fit_threshold_on_val(self) -> bool: def verbose(self) -> int: """Verbosity Level of the Autoencoder.""" return self.config_dict.get('train', {}).get('autoencoder', {}).get('verbose', 1) + + @property + def dtype(self): + """Data type, float32 by default.""" + return self.config_dict.get('dtype', 'float32') diff --git a/energy_fault_detector/config/quickstart_config.py b/energy_fault_detector/config/quickstart_config.py new file mode 100644 index 0000000..208187b --- /dev/null +++ b/energy_fault_detector/config/quickstart_config.py @@ -0,0 +1,246 @@ +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import yaml + + +def _build_preprocessor_steps( + *, + max_nan_frac: float, + min_unique_value_count: int, + max_col_zero_frac: float, + angle_columns: Optional[List[str]], + counter_columns: Optional[List[str]], + imputer_strategy: str, + scaler: str, +) -> List[Dict[str, Any]]: + """ + Build the steps specification for the DataPreprocessor pipeline. + + This helper focuses solely on the steps list for the preprocessing pipeline + and keeps the public function small, readable, and testable. + + Args: + max_nan_frac (float): Maximum fraction of missing values allowed per column. + min_unique_value_count (int): Minimal number of unique values required for a column to remain. + max_col_zero_frac (float): Maximum allowed fraction of zeros in a column (used in the unique-value filter). + angle_columns (Optional[List[str]]): Optional list of column names to be angle-transformed. + counter_columns (Optional[List[str]]): Optional list of counter columns to be transformed to differences. + imputer_strategy (str): SimpleImputer strategy, e.g., "mean", "median", "most_frequent", or "constant". + scaler (str): Scaler type; supports "standard" (and aliases) or "minmax" (and aliases). + + Returns: + List[Dict[str, Any]]: A steps list suitable for DataPreprocessor(steps=[...]). + + Notes: + + - The order is kept minimal here; DataPreprocessor enforces proper ordering internally. + + """ + steps: List[Dict[str, Any]] = [] + + # Optional counter-diff transformation (DataPreprocessor will place it early). + if counter_columns: + steps.append( + { + "name": "counter_diff_transformer", + "params": { + "counters": counter_columns, + "compute_rate": False, + "fill_first": "nan", + }, + } + ) + + # Column selection: drop columns with too many NaNs. + steps.append( + { + "name": "column_selector", + "params": {"max_nan_frac_per_col": max_nan_frac}, + } + ) + + # Filter for columns with very few unique values or many zeros. + steps.append( + { + "name": "low_unique_value_filter", + "params": { + "min_unique_value_count": min_unique_value_count, + "max_col_zero_frac": max_col_zero_frac, + }, + } + ) + + # Optional angle transformer (e.g., degrees => sin/cos). + if angle_columns: + steps.append( + { + "name": "angle_transformer", + "params": {"angles": angle_columns}, + } + ) + + # Explicit imputer; adding it avoids relying on DataPreprocessor defaults. + steps.append( + { + "name": "simple_imputer", + "params": {"strategy": imputer_strategy}, + } + ) + + # Final scaler with aliases supported for convenience. + scaler_key = scaler.lower() + if scaler_key in ("standard", "standardize", "standard_scaler"): + steps.append({"name": "standard_scaler"}) + elif scaler_key in ("minmax", "minmax_scaler", "normalize"): + steps.append({"name": "minmax_scaler"}) + else: + raise ValueError( + f"Unknown scaler '{scaler}'. Use 'standard' (aka 'standardize') or 'minmax'." + ) + + return steps + + +def _dump_yaml_if_requested( + config: Dict[str, Any], + output_path: Optional[Union[str, Path]], +) -> None: + """ + Write the configuration dictionary to a YAML file if a path is provided. + + Args: + config (Dict[str, Any]): The configuration dictionary to serialize. + output_path (Optional[Union[str, Path]]): Destination path. If None, nothing is written. + + Raises: + RuntimeError: If PyYAML is not installed but output_path is not None. + """ + if output_path is None: + return + + if yaml is None: # pragma: no cover - optional dependency + raise RuntimeError( + "PyYAML is not installed; install 'pyyaml' or set output_path=None." + ) + + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + + with path.open("w", encoding="utf-8") as f: + yaml.safe_dump(config, f, sort_keys=False) + + +def generate_quickstart_config( + output_path: Optional[Union[str, Path]] = "base_config.yaml", + *, + # Preprocessor configuration + max_nan_frac: float = 0.05, + min_unique_value_count: int = 2, + max_col_zero_frac: float = 1.0, + angle_columns: Optional[List[str]] = None, + counter_columns: Optional[List[str]] = None, + imputer_strategy: str = "mean", + scaler: str = "standard", + # Early stopping + early_stopping: bool = False, + validation_split: float = 0.2, + # Thresholding + threshold_quantile: float = 0.99, + # Autoencoder defaults + batch_size: int = 128, + code_size: int = 20, + epochs: int = 10, + layers: Optional[List[int]] = None, + learning_rate: float = 1e-3, +) -> Dict[str, Any]: + """ + Generate a minimal, valid configuration for EnergyFaultDetector. + + This function returns a configuration dictionary that uses the steps-based + DataPreprocessor and sensible defaults for training. It can also write the + configuration to YAML if an output path is supplied. + + Example: + from energy_fault_detector import FaultDetector, Config + cfg = generate_quickstart_config(output_path=None) + fault_detector = FaultDetector(config=Config(config_dict=cfg)) + + Args: + output_path (Optional[Union[str, Path]]): YAML output path; set None to return only the dict. + max_nan_frac (float): Max fraction of missing values per column for selection. Default: 0.05 + min_unique_value_count (int): Minimal unique values required to keep a column. Default: 2 + max_col_zero_frac (float): Max fraction of zeros allowed in a column. Default: 1.0 + angle_columns (Optional[List[str]]): Optional columns to transform as angles (sin/cos). Default: None + counter_columns (Optional[List[str]]): Optional counter columns to convert to differences. Default: None + imputer_strategy (str): Strategy for SimpleImputer ("mean", "median", etc.). Default: mean + scaler (str): Scaler selection ("standard" or "minmax"; common aliases allowed). Default: standard + early_stopping (bool): Enable early stopping in the autoencoder training. Default: False + validation_split (float): Fraction for validation in sklearn splitter (0 < val < 1). + threshold_quantile (float): Quantile for the "quantile" threshold selector. Default: 0.99 + batch_size (int): Autoencoder batch size. Default: 128 + code_size (int): Bottleneck code size. Default: 20 + epochs (int): Number of training epochs. Default: 10 + layers (Optional[List[int]]): Autoencoder layer sizes; defaults to [200, 100, 50] if None. + learning_rate (float): Optimizer learning rate. + + Returns: + Dict[str, Any]: Configuration dictionary ready for Config(config_dict=...). + + Raises: + ValueError: If early_stopping is True but validation_split is not in (0, 1). + """ + if not (0 < validation_split < 1.0): + raise ValueError("validation_split must be in (0, 1).") + + # Fallback layers if none provided by user + if layers is None: + layers = [200, 100, 50] + + # Build the preprocessor steps list + steps = _build_preprocessor_steps( + max_nan_frac=max_nan_frac, + min_unique_value_count=min_unique_value_count, + max_col_zero_frac=max_col_zero_frac, + angle_columns=angle_columns, + counter_columns=counter_columns, + imputer_strategy=imputer_strategy, + scaler=scaler, + ) + + # Assemble training configuration + train_config: Dict[str, Any] = { + "data_preprocessor": {"steps": steps}, + "data_splitter": { + "type": "sklearn", + "validation_split": validation_split, + "shuffle": True, + }, + "autoencoder": { + "name": "default", + "params": { + "batch_size": batch_size, + "code_size": code_size, + "early_stopping": early_stopping, + "epochs": epochs, + "layers": layers, + "learning_rate": learning_rate, + }, + "verbose": 1, + }, + "anomaly_score": {"name": "rmse"}, + "threshold_selector": { + "fit_on_val": False, + "name": "quantile", + "params": {"quantile": threshold_quantile}, + }, + # Optional clipping (disabled by default; uncomment to enable): + # "data_clipping": {"lower_percentile": 0.001, "upper_percentile": 0.999}, + } + + config: Dict[str, Any] = {"train": train_config} + + # Optionally write YAML + _dump_yaml_if_requested(config=config, output_path=output_path) + + return config diff --git a/energy_fault_detector/core/__init__.py b/energy_fault_detector/core/__init__.py index 47eca0d..5229541 100644 --- a/energy_fault_detector/core/__init__.py +++ b/energy_fault_detector/core/__init__.py @@ -1,7 +1,8 @@ """This module contains class templates for most of the anomaly detection classes, such as autoencoders, anomaly scores, threshold selectors and data classes.""" -from energy_fault_detector.core.anomaly_score import AnomalyScore -from energy_fault_detector.core.autoencoder import Autoencoder -from energy_fault_detector.core.data_transformer import DataTransformer -from energy_fault_detector.core.threshold_selector import ThresholdSelector +from .anomaly_score import AnomalyScore +from .autoencoder import Autoencoder +from .data_transformer import DataTransformer +from .threshold_selector import ThresholdSelector +from .fault_detection_result import FaultDetectionResult, ModelMetadata \ No newline at end of file diff --git a/energy_fault_detector/_logs.py b/energy_fault_detector/core/_logs.py similarity index 56% rename from energy_fault_detector/_logs.py rename to energy_fault_detector/core/_logs.py index 2cd2a36..2a68147 100644 --- a/energy_fault_detector/_logs.py +++ b/energy_fault_detector/core/_logs.py @@ -1,34 +1,35 @@ """Logging settings""" import os +from pathlib import Path import logging.config as logging_config import yaml -def setup_logging(default_path: str = 'logging.yaml', env_key: str = 'LOG_CFG') -> None: +def setup_logging(default_path: str | Path = 'logging.yaml', env_key: str = 'LOG_CFG') -> None: """Setup logging configuration Args: - default_path (str): default logging configuration file. Default is 'logging.yaml' + default_path (str or Path): default logging configuration file. Default is 'logging.yaml' env_key (str): Environment variable holding logging config file path (overrides default_path). Default is 'LOG_CFG' """ - path = default_path + path = Path(default_path) value = os.getenv(env_key, None) if value: - path = value + path = Path(value) try: with open(path, 'rt', encoding='utf-8') as f: config = yaml.safe_load(f.read()) # check paths exist or create them: for _, handler in config['handlers'].items(): - if handler.get('filename'): - dirname = os.path.dirname(handler['filename']) - if dirname != '' and not os.path.exists(dirname): - os.makedirs(dirname) + filename = handler.get('filename') + if filename: + # Resolve path and create parent directories if they don't exist + Path(filename).parent.mkdir(parents=True, exist_ok=True) logging_config.dictConfig(config) except Exception as e: diff --git a/energy_fault_detector/core/fault_detection_model.py b/energy_fault_detector/core/fault_detection_model.py index 47df523..cd575bb 100644 --- a/energy_fault_detector/core/fault_detection_model.py +++ b/energy_fault_detector/core/fault_detection_model.py @@ -2,9 +2,10 @@ import os from abc import ABC, abstractmethod -from typing import Any, Optional, Union, List, Tuple +from typing import Optional, Union, List, Tuple import logging from datetime import datetime +from pathlib import Path import pandas as pd import numpy as np @@ -16,10 +17,10 @@ from energy_fault_detector.core.model_factory import ModelFactory from energy_fault_detector.core.fault_detection_result import ModelMetadata, FaultDetectionResult from energy_fault_detector.data_preprocessing import DataPreprocessor -from energy_fault_detector._logs import setup_logging +from energy_fault_detector.core._logs import setup_logging from energy_fault_detector.data_splitting.data_splitter import BlockDataSplitter -setup_logging(os.path.join(os.path.dirname(__file__), '..', 'logging.yaml')) +setup_logging(Path(__file__).parent.parent / 'logging.yaml') logger = logging.getLogger('energy_fault_detector') DATA_PREP_DIR = 'data_preprocessor' @@ -28,6 +29,8 @@ SCORE_DIR = 'anomaly_score' DataType = Union[pd.DataFrame, np.ndarray, List] +PathLike = Union[str, Path] +ModelPart = Union[DataPreprocessor, Autoencoder, AnomalyScore, ThresholdSelector] class NoTrainingData(Exception): @@ -50,9 +53,9 @@ class FaultDetectionModel(ABC): save_timestamps: a list of string timestamps, indicating when the model was saved. """ - def __init__(self, config: Optional[Config] = None, model_directory: str = 'models'): + def __init__(self, config: Optional[Config] = None, model_directory: PathLike = 'models'): self.config: Optional[Config] = config - self.model_directory: str = model_directory + self.model_directory: PathLike = model_directory self.anomaly_score: Optional[AnomalyScore] = None self.autoencoder: Optional[Autoencoder] = None @@ -65,6 +68,11 @@ def __init__(self, config: Optional[Config] = None, model_directory: str = 'mode # build models self._model_factory: Optional[ModelFactory] = ModelFactory(config) if config else None + if config is None: + logger.debug('No configuration set. Load models and config from path with the `FaultDetector.load_models`' + ' method.') + else: + self._init_models() def _init_models(self): """Initialize models.""" @@ -79,24 +87,34 @@ def _init_models(self): self.data_preprocessor = self._model_factory.data_preprocessor @abstractmethod - def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, asset_id: Union[int, str] = None, - **kwargs) -> ModelMetadata: + def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, save_models: bool = True, + overwrite_models: bool = False, **kwargs) -> ModelMetadata: """Fit models on the given sensor_data and save them locally and return the metadata. Args: - asset_id: asset ID of the asset for which the model should be trained. sensor_data: pandas DataFrame with the sensor data to use. The time stamp should be the index and the sensor values as columns. normal_index: a pandas Series indicating normal behaviour as boolean with the timestamp as index. + save_models (bool, optional): Whether to save models. Defaults to True. + overwrite_models (bool, optional): If True, existing model directories can be overwritten. Defaults to + False. Returns: ModelMetadata object. """ - def train(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, asset_id: Union[int, str] = None, - **kwargs) -> ModelMetadata: - """Same as the `fit`-method.""" - return self.fit(sensor_data=sensor_data, normal_index=normal_index, asset_id=asset_id, **kwargs) + def train(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, **kwargs) -> ModelMetadata: + """Same as the `fit`-method. + + Args: + sensor_data: pandas DataFrame with the sensor data to use. + The time stamp should be the index and the sensor values as columns. + normal_index: a pandas Series indicating normal behaviour as boolean with the timestamp as index. + + Returns: + ModelMetadata object. + """ + return self.fit(sensor_data=sensor_data, normal_index=normal_index, **kwargs) @abstractmethod def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, asset_id: Union[int, str] = None @@ -176,11 +194,11 @@ def save_models(self, model_name: Union[str, int] = None, overwrite: bool = Fals return os.path.abspath(model_dir), current_datetime - def load_models(self, model_path: str) -> None: + def load_models(self, model_path: PathLike) -> None: """Load saved models given the model path. Args: - model_path: Path to the model files. + model_path (str, Path): Path to the model files. """ data_prep_dir = os.path.join(model_path, DATA_PREP_DIR) @@ -206,7 +224,7 @@ def load_models(self, model_path: str) -> None: self._model_factory = ModelFactory(self.config) @staticmethod - def _load_pickled_model(model_type: str, model_directory: str): + def _load_pickled_model(model_type: str, model_directory: str) -> ModelPart: """Load a pickled model of given type, using file name (which is the class name).""" model_class_name = os.listdir(model_directory)[0].split('.')[0] if model_type != 'data_preprocessor': diff --git a/energy_fault_detector/core/fault_detection_result.py b/energy_fault_detector/core/fault_detection_result.py index 52d8d44..6d90928 100644 --- a/energy_fault_detector/core/fault_detection_result.py +++ b/energy_fault_detector/core/fault_detection_result.py @@ -1,11 +1,13 @@ -import os from typing import Optional, List from dataclasses import dataclass +from pathlib import Path import pandas as pd import numpy as np +from ..utils.analysis import calculate_criticality + @dataclass class FaultDetectionResult: @@ -27,36 +29,102 @@ class FaultDetectionResult: """DataFrame with ARCANA results (ARCANA bias). None if ARCANA was not run.""" arcana_losses: Optional[pd.DataFrame] = None - """DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run.""" + """DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run. + Empty if losses were not tracked.""" tracked_bias: Optional[List[pd.DataFrame]] = None - """List of DataFrames containing the ARCANA bias every 50th iteration. None if ARCANA was not run.""" + """List of DataFrames containing the ARCANA bias every 50th iteration. None if ARCANA was not run. + Empty if bias was not tracked.""" + + def criticality(self, normal_idx: pd.Series | None = None, init_criticality: int = 0, max_criticality: int = 1000 + ) -> pd.Series: + """Criticality based on the predicted anomalies. + + Args: + normal_idx (pd.Series, optional): A pandas Series with boolean values indicating normal operation, indexed + by timestamp. Ignored if None. + init_criticality (int, optional): The initial criticality value. Defaults to 0. + max_criticality (int, optional): The maximum criticality value. Defaults to 1000. + + """ + return calculate_criticality(self.predicted_anomalies, normal_idx, init_criticality, max_criticality) - def save(self, directory: str, **kwargs) -> None: + def save(self, directory: str | Path, **kwargs) -> None: """Saves the results to CSV files in the specified directory. Args: directory (str): The directory where the CSV files will be saved. - kwargs: other keywords args for `pd.DataFrame.to_csv` + kwargs: other keywords args for `pd.DataFrame.to_csv` (i.e. sep=',') """ # Ensure the directory exists - os.makedirs(directory, exist_ok=True) + directory = Path(directory) + directory.mkdir(exist_ok=True, parents=True) # Save each DataFrame as a CSV file - self.predicted_anomalies.to_csv(os.path.join(directory, 'predicted_anomalies.csv'), **kwargs) - self.reconstruction.to_csv(os.path.join(directory, 'reconstruction.csv'), **kwargs) - self.recon_error.to_csv(os.path.join(directory, 'reconstruction_errors.csv'), **kwargs) - self.anomaly_score.to_csv(os.path.join(directory, 'anomaly_scores.csv'), **kwargs) + self.predicted_anomalies.to_csv(directory / 'predicted_anomalies.csv', **kwargs) + self.reconstruction.to_csv(directory / 'reconstruction.csv', **kwargs) + self.recon_error.to_csv(directory / 'reconstruction_errors.csv', **kwargs) + self.anomaly_score.to_csv(directory / 'anomaly_scores.csv', **kwargs) if self.bias_data is not None: - self.bias_data.to_csv(os.path.join(directory, 'bias_data.csv'), **kwargs) + self.bias_data.to_csv(directory / 'bias_data.csv', **kwargs) if self.arcana_losses is not None: - self.arcana_losses.to_csv(os.path.join(directory, 'arcana_losses.csv'), **kwargs) + self.arcana_losses.to_csv(directory / 'arcana_losses.csv', **kwargs) if self.tracked_bias is not None and len(self.tracked_bias) > 0: for idx, bias_df in enumerate(self.tracked_bias): - bias_df.to_csv(os.path.join(directory, f'tracked_bias_{idx}.csv'), **kwargs) + bias_df.to_csv(directory / f'tracked_bias_{idx}.csv', **kwargs) + + @classmethod + def load(cls, directory: str | Path, **kwargs) -> "FaultDetectionResult": + """Loads the results from CSV files in the specified directory. + + Args: + directory (str | Path): The directory where the CSV files are stored. + kwargs: other keywords args for `pd.read_csv` (e.g., sep=',') + + Returns: + FaultDetectionResult: The loaded result object. + """ + directory = Path(directory) + + # Default pandas loading arguments to ensure indices are restored correctly + params = {'index_col': 0, 'parse_dates': True} + params.update(kwargs) + + # Load mandatory fields + predicted_anomalies = pd.read_csv(directory / 'predicted_anomalies.csv', **params).iloc[:, 0] + # Ensure predicted_anomalies is explicitly a Series and boolean + predicted_anomalies = predicted_anomalies.astype(bool) + + reconstruction = pd.read_csv(directory / 'reconstruction.csv', **params) + recon_error = pd.read_csv(directory / 'reconstruction_errors.csv', **params) + anomaly_score = pd.read_csv(directory / 'anomaly_scores.csv', **params).iloc[:, 0] + + # Load optional fields if they exist + bias_data = None + if (directory / 'bias_data.csv').exists(): + bias_data = pd.read_csv(directory / 'bias_data.csv', **params) + + arcana_losses = None + if (directory / 'arcana_losses.csv').exists(): + arcana_losses = pd.read_csv(directory / 'arcana_losses.csv', **params) + + tracked_bias = None + tracked_files = sorted(directory.glob('tracked_bias_*.csv')) + if tracked_files: + tracked_bias = [pd.read_csv(f, **params) for f in tracked_files] + + return cls( + predicted_anomalies=predicted_anomalies, + reconstruction=reconstruction, + recon_error=recon_error, + anomaly_score=anomaly_score, + bias_data=bias_data, + arcana_losses=arcana_losses, + tracked_bias=tracked_bias + ) @dataclass @@ -64,6 +132,6 @@ class ModelMetadata: """Class to encapsulate metadata about the FaultDetector model.""" model_date: str - model_path: str + model_path: str | Path train_recon_error: np.ndarray val_recon_error: Optional[np.ndarray] = None diff --git a/energy_fault_detector/core/model_factory.py b/energy_fault_detector/core/model_factory.py index 4aef0bf..1aca17f 100644 --- a/energy_fault_detector/core/model_factory.py +++ b/energy_fault_detector/core/model_factory.py @@ -32,9 +32,11 @@ def _initialize_models(self) -> None: # Retrieve training configuration train_dict = self.config['train'] - # data preprocessor + # data preprocessor - not specified leads to a default pipeline + data_prep_conf = (train_dict.get('data_preprocessor', {}) or {}) self._models['data_preprocessor'] = DataPreprocessor( - **train_dict.get('data_preprocessor', {}).get('params', {}) + steps=data_prep_conf.get('steps'), + **data_prep_conf.get('params', {}) ) # Loop through each model type and initialize the corresponding model diff --git a/energy_fault_detector/data_preprocessing/column_selector.py b/energy_fault_detector/data_preprocessing/column_selector.py index 3274cbf..2c5ced3 100644 --- a/energy_fault_detector/data_preprocessing/column_selector.py +++ b/energy_fault_detector/data_preprocessing/column_selector.py @@ -1,4 +1,3 @@ - from typing import Optional, List import numpy as np @@ -14,7 +13,8 @@ class ColumnSelector(DataTransformer): Args: max_nan_frac_per_col: maximum fraction of NaN values allowed per column. Defaults to 0.05. If the fraction exceeds max_nan_frac_per_col, the column is dropped. - features_to_exclude: list of features that should be dropped. Defaults to None. + features_to_exclude: columns to drop (case-insensitive). + features_to_select: columns to keep (case-insensitive). Mutually exclusive with features_to_exclude. Attributes: feature_names_in_: list of column names in input. @@ -23,12 +23,20 @@ class ColumnSelector(DataTransformer): columns_dropped_: list of columns that were dropped. """ - def __init__(self, max_nan_frac_per_col: float = 0.05, features_to_exclude: List[str] = None): - + def __init__( + self, + max_nan_frac_per_col: float = 0.05, + features_to_exclude: Optional[List[str]] = None, + features_to_select: Optional[List[str]] = None, + ): super().__init__() - + if features_to_exclude is not None and features_to_select is not None: + raise ValueError("Only one of features_to_exclude or features_to_select can be specified.") + if not (0.0 <= max_nan_frac_per_col <= 1.0): + raise ValueError("max_nan_frac_per_col must be within [0, 1].") self.max_nan_frac_per_col: float = max_nan_frac_per_col self.features_to_exclude: List[str] = features_to_exclude if features_to_exclude is not None else [] + self.features_to_select: Optional[List[str]] = features_to_select # pylint: disable=attribute-defined-outside-init # noinspection PyAttributeOutsideInit @@ -43,11 +51,17 @@ def fit(self, x: pd.DataFrame, y: Optional[np.array] = None) -> 'ColumnSelector' self.feature_names_in_ = x.columns.to_list() self.n_features_in_ = len(x.columns) - # drop features to exclude - ignore upper/lower case - to_drop = [col for col in x.columns if col.lower() in - [excluded_feature.lower() for excluded_feature in self.features_to_exclude] - ] - x_transformed = x.drop(to_drop, axis=1, errors='ignore') + # If features_to_select is provided - ignore upper/lower case + if self.features_to_select is not None: + select_lower = [f.lower() for f in self.features_to_select] + keep_cols = [col for col in x.columns if col.lower() in select_lower] + x_transformed = x[keep_cols] + else: + # drop features to exclude - ignore upper/lower case + to_drop = [col for col in x.columns if col.lower() in + [excluded_feature.lower() for excluded_feature in self.features_to_exclude] + ] + x_transformed = x.drop(to_drop, axis=1, errors='ignore') # drop columns which have more than max_nan_frac_per_col relative NaN frequency empty_percentage = x_transformed.isnull().mean(axis=0) @@ -69,8 +83,11 @@ def transform(self, x: pd.DataFrame) -> pd.DataFrame: # transformation is not possible. missing_columns = set(self.feature_names_out_) - set(x.columns) if len(missing_columns) > 0: - raise ValueError('ColumnSelector: There are columns missing in the prediction data, which were present in' - ' the training data. New models need to be trained!') + raise ValueError( + 'ColumnSelector: There are columns missing in the prediction data, which were present in' + ' the training data. Missing columns: ' + f"{', '.join(sorted(missing_columns))}. New models need to be trained!" + ) x = x[self.feature_names_out_] # ensure ordering return x diff --git a/energy_fault_detector/data_preprocessing/counter_diff_transformer.py b/energy_fault_detector/data_preprocessing/counter_diff_transformer.py new file mode 100644 index 0000000..c56fb6e --- /dev/null +++ b/energy_fault_detector/data_preprocessing/counter_diff_transformer.py @@ -0,0 +1,322 @@ + +from typing import Dict, List, Optional + +import numpy as np +import pandas as pd +from sklearn.utils.validation import check_is_fitted + +from energy_fault_detector.core.data_transformer import DataTransformer + + +class CounterDiffTransformer(DataTransformer): + """ + Transform monotonic counter columns into per-sample increments (default) or per-second rates (if compute_rate=True), + handling resets/rollovers and masking long time gaps. + + It handles counter resets/rollovers and optionally masks values after large time gaps, which helps avoid misleading + diffs/rates caused by missing data. + + Args: + counters: List of counter column names to transform. + compute_rate: If True, output per-second rates (increment / dt). If False (default), + output per-sample increments. + reset_strategy: One of {'zero', 'rollover', 'nan', 'auto'}: + + - 'zero' (default): if diff < 0, treat as reset-to-zero; increment = current_value. + - 'rollover': if diff < 0, increment = current_value + (rollover_value - previous_value). + - 'nan': if diff < 0, set increment to NaN. + - 'auto': use 'rollover' if rollover_values contains the counter; otherwise 'zero'. + + rollover_values: Optional mapping counter -> known max value (used by 'rollover' or 'auto'). + small_negative_tolerance: Treat small negative diffs (``abs(diff) <= tol``) as 0 (noise). Default: 0.0. + fill_first: One of {'nan', 'zero'}. How to fill the first sample where diff is undefined. + keep_original: If True, keep original counters alongside new outputs. If False, drop them. + gap_policy: One of {'mask', 'ignore'}: + + - 'mask' (default): set output to NaN for rows where time delta > threshold. + - 'ignore': do nothing special for large gaps. + + max_gap_seconds: Explicit threshold (in seconds) for gap masking. If provided, overrides + max_gap_factor. + max_gap_factor: If max_gap_seconds is None, use threshold = factor * median(dt). + Default is 3.0. + + Notes: + - A DatetimeIndex is required if compute_rate=True or gap_policy='mask'. + - The inverse_transform is a no-op and returns the input unchanged. + + Examples: + - Diffs: [0, 1, 3, 0 (reset), 2] -> [NaN|0, 1, 2, 0|NaN, 2] + - Rates: increment / dt (in seconds), with large-gap rows optionally masked to NaN. + """ + + def __init__( + self, + counters: List[str], + compute_rate: bool = False, + reset_strategy: str = "zero", + rollover_values: Optional[Dict[str, float]] = None, + small_negative_tolerance: float = 0.0, + fill_first: str = "nan", + keep_original: bool = False, + gap_policy: str = "mask", + max_gap_seconds: Optional[float] = None, + max_gap_factor: float = 3.0, + ) -> None: + super().__init__() + self.counters = counters or [] + self.compute_rate = compute_rate + self.reset_strategy = reset_strategy + self.rollover_values = rollover_values or {} + self.small_negative_tolerance = float(small_negative_tolerance) + self.fill_first = fill_first + self.keep_original = keep_original + self.gap_policy = gap_policy + self.max_gap_seconds = max_gap_seconds + self.max_gap_factor = float(max_gap_factor) + + def fit(self, x: pd.DataFrame, y: Optional[pd.Series] = None) -> "CounterDiffTransformer": + """Validate inputs and compute output schema. + + This method validates the time index (when needed), stores the list of counters that are + present in the input, and computes the output column layout such that transform() can + reproduce the same order deterministically. + + Args: + x: Input DataFrame. Requires a DatetimeIndex if compute_rate=True or gap_policy='mask'. + y: Unused. Present for estimator interface compatibility. + + Returns: + self + + Raises: + ValueError: If a DatetimeIndex is required but missing or non-monotonic. + """ + self.feature_names_in_ = x.columns.to_list() + self.n_features_in_ = len(x.columns) + + # DatetimeIndex is required for rates or for gap masking + if self.compute_rate or self.gap_policy == "mask": + if not isinstance(x.index, pd.DatetimeIndex): + raise ValueError( + "CounterDiffTransformer: DatetimeIndex required (rate or gap masking)." + ) + if not x.index.is_monotonic_increasing: + raise ValueError("CounterDiffTransformer: index must be monotonic increasing.") + + # Keep only counters present in the DataFrame + self.counters_ = [c for c in self.counters if c in self.feature_names_in_] + + # Determine output suffix + self.output_suffix_ = "_rate" if self.compute_rate else "_diff" + + # Compose output feature order + new_cols = [f"{c}{self.output_suffix_}" for c in self.counters_] + if self.keep_original: + # Append new output columns after all original features + self.feature_names_out_ = list(self.feature_names_in_) + new_cols + else: + # Keep non-counter features first, then the new output columns + others = [col for col in self.feature_names_in_ if col not in self.counters_] + self.feature_names_out_ = others + new_cols + + # Track columns dropped when keep_original is False (for introspection/testing) + self.columns_dropped_ = [] if self.keep_original else [c for c in self.counters_] + return self + + def _time_deltas_seconds(self, x: pd.DataFrame) -> Optional[pd.Series]: + """Compute per-row time delta in seconds, or None if not needed. + + Returns NaN for the first row and when dt is 0 seconds (zero dt is masked to NaN to avoid + division by zero for rate calculations). + + Args: + x: Input DataFrame. + + Returns: + A Series of dt seconds aligned to x.index, or None if neither rate nor masking is used. + + Raises: + ValueError: If a DatetimeIndex is required but missing or non-monotonic. + """ + if not (self.compute_rate or self.gap_policy == "mask"): + return None + if not isinstance(x.index, pd.DatetimeIndex): + raise ValueError("CounterDiffTransformer: DatetimeIndex required for rate or gap masking.") + if not x.index.is_monotonic_increasing: + raise ValueError("CounterDiffTransformer: index must be monotonic increasing.") + + # Create a series of timestamps to keep the original index for alignment + dt = pd.Series(x.index, index=x.index).diff().dt.total_seconds() + # Prevent division by zero when computing rates + dt = dt.mask(dt == 0, np.nan) + return dt + + def _gap_threshold(self, dt: pd.Series) -> Optional[float]: + """Compute the gap masking threshold in seconds, or None if masking disabled. + + Args: + dt: Series of time deltas in seconds. + + Returns: + Threshold in seconds or None if masking is not applicable. If max_gap_seconds is given, + it is used; otherwise threshold = max_gap_factor * median(dt). If median is not finite + or <= 0, returns None and masking is effectively disabled. + """ + if self.gap_policy != "mask" or dt is None: + return None + if self.max_gap_seconds is not None: + return float(self.max_gap_seconds) + + med = float(np.nanmedian(dt.values)) if len(dt) else np.nan + if not np.isfinite(med) or med <= 0: + return None + return self.max_gap_factor * med + + def _compute_increment( + self, + s: pd.Series, + strategy: str, + rollover_value: Optional[float], + ) -> pd.Series: + """Compute per-sample increment for a counter series with reset handling. + + This applies the selected reset strategy to negative diffs and also clamps small negative + diffs (within small_negative_tolerance) to zero to mitigate minor noise/clock skew. + + Args: + s: Input counter Series. + strategy: Reset strategy ('zero', 'rollover', 'nan', 'auto'). + rollover_value: Known rollover maximum (used by 'rollover' or 'auto'). + + Returns: + Series of increments aligned to s.index, with the first element filled according to + fill_first ('zero' or 'nan'). + + Raises: + ValueError: If series contains non-numeric values (excluding existing NaNs), + or if strategy is 'rollover' but rollover_value is None, + or if an unknown reset strategy is provided. + """ + # Try to coerce to numeric; if this introduces new NaNs (beyond existing ones), error out + v = pd.to_numeric(s, errors="coerce") + if v.isna().sum() > s.isna().sum(): + raise ValueError( + "CounterDiffTransformer: non-numeric values found in counter series. " + "Ensure all counter values are numeric or NaN." + ) + prev = v.shift(1) + diff = v.diff() + + # Clamp small negative diffs to zero (treat as noise) + if self.small_negative_tolerance > 0: + small_neg = (diff < 0) & ((-diff) <= self.small_negative_tolerance) + diff = diff.mask(small_neg, 0.0) + + neg_mask = diff < 0 + + # Map 'auto' to a concrete strategy + if strategy == "auto": + strategy = "rollover" if rollover_value is not None else "zero" + + if strategy == "nan": + inc = diff.mask(neg_mask, np.nan) + elif strategy == "zero": + # Treat reset-to-zero as increment equals current value + inc = diff.where(~neg_mask, v) + elif strategy == "rollover": + if rollover_value is None: + # Explicit 'rollover' requires a value. + raise ValueError( + "CounterDiffTransformer: rollover strategy requires a rollover_value for the " + f"counter '{s.name}'. Use reset_strategy='auto' to fallback to 'zero' when not provided." + ) + # Add the wrapped amount: current + (rollover - previous) + inc = diff.where(~neg_mask, v + (rollover_value - prev)) + else: + raise ValueError(f"CounterDiffTransformer: unknown reset_strategy '{strategy}'") + + return inc + + def transform(self, x: pd.DataFrame) -> pd.DataFrame: + """Transform counters into diffs or rates, with optional gap masking. + + For each configured counter: + 1) Compute per-sample increment with reset handling. + 2) If compute_rate=True, divide by dt seconds. + 3) If gap_policy='mask', set values to NaN where dt > gap_threshold. + + Args: + x: Input DataFrame. Requires a DatetimeIndex if compute_rate=True or gap_policy='mask'. + + Returns: + A DataFrame with transformed columns appended (if keep_original=True) or replacing the + original counters (if keep_original=False). Column order matches fit()'s schema. + + Raises: + ValueError: If DatetimeIndex is required but missing or non-monotonic. + """ + check_is_fitted(self) + x_ = x.copy() + + dt = self._time_deltas_seconds(x_) + gap_thr = self._gap_threshold(dt) if dt is not None else None + + new_cols = {} + for c in self.counters_: + increment = self._compute_increment( + x_[c], strategy=self.reset_strategy, rollover_value=self.rollover_values.get(c) + ) + series = (increment / dt) if self.compute_rate and dt is not None else increment + + # Ensure first sample respects fill_first setting + series.iloc[0] = 0.0 if self.fill_first == "zero" else np.nan + + # Mask large gaps for both diffs and rates to avoid misleading values + if gap_thr is not None: + series = series.mask(dt > gap_thr) + + new_cols[f"{c}{self.output_suffix_}"] = series + + # Attach new columns + for name, col in new_cols.items(): + x_[name] = col + + # Optionally remove original counter columns + if not self.keep_original: + x_ = x_.drop(columns=self.counters_, errors='ignore') + + # Reorder to the schema established during fit + x_ = x_[self.feature_names_out_] + return x_ + + def inverse_transform(self, x: pd.DataFrame) -> pd.DataFrame: + """If original counter columns are present, drop the derived columns and restore original feature order. + Otherwise, returns the input as is. + + Args: + x: Input DataFrame. + + Returns: + The input DataFrame unchanged. + """ + check_is_fitted(self) + x_ = x.copy() + orig_counters_present = all(c in x_.columns for c in self.counters_) + if orig_counters_present: + if all(col in x_.columns for col in self.feature_names_in_): + x_ = x_[self.feature_names_in_] + return x_ + return x + + def get_feature_names_out(self, input_features: Optional[List[str]] = None) -> List[str]: + """Return the output feature names determined in fit(). + + Args: + input_features: Unused. Present for compatibility with sklearn API. + + Returns: + List of output column names. + """ + check_is_fitted(self) + return self.feature_names_out_ diff --git a/energy_fault_detector/data_preprocessing/data_clipper.py b/energy_fault_detector/data_preprocessing/data_clipper.py index 3adf558..965c457 100644 --- a/energy_fault_detector/data_preprocessing/data_clipper.py +++ b/energy_fault_detector/data_preprocessing/data_clipper.py @@ -1,7 +1,7 @@ """Clip data before standardization or normalization""" import logging -from typing import Optional, List, Union +from typing import Optional, List import numpy as np import pandas as pd @@ -19,19 +19,42 @@ class DataClipper(DataTransformer): Args: lower_percentile (float): The lower percentile for clipping (default: 0.01). upper_percentile (float): The upper percentile for clipping (default: 0.99). - features_to_exclude (List[str]): A list of column names representing feature that should not be clipped. + features_to_exclude (List[str] | None): Column names that should not be clipped. + features_to_clip (List[str] | None): Column names that should be clipped (mutually exclusive with + features_to_exclude). + Configuration example: + + .. code-block:: text + + train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + features_to_exclude: + - do_not_clip_this_feature """ def __init__(self, lower_percentile: float = 0.01, upper_percentile: float = 0.99, - features_to_exclude: List[str] = None): + features_to_exclude: Optional[List[str]] = None, features_to_clip: Optional[List[str]] = None) -> None: + super().__init__() + if features_to_clip is not None and features_to_exclude is not None: + raise ValueError('Only one of features_to_clip or features_to_exclude can be specified.') + if not (0.0 <= lower_percentile <= 1.0) or not (0.0 <= upper_percentile <= 1.0): + raise ValueError('Percentiles must be within [0, 1].') + if lower_percentile >= upper_percentile: + raise ValueError('lower_percentile must be strictly less than upper_percentile.') + self.lower_percentile = lower_percentile self.upper_percentile = upper_percentile - self.feature_to_exclude: List[str] = features_to_exclude if features_to_exclude is not None else [] + self.feature_to_exclude: Optional[List[str]] = features_to_exclude + self.features_to_clip: Optional[List[str]] = features_to_clip - def fit(self, x: Union[np.array, pd.DataFrame], y: Optional[np.array] = None) -> 'DataClipper': + def fit(self, x: pd.DataFrame, y: Optional[np.array] = None) -> 'DataClipper': """Set feature names in and out.""" + if not isinstance(x, pd.DataFrame): + raise TypeError('DataClipper.fit expects a pandas DataFrame.') self.feature_names_in_ = x.columns.to_list() self.feature_names_out_ = x.columns.to_list() return self @@ -48,11 +71,23 @@ def transform(self, x: pd.DataFrame) -> pd.DataFrame: """ check_is_fitted(self) - # Exclude columns representing angles + + # Select feature to clip x_ = x.copy() - x_without_feature_to_exclude = x_[[col for col in x_.columns if col not in self.feature_to_exclude]] + if self.feature_to_exclude is not None: + selected_features = [col for col in x_.columns if col not in self.feature_to_exclude] + elif self.features_to_clip is not None: + selected_features = [col for col in x_.columns if col in self.features_to_clip] + else: + # Clip all numeric columns + selected_features = x_.columns.tolist() + # Exclude non-numeric columns - x_numeric = x_without_feature_to_exclude.select_dtypes(include=np.number) + x_numeric = x_[selected_features].select_dtypes(include=np.number) + + if x_numeric.shape[1] == 0: + logger.debug('DataClipper.transform: no numeric columns selected; returning input unchanged.') + return x_ # Clip the data using the specified percentiles x_clipped = x_numeric.clip( diff --git a/energy_fault_detector/data_preprocessing/data_preprocessor.py b/energy_fault_detector/data_preprocessing/data_preprocessor.py index 5d0c809..8883b46 100644 --- a/energy_fault_detector/data_preprocessing/data_preprocessor.py +++ b/energy_fault_detector/data_preprocessing/data_preprocessor.py @@ -1,6 +1,8 @@ """Generic class for building a preprocessing pipeline.""" -from typing import List, Optional +from collections import Counter, defaultdict +from typing import List, Optional, Dict, Any, Tuple +import warnings import pandas as pd from sklearn.pipeline import Pipeline @@ -12,179 +14,498 @@ from energy_fault_detector.data_preprocessing.low_unique_value_filter import LowUniqueValueFilter from energy_fault_detector.data_preprocessing.angle_transformer import AngleTransformer from energy_fault_detector.data_preprocessing.duplicate_value_to_nan import DuplicateValuesToNan +from energy_fault_detector.data_preprocessing.counter_diff_transformer import CounterDiffTransformer class DataPreprocessor(Pipeline, SaveLoadMixin): - """A data preprocessing pipeline that allows for configurable steps based on the extended pipeline. - - 0. (optional) Replace any consecutive duplicate zero-values (or another value) with NaN. This step should be - used if 0 can also represent missing values in the data. - 1. (optional) Column selection: A ColumnSelector object filters out columns/features with too many NaN values. - 2. (optional) Features containing angles are transformed to sine/cosine values. - 3. (optional) Low unique value filter: Remove columns/features with a low number of unique values or - high fraction of zeroes. The high fraction of zeros setting should be used if 0 can also represent missing - values in the data. - 4. Imputation with sklearn's SimpleImputer - 5. Scaling: Apply either sklearn's StandardScaler or MinMaxScaler. - - Args: - angles: List of angle features for transformation. Defaults to None. - If none provided (or empty list), this step is skipped. - imputer_strategy: Strategy for imputation ('mean', 'median', 'most_frequent', 'constant'). Defaults to 'mean'. - imputer_fill_value: Value to fill for imputation (if imputer_strategy=='constant'). - scale: Type of scaling ('standardize' or 'normalize'). Defaults to 'standardize'. - include_column_selector: Whether to include the column selector step. Defaults to True. - features_to_exclude: ColumnSelector option, list of features to exclude from processing. - max_nan_frac_per_col: ColumnSelector option, max fraction of NaN values allowed per column. Defaults to 0.05. - include_low_unique_value_filter: Whether to include the low unique value filter step. Defaults to True. - min_unique_value_count: Minimum number of unique values for low unique value filter. Defaults to 2. - max_col_zero_frac: Maximum fraction of zeroes for low unique value filter. Defaults to 1.0. - include_duplicate_value_to_nan: Whether to include the duplicate value replacement step. Defaults to False. - value_to_replace: Value to replace with NaN (if using duplicate value replacement). Defaults to None. - n_max_duplicates: Max number of consecutive duplicates to replace with NaN. Defaults to 144. - - Configuration example: - - .. code-block:: text - - train: - data_preprocessor: - params: - scale: normalize - imputer_strategy: mean - max_nan_frac_per_col: 0.05 - include_low_unique_value_filter: true - min_unique_value_count: 2 - max_col_zero_frac: 0.99 - angles: - - angle1 - - angle2 - features_to_exclude: - - feature1 - - feature2 - """ - - def __init__(self, - angles: Optional[List[str]] = None, - imputer_strategy: str = 'mean', - imputer_fill_value: Optional[int] = None, - scale: str = 'standardize', - include_column_selector: bool = True, - features_to_exclude: Optional[List[str]] = None, - max_nan_frac_per_col: float = 0.05, - include_low_unique_value_filter: bool = True, - min_unique_value_count: int = 2, - max_col_zero_frac: float = 1., - include_duplicate_value_to_nan: bool = False, - value_to_replace: float = 0, - n_max_duplicates: int = 144, - duplicate_features_to_exclude: Optional[List[str]] = None - ): - - self.include_column_selector = include_column_selector - self.features_to_exclude = features_to_exclude - self.max_nan_frac_per_col = max_nan_frac_per_col - - self.angles = angles - - self.include_low_unique_value_filter = include_low_unique_value_filter - self.min_unique_value_count = min_unique_value_count - self.max_col_zero_frac = max_col_zero_frac - - self.imputer_strategy = imputer_strategy - self.imputer_fill_value = imputer_fill_value - - self.scale = scale - - self.include_duplicate_value_to_nan = include_duplicate_value_to_nan - self.value_to_replace = value_to_replace - self.n_max_duplicates = n_max_duplicates - self.duplicate_features_to_exclude = duplicate_features_to_exclude - - # Define the scaler based on the chosen scale type - scaler = (StandardScaler(with_mean=True, with_std=True) - if scale in ['standardize', 'standard', 'standardscaler'] - else MinMaxScaler(feature_range=(0, 1))) - - # Configure the pipeline steps - steps = [] - - if include_duplicate_value_to_nan: - steps.append( - ('value_to_nan', - # Do not open source, very specific to our data problems - DuplicateValuesToNan(value_to_replace=value_to_replace, n_max_duplicates=n_max_duplicates, - features_to_exclude=duplicate_features_to_exclude)) - ) - if include_column_selector: - steps.append( - ('column_selector', - ColumnSelector(max_nan_frac_per_col=max_nan_frac_per_col, features_to_exclude=features_to_exclude)) - ) - if include_low_unique_value_filter: - steps.append( - ('low_unique_value_filter', - LowUniqueValueFilter(min_unique_value_count=min_unique_value_count, max_col_zero_frac=max_col_zero_frac)) - ) - if angles is not None and len(angles) > 0: - steps.append(('angle_transform', AngleTransformer(angles=angles))) + STEP_REGISTRY = { + 'duplicate_to_nan': DuplicateValuesToNan, + 'column_selector': ColumnSelector, + 'low_unique_value_filter': LowUniqueValueFilter, + 'angle_transformer': AngleTransformer, + 'counter_diff_transformer': CounterDiffTransformer, + 'simple_imputer': SimpleImputer, + 'standard_scaler': StandardScaler, + 'minmax_scaler': MinMaxScaler, + } + + NAME_ALIASES: Dict[str, str] = { + "angle_transform": "angle_transformer", + "counter_diff": "counter_diff_transformer", + "counter_diff_transform": "counter_diff_transformer", + "standardize": "standard_scaler", + "standard": "standard_scaler", + "standardscaler": "standard_scaler", + "minmax": "minmax_scaler", + "imputer": "simple_imputer", + "duplicate_value_to_nan": "duplicate_to_nan", + "duplicate_values_to_nan": "duplicate_to_nan", + } + + def __init__(self, steps: Optional[List[Dict[str, Any]]] = None, **params: Any) -> None: + """A data preprocessing pipeline that allows for configurable steps based on the extended pipeline. + + If both steps and legacy params are provided, steps take precedence and a warning is emitted. + When neither steps nor legacy params are provided, a default "old-style" pipeline is created which removes + features that are constant or just binary and contain more 5% missing values. Afterward, remaining missing + values are imputed with the mean and the features are scaled with the StandardScaler. - # default steps: - steps.append(('imputer', SimpleImputer(strategy=imputer_strategy, - fill_value=imputer_fill_value).set_output(transform='pandas'))) - steps.append(('scaler', scaler)) - - super().__init__(steps=steps) - self.set_output(transform="pandas") # set output of all transformers to pandas + Args: + steps: Optional list of step specifications. Each item is a dict with: + + - name: registered step name (see STEP_REGISTRY). + - enabled: optional bool (default True). + - params: dict of constructor arguments for the step. + - step_name: optional explicit pipeline name (defaults to name). + + **params: Legacy parameters used when steps is None (see _legacy_keys()). + + Notes: + Enforced ordering in steps mode: + + 1) NaN introducing steps first (DuplicateValuesToNan, CounterDiffTransformer), + 2) ColumnSelector (if present), + 3) Other steps + 4) SimpleImputer placed before scaler (always present; mean strategy by default), + 5) Scaler always last (StandardScaler by default). + + Configuration example: + + .. code-block:: text + + train: + data_preprocessor: + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + features_to_exclude: ['exclude_this_feature'] + - name: counter_diff_transformer + step_name: counter_flow + params: + counters: ['flow_total_m3'] + compute_rate: True + fill_first: 'zero' + - name: counter_diff_transformer + step_name: counter_energy + params: + counters: ['energy_total_kwh'] + compute_rate: False + fill_first: 'zero' + reset_strategy: 'rollover', + rollover_values: + 'energy_total_kwh': 100000.0 + """ - def inverse_transform(self, x: pd.DataFrame, **kwargs) -> pd.DataFrame: - """Reverses the scaler and angle transforms applied to the data. - Other transformations are not reversed. + self.steps_spec_: Optional[List[Dict[str, Any]]] = steps + self.params_: Dict[str, Any] = dict(params) + + if steps is not None and len(steps) > 0: + # Warn if legacy params are present alongside steps. + legacy_keys = set(self._legacy_keys()) + legacy_used = [k for k in self.params_.keys() if k in legacy_keys] + if legacy_used: + warnings.warn( + f"DataPreprocessor: 'steps' provided; legacy params are ignored: {legacy_used}", + UserWarning + ) + built_steps = self._build_from_steps_spec() + else: + # Build the default or legacy pipeline. If params is empty, defaults are applied. + built_steps = self._build_from_legacy() + + super().__init__(steps=built_steps) + # Ensure pandas output for supported transformers. + self.set_output(transform="pandas") + + def inverse_transform(self, x: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """Inverse-transform scaler and angles (other transforms are not reversed). Args: x: The transformed data. Returns: - A DataFrame with the inverse transformed data. + DataFrame with inverse scaling and angle back-transformation. """ + # Find scaler by type + scaler_key, _ = self._find_step_by_type((StandardScaler, MinMaxScaler)) + x_ = self.named_steps[scaler_key].inverse_transform(x.copy()) + x_ = pd.DataFrame(data=x_, columns=self.named_steps[scaler_key].get_feature_names_out()) - x_ = self.named_steps['scaler'].inverse_transform(x.copy()) - x_ = pd.DataFrame(data=x_, columns=self.named_steps['scaler'].get_feature_names_out()) - if 'angle_transform' in self.named_steps: - x_ = self.named_steps['angle_transform'].inverse_transform(x_) + # AngleTransformer supports inverse_transform; apply if present. + angle_key, _ = self._find_step_by_type((AngleTransformer,)) + if angle_key is not None: + x_ = self.named_steps[angle_key].inverse_transform(x_) + # Keep original index (important for time series). if isinstance(x, pd.DataFrame): - # ensure the index is kept x_.index = x.index - return x_ # pylint: disable=arguments-renamed - def transform(self, x: pd.DataFrame, **kwargs) -> pd.DataFrame: - """Transforms the input DataFrame using the pipeline. + def transform(self, x: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """Apply pipeline steps to the input DataFrame. Args: x: Input DataFrame. Returns: - a dataframe with the same index as the input dataframe. + DataFrame with the same index as input. """ - x_ = super().transform(X=x.copy()) - return pd.DataFrame(data=x_, - columns=self.get_feature_names_out(), - index=x.index) + return pd.DataFrame(data=x_, columns=self.get_feature_names_out(), index=x.index) # pylint: disable=arguments-renamed - def fit_transform(self, x: pd.DataFrame, **kwargs) -> pd.DataFrame: - """Fit the model and transform with the final estimator. + def fit_transform(self, x: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """Fit and transform in one step. Args: x: Input DataFrame. Returns: - Transformed DataFrame with the same index as the input dataframe. + Transformed DataFrame with the same index as input. """ - super().fit(X=x) return self.transform(x) + + def _find_step_by_type(self, types: Tuple[type, ...]) -> Tuple[Optional[str], Optional[object]]: + """Return the (step name, estimator) of the first step matching any of the given types.""" + for name, est in self.named_steps.items(): + if isinstance(est, types): + return name, est + return None, None + + @staticmethod + def _legacy_keys() -> List[str]: + """Return the list of supported legacy parameter keys.""" + return [ + "angles", + "imputer_strategy", + "imputer_fill_value", + "scale", + "include_column_selector", + "features_to_exclude", + "max_nan_frac_per_col", + "include_low_unique_value_filter", + "min_unique_value_count", + "max_col_zero_frac", + "include_duplicate_value_to_nan", + "value_to_replace", + "n_max_duplicates", + "duplicate_features_to_exclude", + "counter_columns_to_transform", + ] + + def _normalize_name(self, name: str) -> str: + """Normalize a user-provided step name to a canonical registry key.""" + return self.NAME_ALIASES.get(name, name) + + @staticmethod + def _validate_singletons(steps_spec: List[Dict[str, Any]]) -> None: + """Ensure only one instance of selected steps is present (enabled ones).""" + singleton_names = { + "angle_transformer", + "column_selector", + "low_unique_value_filter", + "simple_imputer", + # scaler handled separately (standard_scaler/minmax_scaler) in your code + } + counts: List[Tuple[str, int]] = [] + for name in singleton_names: + n = sum(1 for s in steps_spec if s.get("enabled", True) and s.get("name") == name) + if n > 1: + counts.append((name, n)) + if counts: + raise ValueError( + "Each of these steps may appear at most once: " + f"{[n for n, _ in counts]}. Found duplicates: {counts}" + ) + + def _build_from_legacy(self) -> List: + """Build pipeline from legacy parameters (old behavior + enforced ordering). + + Steps: + 0. (optional) Replace any consecutive duplicate zero-values (or another value) with NaN. This step should be + used if 0 can also represent missing values in the data. + 1. (optional) Normalize counters to differences. + 2. (optional) Column selection: A ColumnSelector object filters out columns/features with too many NaN values. + 3. (optional) Low unique value filter: Remove columns/features with a low number of unique values or + high fraction of zeroes. The high fraction of zeros setting should be used if 0 can also represent missing + values in the data. + 4. (optional) Features containing angles are transformed to sine/cosine values. + 5. Imputation with sklearn's SimpleImputer + 6. Scaling: Apply either sklearn's StandardScaler or MinMaxScaler. + + Use legacy parameters passed via **params. If empty, defaults are used. + - angles: List of angle features for transformation. Default: None (skipped). + - imputer_strategy: Strategy for imputation ('mean', 'median', 'most_frequent', 'constant'). Default: 'mean'. + - imputer_fill_value: Value to fill for imputation (if imputer_strategy=='constant'). + - scale: Type of scaling ('standardize' or 'normalize'). Default: 'standardize'. + - include_column_selector: Whether to include the column selector step. Default: True. + - features_to_exclude: ColumnSelector option, list of features to exclude from processing. + - max_nan_frac_per_col: ColumnSelector option, max fraction of NaN values allowed per column. Default: 0.05. + - include_low_unique_value_filter: Whether to include the low unique value filter step. Default: True. + - min_unique_value_count: Minimum number of unique values for low unique value filter. Default: 2. + - max_col_zero_frac: Maximum fraction of zeroes for low unique value filter. Default: 1.0. + - include_duplicate_value_to_nan: Whether to include the duplicate value replacement step. Default: False. + - value_to_replace: Value to replace with NaN (if using duplicate value replacement). Default: None. + - n_max_duplicates: Max number of consecutive duplicates to replace with NaN. Default: 144. + - counter_columns_to_transform: List of counters to normalize to differences. Default: None (skipped). + + Returns: + List of (name, estimator) tuples for the pipeline. + """ + steps: List = [] + params = self.params_ + + # 0. Replace any consecutive duplicate zero-values (or another value) with NaN. + if params.get("include_duplicate_value_to_nan", False): + steps.append( + ( + "value_to_nan", + DuplicateValuesToNan(value_to_replace=params.get("value_to_replace", 0), + n_max_duplicates=params.get("n_max_duplicates", 144), + features_to_exclude=params.get("duplicate_features_to_exclude")), + ) + ) + # 1. (optional) Normalize counters to differences. + counter_cols = params.get("counter_columns_to_transform", []) + if len(counter_cols) > 0: + steps.append( + ( + "counter_diff", + CounterDiffTransformer( + counters=counter_cols, + compute_rate=False, + reset_strategy="zero", + rollover_values=None, + small_negative_tolerance=0.0, + fill_first="nan", + keep_original=False, + gap_policy="mask", + max_gap_seconds=None, + max_gap_factor=3.0, + ), + ) + ) + # 2. ColumnSelector (default enabled) + if params.get("include_column_selector", True): + steps.append( + ( + "column_selector", + ColumnSelector(max_nan_frac_per_col=params.get("max_nan_frac_per_col", 0.05), + features_to_exclude=params.get("features_to_exclude")), + ) + ) + # 3. Optional value filters and angle transforms (before imputer) + if params.get("include_low_unique_value_filter", True): + steps.append( + ( + "low_unique_value_filter", + LowUniqueValueFilter( + min_unique_value_count=params.get("min_unique_value_count", 2), + max_col_zero_frac=params.get("max_col_zero_frac", 1.0), + ), + ) + ) + # 4. Apply optional angle transformations + angles = params.get("angles", []) + if len(angles) > 0: + steps.append(("angle_transform", AngleTransformer(angles=angles))) + # 5. Impute missing values with SimpleImputer + steps.append( + ( + "simple_imputer", + SimpleImputer( + strategy=params.get("imputer_strategy", "mean"), + fill_value=params.get("imputer_fill_value", None), + ).set_output(transform="pandas"), + ) + ) + # 6. Scale data + scale = params.get("scale", "standardize") + scaler = ( + StandardScaler(with_mean=True, with_std=True) + if scale in ["standardize", "standard", "standardscaler"] + else MinMaxScaler(feature_range=(0, 1)) + ) + steps.append(("scaler", scaler)) + return steps + + def _build_from_steps_spec(self) -> List: + """Build pipeline from steps specification (preferred mode) with enforced ordering. + + Each step has the following keys: + - name: registered step name (see STEP_REGISTRY). + - enabled: optional, defaults to True. + - params: dict of constructor parameters for the step. + - step_name: optional explicit pipeline key (defaults to name). + + Returns: + List of (name, estimator) tuples for the pipeline. + + Raises: + ValueError: If a step lacks 'name' or references an unknown step. + """ + self._validate_step_spec_keys(self.steps_spec_) + # Filter disabled steps first to simplify ordering. + enabled_spec = [s for s in self.steps_spec_ if s.get("enabled", True)] + self._validate_singletons(enabled_spec) + ordered_spec = self._order_steps_spec(enabled_spec) + # Assign unique step names for duplicates or missing step_name + ordered_spec = self._assign_unique_step_names(ordered_spec) + + steps: List = [] + scaler_defined = False + scaler_names = {"standard_scaler", "minmax_scaler"} + scaler_idx = None + for step_idx, spec in enumerate(ordered_spec): + name = spec.get("name") + if name is None: + raise ValueError("Each step spec requires a 'name'.") + if name in scaler_names: + scaler_defined = True + scaler_idx = step_idx + params = spec.get("params", {}) + cls = self.STEP_REGISTRY.get(name) + if cls is None: + raise ValueError(f"Unknown step name '{name}'. Register it in STEP_REGISTRY.") + estimator = cls(**params) + step_name = spec.get("step_name", name) + steps.append((step_name, estimator)) + + # Ensure an Imputer exists and is placed before the scaler. + if not any(n == "simple_imputer" for n, _ in steps): + default_imputer = SimpleImputer(strategy="mean").set_output(transform="pandas") + # Insert before scaler if scaler already present; else append. + if scaler_idx is not None: + steps.insert(scaler_idx, ("simple_imputer", default_imputer)) + else: + steps.append(("simple_imputer", default_imputer)) + + # Ensure a scaler exists and is last. If missing, add StandardScaler by default. + if not scaler_defined: + steps.append(("scaler", StandardScaler(with_mean=True, with_std=True))) + + return steps + + def _order_steps_spec(self, steps_spec: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Normalize ordering rules for a steps specification. + + Rules: + - NaN introducing steps first (DuplicateValuesToNan and CounterDiffTransformer) + - ColumnSelector (if present). + - Other steps + - Any imputer placed at the end, before scaler. If no imputer was defined, the SimpleImputer with imputation + strategy 'mean' is added. + - Scaler last (if present). If no scaler is added, the StandardScaler with default values is added. + + Args: + steps_spec: List of step dictionaries. + + Returns: + Reordered list of step dictionaries. + """ + # Normalize names to canonical keys for grouping + for s in steps_spec: + s["name"] = self._normalize_name(s.get("name")) + + # Separate groups by type for easy reassembly. + column_selector = [s for s in steps_spec if s.get("name") == "column_selector"] + low_unique_value_filter = [s for s in steps_spec if s.get("name") == "low_unique_value_filter"] + duplicates = [s for s in steps_spec if s.get("name") == "duplicate_to_nan"] + counter = [s for s in steps_spec if s.get("name") == "counter_diff_transformer"] + imputer = [s for s in steps_spec if s.get("name") == "simple_imputer"] + scaler_names = {"standard_scaler", "minmax_scaler"} + scalers = [s for s in steps_spec if s.get("name") in scaler_names] + if len(scalers) > 1: + raise ValueError("Only one scaler can be used, two found in the steps specification: ." + f"{scalers}") + others = [ + s for s in steps_spec + if s.get("name") not in { + "column_selector", "duplicate_to_nan", "counter_diff_transformer", "simple_imputer", + "low_unique_value_filter", + } | scaler_names + ] + + # Keep 'others' in their original relative order. + ordered = [] + # can add NaN avalues or add new features that may be constant + ordered.extend(duplicates) + ordered.extend(counter) + # drop columns based on the values (NaNs, no variance) + ordered.extend(column_selector) + ordered.extend(low_unique_value_filter) + # other transformations + ordered.extend(others) + # end with imputation and scaling + ordered.extend(imputer) # may be empty; scaler gets default added later if missing + ordered.extend(scalers) # may be empty; scaler gets default added later if missing + return ordered + + @staticmethod + def _assign_unique_step_names(specs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Assign unique pipeline step names. If step_name is provided, use it; if it collides, append _2, _3, ... + If step_name is not provided, use the 'name' key. If this key occurs multiple times, assign name_1, name_2, ... + + This method mutates specs in place and also returns it. + + Returns: + Altered step specifications. + """ + total_counts = Counter(s["name"] for s in specs) + used: set[str] = set() + per_base_index = defaultdict(int) + + for s in specs: + explicit = s.get("step_name") + if explicit: + base = explicit + candidate = base + i = 1 + while candidate in used: + i += 1 + candidate = f"{base}_{i}" + s["step_name"] = candidate + used.add(candidate) + continue + + base = s["name"] + if total_counts[base] == 1 and base not in used: + candidate = base + else: + per_base_index[base] += 1 + candidate = f"{base}_{per_base_index[base]}" + while candidate in used: + per_base_index[base] += 1 + candidate = f"{base}_{per_base_index[base]}" + + s["step_name"] = candidate + used.add(candidate) + + return specs + + @staticmethod + def _validate_step_spec_keys(steps_spec: List[Dict[str, Any]]) -> None: + """Validate that each step spec uses only allowed keys and includes 'name'. + + Allowed keys: {'name', 'enabled', 'params', 'step_name'}. + + Args: + steps_spec: Raw steps specification provided by the user. + + Raises: + ValueError: If a step is missing 'name' or contains unknown keys. + """ + allowed = {"name", "enabled", "params", "step_name"} + + for i, spec in enumerate(steps_spec): + if "name" not in spec: + raise ValueError(f"Step #{i} is missing required key 'name'.") + unknown = set(spec.keys()) - allowed + if unknown: + raise ValueError( + f"Step #{i} has unknown keys: {sorted(unknown)}. " + f"Allowed keys are: {sorted(allowed)}." + ) diff --git a/energy_fault_detector/data_preprocessing/low_unique_value_filter.py b/energy_fault_detector/data_preprocessing/low_unique_value_filter.py index b910047..3502634 100644 --- a/energy_fault_detector/data_preprocessing/low_unique_value_filter.py +++ b/energy_fault_detector/data_preprocessing/low_unique_value_filter.py @@ -14,9 +14,9 @@ class LowUniqueValueFilter(DataTransformer): exceeds `max_col_zero_frac`. Args: - min_unique_value_count (int): Minimum number of unique values a feature should have. Defaults to 2. - If set to 2, only constant features are dropped. - max_col_zero_frac (float): Maximum fraction of zeroes a column may contain. + min_unique_value_count (int): Minimum number of unique values a feature should have. Default: 2. + If set to 2, only constant features are dropped. + max_col_zero_frac (float): Maximum fraction of zeroes a column may contain. Default: 1.0 Attributes: feature_names_in_ (list): List of column names in input. @@ -25,7 +25,7 @@ class LowUniqueValueFilter(DataTransformer): columns_dropped_ (list): List of columns that were dropped during filtering. """ - def __init__(self, min_unique_value_count: int = 1, max_col_zero_frac: float = 0.9): + def __init__(self, min_unique_value_count: int = 2, max_col_zero_frac: float = 1.0): super().__init__() self.min_unique_value_count: int = min_unique_value_count diff --git a/energy_fault_detector/evaluation/__init__.py b/energy_fault_detector/evaluation/__init__.py index f136890..499c600 100644 --- a/energy_fault_detector/evaluation/__init__.py +++ b/energy_fault_detector/evaluation/__init__.py @@ -1,4 +1,5 @@ """Evaluation classes and methods, including the CARE-Score and Care2CompareDataset.""" -from energy_fault_detector.evaluation.care_score import CAREScore -from energy_fault_detector.evaluation.care2compare import Care2CompareDataset +from .care_score import CAREScore +from .care2compare import Care2CompareDataset +from .predist_dataset import PreDistDataset diff --git a/energy_fault_detector/evaluation/care2compare.py b/energy_fault_detector/evaluation/care2compare.py index 11b05cc..6fcfda9 100644 --- a/energy_fault_detector/evaluation/care2compare.py +++ b/energy_fault_detector/evaluation/care2compare.py @@ -18,24 +18,22 @@ class Care2CompareDataset: The data can be downloaded either manually from https://doi.org/10.5281/zenodo.14958989 (in this case specify `path`) or it can be downloaded automatically by setting download_dataset to True. - All data is loaded into memory, which might be problematic for large datasets (consider using DataLoader classes of - TensorFlow and PyTorch in that case). - By default, only the averages are read. See statistics argument of the data loading methods. - Methods: - get_event_info: Returns event info for a given event ID - iter_datasets: Reads datasets and yields the resulting training and test DataFrames while iterating over - event IDs. - format_event_dataset: Extracts normal_index from a loaded dataset and returns normal_index and sensor_data. - iter_formatted_datasets: Reads datasets, extracts normal_index and yields the resulting train and test - DataFrames as well as the normal_indexes while iterating over event IDs. - load_event_dataset: Reads dataset specified by event_id and returns training and test data. - load_and_format_event_dataset: Reads dataset specified by event_id and returns training and test data as well as - the corresponding normal indexes. - iter_train_datasets_per_asset: Reads datasets and yields the resulting training DataFrames while - iterating over asset IDs and aggregating event IDs for the same assets. - update_c2c_config: Updates a specified FaultDetector config based on provided feature descriptions. + Method overview: + + - get_event_info: Returns event info for a given event ID + - iter_datasets: Reads datasets and yields the resulting training and test DataFrames while iterating over + event IDs. + - format_event_dataset: Extracts normal_index from a loaded dataset and returns normal_index and sensor_data. + - iter_formatted_datasets: Reads datasets, extracts normal_index and yields the resulting train and test + DataFrames as well as the normal_indexes while iterating over event IDs. + - load_event_dataset: Reads dataset specified by event_id and returns training and test data. + - load_and_format_event_dataset: Reads dataset specified by event_id and returns training and test data as well as + the corresponding normal indexes. + - iter_train_datasets_per_asset: Reads datasets and yields the resulting training DataFrames while + iterating over asset IDs and aggregating event IDs for the same assets. + - update_c2c_config: Updates a specified FaultDetector config based on provided feature descriptions. Args: path (Path): The directory path where the dataset is located. @@ -127,12 +125,14 @@ def iter_formatted_datasets(self, wind_farm: str = None, test_only: bool = False statistics=statistics, index_column=index_column, use_readable_columns=use_readable_columns): if not test_only: - train_sensor_data, train_normal_index = self.format_event_dataset(tup[0]) - test_sensor_data, test_normal_index = self.format_event_dataset(tup[1]) - yield train_sensor_data, train_normal_index, test_sensor_data, test_normal_index, tup[2] + (x_train, x_test), event_id = tup + train_sensor_data, train_normal_index = self.format_event_dataset(x_train) + test_sensor_data, test_normal_index = self.format_event_dataset(x_test) + yield train_sensor_data, train_normal_index, test_sensor_data, test_normal_index, event_id else: - test_sensor_data, test_normal_index = self.format_event_dataset(tup[0]) - yield test_sensor_data, test_normal_index, tup[1] + x_test, event_id = tup + test_sensor_data, test_normal_index = self.format_event_dataset(x_test) + yield test_sensor_data, test_normal_index, event_id def load_event_dataset(self, event_id: int, test_only: bool = False, statistics: List[str] = None, index_column: str = 'id', use_readable_columns: bool = True @@ -179,9 +179,10 @@ def load_and_format_event_dataset(self, event_id: int, statistics: List[str] = N Returns: Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: - If test_only=False, yields a tuple of train_sensor_data, train_status, - test_sensor_data and test_status. - If test_only=True, yields a tuple of test_sensor_data and test_status. + + - If test_only=False, yields a tuple of train_sensor_data, train_status, test_sensor_data and test_status. + - If test_only=True, yields a tuple of test_sensor_data and test_status. + """ tup = self.load_event_dataset(event_id=event_id, test_only=test_only, statistics=statistics, index_column=index_column, use_readable_columns=use_readable_columns) @@ -265,6 +266,24 @@ def get_columns(feature_description_selection: pd.DataFrame) -> List[str]: columns.append(f'{row.sensor_name}_{stat}') return columns + def merge_unique(base: List[str], to_add: List[str]) -> List[str]: + """Merge two lists preserving order and removing duplicates.""" + seen = set() + out: List[str] = [] + for v in (base or []) + (to_add or []): + if v not in seen: + seen.add(v) + out.append(v) + return out + + def find_step(names: List[str]) -> dict | None: + """Find step by name specification.""" + for s in steps: + name = s.get('name') + if name in names: + return s + return None + feature_descriptions = self.feature_descriptions[wind_farm] angles = feature_descriptions.loc[feature_descriptions['is_angle']] to_exclude = feature_descriptions.loc[feature_descriptions['is_counter']] @@ -272,12 +291,28 @@ def get_columns(feature_description_selection: pd.DataFrame) -> List[str]: angle_columns = get_columns(angles) to_exclude_columns = get_columns(to_exclude) - config['train']['data_preprocessor']['params']['angles'] = ( - config['train']['data_preprocessor']['params'].get('angles', []) + angle_columns - ) - config['train']['data_preprocessor']['params']['features_to_exclude'] = ( - config['train']['data_preprocessor']['params'].get('features_to_exclude', []) + to_exclude_columns - ) + # old: + dp = config['train'].setdefault('data_preprocessor', {}) + params = dp.get('params') + steps = dp.get('steps') + if params: + params['angles'] = merge_unique(params.get('angles', []), angle_columns) + params['features_to_exclude'] = merge_unique(params.get('features_to_exclude', []), to_exclude_columns) + # new + else: + angle_step = find_step(['angle_transformer', 'angle_transform']) + if angle_step is None: + steps.append({'name': 'angle_transformer', 'params': {'angles': angle_columns}}) + else: + angle_params = angle_step.setdefault('params', {}) + angle_params['angles'] = merge_unique(angle_params.get('angles', []), angle_columns) + colsel_step = find_step(['column_selector']) + if colsel_step is None: + steps.append({'name': 'column_selector', 'params': {'features_to_exclude': to_exclude_columns}}) + else: + colsel_params = colsel_step.setdefault('params', {}) + colsel_params['features_to_exclude'] = merge_unique( + colsel_params.get('features_to_exclude', []), to_exclude_columns) config.update_config(config.config_dict) diff --git a/energy_fault_detector/evaluation/care_score.py b/energy_fault_detector/evaluation/care_score.py index 2357e5e..5d77172 100644 --- a/energy_fault_detector/evaluation/care_score.py +++ b/energy_fault_detector/evaluation/care_score.py @@ -27,7 +27,7 @@ class CAREScore: The CARE score combines Coverage, Accuracy, Reliability and Earliness to evaluate early fault-detection performance (see CARE to Compare: A Real-World Benchmark Dataset for Early Fault Detection in Wind Turbine Data, - https://doi.org/10.3390/data9120138). The goal of the CARE-Score is to evaluate the ability of a given model to + https://doi.org/10.3390/data9120138). The goal of the CARE-Score is to evaluate the ability of a given model to separate `normal behavior` from `actionable anomalies` (see glossary for definitions), that lead to a fault or indicate a fault. @@ -167,15 +167,17 @@ def evaluate_event(self, event_start: Union[int, pd.Timestamp], event_end: Union Returns: dict: Dictionary with computed metrics, e.g.: - { - 'event_id': int, - 'event_label': str, - 'weighted_score': float, - 'max_criticality': float, - 'f_beta_score': float or NaN, - 'accuracy': float, - 'tp': int, 'fp': int, 'tn': int, 'fn': int - } + .. code-block:: python + + { + 'event_id': int, + 'event_label': str, + 'weighted_score': float, + 'max_criticality': float, + 'f_beta_score': float or NaN, + 'accuracy': float, + 'tp': int, 'fp': int, 'tn': int, 'fn': int + } Raises: ValueError: If event_label is invalid, evaluate_until_event_end has an unknown value, @@ -184,7 +186,7 @@ def evaluate_event(self, event_start: Union[int, pd.Timestamp], event_end: Union Notes: - The function sorts inputs by index to ensure alignment. - If normal_index is provided, this also influences the criticality calculation: criticality does not change - if the expected behaviour is not normal. + if the expected behaviour is not normal. - If predicted_anomalies_event is empty, a ValueError is raised. - Use evaluate_until_event_end to control whether post-event predictions are considered. """ @@ -265,15 +267,17 @@ def get_final_score(self, event_selection: Optional[List[int]] = None, criticali score for anomaly events), average Accuracy (for normal events) and Reliability (eventwise F-score) using the configured weights. - If the average accuracy over all normal events < 0.5, CARE-score = average accuracy over all normal events - (worse than random guessing). - If no anomalies were detected, the CARE-score = 0. - Else, the CARE-score is calculated as: + - If the average accuracy over all normal events < 0.5, CARE-score = average accuracy over all normal events + (worse than random guessing). + - If no anomalies were detected, the CARE-score = 0. + - Else, the CARE-score is calculated as: - ( (average F-score over all anomaly events) * coverage_w - + (average weighted score over all anomaly events) * weighted_score_w - + (average accuracy over all normal events) * accuracy_w - + event wise F-score * eventwise_f_score_w ) / sum_of_weights + .. code-block:: text + + ( (average F-score over all anomaly events) * coverage_w + + (average weighted score over all anomaly events) * weighted_score_w + + (average accuracy over all normal events) * accuracy_w + + event wise F-score * eventwise_f_score_w ) / sum_of_weights where `sum_of_weights` = coverage_w + weighted_score_w + accuracy_w + eventwise_f_score_w. diff --git a/energy_fault_detector/evaluation/predist_dataset.py b/energy_fault_detector/evaluation/predist_dataset.py new file mode 100644 index 0000000..177bb4f --- /dev/null +++ b/energy_fault_detector/evaluation/predist_dataset.py @@ -0,0 +1,208 @@ +import pandas as pd +from pathlib import Path +from typing import Dict, Any, Union +import logging + +from ..utils.data_downloads import download_zenodo_data + +logger = logging.getLogger('energy_fault_detector') + + +class PreDistDataset: + """Loader and preprocessor for the PreDist dataset. + + The data can be downloaded either manually from https://doi.org/10.5281/zenodo.17522254 (in this case specify + `path`) or it can be downloaded automatically by setting download_dataset to True. + + Args: + path (Union[str, Path]): Path to the dataset root. + download_dataset (bool): If True, downloads the PreDist dataset from Zenodo. + + Attributes: + events (Dict[int, pd.DataFrame): preloaded events dataframe for each manufacturer. + """ + + FAULT_HOURS_AFTER = 24 + FAULT_HOURS_BEFORE = 48 + + def __init__(self, path: Union[str, Path], download_dataset: bool = False): + if download_dataset: + logger.info("Downloading PreDist dataset from Zenodo (10.5281/zenodo.17522254)...") + path = download_zenodo_data(identifier="10.5281/zenodo.17522254", dest=path, overwrite=False) + + self.root_path = Path(path) + + # preload events + self.events: Dict[int, pd.DataFrame] = { + 1: self._load_events(manufacturer=1), + 2: self._load_events(manufacturer=2) + } + + def _load_events(self, manufacturer: int, filter_efd: bool = True) -> pd.DataFrame: + """Loads and combines all events from faults.csv and normal_events.csv. + + Args: + manufacturer (int): Dataset 1 or 2. + filter_efd (bool): Whether to filter events with efd possible or not. Default: True. + + Returns: + Events as dataframe, with start and end based on the possible anomaly start and report date for faults and + based on event start and end for normal events. + """ + + m_path = self.root_path / f"Manufacturer {manufacturer}" + + faults = pd.read_csv(m_path / 'faults.csv', sep=';', parse_dates=[ + 'Possible anomaly start', 'Report date', 'Possible anomaly end', + 'Training start', 'Training end' + ], index_col='Event ID') + + normals = pd.read_csv(m_path / 'normal_events.csv', sep=';', parse_dates=[ + 'Event start', 'Event end', 'Training start', 'Training end' + ], index_col='Event ID') + + if filter_efd: + # Only filter faults where early fault detection is possible (from a data perspective) + faults = faults[faults['efd_possible']] + + faults['Event type'] = 'anomaly' + faults['Event end'] = faults['Report date'] # for easy data selection later on + normals['Event type'] = 'normal' + + return pd.concat([faults, normals]) + + def load_substation_data(self, manufacturer: int, substation_id: int) -> pd.DataFrame: + """Loads raw CSV, maps string values, and cleans indices.""" + file_path = self.root_path / f"Manufacturer {manufacturer}" / 'operational_data' / f"substation_{substation_id}.csv" + df = pd.read_csv(file_path, sep=';', index_col='timestamp', parse_dates=['timestamp'], low_memory=False) + df.index = df.index.tz_localize(None) + df = df.sort_index() + + # Mapping string values (EIN/AUS) to (1/0) + val_map = {'EIN': 1, 'AUS': 0} + status_cols = [c for c in df.columns + if any(x in c for x in ['s_hc1_heating_pump_status_setpoint', + 's_hc1.2_heating_pump_status', + 's_hc1.3_heating_pump_status', + 's_hc2_dhw_3-way_valve_status', + 's_dhw_3-way_valve_status', + 's_hc1.1_heating_pump_status'])] + for col in status_cols: + if col in df.columns: + df[col] = df[col].map(val_map).astype('Int32') + + # Map control unit mode to integer + mode_map = {'Nacht': -1, 'Standby': 0, 'Tag': 1} + for col in [c for c in df.columns if 'control_unit_mode' in c]: + df[col] = df[col].map(mode_map).astype('Int32') + + # Handle noisy outside temperature value for specific substations + # In these cases, the outside temperature is not known - the sensor value is just noise + if manufacturer == 2 and substation_id in [18, 61]: + df = df.drop(columns=['outdoor_temperature'], errors='ignore') + + return df[~df.index.duplicated(keep='first')] + + def create_normal_flag(self, data: pd.DataFrame, manufacturer: int, substation_id: int) -> pd.Series: + """Create a normal flag based on disturbances, so we can select normal behaviour for training models. + + Args: + data (pd.DataFrame): Dataframe containing sensor data for a specific substation. + manufacturer (int): Dataset 1 or 2. + substation_id (int): ID of the substation to load data from. + + Returns: + pd.Series: Normal flag (boolean) based on disturbances with the same timestamp index as data. + """ + + dist_path = self.root_path / f"Manufacturer {manufacturer}" / 'disturbances.csv' + disturbances = pd.read_csv(dist_path, sep=';', parse_dates=['Event start']) + disturbances = disturbances[disturbances['substation ID'] == substation_id] + + normal_flag = pd.Series(True, index=data.index) + + # 1. Mark known anomalies from events_df + events_df = self.events[manufacturer] + anoms = events_df[(events_df['substation ID'] == substation_id) & (events_df['Event type'] == 'anomaly')] + for _, row in anoms.iterrows(): + # If we do not know when an anomaly started, we mark FAULT_HOURS_BEFORE before report + start = (row['Possible anomaly start'] + if pd.notna(row['Possible anomaly start']) + else (row['Report date'] - pd.Timedelta(hours=self.FAULT_HOURS_BEFORE))) + # If anomaly end was not provided, add some time after the fault for maintenance + # (This does not happen, anomaly end is always provided in this dataset) + end = (row['Possible anomaly end'] + if pd.notna(row['Possible anomaly end']) + else (row['Report date'] + pd.Timedelta(hours=self.FAULT_HOURS_AFTER))) + normal_flag.loc[start:end] = False + + # remove faults from disturbances already marked by the events dataframe + faults_in_disturbances = disturbances[disturbances['type'] == 'fault'] + faults_in_event_data = faults_in_disturbances[faults_in_disturbances['Event start'].isin(anoms['Report date'])] + disturbances = disturbances[~disturbances.index.isin(faults_in_event_data.index)] + + # 2. Mark disturbances (tasks, activities and remaining faults) + for _, dist in disturbances.iterrows(): + # round to nearest 10 minutes to match timestamp index of the data + d_start = dist['Event start'].floor('10min') + if dist['type'] == 'fault': + normal_flag.loc[d_start - pd.Timedelta(hours=self.FAULT_HOURS_BEFORE): + d_start + pd.Timedelta(hours=self.FAULT_HOURS_AFTER)] = False + else: # task/activity: mark the full day as possibly anomalous behaviour + normal_flag.loc[d_start: d_start.normalize() + pd.Timedelta(days=1)] = False + + return normal_flag + + def get_event_data(self, manufacturer: int, event_id: int, max_training_days: int = 2*365) -> Dict[str, Any]: + """Extracts training and test slices for a specific event row (fault or normal). + """ + + # get info from event + event_row = self.events[manufacturer].loc[event_id] + substation_id = event_row['substation ID'] + train_start = event_row['Training start'] + train_end = event_row['Training end'] + event_end = event_row['Event end'] + event_type = event_row['Event type'].lower() + anomaly_end = event_row.loc['Possible anomaly end'] + + # Max 2 years of training data + train_start = max(train_start, train_end - pd.Timedelta(days=max_training_days)) + + data = self.load_substation_data(manufacturer, event_row['substation ID']) + + # Training data + train_data = data.loc[train_start:train_end] + + # Test data + if event_type == 'normal': + test_data = data.loc[train_end:event_end] + else: # anomaly + # By default, 7 days before report, add 2 days after report for visualisations + test_data = data.loc[event_end - pd.Timedelta(days=7):anomaly_end + pd.Timedelta(days=2)] + # Exception: event 67 of manufacturer 1 (3 months) + if event_id == 67 and manufacturer == 1: + test_data = data.loc[ + event_row['Possible anomaly start']:anomaly_end + pd.Timedelta(days=2) + ] + + # Drop columns that are missing in the evaluation period + eval_data = test_data.loc[:event_end] + eval_data = eval_data.dropna(how='all', axis=1) + train_data = train_data[eval_data.columns] + test_data = test_data[eval_data.columns] + + # Create normal behaviour indicator + train_normal_flag = self.create_normal_flag(data=train_data, + manufacturer=manufacturer, + substation_id=substation_id) + test_normal_flag = self.create_normal_flag(data=test_data, + manufacturer=manufacturer, + substation_id=substation_id) + return { + 'train_data': train_data, + 'test_data': test_data, + 'train_normal_flag': train_normal_flag, + 'test_normal_flag': test_normal_flag, + 'event_data': event_row, + } diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 2e1432b..941e743 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -3,23 +3,24 @@ import logging from typing import Optional, Tuple, List from datetime import datetime -import os import warnings +from pathlib import Path import pandas as pd import numpy as np from tensorflow.keras.backend import clear_session from energy_fault_detector.core.fault_detection_model import FaultDetectionModel +from energy_fault_detector.core.fault_detection_result import FaultDetectionResult, ModelMetadata from energy_fault_detector.threshold_selectors import AdaptiveThresholdSelector from energy_fault_detector.data_preprocessing.data_preprocessor import DataPreprocessor from energy_fault_detector.data_preprocessing.data_clipper import DataClipper from energy_fault_detector.root_cause_analysis import Arcana from energy_fault_detector.config import Config -from energy_fault_detector._logs import setup_logging -from energy_fault_detector.core.fault_detection_result import FaultDetectionResult, ModelMetadata +from energy_fault_detector.core._logs import setup_logging + -setup_logging(os.path.join(os.path.dirname(__file__), 'logging.yaml')) +setup_logging(Path(__file__).parent / 'logging.yaml') logger = logging.getLogger('energy_fault_detector') @@ -41,7 +42,7 @@ class FaultDetector(FaultDetectionModel): save_timestamps: a list of string timestamps indicating when the model was saved. """ - def __init__(self, config: Optional[Config] = None, model_directory: str = 'fault_detector_model', + def __init__(self, config: Optional[Config] = None, model_directory: str | Path = 'fault_detector_model', model_subdir: Optional[str] = None): if model_subdir is not None: warnings.warn( @@ -52,11 +53,6 @@ def __init__(self, config: Optional[Config] = None, model_directory: str = 'faul ) super().__init__(config=config, model_directory=model_directory) - if config is None: - logger.debug('No configuration set. Load models and config from path with the `FaultDetector.load_models`' - ' method.') - else: - self._init_models() def preprocess_train_data(self, sensor_data: pd.DataFrame, normal_index: pd.Series, fit_preprocessor: bool = True ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series]: @@ -99,6 +95,10 @@ def preprocess_train_data(self, sensor_data: pd.DataFrame, normal_index: pd.Seri self.data_preprocessor.fit(x_normal) x_prepped = self.data_preprocessor.transform(x_normal) + + # Use float32 by default for performance, unless specified otherwise in config + x_prepped = x_prepped.astype(self.config.dtype) + return x_prepped, x, y def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, save_models: bool = True, @@ -287,6 +287,7 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, logger.debug('No model_path provided; using existing model instances.') x_prepped = self.data_preprocessor.transform(x).sort_index() + x_prepped = x_prepped.astype(self.config.dtype) column_order = x_prepped.columns if self.autoencoder.is_conditional: diff --git a/energy_fault_detector/main.py b/energy_fault_detector/main.py index 3f379dd..563d9fa 100644 --- a/energy_fault_detector/main.py +++ b/energy_fault_detector/main.py @@ -1,19 +1,19 @@ """Quick energy fault detector CLI tool, to try out the EnergyFaultDetector model on a specific dataset.""" -import os import argparse import logging import yaml +from pathlib import Path from dataclasses import dataclass, field from typing import List, Optional logger = logging.getLogger('energy_fault_detector') -here = os.path.abspath(os.path.dirname(__file__)) +here = Path(__file__).resolve().parent @dataclass class Options: - csv_test_data_path: Optional[str] = None + csv_test_data_path: Optional[str | Path] = None train_test_column_name: Optional[str] = None train_test_mapping: Optional[dict] = None time_column_name: Optional[str] = None @@ -27,7 +27,7 @@ class Options: enable_debug_plots: bool = False -def load_options_from_yaml(file_path: str) -> Options: +def load_options_from_yaml(file_path: str | Path) -> Options: """Load options from a YAML file and return an Options dataclass.""" with open(file_path, 'r') as file: options_dict = yaml.safe_load(file) @@ -76,19 +76,19 @@ def main(): parser.add_argument( 'csv_data_path', - type=str, + type=Path, help='Path to a CSV file containing training data.' ) parser.add_argument( '--options', - type=str, + type=Path, help='Path to a YAML file containing additional options.', default=None, required=False, ) parser.add_argument( '--results_dir', - type=str, + type=Path, help='Path to a directory where results will be saved.', default='results' ) @@ -107,13 +107,13 @@ def main(): logger.info(f"Options YAML: {args.options}") logger.info(f"Results Directory: {args.results_dir}") - os.makedirs(args.results_dir, exist_ok=True) + args.results_dir.mkdir(exist_ok=True) options = Options() # Initialize with default values if args.options: options = load_options_from_yaml(args.options) elif args.c2c_example: - options = load_options_from_yaml(os.path.join(here, 'c2c_options.yaml')) + options = load_options_from_yaml(here / 'c2c_options.yaml') print(options) @@ -136,12 +136,13 @@ def main(): min_anomaly_length=options.min_anomaly_length, save_dir=args.results_dir, ) - logger.info(f'Fault detection completed. Results are saved in {args.results_dir}.') + logger.info(f'Fault detection completed. Results are saved in the directory "{args.results_dir}".') prediction_results.save(args.results_dir) - event_meta_data.to_csv(os.path.join(args.results_dir, 'events.csv'), index=False) + event_meta_data.to_csv(args.results_dir / 'events.csv', index=False) except Exception as e: logger.error(f'An error occurred: {e}') + raise if __name__ == '__main__': diff --git a/energy_fault_detector/quick_fault_detection/configuration.py b/energy_fault_detector/quick_fault_detection/configuration.py index 412199d..cf65608 100644 --- a/energy_fault_detector/quick_fault_detection/configuration.py +++ b/energy_fault_detector/quick_fault_detection/configuration.py @@ -68,10 +68,37 @@ def update_preprocessor_config(config: Config, features_to_exclude: Union[List[s Returns: Config: Updated config object. """ + if features_to_exclude is not None: - config['train']['data_preprocessor']['params']['features_to_exclude'] = features_to_exclude + if config['train']['data_preprocessor'].get('params'): + # old data preprocessing configuration style + config['train']['data_preprocessor']['params']['features_to_exclude'] = features_to_exclude + else: + # new configuration style + steps = config['train']['data_preprocessor'].setdefault('steps', []) + column_selector_found = False + for step in steps: + if step['name'] == 'column_selector': + step['params']['features_to_exclude'] = features_to_exclude + column_selector_found = True + break + if not column_selector_found: + steps.append({'name': 'column_selector', 'params': {'features_to_exclude': features_to_exclude}}) if angles is not None: - config['train']['data_preprocessor']['params']['angles'] = angles + if config['train']['data_preprocessor'].get('params'): + # old data preprocessing configuration style + config['train']['data_preprocessor']['params']['angles'] = angles + else: + # new configuration style + steps = config['train']['data_preprocessor'].setdefault('steps', []) + angle_transformer_found = False + for step in steps: + if step['name'] == 'angle_transformer': + step['params']['angles'] = angles + angle_transformer_found = True + break + if not angle_transformer_found: + steps.append({'name': 'angle_transformer', 'params': {'angles': angles}}) return config diff --git a/energy_fault_detector/quick_fault_detection/optimization.py b/energy_fault_detector/quick_fault_detection/optimization.py index a8de03c..fcadc2f 100644 --- a/energy_fault_detector/quick_fault_detection/optimization.py +++ b/energy_fault_detector/quick_fault_detection/optimization.py @@ -127,6 +127,9 @@ def reconstruction_mse(trial: op.Trial) -> float: deviations = training_dict.val_recon_error score = float(np.mean((np.square(deviations)))) + # help garbage collection + del model + return score study = op.create_study(sampler=op.samplers.TPESampler(), diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index f244de5..dfc82d8 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -1,12 +1,13 @@ """Quick energy fault detection, to try out the EnergyFaultDetector model on a specific dataset.""" import os +from pathlib import Path import logging from typing import List, Optional, Tuple import pandas as pd -from energy_fault_detector._logs import setup_logging +from energy_fault_detector.core._logs import setup_logging from energy_fault_detector.fault_detector import FaultDetector from energy_fault_detector.utils.analysis import create_events from energy_fault_detector.root_cause_analysis.arcana_utils import calculate_mean_arcana_importances @@ -20,14 +21,14 @@ logger = logging.getLogger('energy_fault_detector') -def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = None, +def quick_fault_detector(csv_data_path: str | Path, csv_test_data_path: Optional[str | Path] = None, train_test_column_name: Optional[str] = None, train_test_mapping: Optional[dict] = None, time_column_name: Optional[str] = None, status_data_column_name: Optional[str] = None, status_mapping: Optional[dict] = None, status_label_confidence_percentage: Optional[float] = 0.95, features_to_exclude: Optional[List[str]] = None, angle_features: Optional[List[str]] = None, automatic_optimization: bool = True, enable_debug_plots: bool = False, - min_anomaly_length: int = 18, save_dir: Optional[str] = None + min_anomaly_length: int = 18, save_dir: Optional[str | Path] = None ) -> Tuple[FaultDetectionResult, pd.DataFrame]: """Analyzes provided data using an autoencoder based approach for identifying anomalies based on a learned normal behavior. Anomalies are then aggregated to events and further analyzed. diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index b2a0bf5..65180ee 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -22,9 +22,10 @@ class Arcana: """Anomaly root cause analysis. Tries to find which of the sensors/inputs caused - the reconstruction error of an autoencoder model. + the reconstruction error of an autoencoder model. Implementation details are found in + https://doi.org/10.1016/j.egyai.2021.100065. - This is done by minimizing the loss function: + This method minimizes the loss function: '(1 - alpha) L2(X_corr - autoencoder(X_corr)) + alpha * L1(X_corr - X_obs)' @@ -103,11 +104,12 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi loss 2 for each 50th iteration) track_bias: If True bias will be returned as a list arcana biases each 50th iteration) - Returns: - x_bias: pandas DataFrame - tracked_losses: A dataframe containing the combined loss, loss 1 (reconstruction) and - loss 2 (regularization) for each 50th iteration (if track_losses is False this list is empty) - tracked_bias: A List of dataframes representing x_bias + Returns: A tuple with the following three objects + + - x_bias: pandas DataFrame + - tracked_losses: A dataframe containing the combined loss, loss 1 (reconstruction) and + loss 2 (regularization) for each 50th iteration (if track_losses is False this list is empty) + - tracked_bias: A List of dataframes representing x_bias """ conditions = None diff --git a/energy_fault_detector/utils/data_downloads.py b/energy_fault_detector/utils/data_downloads.py index 6abeef1..1742155 100644 --- a/energy_fault_detector/utils/data_downloads.py +++ b/energy_fault_detector/utils/data_downloads.py @@ -2,7 +2,6 @@ from typing import List, Union import os import re -import sys import shutil import logging from pathlib import Path @@ -142,6 +141,30 @@ def safe_extract_zip(zip_path: Path, dest_dir: Path): zf.extractall(dest_dir) +def recursive_safe_extract(zip_path: Path, dest_dir: Path, remove_archives: bool = True): + """ + Recursively extracts ZIP files, including those found inside other ZIPs. + + Args: + zip_path: Path to the .zip archive. + dest_dir: Directory to extract into. + remove_archives: Whether to delete the .zip file after successful extraction. + """ + logger.info(f"Extracting {zip_path.name} to {dest_dir}") + safe_extract_zip(zip_path, dest_dir) + + if remove_archives: + try: + zip_path.unlink() + except OSError as e: + logger.warning(f"Could not remove archive {zip_path}: {e}") + + # After extraction, check if any new .zip files were created in the dest_dir + for item in list(dest_dir.rglob("*.zip")): + recursive_safe_extract(item, item.parent, remove_archives=remove_archives) + + + def prepare_output_dir(out_dir: Path, overwrite: bool) -> None: """Ensure the output directory is ready. @@ -172,25 +195,40 @@ def prepare_output_dir(out_dir: Path, overwrite: bool) -> None: def download_zenodo_data(identifier: str = "10.5281/zenodo.15846963", dest: Path = "./downloads", - overwrite: bool = False) -> Union[List[Path], Path]: + remove_zip: bool = True, overwrite: bool = False, flatten_file_structure: bool = True, + expected_file_types: Union[List[str], str] = "*.csv") -> Path: """ Download a Zenodo record via API and unzip any .zip files. + Downloads all files associated with a given Zenodo record (by ID, DOI, or URL), + saves them to a local directory, and optionally flattens nested directories + that result from extracting ZIP archives. + Args: - identifier (str): Zenodo record ID, DOI (e.g., 10.5281/zenodo.15846963), or record URL - dest (Path): Output directory (default: downloads) + identifier (str): Zenodo record ID, DOI (e.g., 10.5281/zenodo.15846963), or record URL. + Defaults to the CARE2Compare dataset. + dest (Path): Local output directory to save downloaded files. (default: downloads) + remove_zip (bool): If True, ZIP archives will be removed after extraction. overwrite (bool): If True and dest already exists, contents of dest will be overwritten. + Default is False. + flatten_file_structure (bool): If True and unzipping results in a single top-level folder + with no conflicting root-level files matching `expected_file_types`, + moves its contents up one level. Default is True. + expected_file_types (Union[List[str], str]): Glob pattern(s) used to detect existing relevant files + at the root. If any match, flattening is skipped. + Can be a string like '*.csv' or list like ['*.csv', '*.json']. Default is '*.csv'. Returns: - Union[List[Path], Path]: List of paths the extracted content of all downloaded zip files. If there is only one - downloaded zip file only one path is returned + Path: The absolute path to the directory containing the downloaded and unzipped data. """ + if isinstance(expected_file_types, str): + expected_file_types = [expected_file_types] session = requests.Session() try: record_id = parse_record_id(identifier) except ValueError as e: logger.error(e) - sys.exit(1) + raise out_dir = Path(dest) @@ -200,7 +238,7 @@ def download_zenodo_data(identifier: str = "10.5281/zenodo.15846963", dest: Path prepare_output_dir(out_dir, overwrite) except Exception as e: logger.error(f"Failed to prepare output directory: {e}") - sys.exit(1) + raise logger.info(f"Fetching record {record_id} metadata...") record = fetch_record(session, record_id) @@ -209,7 +247,7 @@ def download_zenodo_data(identifier: str = "10.5281/zenodo.15846963", dest: Path files = list_files(session, record) except RuntimeError as e: logger.error(e) - sys.exit(1) + raise downloaded = [] for f in files: @@ -228,33 +266,23 @@ def download_zenodo_data(identifier: str = "10.5281/zenodo.15846963", dest: Path # Unzip any downloaded .zip files for p in downloaded: if p.suffix.lower() == ".zip": - extract_dir = p.with_suffix("") # folder named after the zip - logger.info(f"Unzipping: {p.name} -> {extract_dir}") + extract_target = out_dir # Extract directly into dest + logger.info(f"Unzipping: {p.name} -> {extract_target}") try: - safe_extract_zip(p, extract_dir) + recursive_safe_extract(p, extract_target, remove_archives=remove_zip) except Exception as e: logger.error(f"Unzipping failed for {p.name}: {e}") - else: - try: - p.unlink() - logger.info(f"Removed archive: {p.name}") - except OSError as e: - logger.warning(f"Could not remove {p}: {e}") - - logger.info(f"Validating file structure.") - # Check resulting file structure and remove duplicate directory names if they exist due to unzipping. - root_paths = [] - for file_or_dir in os.listdir(out_dir): - root_path = out_dir / file_or_dir - if os.path.isdir(root_path): - root_paths.append(root_path) - if file_or_dir in os.listdir(root_path): - duplicate_dir_name = root_path / file_or_dir - logger.info(f"Removing redundant directory: {duplicate_dir_name}") - move_list = os.listdir(duplicate_dir_name) - for content in move_list: - shutil.move(src=duplicate_dir_name / content, - dst=root_path / content) - os.rmdir(duplicate_dir_name) - logger.info(f"File structure validated.") - return root_paths if len(root_paths) > 1 else root_paths[0] + + if flatten_file_structure: + logger.info(f"Flattening file structure.") + # Standardize structure: If unzipping created a single subfolder, move its contents up + # This often happens with Zenodo zips. + subdirs = [d for d in out_dir.iterdir() if d.is_dir()] + if len(subdirs) == 1 and not any(next(out_dir.glob(pattern), None) for pattern in expected_file_types): + redundant_dir = subdirs[0] + logger.info(f"Flattening directory structure from {redundant_dir}") + for item in redundant_dir.iterdir(): + shutil.move(str(item), str(out_dir / item.name)) + redundant_dir.rmdir() + + return out_dir diff --git a/energy_fault_detector/utils/visualisation.py b/energy_fault_detector/utils/visualisation.py index 7a0c337..4eda52c 100644 --- a/energy_fault_detector/utils/visualisation.py +++ b/energy_fault_detector/utils/visualisation.py @@ -11,7 +11,8 @@ from energy_fault_detector.core import Autoencoder from energy_fault_detector.fault_detector import FaultDetector -from energy_fault_detector.utils.analysis import calculate_criticality + +MAX_PLOTS = 20 def plot_learning_curve(model: Union[Autoencoder, FaultDetector], ax: plt.Axes = None, label: str = '', @@ -71,7 +72,7 @@ def plot_reconstruction(data: pd.DataFrame, reconstruction: pd.DataFrame, featur missing = set(to_plot) - set(data.columns) raise ValueError(f'The columns {missing} are not present in the dataset.') - if len(to_plot) > 30: # You can adjust this threshold + if len(to_plot) > MAX_PLOTS: warnings.warn(f"You are attempting to plot a large number of features ({len(to_plot)}). " "This may result in a cluttered figure. Consider selecting fewer features to plot.") @@ -93,6 +94,67 @@ def plot_reconstruction(data: pd.DataFrame, reconstruction: pd.DataFrame, featur return fig, ax +def plot_reconstruction_with_model(model: FaultDetector, data: pd.DataFrame, + features_to_plot: Optional[List[str]] = None, + height_multiplier: float = 1.5, + original_scale: bool = True) -> Tuple[plt.Figure, plt.Axes]: + """Plots the data and its reconstruction using the provided model. Similar to plot_reconstruction, but uses the + 'model.predict' method to get the reconstruction. Counter values are plottet as diffs or rates with their + reconstruction. + + Args: + model (FaultDetector): Fitted model with data_preprocessor and autoencoder. + data (pd.DataFrame): Raw input data. + features_to_plot (Optional[List[str]], optional): Columns to plot. If None, uses reconstruction columns. + height_multiplier (float, optional): Vertical scaling for the figure. Defaults to 1.5. + original_scale (bool, optional): If True, y-limits are based on the observed plot-series + (min-std, max+std). Defaults to True. + + Returns: + Tuple[plt.Figure, plt.Axes]: The figure and axes. + """ + + # Get model predictions + predictions = model.predict(sensor_data=data) + reconstruction = predictions.reconstruction + # model data preprocessor + dp = model.data_preprocessor + + # Discover counter mappings (original -> derived) from CounterDiffTransformer steps + from energy_fault_detector.data_preprocessing.counter_diff_transformer import CounterDiffTransformer + counter_map = {} # original counter -> derived column (e.g., energy_total_kwh -> energy_total_kwh_diff) + for name, est in dp.named_steps.items(): + if isinstance(est, CounterDiffTransformer): + # Need fitted attributes + try: + counters = getattr(est, "counters_", []) + suffix = getattr(est, "output_suffix_", "_diff") + except Exception: + counters = [] + suffix = "_diff" + for c in counters: + counter_map[c] = f"{c}{suffix}" + + # Determine features to plot + to_plot = list(reconstruction.columns) if features_to_plot is None else features_to_plot + + # If any counter are in features to plot, transform input data, so we can plot the diffs with their reconstructions + if any(col in counter_map.keys() for col in to_plot): + # replace counters with their _diff/_rate name + to_plot = [col if col not in counter_map.keys() else counter_map[col] for col in to_plot] + dataset_to_plot = dp.inverse_transform(dp.transform(data.copy()))[to_plot] + else: + dataset_to_plot = data[to_plot].copy() + + return plot_reconstruction( + dataset_to_plot, + reconstruction, + features_to_plot=to_plot, + height_multiplier=height_multiplier, + original_scale=original_scale + ) + + def plot_score_with_threshold(model: FaultDetector, data: pd.DataFrame, normal_index: pd.Series = None, ax: plt.Axes = None, figsize: Tuple[float, float] = (8, 3), show_predicted_anomaly: bool = False, show_threshold: bool = True, @@ -157,8 +219,7 @@ def plot_score_with_threshold(model: FaultDetector, data: pd.DataFrame, normal_i ax.plot(threshold, linestyle='-', linewidth=.7, label='threshold', c=threshold_color) if show_criticality: - crit = calculate_criticality(predictions.predicted_anomalies["anomaly"], normal_idx=normal_index, - max_criticality=max_criticality) + crit = predictions.criticality(normal_idx=normal_index, max_criticality=max_criticality) ax2 = ax.twinx() ax2.plot(crit, label='criticality counter', color=criticality_color) ax2.legend(loc='upper right', markerscale=3) @@ -166,9 +227,13 @@ def plot_score_with_threshold(model: FaultDetector, data: pd.DataFrame, normal_i ax.set_ylabel('anomaly score') - legend = ax.legend(loc='upper left', markerscale=3) - for h in legend.legend_handles: - h.set_alpha(1) + # Get handles and labels from the current axes + handles, labels = ax.get_legend_handles_labels() + if labels: + # Only create the legend if there are labels found + legend = ax.legend(loc='upper left', markerscale=3) + for h in legend.legend_handles: + h.set_alpha(1) return fig, ax diff --git a/notebooks/CARE to Compare.ipynb b/notebooks/CARE to Compare/CARE to Compare.ipynb similarity index 92% rename from notebooks/CARE to Compare.ipynb rename to notebooks/CARE to Compare/CARE to Compare.ipynb index 247c101..0b86a6e 100644 --- a/notebooks/CARE to Compare.ipynb +++ b/notebooks/CARE to Compare/CARE to Compare.ipynb @@ -2,6 +2,13 @@ "cells": [ { "cell_type": "markdown", + "id": "fe8e2b4b8752a26d", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "# CARE Score and Care2CompareDataset usage\n", "\n", @@ -12,14 +19,11 @@ "2. Using the CAREScore to evaluate a model on the dataset.\n", "3. Recreating the results of the CARE paper.\n", "4. Using Care2CompareDataset and CARE-Score for other datasets." - ], - "metadata": { - "collapsed": false - }, - "id": "fe8e2b4b8752a26d" + ] }, { "cell_type": "code", + "execution_count": null, "id": "initial_id", "metadata": { "collapsed": true, @@ -27,6 +31,7 @@ "outputs_hidden": true } }, + "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", @@ -36,51 +41,49 @@ "from energy_fault_detector.fault_detector import FaultDetector\n", "from energy_fault_detector.config import Config\n", "from energy_fault_detector.evaluation import CAREScore, Care2CompareDataset" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "75ac0a42f7c2f795", "metadata": {}, + "outputs": [], "source": [ "data_dir = Path('..') / '..' / 'Care_To_Compare_v6'" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "70f2af66920ba09b", "metadata": {}, + "outputs": [], "source": [ "c2c = Care2CompareDataset(path=data_dir, download_dataset=False) # If you have not downloaded the dataset yet, set download_dataset to True" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "8a1422f3e62850ab", "metadata": {}, + "outputs": [], "source": [ "c2c.event_info_all" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "48309c102d52aeb0", "metadata": {}, + "outputs": [], "source": [ "# select data for a specific event\n", "x, y = c2c.load_event_dataset(0, statistics=['average', 'std_dev'])\n", "x.head()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -91,8 +94,11 @@ ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "e9087f75625207d2", + "metadata": {}, + "outputs": [], "source": [ "c2c = Care2CompareDataset(data_dir)\n", "index_column = 'id' # us time_stamp as index column if you are using the TimestampTransformer\n", @@ -174,14 +180,14 @@ "\n", " # print final score:\n", " print('Final score: ', care_score.get_final_score())" - ], - "id": "e9087f75625207d2", - "outputs": [], - "execution_count": null + ] }, { - "metadata": {}, "cell_type": "code", + "execution_count": null, + "id": "a41c52ce61dd7e06", + "metadata": {}, + "outputs": [], "source": [ "# combine results and get final score over all events / wind farms\n", "all_evaluations = pd.concat([pd.read_csv(f'results_{wf}{suffix}.csv') for wf in ['A', 'B', 'C']])\n", @@ -203,10 +209,7 @@ "\n", "print('overall')\n", "care_score.get_final_score()" - ], - "id": "a41c52ce61dd7e06", - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", @@ -221,8 +224,10 @@ }, { "cell_type": "code", + "execution_count": null, "id": "ac23c5d8fe923863", "metadata": {}, + "outputs": [], "source": [ "# model config\n", "wf = 'A'\n", @@ -263,39 +268,49 @@ " predicted_anomalies=prediction.predicted_anomalies,\n", " ignore_normal_index=True,\n", " )\n" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, "id": "240be4ff7c0b1325", "metadata": {}, + "outputs": [], "source": [ "care_score.get_final_score()" - ], - "outputs": [], - "execution_count": null + ] }, { "cell_type": "markdown", + "id": "5b4cf2cd5b2f9516", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ "# Reproducing results from the Paper\n", "To reproduce the results from (https://doi.org/10.3390/data9120138), an additional filter is needed (though only for wind farm C):\n", "- determine cut-in and cut-off wind speeds by power curve analysis\n", "- Remove potentially anomalous data from the training data:\n", " - Remove rows where the wind speed is outside the normal operation range (below cut-in or above cut-off)\n", - " - Remove rows where the power is zero or near zero (e.g. $P < 0.01$)." - ], - "metadata": { - "collapsed": false - }, - "id": "5b4cf2cd5b2f9516" + " - Remove rows where the power is zero or near zero (e.g. $P < 0.01$).\n", + "\n", + "Note: The trained models may not reproduce the exact results reported in the paper due to random initialization, hardware differences, and random seeds. In practice, it is advisable to train each model 5–10 times and select the best-performing run." + ] }, { "cell_type": "markdown", + "id": "8feb0d8f0e917072", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + } + }, "source": [ - "# CARE Score and Care2CompareDataset usage on other datasets\n", + "# CARE Score usage on other datasets\n", "\n", "To use the CARE-Score with other datasets you need the following data:\n", "- define events containing anomalous data (the period before an actual fault)\n", @@ -309,11 +324,7 @@ "- Calculate the CARE score `CAREScore.get_final_score`\n", "\n", "For each of these events, you need to be able to train a proper model (for example one large model or a model for each event). For the CARE2Compare dataset we assumed 1 year of training data with >=70% normal operation is enough to create a normal behavior model.\n" - ], - "metadata": { - "collapsed": false - }, - "id": "8feb0d8f0e917072" + ] } ], "metadata": { @@ -332,7 +343,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/notebooks/c2c_configs/windfarm_A.yaml b/notebooks/CARE to Compare/c2c_configs/windfarm_A.yaml similarity index 75% rename from notebooks/c2c_configs/windfarm_A.yaml rename to notebooks/CARE to Compare/c2c_configs/windfarm_A.yaml index da7133a..5ef3264 100644 --- a/notebooks/c2c_configs/windfarm_A.yaml +++ b/notebooks/CARE to Compare/c2c_configs/windfarm_A.yaml @@ -4,14 +4,15 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - max_col_zero_frac: 0.99 - max_nan_frac_per_col: 0.05 - min_unique_value_count: 10 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + - name: low_unique_value_filter + params: + min_unique_value_count: 10 + max_col_zero_frac: 0.99 + - name: minmax_scaler data_splitter: shuffle: true diff --git a/notebooks/c2c_configs/windfarm_B.yaml b/notebooks/CARE to Compare/c2c_configs/windfarm_B.yaml similarity index 74% rename from notebooks/c2c_configs/windfarm_B.yaml rename to notebooks/CARE to Compare/c2c_configs/windfarm_B.yaml index 7b55de8..68320b9 100644 --- a/notebooks/c2c_configs/windfarm_B.yaml +++ b/notebooks/CARE to Compare/c2c_configs/windfarm_B.yaml @@ -4,14 +4,15 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - max_col_zero_frac: 0.8 - max_nan_frac_per_col: 0.05 - min_unique_value_count: 10 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + - name: low_unique_value_filter + params: + min_unique_value_count: 10 + max_col_zero_frac: 0.8 + - name: minmax_scaler data_splitter: shuffle: true diff --git a/notebooks/c2c_configs/windfarm_C.yaml b/notebooks/CARE to Compare/c2c_configs/windfarm_C.yaml similarity index 69% rename from notebooks/c2c_configs/windfarm_C.yaml rename to notebooks/CARE to Compare/c2c_configs/windfarm_C.yaml index c618cc3..3f7b08a 100644 --- a/notebooks/c2c_configs/windfarm_C.yaml +++ b/notebooks/CARE to Compare/c2c_configs/windfarm_C.yaml @@ -4,14 +4,15 @@ train: upper_percentile: 0.999 data_preprocessor: - params: - include_column_selector: true - include_low_unique_value_filter: true - include_duplicate_value_to_nan: false - max_col_zero_frac: 0.99 - max_nan_frac_per_col: 0.05 - min_unique_value_count: 10 - scale: minmax + steps: + - name: column_selector + params: + max_nan_frac_per_col: 0.05 + - name: low_unique_value_filter + params: + min_unique_value_count: 10 + max_col_zero_frac: 0.99 + - name: minmax_scaler data_splitter: shuffle: true diff --git a/notebooks/Example - Hyperparameter Optimization.ipynb b/notebooks/Example - Hyperparameter Optimization.ipynb index 37abecf..344db94 100644 --- a/notebooks/Example - Hyperparameter Optimization.ipynb +++ b/notebooks/Example - Hyperparameter Optimization.ipynb @@ -10,119 +10,58 @@ "1. Optimizing the Autoencoder reconstruction using the MSE\n", "2. Optimizing the FaultDetector classification performance using the Fbeta score\n", "3. Optimizing the FaultDetector classification performance using the CARE-score\n", - "The optimization is done using the [CARE to Compare dataset](https://doi.org/10.5281/zenodo.14958989)\n", "\n", - "For this example you need to install Optuna, which is not contained in the standard requirements of the framework\n", - "Optuna [docs](https://optuna.readthedocs.io/en/stable/index.html) and [tutorials](https://optuna.readthedocs.io/en/stable/tutorial/index.html)\n", - "\n", - "-> Install additional requirements for this example using 'pip notebooks/example_requirements.txt'" + "The optimization is done using the [CARE to Compare dataset](https://doi.org/10.5281/zenodo.14958989)" ], - "id": "552412b97335ea1c" + "id": "acc177b6ece47b21" }, { + "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "from typing import List\n", + "from copy import deepcopy\n", "\n", "import optuna as op\n", - "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics import fbeta_score\n", "\n", "from energy_fault_detector import FaultDetector, Config\n", "from energy_fault_detector.evaluation import CAREScore, Care2CompareDataset" ], - "metadata": { - "collapsed": false - }, - "id": "217e454f48a9879b", - "outputs": [], - "execution_count": null - }, - { - "cell_type": "code", - "source": [ - "data_path = './Care_To_Compare'" - ], - "metadata": { - "collapsed": false - }, - "id": "1d2e20520349e34e", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ - "def update_config(config: Config, feature_descriptions: pd.DataFrame) -> None:\n", - " \"\"\"Update config based on provided feature descriptions.\"\"\"\n", - "\n", - " def get_columns(feature_description_selection: pd.DataFrame) -> List[str]:\n", - " col_suffix = {\n", - " 'average': 'avg',\n", - " 'minimum': 'min',\n", - " 'maximum': 'max',\n", - " 'std_dev': 'std'\n", - " }\n", - " columns = []\n", - " for _, row in feature_description_selection.iterrows():\n", - " if row.statistics_type == 'average':\n", - " # in this case the column can be either sensor_i or sensor_i_avg, so we add both\n", - " columns.append(row.sensor_name)\n", - " for stat in row.statistics_type.split(','):\n", - " columns.append(f'{row.sensor_name}_{col_suffix[stat]}')\n", - " return columns\n", - "\n", - " angles = feature_descriptions.loc[feature_descriptions['is_angle']]\n", - " to_exclude = feature_descriptions.loc[feature_descriptions['is_counter']]\n", - "\n", - " angle_columns = get_columns(angles)\n", - " to_exclude_columns = get_columns(to_exclude)\n", - " \n", - " config['train']['data_preprocessor']['params']['angles'] = (\n", - " config['train']['data_preprocessor']['params'].get('angles', []) + angle_columns\n", - " )\n", - " config['train']['data_preprocessor']['params']['features_to_exclude'] = (\n", - " config['train']['data_preprocessor']['params'].get('features_to_exclude', []) + to_exclude_columns\n", - " )\n", - " \n", - " config.update_config(config.config_dict)\n" - ], - "id": "723566d33fa10db6", - "outputs": [], - "execution_count": null + "id": "f498cdd89c2cf406" }, { "metadata": {}, "cell_type": "code", - "source": [ - "c2c = Care2CompareDataset(data_path)" - ], - "id": "ec8c21f059bba3c8", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "data_path = './Care_To_Compare'", + "id": "641b5084847234b8" }, { "metadata": {}, "cell_type": "markdown", - "source": [ - "### Optimize autoencoder reconstruction" - ], - "id": "99fcaa7054047666" + "source": "## Optimize autoencoder reconstruction", + "id": "d5ac0225fa5e566b" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "model_config = Config('c2c_configs/windfarm_C.yaml') # starting point\n", - "\n", - "# our test set\n", + "# Our test set (a specific event from the CARE2Compare dataset)\n", "c2c = Care2CompareDataset(data_path)\n", "event_id = 47\n", - "train_data, normal_index, _, _ = c2c.get_formatted_event_dataset(event_id=event_id, index_column='time_stamp')\n", + "train_data, normal_index, _, _ = c2c.load_and_format_event_dataset(event_id=event_id, index_column='time_stamp')\n", "\n", - "# speed up for testing\n", + "# Model configuration starting point\n", + "model_config = Config('c2c_configs/windfarm_C.yaml')\n", + "c2c.update_c2c_config(model_config, 'C')\n", + "\n", + "# speed up for testing (select a small part of the dataset)\n", "N = 10000\n", "normal_index = normal_index.iloc[:N]\n", "train_data = train_data.iloc[:N]\n", @@ -140,36 +79,23 @@ " Returns:\n", " MSE of the reconstruction.\n", " \"\"\"\n", + " # Use a fresh config dict per trial\n", + " cfg = deepcopy(model_config.config_dict)\n", "\n", - " autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", + " autoencoder_params = cfg['train']['autoencoder']['params']\n", "\n", " # sample new parameters\n", - " autoencoder_params['batch_size'] = int(trial.suggest_categorical(\n", - " name='batch_size', choices=[32, 64, 128]\n", - " ))\n", - " autoencoder_params['learning_rate'] = trial.suggest_float(\n", - " name='learning_rate', low=1e-5, high=0.01, log=True\n", - " )\n", - " autoencoder_params['decay_rate'] = trial.suggest_float(\n", - " name='decay_rate', low=0.8, high=0.99\n", - " )\n", + " autoencoder_params['batch_size'] = int(trial.suggest_categorical(name='batch_size', choices=[32, 64, 128]))\n", + " autoencoder_params['learning_rate'] = trial.suggest_float(name='learning_rate', low=1e-5, high=0.01, log=True)\n", + " autoencoder_params['decay_rate'] = trial.suggest_float(name='decay_rate', low=0.8, high=0.99)\n", "\n", " # architecture\n", - " autoencoder_params['layers'][0] = trial.suggest_int(\n", - " name='layers_0', low=100, high=400\n", - " )\n", - " autoencoder_params['layers'][1] = trial.suggest_int(\n", - " name='layers_1', low=50, high=100\n", - " )\n", - " autoencoder_params['code_size'] = trial.suggest_int(\n", - " name='code_size', low=10, high=30\n", - " )\n", - "\n", - " # update the configuration\n", - " model_config.update_config(model_config.config_dict)\n", + " autoencoder_params['layers'][0] = trial.suggest_int(name='layers_0', low=100, high=400)\n", + " autoencoder_params['layers'][1] = trial.suggest_int(name='layers_1', low=50, high=100)\n", + " autoencoder_params['code_size'] = trial.suggest_int(name='code_size', low=10, high=30)\n", "\n", " # create a new model using our new configuration and train the model\n", - " model = FaultDetector(model_config)\n", + " model = FaultDetector(Config(config_dict=cfg))\n", " # For autoencoder optimization, we do not need to fit a threshold\n", " training_result = model.fit(train_data, normal_index=normal_index, fit_autoencoder_only=True, save_model=False)\n", "\n", @@ -179,81 +105,69 @@ "\n", " return score" ], - "id": "eadbdf08b64a43e4", - "outputs": [], - "execution_count": null + "id": "6cc9b7bee0de0a25" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "study = op.create_study(sampler=op.samplers.TPESampler(),\n", - " study_name='autoencoder_optimization',\n", - " direction='minimize')\n", + "study = op.create_study(sampler=op.samplers.TPESampler(), study_name='autoencoder_optimization', direction='minimize')\n", "\n", "# if we want to ensure that the first trial is done with the hyperparameters of the configuration, we need to enqueue a trial:\n", "autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", "study.enqueue_trial(params={\n", " 'batch_size': autoencoder_params['batch_size'],\n", " 'learning_rate': autoencoder_params['learning_rate'],\n", - " 'decay_rate': autoencoder_params['decay_rate'],\n", " 'layers_0': autoencoder_params['layers'][0],\n", " 'layers_1': autoencoder_params['layers'][1],\n", " 'code_size': autoencoder_params['code_size'],\n", - "})" - ], - "id": "ff13f4aeb3d8b6d0", - "outputs": [], - "execution_count": null - }, - { - "metadata": {}, - "cell_type": "code", - "source": [ + "})\n", + "\n", + "# Run optimization for 5 trials\n", "study.optimize(reconstruction_mse, n_trials=5)" ], - "id": "edfa14b97d00a0b9", - "outputs": [], - "execution_count": null + "id": "1036f11c1b97e12" }, { "metadata": {}, "cell_type": "code", - "source": [ - "study.best_params" - ], - "id": "91747046042e809a", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "study.best_params", + "id": "7d452e30dd1a1e0e" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ "# analyze results\n", "study.trials_dataframe()" ], - "id": "4cf3e2b09291ba57", - "outputs": [], - "execution_count": null + "id": "5f061454dc9f7980" }, { "metadata": {}, "cell_type": "markdown", - "source": "# Optimize fault detection model", - "id": "cb3c07b2be2cb5ed" + "source": "## Optimize fault detection model - F-beta score", + "id": "80664d61f648132f" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ + "# Our test set (a specific event from the CARE2Compare dataset)\n", "c2c = Care2CompareDataset(data_path)\n", - "\n", "event_id = 47\n", - "event_info = c2c.event_info_all[c2c.event_info_all['event_id'] == event_id].iloc[0]\n", - "\n", - "train_data, normal_index, test_data, test_normal_index = c2c.get_formatted_event_dataset(event_id=event_id, index_column='time_stamp')\n", + "train_data, normal_index, test_data, test_normal_index = c2c.load_and_format_event_dataset(event_id=event_id, index_column='time_stamp')\n", "\n", + "# Create a ground truth for this event\n", + "event_info = c2c.event_info_all[c2c.event_info_all['event_id'] == event_id].iloc[0]\n", "ground_truth = CAREScore.create_ground_truth(\n", " event_label=event_info['event_label'],\n", " event_start=event_info['event_start'],\n", @@ -261,21 +175,42 @@ " normal_index=test_normal_index\n", ")" ], - "id": "fa98c752d06b0e35", - "outputs": [], - "execution_count": null + "id": "3d4b96de7af51a2a" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "model_config = Config('c2c_configs/windfarm_C.yaml') # starting point\n", + "# Model configuration starting point\n", + "model_config = Config('c2c_configs/windfarm_C.yaml')\n", + "c2c.update_c2c_config(model_config, 'C')\n", "\n", - "# speed up for testing\n", + "# speed up for testing (select a small part of the dataset)\n", "N = 10000\n", "normal_index = normal_index.iloc[:N]\n", "train_data = train_data.iloc[:N]\n", "\n", + "# helper function to (re)set the scaling step of the DataPreprocessor\n", + "def set_scaler_step(cfg: dict, choice: str) -> dict:\n", + " \"\"\"Update cfg to use the chosen scaler.\"\"\"\n", + " dp = cfg['train'].setdefault('data_preprocessor', {})\n", + " steps = dp.get('steps')\n", + "\n", + " # Remove any existing scaler step(s)\n", + " scaler_names = {'standard_scaler', 'minmax_scaler'}\n", + " steps = [s for s in steps if s.get('name') not in scaler_names]\n", + " # Add the chosen scaler step\n", + " if choice == 'minmax':\n", + " steps.append({'name': 'minmax_scaler'})\n", + " else:\n", + " # 'standardize'\n", + " steps.append({'name': 'standard_scaler'})\n", + "\n", + " dp['steps'] = steps\n", + " return cfg\n", + "\n", "\n", "def f_score(trial: op.Trial) -> float:\n", " \"\"\"Returns the F-score of the model (only useful for datasets with anomalies).\n", @@ -286,69 +221,50 @@ " Returns:\n", " Score of the FaultDetector model \n", " \"\"\"\n", + " # Use a fresh config dict per trial\n", + " cfg = deepcopy(model_config.config_dict)\n", "\n", - " dataprep_params = model_config.config_dict['train']['data_preprocessor']['params']\n", - " autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", - "\n", - " dataprep_params['scale'] = trial.suggest_categorical(\n", - " name='scale', choices=['minmax', 'standardize']\n", - " )\n", + " # Scale choice (new steps mode or legacy fallback)\n", + " scale_choice = trial.suggest_categorical('scale', ['minmax', 'standardize'])\n", + " cfg = set_scaler_step(cfg, scale_choice)\n", "\n", - " autoencoder_params['batch_size'] = int(trial.suggest_categorical(\n", - " name='batch_size', choices=[32, 64, 128]\n", - " ))\n", - " autoencoder_params['learning_rate'] = trial.suggest_float(\n", - " name='learning_rate', low=1e-5, high=0.01, log=True\n", - " )\n", - " autoencoder_params['decay_rate'] = trial.suggest_float(\n", - " name='decay_rate', low=0.8, high=0.99\n", - " )\n", + " # Autoencoder params\n", + " autoencoder_params = cfg['train']['autoencoder']['params']\n", + " autoencoder_params['batch_size'] = int(trial.suggest_categorical(name='batch_size', choices=[32, 64, 128]))\n", + " autoencoder_params['learning_rate'] = trial.suggest_float(name='learning_rate', low=1e-5, high=0.01, log=True)\n", + " autoencoder_params['decay_rate'] = trial.suggest_float(name='decay_rate', low=0.8, high=0.99)\n", "\n", " # architecture\n", - " autoencoder_params['layers'][0] = trial.suggest_int(\n", - " name='layers_0', low=100, high=400\n", - " )\n", - " autoencoder_params['layers'][1] = trial.suggest_int(\n", - " name='layers_1', low=50, high=100\n", - " )\n", - " autoencoder_params['code_size'] = trial.suggest_int(\n", - " name='code_size', low=10, high=30\n", - " )\n", - "\n", - " # update the configuration\n", - " model_config.update_config(model_config.config_dict)\n", + " autoencoder_params['layers'][0] = trial.suggest_int(name='layers_0', low=100, high=400)\n", + " autoencoder_params['layers'][1] = trial.suggest_int(name='layers_1', low=50, high=100)\n", + " autoencoder_params['code_size'] = trial.suggest_int(name='code_size', low=10, high=30)\n", "\n", " # create a new model using our new configuration and train the model\n", - " model = FaultDetector(model_config)\n", + " model = FaultDetector(Config(config_dict=cfg))\n", " _ = model.fit(train_data, normal_index=normal_index, save_models=False)\n", " predictions = model.predict(test_data)\n", "\n", - " score = fbeta_score(\n", + " return fbeta_score(\n", " y_true=ground_truth.sort_index(),\n", " y_pred=predictions.predicted_anomalies.sort_index(),\n", " beta=0.5\n", - " )\n", - "\n", - " return score" + " )" ], - "id": "1651af56e86b8c17", - "outputs": [], - "execution_count": null + "id": "2d93c80b0090b855" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "study = op.create_study(sampler=op.samplers.TPESampler(),\n", - " study_name='ad_optimization',\n", - " direction='maximize')\n", + "study = op.create_study(sampler=op.samplers.TPESampler(), study_name='ad_optimization', direction='maximize')\n", "\n", "# if we want to ensure that the first trial is done with the hyperparameters of the configuration, we need to enqueue a trial:\n", "autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", "study.enqueue_trial(params={\n", " 'batch_size': autoencoder_params['batch_size'],\n", " 'learning_rate': autoencoder_params['learning_rate'],\n", - " 'decay_rate': autoencoder_params['decay_rate'],\n", " 'layers_0': autoencoder_params['layers'][0],\n", " 'layers_1': autoencoder_params['layers'][1],\n", " 'code_size': autoencoder_params['code_size'],\n", @@ -356,74 +272,64 @@ "\n", "study.optimize(f_score, n_trials=5)" ], - "id": "713d5956170a993e", - "outputs": [], - "execution_count": null + "id": "d8a1cd60b4664efd" }, { "metadata": {}, "cell_type": "code", - "source": [ - "study.trials_dataframe()" - ], - "id": "7b1b25ddfb3e5b2d", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "study.trials_dataframe()", + "id": "49d60813c32812ed" }, { "metadata": {}, "cell_type": "markdown", "source": [ - "### Optimize CARE score\n", - "Optimize the CARE Score. Note that this is extremely slow, as we train a model for each subdataset." + "## Optimize fault detection model - CARE score\n", + "Optimize the CARE Score. Note that this takes a while, as we train a model for each subdataset." ], - "id": "bbdfaaaf03341cbc" + "id": "bb8fc58e17ba1b64" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ + "# Our test set - Wind Farm B from the CARE2Compare dataset\n", + "c2c = Care2CompareDataset(data_path)\n", "wind_farm = 'B'\n", + "\n", + "# Model configuration starting point\n", "model_config = Config('c2c_configs/windfarm_B.yaml')\n", + "c2c.update_c2c_config(model_config, 'B')\n", "\n", "# speed up for testing\n", - "N = 100\n", + "N = 10000\n", + "max_datasets = 15\n", "\n", "def care_objective(trial: op.Trial) -> float:\n", - " \"\"\"Returns the F-score of the model (only useful for datasets with anomalies).\n", - "\n", - " Args:\n", - " trial: optuna Trial object\n", + " \"\"\"Returns the CARE score of the FaultDetector model.\"\"\"\n", "\n", - " Returns:\n", - " Score of the FaultDetector model.\n", - " \"\"\"\n", + " # Use a fresh config dict per trial\n", + " cfg = deepcopy(model_config.config_dict)\n", "\n", - " autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", - " threshold_params = model_config.config_dict['train']['threshold_selector']['params']\n", + " autoencoder_params = cfg['train']['autoencoder']['params']\n", + " threshold_params = cfg['train']['threshold_selector']['params']\n", "\n", - " autoencoder_params['batch_size'] = int(trial.suggest_categorical(\n", - " name='batch_size', choices=[32, 64, 128]\n", - " ))\n", - " autoencoder_params['learning_rate'] = trial.suggest_float(\n", - " name='learning_rate', low=1e-5, high=0.01, log=True\n", - " )\n", + " autoencoder_params['batch_size'] = int(trial.suggest_categorical(name='batch_size', choices=[32, 64, 128]))\n", + " autoencoder_params['learning_rate'] = trial.suggest_float(name='learning_rate', low=1e-5, high=0.01, log=True)\n", "\n", " # architecture\n", - " autoencoder_params['layers'][0] = trial.suggest_int(\n", - " name='layers_0', low=20, high=100\n", - " )\n", - " autoencoder_params['code_size'] = trial.suggest_int(\n", - " name='code_size', low=5, high=20\n", - " )\n", + " autoencoder_params['layers'][0] = trial.suggest_int(name='layers_0', low=20, high=100)\n", + " autoencoder_params['code_size'] = trial.suggest_int(name='code_size', low=5, high=20)\n", "\n", " # threshold\n", " threshold_params['gamma'] = trial.suggest_float(name='gamma', low=0.05, high=0.3)\n", " threshold_params['nn_size'] = trial.suggest_int(name='nn_size', low=20, high=50)\n", "\n", - " # update the configuration with the new hyperparameters\n", - " model_config.update_config(model_config.config_dict)\n", - "\n", + " # Create a CAREScore object and train+evaluate each dataset for this wind farm\n", " care_score = CAREScore(coverage_beta=0.5, eventwise_f_score_beta=0.5, anomaly_detection_method='criticality')\n", " i = 1\n", " for x_train, y_train, x_test, y_test, event_id in c2c.iter_formatted_datasets(wind_farm=wind_farm, index_column='time_stamp'):\n", @@ -435,7 +341,7 @@ " y_test = y_test.iloc[:N]\n", " \n", " # create a new model using our new configuration and train the model\n", - " model = FaultDetector(model_config)\n", + " model = FaultDetector(Config(config_dict=cfg))\n", " _ = model.fit(x_train, normal_index=y_train, save_models=False)\n", " prediction = model.predict(x_test)\n", " event_info = c2c.event_info_all[c2c.event_info_all['event_id'] == event_id].iloc[0]\n", @@ -449,22 +355,22 @@ " ignore_normal_index=False\n", " )\n", " i += 1\n", + " if i > max_datasets:\n", + " break\n", "\n", " score = care_score.get_final_score()\n", "\n", " return score" ], - "id": "7cee58426b72d9a4", - "outputs": [], - "execution_count": null + "id": "ce3000b2686a463f" }, { "metadata": {}, "cell_type": "code", + "outputs": [], + "execution_count": null, "source": [ - "study = op.create_study(sampler=op.samplers.TPESampler(),\n", - " study_name='care_optimization',\n", - " direction='maximize')\n", + "study = op.create_study(sampler=op.samplers.TPESampler(), study_name='care_optimization', direction='maximize')\n", "\n", "# Ensure that the first trial is done with the hyperparameters of the provided configuration\n", "autoencoder_params = model_config.config_dict['train']['autoencoder']['params']\n", @@ -478,41 +384,37 @@ " 'nn_size': threshold_params['nn_size'],\n", "})\n", "\n", - "# since we loop through many datasets, train many models, we run the garbage collector after each trial\n", + "# Since we loop through many datasets, train many models, we run the garbage collector after each trial\n", "study.optimize(care_objective, n_trials=5, gc_after_trial=True)" ], - "id": "6621d7a2bf3ac717", - "outputs": [], - "execution_count": null + "id": "2063c7f60b979cc5" }, { "metadata": {}, "cell_type": "code", - "source": [ - "study.trials_dataframe()" - ], - "id": "9aa3d678f4fa22e3", "outputs": [], - "execution_count": null + "execution_count": null, + "source": "study.trials_dataframe()", + "id": "95c8498e85467561" } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.11.8" } }, "nbformat": 4, diff --git a/notebooks/PreDist/PreDist.ipynb b/notebooks/PreDist/PreDist.ipynb new file mode 100644 index 0000000..34ccd88 --- /dev/null +++ b/notebooks/PreDist/PreDist.ipynb @@ -0,0 +1,986 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b3569887686796a6", + "metadata": {}, + "source": [ + "# EnergyFaultDetector @ District Heating\n", + "\n", + "This notebook shows how to apply the EnergyFaultDetector on the PreDist dataset (available on [zenodo](https://doi.org/10.5281/zenodo.17522254)) and how to reproduce results from the accompanying paper (preprint available on [arXiv](https://doi.org/10.48550/arXiv.2511.14791))." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a149ecfec1850ff7", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:50:14.641401100Z", + "start_time": "2026-01-13T10:50:07.635583200Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import fbeta_score, precision_score, recall_score, ConfusionMatrixDisplay\n", + "\n", + "from predist_utils import train_or_get_model, find_optimal_threshold, get_arcana_importances, calculate_earliness\n", + "\n", + "from energy_fault_detector.evaluation import PreDistDataset\n", + "from energy_fault_detector import Config\n", + "from energy_fault_detector.utils.visualisation import plot_reconstruction\n", + "from energy_fault_detector.utils.analysis import create_events" + ] + }, + { + "cell_type": "markdown", + "id": "5cab669e1b0c15d", + "metadata": {}, + "source": [ + "### Load the PreDist dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b3f587b734b87ccc", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:50:14.704952800Z", + "start_time": "2026-01-13T10:50:14.647830300Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
substation IDReport dateProblem ENEvent description ENPossible anomaly startPossible anomaly endTraining startTraining endefd_possibleFault labelMonitoring potentialEvent typeEvent endEvent start
Event ID
1102014-05-04 14:44:00no DHWNo hot water. Actuator (DHW system) replaced.2014-05-03 16:00:002014-05-05 04:00:002012-03-28 09:00:002014-04-20 14:44:00TrueMotorised control valve (primary side): Actuat...3.4anomaly2014-05-04 14:44:00NaT
3122015-12-01 10:56:00no heatControl parameters updated.2015-11-29 12:00:002015-12-02 10:56:002015-03-01 00:00:002015-11-17 10:56:00TrueControl unit: Incorrect parameterisation4anomaly2015-12-01 10:56:00NaT
5112018-11-23 08:30:00no heatPump settings updated.NaT2018-11-26 09:56:592015-02-20 14:00:002018-11-09 08:30:00TrueFailure of the heating circuit pump3.8anomaly2018-11-23 08:30:00NaT
6212016-12-06 13:12:00not enough heatThe heaters are not getting warm enough. Suppl...NaT2016-12-07 13:12:002015-11-30 09:00:002016-11-22 13:12:00TrueControl unit: Incorrect parameterisation4anomaly2016-12-06 13:12:00NaT
7262020-06-13 10:38:00no DHWThe needle valve was closed. Readjusted.2020-06-12 12:00:002020-06-14 10:38:002018-10-18 13:00:002020-05-30 10:38:00TrueIncorrect setting of the differential pressure...3.1anomaly2020-06-13 10:38:00NaT
.............................................
585NaTNaNNaNNaTNaT2016-02-29 00:00:002018-02-28 00:00:00NaNNaNNaNnormal2018-03-07 00:00:002018-02-28
5922NaTNaNNaNNaTNaT2018-06-21 10:00:002019-01-31 00:00:00NaNNaNNaNnormal2019-02-07 00:00:002019-01-31
6114NaTNaNNaNNaTNaT2017-12-04 00:00:002019-12-05 00:00:00NaNNaNNaNnormal2019-12-12 00:00:002019-12-05
6619NaTNaNNaNNaTNaT2015-09-15 09:31:002017-06-14 00:00:00NaNNaNNaNnormal2017-06-21 00:00:002017-06-14
6813NaTNaNNaNNaTNaT2017-12-19 00:00:002019-12-20 00:00:00NaNNaNNaNnormal2019-12-27 00:00:002019-12-20
\n", + "

64 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " substation ID Report date Problem EN \\\n", + "Event ID \n", + "1 10 2014-05-04 14:44:00 no DHW \n", + "3 12 2015-12-01 10:56:00 no heat \n", + "5 11 2018-11-23 08:30:00 no heat \n", + "6 21 2016-12-06 13:12:00 not enough heat \n", + "7 26 2020-06-13 10:38:00 no DHW \n", + "... ... ... ... \n", + "58 5 NaT NaN \n", + "59 22 NaT NaN \n", + "61 14 NaT NaN \n", + "66 19 NaT NaN \n", + "68 13 NaT NaN \n", + "\n", + " Event description EN \\\n", + "Event ID \n", + "1 No hot water. Actuator (DHW system) replaced. \n", + "3 Control parameters updated. \n", + "5 Pump settings updated. \n", + "6 The heaters are not getting warm enough. Suppl... \n", + "7 The needle valve was closed. Readjusted. \n", + "... ... \n", + "58 NaN \n", + "59 NaN \n", + "61 NaN \n", + "66 NaN \n", + "68 NaN \n", + "\n", + " Possible anomaly start Possible anomaly end Training start \\\n", + "Event ID \n", + "1 2014-05-03 16:00:00 2014-05-05 04:00:00 2012-03-28 09:00:00 \n", + "3 2015-11-29 12:00:00 2015-12-02 10:56:00 2015-03-01 00:00:00 \n", + "5 NaT 2018-11-26 09:56:59 2015-02-20 14:00:00 \n", + "6 NaT 2016-12-07 13:12:00 2015-11-30 09:00:00 \n", + "7 2020-06-12 12:00:00 2020-06-14 10:38:00 2018-10-18 13:00:00 \n", + "... ... ... ... \n", + "58 NaT NaT 2016-02-29 00:00:00 \n", + "59 NaT NaT 2018-06-21 10:00:00 \n", + "61 NaT NaT 2017-12-04 00:00:00 \n", + "66 NaT NaT 2015-09-15 09:31:00 \n", + "68 NaT NaT 2017-12-19 00:00:00 \n", + "\n", + " Training end efd_possible \\\n", + "Event ID \n", + "1 2014-04-20 14:44:00 True \n", + "3 2015-11-17 10:56:00 True \n", + "5 2018-11-09 08:30:00 True \n", + "6 2016-11-22 13:12:00 True \n", + "7 2020-05-30 10:38:00 True \n", + "... ... ... \n", + "58 2018-02-28 00:00:00 NaN \n", + "59 2019-01-31 00:00:00 NaN \n", + "61 2019-12-05 00:00:00 NaN \n", + "66 2017-06-14 00:00:00 NaN \n", + "68 2019-12-20 00:00:00 NaN \n", + "\n", + " Fault label \\\n", + "Event ID \n", + "1 Motorised control valve (primary side): Actuat... \n", + "3 Control unit: Incorrect parameterisation \n", + "5 Failure of the heating circuit pump \n", + "6 Control unit: Incorrect parameterisation \n", + "7 Incorrect setting of the differential pressure... \n", + "... ... \n", + "58 NaN \n", + "59 NaN \n", + "61 NaN \n", + "66 NaN \n", + "68 NaN \n", + "\n", + " Monitoring potential Event type Event end Event start \n", + "Event ID \n", + "1 3.4 anomaly 2014-05-04 14:44:00 NaT \n", + "3 4 anomaly 2015-12-01 10:56:00 NaT \n", + "5 3.8 anomaly 2018-11-23 08:30:00 NaT \n", + "6 4 anomaly 2016-12-06 13:12:00 NaT \n", + "7 3.1 anomaly 2020-06-13 10:38:00 NaT \n", + "... ... ... ... ... \n", + "58 NaN normal 2018-03-07 00:00:00 2018-02-28 \n", + "59 NaN normal 2019-02-07 00:00:00 2019-01-31 \n", + "61 NaN normal 2019-12-12 00:00:00 2019-12-05 \n", + "66 NaN normal 2017-06-21 00:00:00 2017-06-14 \n", + "68 NaN normal 2019-12-27 00:00:00 2019-12-20 \n", + "\n", + "[64 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = PreDistDataset('./predist_data', download_dataset=False)\n", + "# Check events for manufacturer 1\n", + "dataset.events[1]" + ] + }, + { + "cell_type": "markdown", + "id": "ad56a689d11f8d2b", + "metadata": {}, + "source": [ + "### Create or load models (uses optimized configs)\n", + "\n", + "Models defined are:\n", + " - the default autoencoder,\n", + " - conditional autoencoder with day-of-week and hour-of-day time features, and\n", + " - day-of-year autoencoder with day-of-week, hour-of-day and day-of-year time features.\n", + "\n", + "The code size (bottleneck, latent dimension) of the autoencoder is represented as fraction of the input dimension." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c9177963f751115b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:50:14.854728100Z", + "start_time": "2026-01-13T10:50:14.744744800Z" + } + }, + "outputs": [], + "source": [ + "model_configs = {\n", + " 1: {\n", + " 'config_files': {\n", + " 'Default AE': './configs/m1_default_ae.yaml',\n", + " 'Conditional AE': './configs/m1_cond_ae.yaml',\n", + " 'Day-of-year AE': './configs/m1_doy_ae.yaml'\n", + " },\n", + " 'bottleneck': 0.65,\n", + " },\n", + " 2: {\n", + " 'config_files': {\n", + " 'Default AE': './configs/m2_default_ae.yaml',\n", + " 'Conditional AE': './configs/m2_cond_ae.yaml',\n", + " 'Day-of-year AE': './configs/m2_doy_ae.yaml'\n", + " },\n", + " 'bottleneck': 0.25\n", + " }\n", + "}\n", + "\n", + "time_features = {\n", + " 'Default AE': [],\n", + " 'Conditional AE': ['hour_of_day', 'day_of_week'],\n", + " 'Day-of-year AE': ['hour_of_day', 'day_of_week', 'day_of_year'],\n", + "}\n", + "\n", + "# Model file exists, load the model\n", + "load_from_file = True" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d6f9edc9349242ca", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:42.942459900Z", + "start_time": "2026-01-13T10:50:15.097719300Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.\n", + "[Parallel(n_jobs=100)]: Done 5 out of 64 | elapsed: 7.9s remaining: 1.5min\n", + "[Parallel(n_jobs=100)]: Done 12 out of 64 | elapsed: 9.0s remaining: 38.8s\n", + "[Parallel(n_jobs=100)]: Done 19 out of 64 | elapsed: 9.3s remaining: 22.0s\n", + "[Parallel(n_jobs=100)]: Done 26 out of 64 | elapsed: 9.5s remaining: 14.0s\n", + "[Parallel(n_jobs=100)]: Done 33 out of 64 | elapsed: 9.9s remaining: 9.3s\n", + "[Parallel(n_jobs=100)]: Done 40 out of 64 | elapsed: 10.1s remaining: 6.0s\n", + "[Parallel(n_jobs=100)]: Done 47 out of 64 | elapsed: 10.1s remaining: 3.7s\n", + "[Parallel(n_jobs=100)]: Done 54 out of 64 | elapsed: 10.1s remaining: 1.9s\n", + "[Parallel(n_jobs=100)]: Done 61 out of 64 | elapsed: 10.2s remaining: 0.5s\n", + "[Parallel(n_jobs=100)]: Done 64 out of 64 | elapsed: 10.6s finished\n", + "[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.\n", + "[Parallel(n_jobs=100)]: Done 5 out of 64 | elapsed: 1.4s remaining: 16.2s\n", + "[Parallel(n_jobs=100)]: Done 12 out of 64 | elapsed: 1.6s remaining: 7.0s\n", + "[Parallel(n_jobs=100)]: Done 19 out of 64 | elapsed: 1.8s remaining: 4.3s\n", + "[Parallel(n_jobs=100)]: Done 26 out of 64 | elapsed: 1.9s remaining: 2.8s\n", + "[Parallel(n_jobs=100)]: Done 33 out of 64 | elapsed: 4.8s remaining: 4.5s\n", + "[Parallel(n_jobs=100)]: Done 40 out of 64 | elapsed: 5.6s remaining: 3.3s\n", + "[Parallel(n_jobs=100)]: Done 47 out of 64 | elapsed: 6.0s remaining: 2.2s\n", + "[Parallel(n_jobs=100)]: Done 54 out of 64 | elapsed: 6.2s remaining: 1.2s\n", + "[Parallel(n_jobs=100)]: Done 61 out of 64 | elapsed: 6.3s remaining: 0.3s\n", + "[Parallel(n_jobs=100)]: Done 64 out of 64 | elapsed: 7.2s finished\n", + "[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.\n", + "[Parallel(n_jobs=100)]: Done 5 out of 64 | elapsed: 1.4s remaining: 16.9s\n", + "[Parallel(n_jobs=100)]: Done 12 out of 64 | elapsed: 1.6s remaining: 7.0s\n", + "[Parallel(n_jobs=100)]: Done 19 out of 64 | elapsed: 2.1s remaining: 4.9s\n", + "[Parallel(n_jobs=100)]: Done 26 out of 64 | elapsed: 2.4s remaining: 3.5s\n", + "[Parallel(n_jobs=100)]: Done 33 out of 64 | elapsed: 2.5s remaining: 2.4s\n", + "[Parallel(n_jobs=100)]: Done 40 out of 64 | elapsed: 2.5s remaining: 1.5s\n", + "[Parallel(n_jobs=100)]: Done 47 out of 64 | elapsed: 2.6s remaining: 0.9s\n", + "[Parallel(n_jobs=100)]: Done 54 out of 64 | elapsed: 2.6s remaining: 0.5s\n", + "[Parallel(n_jobs=100)]: Done 61 out of 64 | elapsed: 2.7s remaining: 0.1s\n", + "[Parallel(n_jobs=100)]: Done 64 out of 64 | elapsed: 3.2s finished\n", + "[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.\n", + "[Parallel(n_jobs=100)]: Done 7 out of 56 | elapsed: 1.3s remaining: 8.9s\n", + "[Parallel(n_jobs=100)]: Done 13 out of 56 | elapsed: 1.3s remaining: 4.4s\n", + "[Parallel(n_jobs=100)]: Done 19 out of 56 | elapsed: 1.6s remaining: 3.2s\n", + "[Parallel(n_jobs=100)]: Done 25 out of 56 | elapsed: 1.7s remaining: 2.1s\n", + "[Parallel(n_jobs=100)]: Done 31 out of 56 | elapsed: 1.7s remaining: 1.4s\n", + "[Parallel(n_jobs=100)]: Done 37 out of 56 | elapsed: 1.9s remaining: 1.0s\n", + "[Parallel(n_jobs=100)]: Done 43 out of 56 | elapsed: 1.9s remaining: 0.6s\n", + "[Parallel(n_jobs=100)]: Done 49 out of 56 | elapsed: 2.0s remaining: 0.3s\n", + "[Parallel(n_jobs=100)]: Done 56 out of 56 | elapsed: 2.3s finished\n", + "[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.\n", + "[Parallel(n_jobs=100)]: Done 7 out of 56 | elapsed: 1.5s remaining: 10.4s\n", + "[Parallel(n_jobs=100)]: Done 13 out of 56 | elapsed: 2.0s remaining: 6.8s\n", + "[Parallel(n_jobs=100)]: Done 19 out of 56 | elapsed: 2.1s remaining: 4.0s\n", + "[Parallel(n_jobs=100)]: Done 25 out of 56 | elapsed: 2.1s remaining: 2.6s\n", + "[Parallel(n_jobs=100)]: Done 31 out of 56 | elapsed: 2.2s remaining: 1.8s\n", + "[Parallel(n_jobs=100)]: Done 37 out of 56 | elapsed: 2.3s remaining: 1.2s\n", + "[Parallel(n_jobs=100)]: Done 43 out of 56 | elapsed: 2.3s remaining: 0.7s\n", + "[Parallel(n_jobs=100)]: Done 49 out of 56 | elapsed: 2.4s remaining: 0.3s\n", + "[Parallel(n_jobs=100)]: Done 56 out of 56 | elapsed: 2.6s finished\n", + "[Parallel(n_jobs=100)]: Using backend LokyBackend with 100 concurrent workers.\n", + "[Parallel(n_jobs=100)]: Done 7 out of 56 | elapsed: 1.5s remaining: 10.7s\n", + "[Parallel(n_jobs=100)]: Done 13 out of 56 | elapsed: 1.6s remaining: 5.4s\n", + "[Parallel(n_jobs=100)]: Done 19 out of 56 | elapsed: 1.9s remaining: 3.7s\n", + "[Parallel(n_jobs=100)]: Done 25 out of 56 | elapsed: 1.9s remaining: 2.4s\n", + "[Parallel(n_jobs=100)]: Done 31 out of 56 | elapsed: 2.2s remaining: 1.8s\n", + "[Parallel(n_jobs=100)]: Done 37 out of 56 | elapsed: 2.2s remaining: 1.1s\n", + "[Parallel(n_jobs=100)]: Done 43 out of 56 | elapsed: 2.2s remaining: 0.7s\n", + "[Parallel(n_jobs=100)]: Done 49 out of 56 | elapsed: 2.3s remaining: 0.3s\n", + "[Parallel(n_jobs=100)]: Done 56 out of 56 | elapsed: 2.9s finished\n" + ] + } + ], + "source": [ + "# Create or load model, predict and collect results for all events\n", + "from joblib import Parallel, delayed\n", + "\n", + "\n", + "# Select which dataset(s) and model configs to test:\n", + "manufacturers = [1, 2]\n", + "models = ['Default AE', 'Conditional AE', 'Day-of-year AE']\n", + "\n", + "results = {}\n", + "for manufacturer in manufacturers:\n", + " results[manufacturer] = {}\n", + " for model_name, config_file in model_configs[manufacturer]['config_files'].items():\n", + " if model_name not in models:\n", + " continue\n", + "\n", + " # get configuration and time features\n", + " conf = Config(config_file)\n", + "\n", + " # Prepare parameters for parallel execution\n", + " bottleneck_ratio = model_configs[manufacturer]['bottleneck']\n", + " events_to_process = dataset.events[manufacturer].index\n", + "\n", + " # Run parallel over events\n", + " # n_jobs=-1 uses all CPU cores. Adjust if memory is an issue.\n", + " parallel_results = Parallel(n_jobs=-1, verbose=10)(\n", + " delayed(train_or_get_model)(\n", + " event_id, dataset, manufacturer, model_name,\n", + " conf, bottleneck_ratio, load_from_file, time_features[model_name]\n", + " ) for event_id in events_to_process\n", + " )\n", + "\n", + " # Create the results dictionary\n", + " results[manufacturer][model_name] = dict(parallel_results)" + ] + }, + { + "cell_type": "markdown", + "id": "3c0ee0eeabed5068", + "metadata": {}, + "source": [ + "### Find optimal criticality threshold based on the reliability score\n", + "Calculate max criticality before the report timestamp and optimize criticality threshold. We use cross-validation to find the criticality threshold to prevent overfitting, so the model will generalize better to unseen data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4f741a688f149cd7", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:46.183337500Z", + "start_time": "2026-01-13T10:51:43.076058800Z" + } + }, + "outputs": [], + "source": [ + "max_criticality_results = {}\n", + "criticality_thresholds = {}\n", + "predicted_anomalies = {}\n", + "true_anomalies = {}\n", + "\n", + "for manufacturer in results.keys():\n", + " # prepare result dictionaries\n", + " max_criticality_results[manufacturer] = {}\n", + " criticality_thresholds[manufacturer] = {}\n", + " predicted_anomalies[manufacturer] = {}\n", + "\n", + " # save true anomalies for easy access later\n", + " true_anomalies[manufacturer] = (dataset.events[manufacturer]['Event type'] == 'anomaly').astype(int)\n", + "\n", + " for model_name, results_dict in results[manufacturer].items():\n", + "\n", + " # calculate max criticality for each event\n", + " max_criticality_list = []\n", + " for event_id, prediction in results_dict.items():\n", + " event_row = dataset.events[manufacturer].loc[event_id]\n", + " max_criticality = prediction.criticality().loc[:event_row['Report date']].max()\n", + " max_criticality_list += [(event_id, max_criticality)]\n", + "\n", + " # Transform results to pandas series with max criticality with event id as index\n", + " c = pd.DataFrame(max_criticality_list, columns=['event_id', 'max_criticality'])\n", + " c = c.set_index('event_id')['max_criticality']\n", + " max_criticality_results[manufacturer][model_name] = c\n", + "\n", + " criticality_threshold, _ = find_optimal_threshold(\n", + " true_anomalies=true_anomalies[manufacturer],\n", + " max_criticalities=max_criticality_results[manufacturer][model_name],\n", + " )\n", + " criticality_thresholds[manufacturer][model_name] = criticality_threshold\n", + " predicted_anomalies[manufacturer][model_name] = c > criticality_threshold" + ] + }, + { + "cell_type": "markdown", + "id": "ae299a2f61b2f5a5", + "metadata": {}, + "source": [ + "### Final results (reliability and eventwise precision+recall)\n", + "\n", + "Note: The trained models may not reproduce the exact results reported in the paper due to random initialization, hardware differences, and random seeds. In practice, it is advisable to train each model 5–10 times and select the best-performing run for the target application." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3b5c248383e4592e", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:47.519369900Z", + "start_time": "2026-01-13T10:51:46.252375100Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Manufacturer m1\n", + "Model Default AE:\n", + "Reliability: 0.87, Precision: 0.95, Recall: 0.66, Earliness: 0.59\n", + "Model Conditional AE:\n", + "Reliability: 0.90, Precision: 1.00, Recall: 0.66, Earliness: 0.60\n", + "Model Day-of-year AE:\n", + "Reliability: 0.92, Precision: 1.00, Recall: 0.69, Earliness: 0.65\n", + "Manufacturer m2\n", + "Model Default AE:\n", + "Reliability: 0.64, Precision: 0.77, Recall: 0.38, Earliness: 0.35\n", + "Model Conditional AE:\n", + "Reliability: 0.74, Precision: 0.82, Recall: 0.54, Earliness: 0.52\n", + "Model Day-of-year AE:\n", + "Reliability: 0.73, Precision: 0.86, Recall: 0.46, Earliness: 0.44\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABG0AAAExCAYAAADGGl/sAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAa0RJREFUeJzt3XdcVfX/B/DXYV32cIEogqKiuFNzpaChuDX3CnCluXdZXyeV5h5f0+xHoIgjZ2pluSeV2pc0U3KPVJxsGcLn9wdx8wrcoXcdeD17nMeje865577vVc+L++ZzPkcSQggQEREREREREZFZsTB1AUREREREREREVBCbNkREREREREREZohNGyIiIiIiIiIiM8SmDRERERERERGRGWLThoiIiIiIiIjIDLFpQ0RERERERERkhti0ISIiIiIiIiIyQ2zaEBERERERERGZITZtiIiIiIiIiIjMEJs2REREZujy5cto164dXFxcIEkSdu3apdfj37hxA5IkISoqSq/HLQ58fHwQFhZm6jKIiIiI2LQhIiIqytWrVzFixAhUqVIFtra2cHZ2RosWLbB8+XI8e/bMoK8dGhqK8+fP49NPP0V0dDQaNWpk0Ncrjv7880/Mnj0bN27cMHUpWpMkCZIkYdiwYYVu//jjj5X7PHr0SLk+Pj4eEydORPPmzWFrawtJkmT1vomIiKhwkhBCmLoIIiIic/Pdd9+hd+/eUCgUCAkJQe3atZGVlYUTJ05g+/btCAsLw9q1aw3y2s+ePYO9vT0+/vhjfPLJJwZ5DSEEMjMzYW1tDUtLS4O8hqlt27YNvXv3xuHDhxEYGKj18zIzM2FhYQFra2vDFVcESZJga2sLW1tbJCQkwMbGRmV7lSpVcO/ePWRkZODhw4coU6YMACAqKgpDhw6Fv78/rKysEBcXh+vXr8PHx8fo74GIiIj0hyNtiIiIXnL9+nX069cP3t7e+PPPP7F8+XIMHz4co0ePxqZNm/Dnn3+iVq1aBnv9hw8fAgBcXV0N9hr5zYHi2rDRlRBCOXpKoVCYpGGTr3379khOTsYPP/ygsv7UqVO4fv06OnXqVOA5Xbt2RWJiIs6fP4+BAwcaq1QiIiIyMDZtiIiIXrJgwQKkpqYiIiIC5cuXL7C9atWqGD9+vPLx8+fPER4eDl9fXygUCvj4+OCjjz5CZmamyvN8fHzQuXNnnDhxAm+++SZsbW1RpUoVrF+/XrnP7Nmz4e3tDQCYOnUqJElSjpYICwsrdOTE7NmzIUmSyrr9+/fjrbfegqurKxwdHeHn54ePPvpIub2oOW0OHTqEli1bwsHBAa6urujWrRsuXrxY6OtduXIFYWFhcHV1hYuLCwYPHoz09PSiP9h/BAYGonbt2jh37hwCAgJgb2+PqlWrYtu2bQCAo0ePokmTJrCzs4Ofnx8OHDig8vybN29i1KhR8PPzg52dHUqXLo3evXurXA4UFRWF3r17AwBat26tvKToyJEjAP79s/jxxx/RqFEj2NnZ4csvv1Ruy5/TRgiB1q1bo2zZsnjw4IHy+FlZWahTpw58fX2Rlpam8T3rokKFCmjVqhU2btyosj4mJgZ16tRB7dq1CzynVKlScHJy0msdREREZHps2hAREb1kz549qFKlCpo3b67V/sOGDcPMmTPxxhtvYOnSpQgICMC8efPQr1+/AvteuXIFvXr1Qtu2bbF48WK4ubkhLCwMFy5cAAD06NEDS5cuBQD0798f0dHRWLZsmU71X7hwAZ07d0ZmZibmzp2LxYsXo2vXrjh58qTa5x04cADBwcF48OABZs+ejUmTJuHUqVNo0aJFofOj9OnTBykpKZg3bx769OmDqKgozJkzR6sanz59is6dO6NJkyZYsGABFAoF+vXrhy1btqBfv37o2LEj5s+fj7S0NPTq1QspKSnK554+fRqnTp1Cv379sGLFCowcORIHDx5EYGCgsmnUqlUrjBs3DgDw0UcfITo6GtHR0ahZs6byOPHx8ejfvz/atm2L5cuXo379+gXqlCQJX3/9NTIyMjBy5Ejl+lmzZuHChQuIjIyEg4ODVu9ZFwMGDMCePXuQmpoKIK8xuHXrVgwYMEDvr0VERERmTBAREZFSUlKSACC6deum1f5xcXECgBg2bJjK+ilTpggA4tChQ8p13t7eAoA4duyYct2DBw+EQqEQkydPVq67fv26ACAWLlyocszQ0FDh7e1doIZZs2aJFyN96dKlAoB4+PBhkXXnv0ZkZKRyXf369UW5cuXE48ePlet+//13YWFhIUJCQgq83pAhQ1SO+c4774jSpUsX+Zr5AgICBACxceNG5bpLly4JAMLCwkL8/PPPyvU//vhjgTrT09MLHDM2NlYAEOvXr1eu27p1qwAgDh8+XGD//D+Lffv2FbotNDRUZd2XX34pAIgNGzaIn3/+WVhaWooJEyZofK+6AiBGjx4tnjx5ImxsbER0dLQQQojvvvtOSJIkbty4ofz8i/rzXbhwoQAgrl+/rvf6iIiIyLg40oaIiOgFycnJAKD1pSbff/89AGDSpEkq6ydPngwgb0LjF/n7+6Nly5bKx2XLloWfnx+uXbv2yjW/LH8unG+//Ra5ublaPefevXuIi4tDWFgYSpUqpVxft25dtG3bVvk+X/TiyBMAaNmyJR4/fqz8DNVxdHRUGYnk5+cHV1dX1KxZE02aNFGuz///Fz8fOzs75f9nZ2fj8ePHqFq1KlxdXfHbb79p8W7zVK5cGcHBwVrt+9577yE4OBhjx47Fu+++C19fX3z22Wdav5au3Nzc0L59e2zatAkAsHHjRjRv3lx56RwRERGVDGzaEBERvcDZ2RkAVC7HUefmzZuwsLBA1apVVdZ7eHjA1dUVN2/eVFlfqVKlAsdwc3PD06dPX7Higvr27YsWLVpg2LBhcHd3R79+/fDNN9+obeDk1+nn51dgW82aNfHo0aMCc7e8/F7c3NwAQKv3UrFixQLz8Li4uMDLy6vAupeP+ezZM8ycORNeXl5QKBQoU6YMypYti8TERCQlJWl87XyVK1fWel8AiIiIQHp6Oi5fvoyoqCiV5lFR7t+/r7Locqv4AQMGYP/+/bh16xZ27drFS6OIiIhKIDZtiIiIXuDs7AxPT0/88ccfOj3v5QZEUYq6W5MQ4pVfIycnR+WxnZ0djh07hgMHDuDdd9/FuXPn0LdvX7Rt27bAvq/jdd5LUc/V5phjx47Fp59+ij59+uCbb77BTz/9hP3796N06dJajywCoFXT5UVHjhxRTi59/vx5rZ5Tvnx5lWXLli1av17Xrl2hUCgQGhqKzMxM9OnTR6d6iYiISP6sTF0AERGRuencuTPWrl2L2NhYNGvWTO2+3t7eyM3NxeXLl1UmuU1ISEBiYqJeL2dxc3NDYmJigfUvj+YBAAsLC7z99tt4++23sWTJEnz22Wf4+OOPcfjwYQQFBRX6PoC8yXlfdunSJZQpU8YgE+6+im3btiE0NBSLFy9WrsvIyCjw2WjbSNPGvXv3MHbsWLRr1w42NjaYMmUKgoODNf757t+/X+WxLreKt7OzQ/fu3bFhwwZ06NABZcqUeaXaiYiISL440oaIiOgl06ZNg4ODA4YNG4aEhIQC269evYrly5cDADp27AgABe7wtGTJEgBAp06d9FaXr68vkpKScO7cOeW6e/fuYefOnSr7PXnypMBz8++M9PJtyPOVL18e9evXx7p161SaH3/88Qd++ukn5fs0B5aWlgVG86xcubLAKKL8JlNhjS5dDR8+HLm5uYiIiMDatWthZWWFoUOHahxVFBQUpLIUdgt5daZMmYJZs2ZhxowZr1M+ERERyRRH2hAREb3E19cXGzduRN++fVGzZk2EhISgdu3ayMrKwqlTp7B161aEhYUBAOrVq4fQ0FCsXbsWiYmJCAgIwK+//op169ahe/fuaN26td7q6tevHz744AO88847GDduHNLT07F69WpUr15dZQLeuXPn4tixY+jUqRO8vb3x4MEDfPHFF6hYsSLeeuutIo+/cOFCdOjQAc2aNcPQoUPx7NkzrFy5Ei4uLpg9e7be3sfr6ty5M6Kjo+Hi4gJ/f3/ExsbiwIEDKF26tMp+9evXh6WlJT7//HMkJSVBoVCgTZs2KFeunE6vFxkZie+++w5RUVGoWLEigLwm0aBBg7B69WqMGjVKb+/tZfXq1UO9evU07peUlISVK1cCgPLW7v/973/h6uoKV1dXjBkzxmA1EhERkeGwaUNERFSIrl274ty5c1i4cCG+/fZbrF69GgqFAnXr1sXixYsxfPhw5b7/93//hypVqiAqKgo7d+6Eh4cHpk+fjlmzZum1ptKlS2Pnzp2YNGkSpk2bhsqVK2PevHm4fPmyStOma9euuHHjBr7++ms8evQIZcqUQUBAAObMmaOc2LcwQUFB2LdvH2bNmoWZM2fC2toaAQEB+Pzzz3WetNeQli9fDktLS8TExCAjIwMtWrTAgQMHCtwJysPDA2vWrMG8efMwdOhQ5OTk4PDhwzo1be7cuYOJEyeiS5cuCA0NVa4fOHAgtm/fjmnTpqFDhw4m/3yePn1aYDRO/uVj3t7ebNoQERHJlCS0mS2QiIiIiIiIiIiMinPaEBERERERERGZITZtiIiIiIiIiIjMEJs2RERERERERERmiE0bIiIiIiIiIiIzxKYNEREREREREZEZYtOGiIiIiIiIiMgMsWlDRERERERERGSG2LQhIiIiIiIiIjJDbNoQEREREREREZkhNm2IiIiIiIiIiMwQmzZERERERERERGaITRsiIiIiIiIiIjPEpg0RERERERERkRli04aIiIiIiIiIyAyxaUNEREREREREZIbYtCEiIiIiIiIiMkNs2hARERERERERmSE2bYiIiIiIiIiIzBCbNkREREREREREZohNGyIiIiIiIiIiM8SmDRERERERERGRGWLThmTn8uXLaNeuHVxcXCBJEnbt2mWQ1wkMDERgYKBBjk1ERPoXFRUFSZJw48YN5TpdzuVhYWHw8fExSG26OnLkCCRJwpEjR0xdChFRibRv3z7Ur18ftra2kCQJiYmJpi6JSig2bUjv8n9ozl9sbW3h6emJ4OBgrFixAikpKa91/NDQUJw/fx6ffvopoqOj0ahRIz1Vrt7du3cxe/ZsxMXF6fzcL774ApIkoUmTJkXu8+Jn9vIycuTI16iciMgwrl69ihEjRqBKlSqwtbWFs7MzWrRogeXLl+PZs2emLq9Qr3MuN3fMGiIyB4b+LmAMjx8/Rp8+fWBnZ4dVq1YhOjoaDg4Opi5Lb6ZNmwZJktC3b99Ct9+4cUNtXsyfP9/IFZdsVqYugIqvuXPnonLlysjOzsb9+/dx5MgRTJgwAUuWLMHu3btRt25dnY/57NkzxMbG4uOPP8aYMWMMUHXR7t69izlz5sDHxwf169fX6bkxMTHw8fHBr7/+iitXrqBq1aqF7te2bVuEhIQUWF+9evVXKZmIyGC+++479O7dGwqFAiEhIahduzaysrJw4sQJTJ06FRcuXMDatWtNXSZ++uknlcfqzuVfffUVcnNzjVidfjFriMicGOK7gLGcPn0aKSkpCA8PR1BQkKnL0SshBDZt2gQfHx/s2bMHKSkpcHJyKnTf/v37o2PHjgXWN2jQwNBl0gvYtCGD6dChg8oomOnTp+PQoUPo3LkzunbtiosXL8LOzk6nYz58+BAA4Orqqs9SDer69es4deoUduzYgREjRiAmJgazZs0qdN/q1atj0KBBRq6QiEg3169fR79+/eDt7Y1Dhw6hfPnyym2jR4/GlStX8N1335mwwn/Z2Nhova+1tbUBKzEsZg0RmRtDfBcwlgcPHgCQ13cOAEhPT4e9vb3afY4cOYI7d+7g0KFDCA4Oxo4dOxAaGlrovm+88Qbzwgzw8igyqjZt2mDGjBm4efMmNmzYoLLt0qVL6NWrF0qVKgVbW1s0atQIu3fvVm6fPXs2vL29AQBTp06FJEnKuQdu3ryJUaNGwc/PD3Z2dihdujR69+6tMq9B/jEkSSpQV2HzILzoyJEjaNy4MQBg8ODByqGBUVFRGt9zTEwM3Nzc0KlTJ/Tq1QsxMTEan0NEZM4WLFiA1NRUREREqDRs8lWtWhXjx49XPn7+/DnCw8Ph6+sLhUIBHx8ffPTRR8jMzFR5no+PDzp37owTJ07gzTffhK2tLapUqYL169cXeI0LFy6gTZs2sLOzQ8WKFfHJJ58UOkrmxTltNJ3LC5vTJi0tDZMnT4aXlxcUCgX8/PywaNEiCCFU9pMkCWPGjMGuXbtQu3ZtKBQK1KpVC/v27VPZT9u80hWzhojkoKjvAufOnUNYWJjyclsPDw8MGTIEjx8/Vu5z+PBhSJKEnTt3Fjjuxo0bIUkSYmNjNdawdetWNGzYEHZ2dihTpgwGDRqEv//+W7k9MDBQ2cRo3LgxJElCWFhYoccKCAhAvXr1Ct3m5+eH4OBg5ePc3FwsW7YMtWrVgq2tLdzd3TFixAg8ffpU5XnffvstOnXqBE9PTygUCvj6+iI8PBw5OTkq+wUGBqJ27do4e/YsWrVqBXt7e3z00Uca339MTAz8/f3RunVrBAUFMS9kgE0bMrp3330XgOqQ9QsXLqBp06a4ePEiPvzwQyxevBgODg7o3r278sTco0cPLF26FEDeUL3o6GgsW7YMQN4QxlOnTqFfv35YsWIFRo4ciYMHDyIwMBDp6emvXXPNmjUxd+5cAMB7772H6OhoREdHo1WrVhqfGxMTgx49esDGxgb9+/fH5cuXcfr06UL3zcjIwKNHjwosWVlZr/0eiIj0Zc+ePahSpQqaN2+u1f7Dhg3DzJkz8cYbb2Dp0qUICAjAvHnz0K9fvwL7XrlyBb169ULbtm2xePFiuLm5ISwsDBcuXFDuc//+fbRu3RpxcXH48MMPMWHCBKxfvx7Lly9XW4eu53IhBLp27YqlS5eiffv2WLJkCfz8/DB16lRMmjSpwP4nTpzAqFGj0K9fPyxYsAAZGRno2bOnypcOQ+UVs4aI5KKw7wL79+/HtWvXMHjwYKxcuRL9+vXD5s2b0bFjR2WTPDAwEF5eXoU2GWJiYuDr64tmzZqpfe2oqCj06dMHlpaWmDdvHoYPH44dO3bgrbfeUk40/PHHH+O9994DkHeJV3R0NEaMGFHkezl37hz++OMPlfWnT5/GX3/9pTJKZcSIEZg6dapy7rfBgwcjJiYGwcHByM7OVqnR0dERkyZNwvLly9GwYUPMnDkTH374YYHXf/z4MTp06ID69etj2bJlaN26tdr3n5mZie3bt6N///4A8r5THTp0CPfv3y90//T09ELz4vnz52pfh/RMEOlZZGSkACBOnz5d5D4uLi6iQYMGysdvv/22qFOnjsjIyFCuy83NFc2bNxfVqlVTrrt+/boAIBYuXKhyvPT09AKvERsbKwCI9evXK9fNmjVLFPbXPr/m69evK9cFBASIgIAA5ePTp08LACIyMrLI9/WyM2fOCABi//79yvdUsWJFMX78+AL7Aihy2bRpk9avSURkSElJSQKA6Natm1b7x8XFCQBi2LBhKuunTJkiAIhDhw4p13l7ewsA4tixY8p1Dx48EAqFQkyePFm5bsKECQKA+OWXX1T2c3Fxea1zeWhoqPD29lY+3rVrlwAgPvnkE5X9evXqJSRJEleuXFGuAyBsbGxU1v3+++8CgFi5cqVynbZ5dfjwYQFAHD58uMD+L2PWEJE5eZXvAoWdGzdt2lQgE6ZPny4UCoVITExUrnvw4IGwsrISs2bNUltXVlaWKFeunKhdu7Z49uyZcv3evXsFADFz5kyd3oMQQiQmJgpbW1vxwQcfqKwfN26ccHBwEKmpqUIIIY4fPy4AiJiYGJX99u3bV2B9YZ/FiBEjhL29vcp3pYCAAAFArFmzRm2NL9q2bZsAIC5fviyEECI5OVnY2tqKpUuXquyX/52rqCU2Nlbr16TXx5E2ZBKOjo7KmeOfPHmCQ4cOoU+fPkhJSVF2cB8/fozg4GBcvnxZZchiYV68HjY7OxuPHz9G1apV4erqit9++82g70WdmJgYuLu7K7ve+bO0b968ucAQRwDo1q0b9u/fX2DR1DUnIjKW5ORkAChy0sKXff/99wBQYGTK5MmTAaDA3Df+/v5o2bKl8nHZsmXh5+eHa9euqRyzadOmePPNN1X2GzhwoA7vRLvaLS0tMW7cuAK1CyHwww8/qKwPCgqCr6+v8nHdunXh7OysUrsh8opZQ0Ry8+J3AUD13Jg/GrBp06YAoHJuDAkJQWZmJrZt26Zct2XLFjx//lzj3CtnzpzBgwcPMGrUKNja2irXd+rUCTVq1HiludhcXFzQrVs3bNq0STkiKCcnB1u2bEH37t2Vd5zaunUrXFxc0LZtW5URKw0bNoSjoyMOHz5c6GeR/92oZcuWSE9Px6VLl1ReX6FQYPDgwVrXGxMTg0aNGiknqndyckKnTp2KvETqvffeKzQv/P39tX5Nen2ciJhMIjU1FeXKlQOQNxReCIEZM2ZgxowZhe7/4MEDVKhQocjjPXv2DPPmzUNkZCT+/vtvlbkGkpKS9Fu8lnJycrB582a0bt0a169fV65v0qQJFi9ejIMHD6Jdu3Yqz6lYsWKxm6GeiIoXZ2dnAND6lq03b96EhYVFgTsZeXh4wNXVFTdv3lRZX6lSpQLHcHNzU7nm/+bNm4Xe1trPz0+rmrR18+ZNeHp6FmhQ1axZU7n9RdrUru+8YtYQkRy9+F0AyPsl7pw5c7B582blJMD5Xjw31qhRA40bN0ZMTAyGDh0KIK8R0bRpU2XOJCUl4dmzZ8rn2NjYoFSpUspzdmFZUaNGDZw4caLIep89e1bgHO3h4QEgr5G0ZcsWHD9+HK1atcKBAweQkJCgvAwMAC5fvoykpCSV9/yiF9/zhQsX8J///AeHDh1S/qKksM8CACpUqKD1hPuJiYn4/vvvMWbMGFy5ckW5vkWLFti+fTv++uuvAncRrFatGvPCDLBpQ0Z3584dJCUlKU+s+RNHTpkyRWWyrhcVddvSfGPHjkVkZCQmTJiAZs2awcXFBZIkoV+/fioTUxY2CTGAQn8T+boOHTqEe/fuYfPmzdi8eXOB7TExMQV+kCYiMnfOzs7w9PQscP2+JkWdf19maWlZ6Hrx0sS/5kib2rXNK20xa4hIbl7+LgAAffr0walTpzB16lTUr18fjo6OyM3NRfv27QucG0NCQjB+/HjcuXMHmZmZ+Pnnn/Hf//5XuX38+PFYt26d8nFAQACOHDnyWjVv2bKlwIiW/HN7cHAw3N3dsWHDBrRq1QobNmyAh4eHSrMjNzcX5cqVK3JES9myZQHkNVYCAgLg7OyMuXPnwtfXF7a2tvjtt9/wwQcfFPgsdLn71tatW5GZmYnFixdj8eLFBbbHxMRgzpw5Wh+PjIdNGzK66OhoAFA2aKpUqQIg71arr9rJ3bZtG0JDQ1VOQBkZGcoJxfK5ubkByDshvngLv5d/W1oYbb9w5IuJiUG5cuWwatWqAtt27NiBnTt3Ys2aNWZ7q0MioqJ07twZa9euRWxsrMZJH729vZGbm4vLly8rR6gAQEJCAhITE5V3BdSFt7c3Ll++XGB9fHy8xufqci739vbGgQMHkJKSojLaJn94+qvUrm1eaYtZQ0Ry8/J3gadPn+LgwYOYM2cOZs6cqdyvsPM8APTr1w+TJk3Cpk2b8OzZM1hbW6Nv377K7dOmTVO5VCr/5//8c3Z8fDzatGmjcsz4+Hi15/Tg4GDs37+/0G2WlpYYMGAAoqKi8Pnnn2PXrl0YPny4SiPf19cXBw4cQIsWLdSej48cOYLHjx9jx44dKpPkvziS8lXFxMSgdu3amDVrVoFtX375JTZu3MimjZninDZkVIcOHUJ4eDgqV66snHugXLlyCAwMxJdffol79+4VeM7Dhw81HtfS0rLAb2FXrlxZYARN/lwDx44dU65LS0tT6cYXJf+aVG1+sH727Bl27NiBzp07o1evXgWWMWPGICUlReWW5kREcjFt2jQ4ODhg2LBhSEhIKLD96tWryjs5dezYEQCUd/vLt2TJEgB5cwnoqmPHjvj555/x66+/Ktc9fPhQq9uW6nIu79ixI3JyclR+gwsAS5cuhSRJ6NChg26FQ/u80gazhojkprDvAvnNjZfPjS/nRr4yZcqgQ4cO2LBhA2JiYtC+fXuUKVNGud3f3x9BQUHKpWHDhgCARo0aoVy5clizZg0yMzOV+//www+4ePGi2jwqX768yjFf/kXzu+++i6dPn2LEiBFITU0tML9Onz59kJOTg/Dw8ALHfv78uTKTCvsssrKy8MUXXxRZmzZu376NY8eOoU+fPoXmxeDBg3HlyhX88ssvr/U6ZBgcaUMG88MPP+DSpUt4/vw5EhIScOjQIezfvx/e3t7YvXu3ygRgq1atwltvvYU6depg+PDhqFKlChISEhAbG4s7d+7g999/V/tanTt3RnR0NFxcXODv74/Y2FgcOHAApUuXVtmvXbt2qFSpEoYOHYqpU6fC0tISX3/9NcqWLYtbt26pfQ1fX1+4urpizZo1cHJygoODA5o0aYLKlSsX2Hf37t1ISUlB165dCz1W06ZNUbZsWcTExKj8ZuCvv/7Chg0bCuzv7u6Otm3bqq2PiMhYfH19sXHjRvTt2xc1a9ZESEgIateujaysLJw6dQpbt25FWFgYAKBevXoIDQ3F2rVrlcO+f/31V6xbtw7du3d/pclvp02bhujoaLRv3x7jx4+Hg4MD1q5dC29vb5w7d05j7dqey7t06YLWrVvj448/xo0bN1CvXj389NNP+PbbbzFhwgSVSYe1pW1eaYNZQ0TmTNvvAs7OzmjVqhUWLFiA7OxsVKhQAT/99JPa0SUhISHo1asXABTaCCmMtbU1Pv/8cwwePBgBAQHo378/EhISsHz5cvj4+GDixImv/F4bNGiA2rVrY+vWrahZsybeeOMNle0BAQEYMWIE5s2bh7i4OLRr1w7W1ta4fPkytm7diuXLl6NXr15o3rw53NzcEBoainHjxkGSJERHR7/2JcIbN26EEKLIvOjYsSOsrKwQExOjMmfcb7/9VmheaHN7ddIjU9yyioq3/Fvk5S82NjbCw8NDtG3bVixfvlwkJycX+ryrV6+KkJAQ4eHhIaytrUWFChVE586dxbZt25T7FHXL76dPn4rBgweLMmXKCEdHRxEcHCwuXbokvL29RWhoqMq+Z8+eFU2aNBE2NjaiUqVKYsmSJVrd8lsIIb799lvh7+8vrKys1N7+u0uXLsLW1lakpaUV+TmFhYUJa2tr8ejRIyGE+tuwvlwHEZE5+Ouvv8Tw4cOFj4+PsLGxEU5OTqJFixZi5cqVKrclzc7OFnPmzBGVK1cW1tbWwsvLS0yfPl1lHyHybvndqVOnAq9T2Pn43LlzIiAgQNja2ooKFSqI8PBwERER8Vrn8pdv+S2EECkpKWLixInC09NTWFtbi2rVqomFCxeK3Nxclf0AiNGjRxeo/eUc0javtLnlN7OGiMzRq3wXuHPnjnjnnXeEq6urcHFxEb179xZ3794VAAq9lXdmZqZwc3MTLi4uKrfv1saWLVtEgwYNhEKhEKVKlRIDBw4Ud+7cKfQ9aLrl94sWLFggAIjPPvusyH3Wrl0rGjZsKOzs7ISTk5OoU6eOmDZtmrh7965yn5MnT4qmTZsKOzs74enpKaZNmyZ+/PHHApkQEBAgatWqpVVtderUEZUqVVK7T2BgoChXrpzIzs7WeMvvl79fkWFJQshgZj8iIiIiIiIi5F1S5OnpiS5duiAiIsLU5QAAli9fjokTJ+LGjRuF3k2Q6FVxThsiIiIiIiKSjV27duHhw4cICQkxdSkA8uagiYiIQEBAABs2pHec04aIiIiIiIjM3i+//IJz584hPDwcDRo0QEBAgEnrSUtLw+7du3H48GGcP38e3377rUnroeKJTRsiIiIiIiIye6tXr8aGDRtQv359REVFmbocPHz4EAMGDICrqys++uijIif6JXodnNOGiIiIiIiIiMgMcU4bIiIiIiIiIiIzxKYNEREREREREZEZ4pw2ZiQ3Nxd3796Fk5MTJEkydTlEeiWEQEpKCjw9PWFhoVu/OCMjA1lZWRr3s7Gxga2t7auWSCQLzAoqrl4nJwDtsoI5QSUBc4KKq5KaE2zamJG7d+/Cy8vL1GUQGdTt27dRsWJFrffPyMiAnVNp4Hm6xn09PDxw/fp1szvREukTs4KKO11zAtA+K5gTVBIwJ6i4K2k5waaNGXFycgIA2PiHQrK0MXE1JcOtI4tMXUKJkZKcjKqVvZR/z7WVlZUFPE+HotZgQN2/i5ws3L8QiaysLLM6yRLpG7PC+JgVxvGqOQFomRXMCSohmBPGx5wwjpKaE2zamJH84YuSpQ1PsEbi7Oxs6hJKnFcepmttC8lSUeRm8QpDJInkiFlhfMwK43qtyznUZAVzgkoK5oTxMSeMq6TlBJs2RCQPkpS3qNtOREQlm7qsYE4QEZEMc4JNGyKSBwvLvKUoQs02IiIqGdRlBXOCiIhkmBNs2hCRTFgAkrohi+Y5nJGIiIxJXVYwJ4iISH45waYNEckDL48iIiJNZDjsnYiIjEiGOcGmDRHJg6bLo9RtIyKikkFdVjAniIhIhjnBpg0RyYOk4fIotZdOERFRiaAuK5gTREQkw5xg04aI5IEjbYiISBMZ/gaViIiMSIY5waYNEcmDJGkYaWOe16ASEZERqcsK5gQREckwJ9i0ISJ5sJDyFnXbiYioZFOXFcwJIiKSYU6waUNE8sDLo4iISBMZDnsnIiIjkmFOsGlDRPLAiYiJiEgTGU4wSURERiTDnGDThojkgSNtiIhIExn+BpWIiIxIhjnBpg0RyYMkqZ8czEwnDiMiIiNSlxXMCSIikmFOsGlDRPLAy6OIiEgTGQ57JyIiI5JhTrBpQ0TyIGm4PEoyz+GMRERkROqygjlBREQyzAnzbCUREb0sfyijukVLq1evRt26deHs7AxnZ2c0a9YMP/zwg3J7RkYGRo8ejdKlS8PR0RE9e/ZEQkKCId4VERHpk55ygoiIiikZ5gSbNkQkD5L073DGQhftT7IVK1bE/PnzcfbsWZw5cwZt2rRBt27dcOHCBQDAxIkTsWfPHmzduhVHjx7F3bt30aNHD0O9MyIi0he1WWGeP4wTEZERyTAneHkUEcmDHu8e1aVLF5XHn376KVavXo2ff/4ZFStWREREBDZu3Ig2bdoAACIjI1GzZk38/PPPaNq06SuVT0RERiDDu4IQEZERyTAnONKGiORB7SibfycUS05OVlkyMzPVHjYnJwebN29GWloamjVrhrNnzyI7OxtBQUHKfWrUqIFKlSohNjbWoG+RiIhekxY5QUREJZgMc8I8qyIiell+V1zdAsDLywsuLi7KZd68eYUe7vz583B0dIRCocDIkSOxc+dO+Pv74/79+7CxsYGrq6vK/u7u7rh//76h3yUREb0OLXJCW5z/jIioGJJhTrBpQ0TyoOVExLdv30ZSUpJymT59eqGH8/PzQ1xcHH755Re8//77CA0NxZ9//mnMd0RERPqmxwkmOf8ZEVExJMOc4Jw2RCQLkiRBUnci/WdbfqdbExsbG1StWhUA0LBhQ5w+fRrLly9H3759kZWVhcTERJXRNgkJCfDw8Hit90BERIalNit0/GGc858RERU/cswJjrQhIlmQLCSNy+vIzc1FZmYmGjZsCGtraxw8eFC5LT4+Hrdu3UKzZs1e920QEZEBaZMTus59BnD+MyKi4kKOOcGRNkQkC9qOtNHG9OnT0aFDB1SqVAkpKSnYuHEjjhw5gh9//BEuLi4YOnQoJk2ahFKlSsHZ2Rljx45Fs2bN+JtTIiIzp81vUL28vFRWz5o1C7Nnzy70KefPn0ezZs2QkZEBR0dH5fxncXFxnP+MiEiG5JgTbNoQkSxYWFhAsih6cKBQs+1lDx48QEhICO7duwcXFxfUrVsXP/74I9q2bQsAWLp0KSwsLNCzZ09kZmYiODgYX3zxxWu/ByIiMix1WZGfE7dv31a5jFahUBR5vPz5z5KSkrBt2zaEhobi6NGj+i2aiIiMRo45waYNEcmCPkfaREREqN1ua2uLVatWYdWqVVofk4iITE+b36BqO/cZwPnPiIiKGznmBOe0ISJ5kLRYiIioZDNwTnD+MyIimZNhTnCkDRHJgoWFpOHyKHZtiIhKOnVZoWtOcP4zIqLiR445waYNEcmCBA2XR3GoDRFRiac+K3TLCc5/RkRU/MgxJ9i0ISJZ0Hhbb460ISIq8dRmhY45wfnPiIiKHznmBJs2RCQPGiYiFjpMRExERMWUmqxgThARkRxzgk0bIpIFTXePUn/pFBERlQTqsoI5QUREcswJNm2ISBY0XR6l9tIpIiIqEdRlBXOCiIjkmBNs2hCRLHCkDRERaSLH36ASEZHxyDEn2LQhIlmwsLCAhZpbfkPdNiIiKhHUZgVzgoioxJNjTrBpQ0SywJE2RESkiRx/g0pERMYjx5xg04aI5EH6Z1G3nYiISjZ1WcGcICIiGeYEmzaklSE938KQni3hVb4UAODStftYGPEDDpz6s8C+W5e/j6DmtTBwylp8f/ScsUsttk7+dgUrow/g90u3cP9RMjYsHI5OgfVMXZbR8PIoIvOnKSv2rBmPtxpWU3lO5PYTmDR/s9FrLc6++uYoVm44iAePk1G7WgV8PrU3GtbyMXVZRiHHYe9EJQlzwjwwJ+SVE2zaGNCRI0fQunVrPH36FK6urqYu57XcfZCIOf/9FldvP4QkSejfqQliFr2HgEHzcenafeV+7/dvDSFMWGgxlv4sE7WrV8Cgrs3w7rSvTF2O0fHyKCqOilNOANplRdTOk5j35V7lc55lZJuq3GJpx09n8Z9lO7Hkw75oWNsHazYdRs+xq3B620yULeVk6vIMTo7D3onUYU4wJ/SNOSG/nDDPVlIhwsLCIEkS5s+fr7J+165dZvvhFif7jv+B/af+xLXbD3H11gN8snoP0tIz0ah2ZeU+tatXwOiBbTAmfIMJKy2+2raohf+83wWdW5ec0TUvyr89n7qFSjbmhOlpkxXPMrLw4HGKcklJyzBhxcXPFxsPIaR7cwzs2gw1qpTHkun9YG9rgw27Y01dmlEwJ0gd5oTpMSdMjzkhv5yQTdMGAGxtbfH555/j6dOnejtmVlaW3o5VUlhYSOjRtiHs7Wxw+vx1AICdwhpfhYdh6oJv8OBxiokrpOIovyuubiFiTpiPwrICAHq3b4Qr++fj1OaPMHN0V9gprE1YZfGSlf0ccZduI/BNP+U6CwsLBLzpp/JnUJwxJ0gT5oT5YE4YH3NCnjkhq6ZNUFAQPDw8MG/evCL32b59O2rVqgWFQgEfHx8sXrxYZbuPjw/Cw8MREhICZ2dnvPfee4iKioKrqyv27t0LPz8/2Nvbo1evXkhPT8e6devg4+MDNzc3jBs3Djk5OcpjRUdHo1GjRnBycoKHhwcGDBiABw8eGOz9m5q/ryduH12MhJPLsGR6X7w79SvEX88bxvjZpJ749dx1/HDsvImrpOJKgoamjbnOHEZGxZwwPXVZse3HMxgxcz26jlyBpVE/oU+HxvgyPNTEFRcfjxNTkZOTW2B4e9lSznjwONlEVRmX2qxgThCYE+aAOWE6zAl55oSs5rSxtLTEZ599hgEDBmDcuHGoWLGiyvazZ8+iT58+mD17Nvr27YtTp05h1KhRKF26NMLCwpT7LVq0CDNnzsSsWbMAAMePH0d6ejpWrFiBzZs3IyUlBT169MA777wDV1dXfP/997h27Rp69uyJFi1aoG/fvgCA7OxshIeHw8/PDw8ePMCkSZMQFhaG77//Xqv3k5mZiczMTOXj5GTz/ody+WYCWg2cB2dHO3R7uwG+mP0uOo9YjipeZdGyUXUEDJqv+SBEr0jTkEVzHc5IxlXccgIoPlkRf/0+1u08qdzvz6t3cf9RMnavHgefCmVw4+9HJqyaigt1WcGcIIA5YQ6YE2RKcswJWTVtAOCdd95B/fr1MWvWLERERKhsW7JkCd5++23MmDEDAFC9enX8+eefWLhwocpJtk2bNpg8ebLy8fHjx5GdnY3Vq1fD19cXANCrVy9ER0cjISEBjo6O8Pf3R+vWrXH48GHlSXbIkCHKY1SpUgUrVqxA48aNkZqaCkdHR43vZd68eZgzZ84rfxbGlv08B9fv5J0sf790Gw38K2Fkv0A8y8xG5YplcOPQQpX9138+DLFxV9Fl5HJTlEvFDCciJm0Vp5wAik9WTJxX8M4fZ/+4AQCo4lWWP4zrQWlXR1haWuDhE9XLlB8+SUa50s4mqsq45DjBJBkfc8K0mBOmw5yQZ07I6vKofJ9//jnWrVuHixcvqqy/ePEiWrRoobKuRYsWuHz5ssowxEaNGhU4pr29vfIECwDu7u7w8fFROVm6u7urDFc8e/YsunTpgkqVKsHJyQkBAQEAgFu3bmn1PqZPn46kpCTlcvv2ba2eZy4sJAk2NlZYtu4nvDVgHloNmq9cAOCjpdsxei4nJSb9sLCQNC5E+YpLTgDFJysKU6d63m+4Ex4lGbOkYsvG2gr1a3jh6Ol45brc3FwcO/0XGteprOaZxQdzgrTFnDAfzAnjYU7IMydkN9IGAFq1aoXg4GBMnz5dpeOtLQcHhwLrrK1VJ7iSJKnQdbm5uQCAtLQ0BAcHIzg4GDExMShbtixu3bqF4OBgrScjUygUUCgUOtdvCjNHd8WBUxdw+/5TONnbolf7RnirYTX0HPuFcmb3l925/xS37j42QbXFU2p6Jq7ffqh8fPPuY5yPvwNXF3t4eZQyYWXGIUnqu99m2hgnEykuOQEUn6zwqVAGvdo3wv6TF/AkKQ21q1XApxN74ORvl3Hhyl1Tl15sjBrQBqPmRKNBzUp4o5YPVm86jLRnmRjYpampSzMKdVnBnKAXMSdMgzlheswJ+eWELJs2ADB//nzUr18ffn7/znxds2ZNnDx5UmW/kydPonr16rC0tNTr61+6dAmPHz/G/Pnz4eXlBQA4c+aMXl/DnJRxc8Tq2SFwL+OM5NQMXLjyN3qO/QJHfr1k6tJKjLiLN9Fl5Arl44+X7gAA9O/UBF/MftdUZRmPpOFEaqYnWTId5oTxqcuKCu6uCHzTD+/3aw17Oxv8nfAUew7FYdHXP5q67GKlR7uGeJSYis++/A4PHqegTvUK2LZidIkZ9q42K5gT9BLmhPExJ0yPOSG/nJBt06ZOnToYOHAgVqz490vs5MmT0bhxY4SHh6Nv376IjY3Ff//7X3zxxRd6f/1KlSrBxsYGK1euxMiRI/HHH38gPDxc769jLsZ9slGn/d0ajzFQJSXXWw2r4+np/5q6DJPRNGRRmOlwRjId5oTxqcuKvxMS0XkE5zgzhvf6BOC9PgGmLsMk1GUFc4JexpwwPuaEeWBOyCsnZDmnTb65c+cqhxcCwBtvvIFvvvkGmzdvRu3atTFz5kzMnTv3lYY8alK2bFlERUVh69at8Pf3x/z587Fo0SK9vw4R5eGcNvQqmBNEJQtzgnTFnCAqWeSYE5IQQpi6CMqTnJwMFxcXKOoMh2RpY+pySoSSPHLF2JKTk+Fe2gVJSUlwdtZ++GX+vwu/yTtgqSh4/Xi+nMw0xC/uofPxieSGWWF8zArjeNWcyH+upqxgTlBJwZwwPuaEcZTUnJDt5VFEVLLw8igiItJEjsPeiYjIeOSYE2zaEJEsSJKk4e5R5nmSJSIi41GXFcwJIiKSY06waUNEssCRNkREpIkcf4NKRETGI8ecYNOGiGRBgoaRNuZ6jz4iIjIadVnBnCAiIjnmBJs2RCQLkpS3qNtOREQlm7qsYE4QEZEcc4JNGyKSBV4eRUREmshx2DsRERmPHHOCTRsikgVORExERJrIcYJJIiIyHjnmhFZNm927d2t9wK5du75yMUREReFIG/PGnCAicyDH36CWJMwKIjI1OeaEVk2b7t27a3UwSZKQk5PzOvUQERVOw5w2ZjpvWInBnCAis6AuK5gTJsesICKTk2FOaNW0yc3NNXQdRERq8fIo88acICJzIMdh7yUJs4KITE2OOfFac9pkZGTA1tZWX7UQERVJ0+VR6raR6TAniMiY1GUFc8J8MSuIyFjkmBMWuj4hJycH4eHhqFChAhwdHXHt2jUAwIwZMxAREaH3AomIgH9vz6duIfPAnCAiU2FOyAezgohMQY45oXPT5tNPP0VUVBQWLFgAGxsb5fratWvj//7v//RaHBFRPgsLC40LmQfmBBGZCnNCPpgVRGQKcswJnatav3491q5di4EDB8LS0lK5vl69erh06ZJeiyMiyseRNvLBnCAiU2FOyAezgohMQY45ofOcNn///TeqVq1aYH1ubi6ys7P1UhQR0cs4EbF8MCeIyFTkOMFkScWsICJTkGNO6DzSxt/fH8ePHy+wftu2bWjQoIFeiiIieln+pGHqFjIPzAkiMhXmhHwwK4jIFOSYEzqPtJk5cyZCQ0Px999/Izc3Fzt27EB8fDzWr1+PvXv3GqJGIiJIUD9k0TxPsSUTc4KITEVdVjAnzAuzgohMQY45ofNIm27dumHPnj04cOAAHBwcMHPmTFy8eBF79uxB27ZtDVEjEREsLSSNC5kH5gQRmQpzQj6YFURkCnLMCZ1H2gBAy5YtsX//fn3XQkRUJM5pIy/MCSIyBTnOVVCSMSuIyNjkmBOv1LQBgDNnzuDixYsA8q5Jbdiwod6KIiJ6mYWUt6jbTuaFOUFExqYuK5gT5olZQUTGJMec0Llpc+fOHfTv3x8nT56Eq6srACAxMRHNmzfH5s2bUbFiRX3XSEQEyQJqJweTdL7YkwyFOUFEpqIuK5gT5oVZQUSmIMec0LmsYcOGITs7GxcvXsSTJ0/w5MkTXLx4Ebm5uRg2bJghaiQigqTFf9qaN28eGjduDCcnJ5QrVw7du3dHfHy8yj4ZGRkYPXo0SpcuDUdHR/Ts2RMJCQn6flvFEnOCiExFXzkBMCsMjVlBRKYgx5zQuWlz9OhRrF69Gn5+fsp1fn5+WLlyJY4dO6br4YiItKLPiYiPHj2K0aNH4+eff8b+/fuRnZ2Ndu3aIS0tTbnPxIkTsWfPHmzduhVHjx7F3bt30aNHD0O8tWKHOUFEpqLPCSaZFYbFrCAiU5BjTuh8eZSXlxeys7MLrM/JyYGnp6euhyMi0ookabjltw7n2H379qk8joqKQrly5XD27Fm0atUKSUlJiIiIwMaNG9GmTRsAQGRkJGrWrImff/4ZTZs2fZW3UGIwJ4jIVNRlha7zSzIrDItZQUSmIMec0HmkzcKFCzF27FicOXNGue7MmTMYP348Fi1apOvhiIi0YiFJGhcASE5OVlkyMzM1HjspKQkAUKpUKQDA2bNnkZ2djaCgIOU+NWrUQKVKlRAbG2uAd1e8MCeIyFS0yYlXxazQL2YFEZmCHHNCq5E2bm5uKre/SktLQ5MmTWBllff058+fw8rKCkOGDEH37t21fnEiIm1ZWEhqJyLO3+bl5aWyftasWZg9e3aRz8vNzcWECRPQokUL1K5dGwBw//592NjYKCdGzOfu7o779++/2hso5pgTRGQO1GVF/vrk5GSV9QqFAgqFQu1xmRX6wawgIlOTY05o1bRZtmyZ1gckIjIEbS+Pun37NpydnZXrNZ1gR48ejT/++AMnTpzQR5klFnOCiMyBNsPedW3uA8wKfWFWEJGpyTEntGrahIaG6v2FiYh0YSlJsFTTtcn9Z5uzs7NK00adMWPGYO/evTh27JjKrUU9PDyQlZWFxMRElc54QkICPDw8Xu0NFHPMCSIyB+qyIj8ndG3uMyv0h1lBRKYmx5x4rTuRZ2RkFJg/gojIECRJ0rhoSwiBMWPGYOfOnTh06BAqV66ssr1hw4awtrbGwYMHlevi4+Nx69YtNGvWTG/vqSRgThCRMWmTE/nN/fylqB/GmRXGw6wgImORY07ofPeotLQ0fPDBB/jmm2/w+PHjAttzcnJ0PSQRkUYWUt6ibru2Ro8ejY0bN+Lbb7+Fk5OT8ppSFxcX2NnZwcXFBUOHDsWkSZNQqlQpODs7Y+zYsWjWrBnvBqIF5gQRmYq6rNDxTq7MCgNjVhCRKcgxJ3QeaTNt2jQcOnQIq1evhkKhwP/93/9hzpw58PT0xPr163U9HBGRViRJUk4cVtiiy0ib1atXIykpCYGBgShfvrxy2bJli3KfpUuXonPnzujZsydatWoFDw8P7NixwxBvrdhhThCRqajLCl1yAmBWGBqzgohMQY45ofNImz179mD9+vUIDAzE4MGD0bJlS1StWhXe3t6IiYnBwIEDdT0kEZFGmi6B0vXyKE1sbW2xatUqrFq1SuvjUh7mBBGZirqs0PWHcWaFYTEriMgU5JgTOo+0efLkCapUqQIg71qvJ0+eAADeeustHDt27JULISJSx9JC0riQeWBOEJGpMCfkg1lBRKYgx5zQuWlTpUoVXL9+HQBQo0YNfPPNNwDyuuUv33+ciEhfJC0WMg/MCSIyFeaEfDAriMgU5JgTOjdtBg8ejN9//x0A8OGHH2LVqlWwtbXFxIkTMXXqVL0XSEQEABaSpHEh88CcICJTYU7IB7OCiExBjjmh85w2EydOVP5/UFAQLl26hLNnz6Jq1aqoW7euXosjIsqXP0GYuu1kHpgTRGQq6rKCOWFemBVEZApyzAmdmzYv8/b2hre3tz5qISIqkiTlLeq2k3liThCRsajLCuaEeWNWEJExyDEntGrarFixQusDjhs37pWLISIqiqYhi+Y6nLGkYE4QkTlQlxXMCdNjVhCRqckxJ7Rq2ixdulSrg0mSxBMsERkEL48yb8wJIjIHchz2XpIwK4jI1OSYE1o1bfJndifj2B75ERwcnUxdRokwfPPvpi6hxMh6lvpaz7eA+pnTdZ5VnfSKOWF8myM+ZFYYyXtbmBXGkJX+ejkBqM8K5oTpMSuMa/e6j+Hg6GzqMkqEQevPmrqEEiH7Nb9PAPLMidee04aIyBgsLCRYcqQNERGpoS4rmBNERCTHnGDThohkwULKW9RtJyKikk1dVjAniIhIjjnBpg0RyYIkSZDUTA6mbhsREZUM6rKCOUFERHLMCTZtiEgWLC3yFnXbiYioZFOXFcwJIiKSY06waUNEssBbfhMRkSZyvJUrEREZjxxz4pV6ScePH8egQYPQrFkz/P333wCA6OhonDhxQq/FERHls5Q0L2Q+mBNEZArMCXlhVhCRsckxJ3Ru2mzfvh3BwcGws7PD//73P2RmZgIAkpKS8Nlnn+m9QCIiALCApOyMF7rATM+yJRBzgohMRW1WMCfMCrOCiExBjjmhc9Pmk08+wZo1a/DVV1/B2tpaub5Fixb47bff9FocEVE+SdK8kHlgThCRqTAn5INZQUSmIMec0HlOm/j4eLRq1arAehcXFyQmJuqjJiKiAiwtACs19+F7bqYTh5VEzAkiMhV1WcGcMC/MCiIyBTnmhM5leXh44MqVKwXWnzhxAlWqVNFLUUREL+NIG/lgThCRqTAn5INZQUSmIMec0LlpM3z4cIwfPx6//PILJEnC3bt3ERMTgylTpuD99983RI1ERLCUJI0LmQfmBBGZCnNCPpgVRGQKcswJnS+P+vDDD5Gbm4u3334b6enpaNWqFRQKBaZMmYKxY8caokYiIlhIeYu67WQemBNEZCrqsoI5YV6YFURkCnLMCZ2bNpIk4eOPP8bUqVNx5coVpKamwt/fH46Ojoaoj4gIAJs2csKcICJTkeMP4yUVs4KITEGOOaFz0yafjY0N/P399VkLEVGRLC0kWKo5k6rbRqbBnCAiY1OXFcwJ88SsICJjkmNO6Ny0ad26NSQ113odOnTotQoiIiqMpsnBzPQS1BKJOUFEpqIuK5gT5oVZQUSmIMec0LlpU79+fZXH2dnZiIuLwx9//IHQ0FB91UVEpMLKQlJ7y29128i4mBNEZCrqsoI5YV6YFURkCnLMCZ2bNkuXLi10/ezZs5GamvraBRERFUrTbfjM8xxbIjEniMhk1GUFc8KsMCuIyCRkmBM63/K7KIMGDcLXX3+tr8MREamwgKRxIfPGnCAiQ2NOyB+zgogMSY458coTEb8sNjYWtra2+jocEZEKS4u8Rd12Mm/MCSIyNHVZwZyQB2YFERmSHHNC56ZNjx49VB4LIXDv3j2cOXMGM2bM0FthREQvspAkWKi5PkrdNjIu5gQRmYq6rGBOmBdmBRGZghxzQuemjYuLi8pjCwsL+Pn5Ye7cuWjXrp3eCiMiepGlpOGW32Z6ki2JmBNEZCrqsoI5YV6YFURkCnLMCZ2aNjk5ORg8eDDq1KkDNzc3Q9VERFQAb/ktD8wJIjIlOd7KtSRiVhCRqcgxJ3S6asvS0hLt2rVDYmKigcohIiqchRYLmR5zgohMiTkhD8wKIjIVOeaEznXVrl0b165dM0QtRERFyr/+VN1C5oE5QUSmwpyQD2YFEZmCHHNC56bNJ598gilTpmDv3r24d+8ekpOTVRYiIkNg00Y+mBNEZCrMCflgVhCRKcgxJ7Se02bu3LmYPHkyOnbsCADo2rUrpBfelBACkiQhJydH/1USUYknSYCaeYjN9hrUkoQ5QUSmpi4rmBPmgVlBRKYkx5zQumkzZ84cjBw5EocPHzZkPUREhZIkSeWHusK2k2kxJ4jI1NRlBXPCPDAriMiU5JgTWjdthBAAgICAAIMVQ0RUFE2Tg5nrxGElCXOCiExNXVYwJ8wDs4KITEmOOaHTLb/NtfNERMWfputMzfUa1JKGOUFEpqQuK5gT5oNZQUSmIsec0KlpU716dY0n2SdPnrxWQUREheHlUfLAnCAiU5LjsPeSiFlBRKYix5zQqWkzZ84cuLi4GKoWIqIiWUoSLNWcSNVtI+NhThCRKanLCuaE+WBWEJGpyDEndGra9OvXD+XKlTNULURERZL+WdRtJ9NjThCRKanLCuaE+WBWEJGpyDEntG7amOtQISIqGSRJ/W34eIoyPeYEEZmauqzgKco8MCuIyJTkmBM63z2KiMgUeHmU+WNOEJGpyXHYe0nDrCAiU5JjTmh9V6vc3FwOYyQik5G0+I9MizlBRKamz5w4duwYunTpAk9PT0iShF27dqlsF0Jg5syZKF++POzs7BAUFITLly/r8d0UT8wKIjIlOeaEud6KnIhIRf5QRnULERGVbPrMibS0NNSrVw+rVq0qdPuCBQuwYsUKrFmzBr/88gscHBwQHByMjIwMPbwTIiIyBDnmBJs2RCQLFv8MZSxqsdDxLMvfoBIRFT/qskLXnOjQoQM++eQTvPPOOwW2CSGwbNky/Oc//0G3bt1Qt25drF+/Hnfv3i2QJ0REZD7kmBNs2hCRLOh7pA1/g0pEVPxokxPJyckqS2Zmps6vc/36ddy/fx9BQUHKdS4uLmjSpAliY2P19XaIiEjP5JgTbNoQkSyoG2WjaZLiwvA3qERExY82OeHl5QUXFxflMm/ePJ1f5/79+wAAd3d3lfXu7u7KbUREZH7kmBNa3z2KCvLx8cGECRMwYcIEU5diFOcv3sC2PSdx5fo9PHmaghmT+6F545rK7UIIRG89jH2HziItLQP+fpUwZmhnVChf2oRVy1e1sg4IrlEW3qXs4WpnjVXHryPu72TldieFFXrVLw9/DyfYWVvi8sNUbDr7Nx6kZpmwasPRNDmYPici1tQZ79evn95ei4q3kpYTf1y8iZ17T+Hq9bt4kpiKjyb2RdPGNZTbhRDYuO0Ifjr8G9LSMlCzuhfeH9IJnsyJV1KtrAPa+f2bE1+cKJgTPevl5YS9tSX+epiKzb8V35wA1GdF/vrbt2/D2dlZuV6hUBilNqLClLScKEz6s0xEbTmIE7/+icSkNFStXB6jwjqiRtWKpi5N9rrU9kDjSq4o72KLrOe5uPwwDVt+u4N7yf+OHLG2kDCgUUU0rVwK1hYSzt1NRtQvt5Cc8dyElRuOHHPCLEbaxMbGwtLSEp06dTJ1KaRGRkY2qnh7YNTgwv+ctu4+gd37fsHYYV2w7JPhsFVY4z/zopGVlW3kSosHhZUF7iRmYOOZO4VuH93SB2UcbLDq+HWE//gXHqdlY1JrX9hYmsU/a73T9vIofQxn5G9QzQ9zQh4yM7NQ2dsdIwZ3LHT7jj0nsffHX/D+kE5YGD4MClsbzJq/AVlZxfMHQ0NTWP6TE2cLz4lRb/2TEyeuI/ynv/AkPRsTA4tvTgDa5YSzs7PK8io/jHt4eAAAEhISVNYnJCQot5FxMSfka/GaXTh77go+HNMLXy0eg4Z1q2JaeBQePUnW/GRSq6a7I/bHP8Ts7y/h8wOXYWUh4YOgalBY/ZsDAxt7oYGXK1YevYZPfvwLbvbWmBDoa8KqDUuOOWEWqR0REYGxY8fi2LFjuHv3rqnLoSI0blANoX3fRos3axbYJoTArh9+Rr93WqFZoxqo7O2BKaN74PHTFJw6c8kE1crfH/dSsOv8ffzv74KB5e5kA98yDog5cwc3njxDQkomYs7cgbWlhDe9XY1frBFYSpqGM+btp4/hjGR+mBPy0LB+NQzq0wbNGheeE7v3/YI+3VuhaaMaqFzJHRPf744niSn4mTnxSv64n4Jv/7ivMromXznHf3Li7B3cLCE5AWjKCv29TuXKleHh4YGDBw8q1yUnJ+OXX35Bs2bN9PdCpDXmhDxlZmXj+C9/YvigYNT190EFj9II7dMGFTxKY/dPv5q6PNlbcPAKjl99jL+TMnDr6TN8efIGyjgq4FPKHgBgZ22BwKqlEXP6Nv68n4IbT9Kx9uQNVC/nCN8yDiau3jDkmBMmb9qkpqZiy5YteP/999GpUydERUUptx05cgSSJOHgwYNo1KgR7O3t0bx5c8THx6scY/Xq1fD19YWNjQ38/PwQHR2tsl2SJHz55Zfo3Lkz7O3tUbNmTcTGxuLKlSsIDAyEg4MDmjdvjqtXryqfc/XqVXTr1g3u7u5wdHRE48aNceDAgSLfx5AhQ9C5c2eVddnZ2ShXrhwiIiJe4xOSh/sPnuJpYioa1KmiXOdgbwu/qhVw6a/bJqyseLKyyPunm50rlOsEgOe5AtXKFs8TrKTFf0DecMakpCTlMn36dJ1fi79BNS/MieIh4UEiniamol5t1Zyo7lsR8ZeZE/pm/c9omuc5BXOiajH9QRzQnBW6SE1NRVxcHOLi4gDkXTobFxeHW7duQZIkTJgwAZ988gl2796N8+fPIyQkBJ6enujevbv+3xipxZyQr5ycXOTm5sLGWnXWDhsbK/xx6aaJqiq+7G0sAQBp/4xwrVzaAVaWFrhwL0W5z73kTDxKzSyx3yl0YaycMHnT5ptvvkGNGjXg5+eHQYMG4euvv4YQQmWfjz/+GIsXL8aZM2dgZWWFIUOGKLft3LkT48ePx+TJk/HHH39gxIgRGDx4MA4fPqxyjPDwcISEhCAuLg41atTAgAEDMGLECEyfPh1nzpyBEAJjxoxR7p+amoqOHTvi4MGD+N///of27dujS5cuuHXrVqHvY9iwYdi3bx/u3bunXLd3716kp6ejb9+++viozNrTxFQAgJuLo8p6NxdH5TbSn/vJGXicloUedcvD3toSlhYS2tcoi1L2NnCxtTZ1eQZhIWleAP0MZ+RvUM0Lc6J4eJqUlwWuLqo/BLq6OOBpUpopSirW8nPinRdyIriY5wSgXU5o68yZM2jQoAEaNGgAAJg0aRIaNGiAmTNnAgCmTZuGsWPH4r333kPjxo2RmpqKffv2wdbWVt9vizRgTsiXvZ0C/tW9sGH7ETx6koyc3FwcOBaHi3/dxpOnKZoPQFqTAAxqXBHxD1JxJzHvbqgudlbIzslFenaOyr5JGc/hYlc8s0KOOWHypk1ERAQGDRoEAGjfvj2SkpJw9OhRlX0+/fRTBAQEwN/fHx9++CFOnTqlvO3uokWLEBYWhlGjRqF69eqYNGkSevTogUWLFqkcY/DgwejTpw+qV6+ODz74ADdu3MDAgQMRHByMmjVrYvz48Thy5Ihy/3r16mHEiBGoXbs2qlWrhvDwcPj6+mL37t2Fvo/mzZsX6MpHRkaid+/ecHR0LPQ5mZmZBebfINJGjgC+OHED7k4KLO9ZG6t61YGfuyPO301G7ks/pBQXFpBgIalZzLQzTq+vJOcEwKygV5MjgNUn83JiWY/a+G/POvArl5cTAsUzJwANWaFjTgQGBkIIUWDJH8UhSRLmzp2L+/fvIyMjAwcOHED16tUN8K5IE+aEvHPiwzG9AAH0G7kQHQbMwc4ffkbrFnVgoes3aFIrtEklVHS1w6pj10xdiknJMSdM2rSJj4/Hr7/+iv79+wMArKys0Ldv3wLD/+rWrav8//LlywMAHjx4AAC4ePEiWrRoobJ/ixYtcPHixSKPkT+5aJ06dVTWZWRkKE9yqampmDJlCmrWrAlXV1c4Ojri4sWLRXbGgbzueGRkJIC8yyh++OEHlS7+y+bNm6cy94aXl1eR+5o7N9e8IMn/TWq+p0mpym2kX7eePsPcH//CuO3nMeXbC1h+9DocFJZ4lFY87woiabHogr9BlYeSnhNA8cmK/JGYiS+NqklMSoObS/Ecgm1qt54+Q/hPf2H8jvOYuvsCVhy7DkeFJR4W67tH6S8nSB6YE/LPCU+PUlgyZyj2rJ+BTaunYNW8kXiekwuPcqVMXVqxEfKmFxpUdMFn/0xKny/p2XNYW1rA3tpSZX8XWyskPSueN5ORY06YtGkTERGB58+fw9PTE1ZWVrCyssLq1auxfft2JCUlKfeztv53aJb0z5TOubm5Or1WYcdQd9wpU6Zg586d+Oyzz3D8+HHExcWhTp06yMoq+gedkJAQXLt2DbGxsdiwYQMqV66Mli1bFrn/9OnTVebeuH1bvtf0e5Rzg5urI+L++Ldzm5aegfgrf6NGdXkFh9w8y85FamYOyjnawMfNHnF/J2l+kgxJkqRx0QV/gyoPJT0ngOKTFe7lXOHm6ojfL/ybE+npmfjr6h34VWNOGNKLOeHtZo/fi2lOAJqzgoof5kTxyQk7WxuUdnNCSuoznPn9Cpo3rmHqkoqFkDe90KiSKz776a8CTfvrj9PwPCcXtco7KdeVd1agjKMClx8Wz0uX5ZgTVpp3MYznz59j/fr1WLx4Mdq1a6eyrXv37ti0aRNq1ND8D7VmzZo4efIkQkNDletOnjwJf3//16rv5MmTCAsLwzvvvAMgr1N+48YNtc8pXbo0unfvjsjISMTGxmLw4MFq91coFCa/57sunmVk4u79J8rHCQ+e4uqNe3BytEO5Mq7o3qEpNu88hgoepeFezg3R3xxCaTcnNG/EE+6rUFhZoJyjjfJxGQcbeLnaIi0rB0/Ss9HQywUpmc/xJC0bFVxt0e+NCvjf30n4834xnUPohdvwFbWdihfmRB45ZcWzjCzcezEnHj7FtRv34eRoh7JlXNC1fRN8s/M4PD1Kw72sK2K2HkYpVyc0ZU68EoWVBcq+lBMVXW2Rnp8TFf/JifRsVHCxRd83KiDu7yT8mVBMcwJQnxXMiWKHOZFHTjlRmNNxlyEAeHmWwd37j7E2+kd4VSiD9oFvmLo02Qtr4oVmlUth6eGryMjOgYtt3tf/9OwcZOcIPMvOxZErjzGwUUWkZj7Hs+xchLzphb8epOLqo+LZtJFjTpisabN37148ffoUQ4cOhYuLi8q2nj17IiIiAgsXLtR4nKlTp6JPnz5o0KABgoKCsGfPHuzYsUPtzOzaqFatGnbs2IEuXbpAkiTMmDFDq278sGHD0LlzZ+Tk5Kic+IuDy1fv4oPwKOXjtdE/AgCCWtXH5FHvoHfXt5CRmY0VX+1BanoGavlVQviHg2BjUzwnsTI071J2mNqmqvJx3zcqAABOXX+CyF9uw8XWGn0aeMJZYYWkjOeIvfEUey8kFHU42ZM0NG3MtDFOr4E5IT9Xrt3Fx5+sUz6O2PATAKBNq3qYMLI7enRpgYzMbKz6vz1IS8+Af/VKmP3hINjYmOzHEVnzdrPDlBdyok+Df3Mi6tfbcLGzRu+XcuK7P4tvTgDqs4I5UfwwJ4qHtPQMRGzaj0ePk+HkaIeWTWphcP8gWFlZan4yqRXkVw4A8J9gP5X1X568geNXHwMAYk7fhhAVMT7QF1YWEs7fTUbUL0Vfwid3cswJk/2UFBERgaCgoAInWCDvJLtgwQKcO3dO43G6d++O5cuXY9GiRRg/fjwqV66MyMhIBAYGvlZ9S5YswZAhQ9C8eXOUKVMGH3zwgVaTegUFBaF8+fKoVasWPD09X6sGc1O3VmX8sHlOkdslSUJInzYI6dPGiFUVX389SMPwzb8Xuf3Q5Uc4dPmRESsyLU234dP1Fn1k/pgT8lPH3we7N84qcrskSRjYuzUG9m5txKqKr78epuG9LcyJF6nLCuZE8cOcKB4Cm9dBYPM6mncknQ1af1bjPtm5Aut+vY11v8rzsjpdyTEnJPHy/fDotaSmpqJChQqIjIxEjx49dHpucnIyXFxcsPfMdTg4Oml+Ar22iNN3TF1CiZH1LBXfvPcWkpKS4OzsrPXz8v9dHDl3G45ORT8vNSUZgXW9dD4+kbG9Tk4A//6b2PnrVWaFkUT/dtfUJZQIWemp2PIKOQFolxXMCZILfeXEj7/dgIMj/64bw9LjJfuOTMaS/SwVu8cElric4HhkPcnNzcWjR4+wePFiuLq6omvXrqYuiahY4eVRJHfMCSLDk+Owd6J8zAkiw5NjTrBpoye3bt1C5cqVUbFiRURFRcHKih8tkT7x8iiSO+YEkeHJcdg7UT7mBJHhyTEneCbQEx8fH/BKMyLDsZDyFnXbicwZc4LI8NRlBXOCzB1zgsjw5JgTbNoQkTxIUH8bPjM9yRIRkRGpywrmBBERyTAn2LQhIlng5VFERKSJHIe9ExGR8cgxJ9i0ISJZ4OVRRESkiRyHvRMRkfHIMSfYtCEieeDlUUREpIkMh70TEZERyTAn2LQhIlmwkCRYqLkPn7ptRERUMqjLCuYEERHJMSfYtCEiWeBAGyIi0kSGv0AlIiIjkmNOsGlDRPLArg0REWkix5/GiYjIeGSYE2zaEJEs8PIoIiLSRI7D3omIyHjkmBNs2hCRLHCgDRERaSLDX6ASEZERyTEn2LQhIlmQJAmSmu63um1ERFQyqMsK5gQREckxJ9i0ISJ5kAC151HzPMcSEZExqcsK5gQREckwJ9i0ISJZ4OVRRESkiRyHvRMRkfHIMSfYtCEiWeDlUUREpIkch70TEZHxyDEn2LQhIlmQNFweZabnWCIiMiJ1WcGcICIiOeYEmzZEJAts2hARkSZy/GGciIiMR445waYNEcmC9M9/6rYTEVHJpi4rmBNERCTHnGDThohkQYKGkTZGq4SIiMyVuqxgThARkRxzgk0bIpIFCylvUbediIhKNnVZwZwgIiI55gSbNkQkE7zpNxERaSLHm7kSEZHxyC8n2LQhIlngRMRERKSJHCeYJCIi45FjTrBpQ0SywMujiIhIEzkOeyciIuORY06waUNEssC7RxERkSZyvCsIEREZjxxzgk0bIpIFXh5FRESayHHYOxERGY8cc4JNGyKSBTZtiIhIEzn+ME5ERMYjx5xg04aIZIGXRxERkSZyHPZORETGI8ecYNOGiGSBI22IiEgTOf4GlYiIjEeOOcGmDRHJAps2RESkiRx/GCciIuORY06waUNEsiBJEizUnEklcz3LEhGR0ajLCuYEERHJMScsTF0AEREREREREREVxJE2RCQLvDyKiIg0keOwdyIiMh455gSbNkQkCxYaLo9St42IiEoGdVnBnCAiIjnmBJs2RCQL0j+Luu1ERFSyqcsK5gQREckxJzinDRHJgiRJGhciIirZ9J0Tq1atgo+PD2xtbdGkSRP8+uuvBqiaiIiMxRDfJwydFWzaEJEs5F9/qm4hIqKSTZ85sWXLFkyaNAmzZs3Cb7/9hnr16iE4OBgPHjwwTPFERGRw+v4+YYysYNOGiGRB0mIhIqKSTZ85sWTJEgwfPhyDBw+Gv78/1qxZA3t7e3z99df6LZqIiIxG398njJEVbNoQkSzw8igiItJEXzmRlZWFs2fPIigoSLnOwsICQUFBiI2NNUTpRERkBPr8PmGsrOBExGZECAEASE9NMXElJUfWs1RTl1BiZD9LA/Dv33NdpaQkqx2ymJKS/ErHJZIbZoXxZaUzK4zhdXMCUJ8V+TmRnKyaFwqFAgqFQmXdo0ePkJOTA3d3d5X17u7uuHTp0ivXR2QM+f+G0pgTRpPN7xRGYU45ARgvK9i0MSMpKXkn1j6BdU1cCZHhpKSkwMXFRev9bWxs4OHhgWqVvTTu6+HhARsbm9cpj8js5WfFwDb1TVsIkYHomhOA9lnh6OgILy/VfWbNmoXZs2frWiaR2crPiR6t6pi4EiLDKGk5waaNGfH09MTt27fh5OQkq0s9kpOT4eXlhdu3b8PZ2dnU5RR7cv28hRBISUmBp6enTs+ztbXF9evXkZWVpXFfGxsb2NravmqJRLIgx6yQ63lLruT6eb9qTgDaZ4UQosC/m8J+e1qmTBlYWloiISFBZX1CQgI8PDx0ro/ImOSYE4B8z11yJcfP25xyAjBeVrBpY0YsLCxQsWJFU5fxypydnWXzD744kOPnrWtHPJ+trS2bMUT/kHNWyPG8JWdy/LxfNScA/WaFjY0NGjZsiIMHD6J79+4AgNzcXBw8eBBjxozRy2sQGYqccwKQ57lLzuT2eZtLTgDGywo2bYiIiIiIXjJp0iSEhoaiUaNGePPNN7Fs2TKkpaVh8ODBpi6NiIjMhDGygk0bIiIiIqKX9O3bFw8fPsTMmTNx//591K9fH/v27Ssw4SQREZVcxsgKNm3otSkUCsyaNavIa/1Iv/h5E5Hc8LxlXPy89WfMmDG8HIrISHjuMi5+3vpj6KyQxOvcL4uIiIiIiIiIiAzCwtQFEBERERERERFRQWzaEBERERERERGZITZtiIiIiIiIiIjMEJs2ZLaOHDkCSZKQmJho6lKKNR8fHyxbtszUZRARvRJmheExJ4hIzpgThsecMCw2bUqIsLAwSJKE+fPnq6zftWsXJEkyUVXFS2xsLCwtLdGpUydTl0JE9EqYFYbFnCAiuWNOGBZzggrDpk0JYmtri88//xxPnz7V2zGzsrL0diy5i4iIwNixY3Hs2DHcvXvX1OUQEb0SZoXhMCeIqDhgThgOc4IKw6ZNCRIUFAQPDw/MmzevyH22b9+OWrVqQaFQwMfHB4sXL1bZ7uPjg/DwcISEhMDZ2RnvvfceoqKi4Orqir1798LPzw/29vbo1asX0tPTsW7dOvj4+MDNzQ3jxo1DTk6O8ljR0dFo1KgRnJyc4OHhgQEDBuDBgwcGe/+GlJqaii1btuD9999Hp06dEBUVpdyWPyTz4MGDaNSoEezt7dG8eXPEx8erHGP16tXw9fWFjY0N/Pz8EB0drbJdkiR8+eWX6Ny5M+zt7VGzZk3ExsbiypUrCAwMhIODA5o3b46rV68qn3P16lV069YN7u7ucHR0ROPGjXHgwIEi38eQIUPQuXNnlXXZ2dkoV64cIiIiXuMTIiK5YFYYBnOCiIoL5oRhMCeoSIJKhNDQUNGtWzexY8cOYWtrK27fvi2EEGLnzp0i/6/BmTNnhIWFhZg7d66Ij48XkZGRws7OTkRGRiqP4+3tLZydncWiRYvElStXxJUrV0RkZKSwtrYWbdu2Fb/99ps4evSoKF26tGjXrp3o06ePuHDhgtizZ4+wsbERmzdvVh4rIiJCfP/99+Lq1asiNjZWNGvWTHTo0EG5/fDhwwKAePr0qVE+o9cREREhGjVqJIQQYs+ePcLX11fk5uYKIf59H02aNBFHjhwRFy5cEC1bthTNmzdXPn/Hjh3C2tparFq1SsTHx4vFixcLS0tLcejQIeU+AESFChXEli1bRHx8vOjevbvw8fERbdq0Efv27RN//vmnaNq0qWjfvr3yOXFxcWLNmjXi/Pnz4q+//hL/+c9/hK2trbh586ZyH29vb7F06VIhhBAnT54UlpaW4u7duyq1OTg4iJSUFIN8dkRkPpgVhsOcIKLigDlhOMwJKgqbNiVE/glWCCGaNm0qhgwZIoRQPcEOGDBAtG3bVuV5U6dOFf7+/srH3t7eonv37ir7REZGCgDiypUrynUjRowQ9vb2Kv8wg4ODxYgRI4qs8fTp0wKA8jlyOcEKIUTz5s3FsmXLhBBCZGdnizJlyojDhw8LIf59HwcOHFDu/9133wkA4tmzZ8rnDx8+XOWYvXv3Fh07dlQ+BiD+85//KB/HxsYKACIiIkK5btOmTcLW1lZtrbVq1RIrV65UPn7xJCuEEP7+/uLzzz9XPu7SpYsICwvT9BEQUTHArDAc5gQRFQfMCcNhTlBReHlUCfT5559j3bp1uHjxosr6ixcvokWLFirrWrRogcuXL6sMQWzUqFGBY9rb28PX11f52N3dHT4+PnB0dFRZ9+JQxbNnz6JLly6oVKkSnJycEBAQAAC4devW671BI4uPj8evv/6K/v37AwCsrKzQt2/fAsP/6tatq/z/8uXLA4Dy8yjqs3/5z+jFY7i7uwMA6tSpo7IuIyMDycnJAPKGWU6ZMgU1a9aEq6srHB0dcfHiRbWf8bBhwxAZGQkASEhIwA8//IAhQ4Zo8UkQUXHCrNAf5gQRFUfMCf1hTpA6bNqUQK1atUJwcDCmT5/+Ss93cHAosM7a2lrlsSRJha7Lzc0FAKSlpSE4OBjOzs6IiYnB6dOnsXPnTgDym4gsIiICz58/h6enJ6ysrGBlZYXVq1dj+/btSEpKUu734ueRP7t+/uehrcKOoe64U6ZMwc6dO/HZZ5/h+PHjiIuLQ506ddR+xiEhIbh27RpiY2OxYcMGVK5cGS1bttSpTiKSP2aF/jAniKg4Yk7oD3OC1LEydQFkGvPnz0f9+vXh5+enXFezZk2cPHlSZb+TJ0+ievXqsLS01OvrX7p0CY8fP8b8+fPh5eUFADhz5oxeX8MYnj9/jvXr12Px4sVo166dyrbu3btj06ZNqFGjhsbj5H/2oaGhynUnT56Ev7//a9V38uRJhIWF4Z133gGQ1ym/ceOG2ueULl0a3bt3R2RkJGJjYzF48ODXqoGI5ItZ8fqYE0RUnDEnXh9zgjRh06aEqlOnDgYOHIgVK1Yo102ePBmNGzdGeHg4+vbti9jYWPz3v//FF198offXr1SpEmxsbLBy5UqMHDkSf/zxB8LDw/X+Ooa2d+9ePH36FEOHDoWLi4vKtp49eyIiIgILFy7UeJypU6eiT58+aNCgAYKCgrBnzx7s2LFD7czs2qhWrRp27NiBLl26QJIkzJgxQ6tu/LBhw9C5c2fk5OSonPiJqGRhVrw+5gQRFWfMidfHnCBNeHlUCTZ37lyVf3BvvPEGvvnmG2zevBm1a9fGzJkzMXfuXISFhen9tcuWLYuoqChs3boV/v7+mD9/PhYtWqT31zG0iIgIBAUFFTjBAnkn2TNnzuDcuXMaj9O9e3csX74cixYtQq1atfDll18iMjISgYGBr1XfkiVL4ObmhubNm6NLly4IDg7GG2+8ofF5QUFBKF++PIKDg+Hp6flaNRCRvDErXg9zgoiKO+bE62FOkCaSEEKYuggiMi+pqamoUKECIiMj0aNHD1OXQ0REZoY5QURE6jAn9IeXRxGRUm5uLh49eoTFixfD1dUVXbt2NXVJRERkRpgTRESkDnNC/9i0ISKlW7duoXLlyqhYsSKioqJgZcVTBBER/Ys5QURE6jAn9I+XRxERERERERERmSFORExEREREREREZIbYtCEiIiIiIiIiMkNs2hARERERERERmSE2bYiIiIiIiIiIzBCbNmTWwsLC0L17d+XjwMBATJgwweh1HDlyBJIkITExsch9JEnCrl27tD7m7NmzUb9+/deq68aNG5AkCXFxca91HCIiOWNWqMesIKKSjjmhHnPCvLFpQzoLCwuDJEmQJAk2NjaoWrUq5s6di+fPnxv8tXfs2IHw8HCt9tXmpEhERIbBrCAiInWYE0Ta4U3T6ZW0b98ekZGRyMzMxPfff4/Ro0fD2toa06dPL7BvVlYWbGxs9PK6pUqV0stxiIjI8JgVRESkDnOCSDOOtKFXolAo4OHhAW9vb7z//vsICgrC7t27Afw7/PDTTz+Fp6cn/Pz8AAC3b99Gnz594OrqilKlSqFbt264ceOG8pg5OTmYNGkSXF1dUbp0aUybNg1CCJXXfXkoY2ZmJj744AN4eXlBoVCgatWqiIiIwI0bN9C6dWsAgJubGyRJQlhYGAAgNzcX8+bNQ+XKlWFnZ4d69eph27ZtKq/z/fffo3r16rCzs0Pr1q1V6tTWBx98gOrVq8Pe3h5VqlTBjBkzkJ2dXWC/L7/8El5eXrC3t0efPn2QlJSksv3//u//ULNmTdja2qJGjRr44osvdK6FiMgUmBWaMSuIqCRjTmjGnCA2bUgv7OzskJWVpXx88OBBxMfHY//+/di7dy+ys7MRHBwMJycnHD9+HCdPnoSjoyPat2+vfN7ixYsRFRWFr7/+GidOnMCTJ0+wc+dOta8bEhKCTZs2YcWKFbh48SK+/PJLODo6wsvLC9u3bwcAxMfH4969e1i+fDkAYN68eVi/fj3WrFmDCxcuYOLEiRg0aBCOHj0KIC8IevTogS5duiAuLg7Dhg3Dhx9+qPNn4uTkhKioKPz5559Yvnw5vvrqKyxdulRlnytXruCbb77Bnj17sG/fPvzvf//DqFGjlNtjYmIwc+ZMfPrpp7h48SI+++wzzJgxA+vWrdO5HiIiU2NWFMSsICL6F3OiIOYEQRDpKDQ0VHTr1k0IIURubq7Yv3+/UCgUYsqUKcrt7u7uIjMzU/mc6Oho4efnJ3Jzc5XrMjMzhZ2dnfjxxx+FEEKUL19eLFiwQLk9OztbVKxYUflaQggREBAgxo8fL4QQIj4+XgAQ+/fvL7TOw4cPCwDi6dOnynUZGRnC3t5enDp1SmXfoUOHiv79+wshhJg+fbrw9/dX2f7BBx8UONbLAIidO3cWuX3hwoWiYcOGysezZs0SlpaW4s6dO8p1P/zwg7CwsBD37t0TQgjh6+srNm7cqHKc8PBw0axZMyGEENevXxcAxP/+978iX5eIyBSYFYVjVhAR5WFOFI45QS/jnDb0Svbu3QtHR0dkZ2cjNzcXAwYMwOzZs5Xb69Spo3LN6e+//44rV67AyclJ5TgZGRm4evUqkpKScO/ePTRp0kS5zcrKCo0aNSownDFfXFwcLC0tERAQoHXdV65cQXp6Otq2bauyPisrCw0aNAAAXLx4UaUOAGjWrJnWr5Fvy5YtWLFiBa5evYrU1FQ8f/4czs7OKvtUqlQJFSpUUHmd3NxcxMfHw8nJCVevXsXQoUMxfPhw5T7Pnz+Hi4uLzvUQERkbs0IzZgURlWTMCc2YE8SmDb2S1q1bY/Xq1bCxsYGnpyesrFT/Kjk4OKg8Tk1NRcOGDRETE1PgWGXLln2lGuzs7HR+TmpqKgDgu+++UzmxAXnX1OpLbGwsBg4ciDlz5iA4OBguLi7YvHkzFi9erHOtX331VYETvqWlpd5qJSIyFGaFeswKIirpmBPqMScIYNOGXpGDgwOqVq2q9f5vvPEGtmzZgnLlyhXoDOcrX748fvnlF7Rq1QpAXvf37NmzeOONNwrdv06dOsjNzcXRo0cRFBRUYHt+Vz4nJ0e5zt/fHwqFArdu3Sqym16zZk3lBGj5fv75Z81v8gWnTp2Ct7c3Pv74Y+W6mzdvFtjv1q1buHv3Ljw9PZWvY2FhAT8/P7i7u8PT0xPXrl3DwIEDdXp9IiJzwKxQj1lBRCUdc0I95gQBnIiYjGTgwIEoU6YMunXrhuPHj+P69es4cuQIxo0bhzt37gAAxo8fj/nz52PXrl24dOkSRo0ahcTExCKP6ePjg9DQUAwZMgS7du1SHvObb74BAHh7e0OSJOzduxcPHz5EamoqnJycMGXKFEycOBHr1q3D1atX8dtvv2HlypXKibhGjhyJy5cvY+rUqYiPj8fGjRsRFRWl0/utVq0abt26hc2bN+Pq1atYsWJFoROg2draIjQ0FL///juOHz+OcePGoU+fPvDw8AAAzJkzB/PmzcOKFSvw119/4fz584iMjMSSJUt0qoeISA6YFcwKIiJ1mBPMiRLJxHPqkAy9OGmYLtvv3bsnQkJCRJkyZYRCoRBVqlQRw4cPF0lJSUKIvEnCxo8fL5ydnYWrq6uYNGmSCAkJKXLSMCGEePbsmZg4caIoX768sLGxEVWrVhVff/21cvvcuXOFh4eHkCRJhIaGCiHyJjpbtmyZ8PPzE9bW1qJs2bIiODhYHD16VPm8PXv2iKpVqwqFQiFatmwpvv76a50nDZs6daooXbq0cHR0FH379hVLly4VLi4uyu2zZs0S9erVE1988YXw9PQUtra2olevXuLJkycqx42JiRH169cXNjY2ws3NTbRq1Urs2LFDCMFJw4jIfDErCsesICLKw5woHHOCXiYJUcSMTEREREREREREZDK8PIqIiIiIiIiIyAyxaUNEREREREREZIbYtCEiIiIiIiIiMkNs2hARERERERERmSE2bYiIiIiIiIiIzBCbNkREREREREREZohNGyIiIiIiIiIiM8SmDRERERERERGRGWLThoiIiIiIiIjIDLFpQ0RERERERERkhti0ISIiIiIiIiIyQ2zaEBERERERERGZof8HjWZepymyOYwAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for manufacturer in results.keys():\n", + " print(f'Manufacturer m{manufacturer}')\n", + "\n", + " fig, ax = plt.subplots(1, 3, figsize=(12, 3))\n", + " \n", + " for model_name, ax_ in zip(results[manufacturer].keys(), ax):\n", + " print(f'Model {model_name}:')\n", + "\n", + " reliability = fbeta_score(\n", + " true_anomalies[manufacturer], predicted_anomalies[manufacturer][model_name],\n", + " beta=0.5\n", + " )\n", + " precision = precision_score(\n", + " true_anomalies[manufacturer], predicted_anomalies[manufacturer][model_name]\n", + " )\n", + " recall = recall_score(\n", + " true_anomalies[manufacturer], predicted_anomalies[manufacturer][model_name]\n", + " )\n", + "\n", + " # Average earliness score over reports\n", + " earliness_scores = []\n", + " for event_id, predictions in results[manufacturer][model_name].items():\n", + " if dataset.events[manufacturer].loc[event_id]['Event type'] == 'anomaly':\n", + " criticality = predictions.criticality()\n", + " detection_time, earliness = calculate_earliness(\n", + " criticality_threshold=criticality_thresholds[manufacturer][model_name],\n", + " report_ts=dataset.events[manufacturer].loc[event_id]['Report date'],\n", + " criticality=criticality\n", + " )\n", + " earliness_scores.append(earliness)\n", + " avg_earliness = sum(earliness_scores) / len(earliness_scores)\n", + "\n", + " print(f'Reliability: {reliability:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, Earliness: {avg_earliness:.2f}')\n", + "\n", + " disp = ConfusionMatrixDisplay.from_predictions(\n", + " y_true=true_anomalies[manufacturer], y_pred=predicted_anomalies[manufacturer][model_name],\n", + " cmap='Blues',\n", + " labels=[False, True],\n", + " display_labels=['Normal', 'Anomaly'],\n", + " ax=ax_\n", + " )\n", + " ax_.set_title(model_name)\n", + " \n", + " fig.suptitle(f'Confusion matrix - M{manufacturer}')\n", + " plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "id": "570ebfedc795dd61", + "metadata": {}, + "source": [ + "### Visualise results\n", + "\n", + "Here we show how to visualise results for a specific event and model.\n", + "\n", + "For the event chosen, 49 of manufacturer 1, the customer called because of lack of hot water. It turned out to be an operating error where the DHW controller was set to night mode, leading to a very low setpoint for the DHW storage temperatures" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a761ecfa7874e175", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:47.618674Z", + "start_time": "2026-01-13T10:51:47.602650600Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "substation ID 18\n", + "Report date 2019-05-04 07:19:00\n", + "Problem EN no DHW\n", + "Event description EN The DHW controller was set to night mode. Rese...\n", + "Possible anomaly start 2019-05-03 11:00:00\n", + "Possible anomaly end 2019-05-05 07:19:00\n", + "Training start 2016-12-16 10:00:00\n", + "Training end 2019-04-20 07:19:00\n", + "efd_possible True\n", + "Fault label Control unit: Incorrect parameterisation\n", + "Monitoring potential 4\n", + "Event type anomaly\n", + "Event end 2019-05-04 07:19:00\n", + "Event start NaT\n", + "Name: 49, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Choose event and model to plot results for\n", + "manufacturer = 1\n", + "model_name = 'Conditional AE'\n", + "event_id = 49\n", + "report_date = dataset.events[manufacturer].loc[event_id]['Report date']\n", + "\n", + "# Event details\n", + "dataset.events[manufacturer].loc[event_id]" + ] + }, + { + "cell_type": "markdown", + "id": "e1d0ace838ed0d5f", + "metadata": {}, + "source": [ + "#### Criticality\n", + "\n", + "Plot the criticality and the incident report timestamp." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5236c23c789fdd72", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:47.805480100Z", + "start_time": "2026-01-13T10:51:47.662858400Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, '')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "predictions = results[manufacturer][model_name][event_id]\n", + "\n", + "fig, ax = plt.subplots(1, 1, figsize=(8,3))\n", + "crit = predictions.criticality()\n", + "crit.plot(ax=ax, label=model_name)\n", + "\n", + "if pd.notna(report_date):\n", + " ax.axvline(report_date, label='incident report', c='r', linestyle='-')\n", + "\n", + "ax.legend(loc='upper left')\n", + "ax.set_ylabel('criticality')\n", + "ax.set_xlabel('')" + ] + }, + { + "cell_type": "markdown", + "id": "814da89e48b3860c", + "metadata": {}, + "source": [ + "#### ARCANA results\n", + "\n", + "Here we first determine anomalous events detected by the model and then calculate the ARCANA feature importances for the longest detected anomaly event.\n", + "Afterward, we plot the top 3 features with the highest ARCANA feature importances." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b8a234e31eeecc66", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:58.483098200Z", + "start_time": "2026-01-13T10:51:47.845224700Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "s_dhw_supply_temperature_setpoint 0.268510\n", + "s_hc1_supply_temperature_setpoint 0.215204\n", + "p_hc1_return_temperature 0.118098\n", + "p_net_supply_temperature 0.110434\n", + "outdoor_temperature 0.082548\n", + "s_dhw_lower_storage_temperature 0.058807\n", + "s_hc1_supply_temperature 0.058639\n", + "s_dhw_supply_temperature 0.029270\n", + "p_net_meter_flow 0.024234\n", + "p_net_return_temperature 0.023280\n", + "s_dhw_upper_storage_temperature 0.006409\n", + "p_net_meter_heat_power 0.004566\n", + "dtype: float32" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data = dataset.get_event_data(manufacturer, event_id)['test_data']\n", + "\n", + "# find longest detected anomaly event (continuous run of predicted anomalous timestamps)\n", + "anomaly_events, _ = create_events(\n", + " test_data,\n", + " predictions.predicted_anomalies,\n", + " min_event_length=12\n", + ")\n", + "longest_anomaly_event = anomaly_events[anomaly_events['duration'] == anomaly_events['duration'].max()].iloc[0]\n", + "\n", + "# Calculate ARCANA feature importances\n", + "top_features = get_arcana_importances(manufacturer, event_id, model_name, test_data.loc[longest_anomaly_event['start']:report_date])\n", + "\n", + "top_features" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d0cbfbada56ec052", + "metadata": { + "ExecuteTime": { + "end_time": "2026-01-13T10:51:59.243154700Z", + "start_time": "2026-01-13T10:51:58.602602600Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Plot the reconstruction of the top 3\n", + "fig, ax = plot_reconstruction(test_data, predictions.reconstruction, top_features.index[:3].to_list())\n", + "\n", + "for ax_ in ax:\n", + " ax_.axvline(report_date, label='incident report', color='r', linestyle='-')\n", + "\n", + "ax[0].legend(loc='upper left')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "efd", + "language": "python", + "name": "efd" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/PreDist/configs/m1_cond_ae.yaml b/notebooks/PreDist/configs/m1_cond_ae.yaml new file mode 100644 index 0000000..d314d40 --- /dev/null +++ b/notebooks/PreDist/configs/m1_cond_ae.yaml @@ -0,0 +1,56 @@ +train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + params: + imputer_strategy: mean + include_duplicate_value_to_nan: false, + max_col_zero_frac: 0.5 + max_nan_frac_per_col: 0.8 + min_unique_value_count: 3 + scale: standardize + features_to_exclude: + - p_net_meter_energy # we use power and flow + - p_net_meter_volume + + autoencoder: + name: conditional + verbose: 0 + params: + act: prelu + last_act: linear + batch_size: 256 + code_size: 5 # set by bottleneck ratio (0.65) + early_stopping: true + min_delta: 0.0001 + patience: 5 + epochs: 100 + layers: [64, 32] + learning_rate: 0.0004486710512068144 + noise: 0.05 + loss_name: mean_squared_error + conditional_features: ['hour_of_day_sine', + 'hour_of_day_cosine', + 'day_of_week_sine', + 'day_of_week_cosine'] + + anomaly_score: + name: mahalanobis + + data_splitter: + type: sklearn + validation_split: 0.2 + shuffle: true + + threshold_selector: + fit_on_val: false + name: quantile + params: + quantile: 0.99 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/notebooks/PreDist/configs/m1_default_ae.yaml b/notebooks/PreDist/configs/m1_default_ae.yaml new file mode 100644 index 0000000..bd8f9f8 --- /dev/null +++ b/notebooks/PreDist/configs/m1_default_ae.yaml @@ -0,0 +1,52 @@ +train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + params: + imputer_strategy: mean + include_duplicate_value_to_nan: false, + max_col_zero_frac: 0.5 + max_nan_frac_per_col: 0.8 + min_unique_value_count: 3 + scale: standardize + features_to_exclude: + - p_net_meter_energy # we use power and flow + - p_net_meter_volume + + autoencoder: + name: default + verbose: 0 + params: + act: prelu + last_act: linear + batch_size: 256 + code_size: 5 # set by bottleneck ratio (0.65) + early_stopping: true + min_delta: 0.0001 + patience: 5 + epochs: 100 + layers: [64, 32] + learning_rate: 0.0004486710512068144 + noise: 0.05 + loss_name: mean_squared_error + + anomaly_score: + name: mahalanobis + + data_splitter: + type: sklearn + validation_split: 0.2 + shuffle: true + + threshold_selector: + fit_on_val: false + name: quantile + params: + quantile: 0.99 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/notebooks/PreDist/configs/m1_doy_ae.yaml b/notebooks/PreDist/configs/m1_doy_ae.yaml new file mode 100644 index 0000000..e2fc23d --- /dev/null +++ b/notebooks/PreDist/configs/m1_doy_ae.yaml @@ -0,0 +1,58 @@ +train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + params: + imputer_strategy: mean + include_duplicate_value_to_nan: false, + max_col_zero_frac: 0.5 + max_nan_frac_per_col: 0.8 + min_unique_value_count: 3 + scale: standardize + features_to_exclude: + - p_net_meter_energy # we use power and flow + - p_net_meter_volume + + autoencoder: + name: conditional + verbose: 0 + params: + act: prelu + last_act: linear + batch_size: 256 + code_size: 5 # set by bottleneck ratio (0.65) + early_stopping: true + min_delta: 0.0001 + patience: 5 + epochs: 100 + layers: [64, 32] + learning_rate: 0.0004486710512068144 + noise: 0.05 + loss_name: mean_squared_error + conditional_features: ['hour_of_day_sine', + 'hour_of_day_cosine', + 'day_of_week_sine', + 'day_of_week_cosine', + 'day_of_year_sine', + 'day_of_year_cosine'] + + anomaly_score: + name: mahalanobis + + data_splitter: + type: sklearn + validation_split: 0.2 + shuffle: true + + threshold_selector: + fit_on_val: false + name: quantile + params: + quantile: 0.99 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/notebooks/PreDist/configs/m2_cond_ae.yaml b/notebooks/PreDist/configs/m2_cond_ae.yaml new file mode 100644 index 0000000..a171ac4 --- /dev/null +++ b/notebooks/PreDist/configs/m2_cond_ae.yaml @@ -0,0 +1,65 @@ +train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + params: + imputer_strategy: mean + include_duplicate_value_to_nan: false, + max_col_zero_frac: 0.5 + max_nan_frac_per_col: 0.8 + min_unique_value_count: 2 + scale: standardize + features_to_exclude: + - p_net_meter_energy # we use power and flow + - p_net_meter_volume + - s_dhw_control_unit_mode + - s_hc1.1_control_unit_mode + - s_hc1.2_control_unit_mode + - s_hc1.3_control_unit_mode + - s_hc1_control_unit_mode + - s_hc1.1_room_temperature_setpoint + - s_hc1.2_room_temperature_setpoint + - s_hc1.3_room_temperature_setpoint + - s_hc1_room_temperature_setpoint + + autoencoder: + name: conditional + verbose: 0 + params: + act: prelu + last_act: linear + batch_size: 256 + code_size: 5 # set by bottleneck ratio (0.25) + early_stopping: true + min_delta: 0.0001 + patience: 5 + epochs: 100 + layers: [64, 32] + learning_rate: 0.0005289609464733553 + noise: 0.15 + loss_name: mean_squared_error + conditional_features: [ 'hour_of_day_sine', + 'hour_of_day_cosine', + 'day_of_week_sine', + 'day_of_week_cosine' ] + + anomaly_score: + name: rmse + + data_splitter: + type: sklearn + validation_split: 0.2 + shuffle: true + + threshold_selector: + fit_on_val: false + name: quantile + params: + quantile: 0.99 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/notebooks/PreDist/configs/m2_default_ae.yaml b/notebooks/PreDist/configs/m2_default_ae.yaml new file mode 100644 index 0000000..c5a188e --- /dev/null +++ b/notebooks/PreDist/configs/m2_default_ae.yaml @@ -0,0 +1,61 @@ +train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + params: + imputer_strategy: mean + include_duplicate_value_to_nan: false, + max_col_zero_frac: 0.5 + max_nan_frac_per_col: 0.8 + min_unique_value_count: 2 + scale: standardize + features_to_exclude: + - p_net_meter_energy # we use power and flow + - p_net_meter_volume + - s_dhw_control_unit_mode + - s_hc1.1_control_unit_mode + - s_hc1.2_control_unit_mode + - s_hc1.3_control_unit_mode + - s_hc1_control_unit_mode + - s_hc1.1_room_temperature_setpoint + - s_hc1.2_room_temperature_setpoint + - s_hc1.3_room_temperature_setpoint + - s_hc1_room_temperature_setpoint + + autoencoder: + name: default + verbose: 0 + params: + act: prelu + last_act: linear + batch_size: 256 + code_size: 5 # set by bottleneck ratio (0.25) + early_stopping: true + min_delta: 0.0001 + patience: 5 + epochs: 100 + layers: [64, 32] + learning_rate: 0.0005289609464733553 + noise: 0.15 + loss_name: mean_squared_error + + anomaly_score: + name: rmse + + data_splitter: + type: sklearn + validation_split: 0.2 + shuffle: true + + threshold_selector: + fit_on_val: false + name: quantile + params: + quantile: 0.99 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/notebooks/PreDist/configs/m2_doy_ae.yaml b/notebooks/PreDist/configs/m2_doy_ae.yaml new file mode 100644 index 0000000..5acfac6 --- /dev/null +++ b/notebooks/PreDist/configs/m2_doy_ae.yaml @@ -0,0 +1,67 @@ +train: + data_clipping: + lower_percentile: 0.001 + upper_percentile: 0.999 + + data_preprocessor: + params: + imputer_strategy: mean + include_duplicate_value_to_nan: false, + max_col_zero_frac: 0.5 + max_nan_frac_per_col: 0.8 + min_unique_value_count: 2 + scale: standardize + features_to_exclude: + - p_net_meter_energy # we use power and flow + - p_net_meter_volume + - s_dhw_control_unit_mode + - s_hc1.1_control_unit_mode + - s_hc1.2_control_unit_mode + - s_hc1.3_control_unit_mode + - s_hc1_control_unit_mode + - s_hc1.1_room_temperature_setpoint + - s_hc1.2_room_temperature_setpoint + - s_hc1.3_room_temperature_setpoint + - s_hc1_room_temperature_setpoint + + autoencoder: + name: conditional + verbose: 0 + params: + act: prelu + last_act: linear + batch_size: 256 + code_size: 5 # set by bottleneck ratio (0.25) + early_stopping: true + min_delta: 0.0001 + patience: 5 + epochs: 100 + layers: [64, 32] + learning_rate: 0.0005289609464733553 + noise: 0.15 + loss_name: mean_squared_error + conditional_features: [ 'hour_of_day_sine', + 'hour_of_day_cosine', + 'day_of_week_sine', + 'day_of_week_cosine', + 'day_of_year_sine', + 'day_of_year_cosine' ] + + anomaly_score: + name: rmse + + data_splitter: + type: sklearn + validation_split: 0.2 + shuffle: true + + threshold_selector: + fit_on_val: false + name: quantile + params: + quantile: 0.99 + +root_cause_analysis: + alpha: 0.8 + init_x_bias: recon + num_iter: 1000 diff --git a/notebooks/PreDist/predist_utils.py b/notebooks/PreDist/predist_utils.py new file mode 100644 index 0000000..484e101 --- /dev/null +++ b/notebooks/PreDist/predist_utils.py @@ -0,0 +1,246 @@ +import os +from typing import List, Tuple +from pathlib import Path +from copy import deepcopy +import gc + +# suppress tensorflow warnings +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + +import pandas as pd +import numpy as np +import tensorflow as tf +from sklearn.model_selection import StratifiedKFold +from sklearn.metrics import fbeta_score + +from energy_fault_detector import FaultDetector, Config +from energy_fault_detector.core import FaultDetectionResult +from energy_fault_detector.evaluation import PreDistDataset +from energy_fault_detector.root_cause_analysis.arcana_utils import calculate_mean_arcana_importances + + +def train_or_get_model(event_id: int, dataset: PreDistDataset, manufacturer: int, model_name: str, conf: Config, + bottleneck_ratio: float, load_from_file: bool, time_features: List[str] | None + ) -> Tuple[int, FaultDetectionResult]: + """Processes a single event: loads data, trains/loads model, and predicts. + + Args: + event_id (int): ID of the event to process. + dataset (PreDistDataset): Dataset containing the event data. + manufacturer (int): Manufacturer ID for the event. + model_name (str): Name of the model to use for training. + conf (Config): Base configuration for training. + bottleneck_ratio (float): Ratio to determine the bottleneck size for the autoencoder. + load_from_file (bool): Whether to load the model from file if available. Otherwise, train and save a new model. + time_features (List[str] | None): List of time features to use for conditional autoencoders. + + Returns: + Tuple[int, FaultDetectionResult]: A tuple containing the event ID and the fault detection result. + """ + + # Local copy of time features and configuration to avoid mutation issues in parallel + ts_features = time_features.copy() if time_features else None + local_conf = deepcopy(conf) + + # Get specific event data + data = dataset.get_event_data(manufacturer, event_id) + train_data = data['train_data'] + test_data = data['test_data'] + + # Create a new model or load from file + model_path = Path(f'./models/m{manufacturer}/event_{event_id}/{model_name}') + + if model_path.exists() and load_from_file: + model = FaultDetector() + model.load_models(model_path) + if (model_path / 'ts_features.txt').exists(): + with open(model_path / 'ts_features.txt', 'r') as f: + ts_features = f.read().splitlines() + else: + # Add the code size to the AE configuration, based on the bottleneck ratio + # code_size is part of the model configuration, so we overwrite the parameter of the underlying dictionary. + bottleneck = calculate_bottleneck(train_data, local_conf, bottleneck_ratio) + local_conf['train']['autoencoder']['params']['code_size'] = bottleneck + + # For the conditional autoencoders, add time features + if ts_features: + train_data = add_cyclic_time_features(train_data, ts_features) + + model = FaultDetector(local_conf, model_directory=model_path) + model_data = model.fit(train_data, data['train_normal_flag'], save_models=True, overwrite_models=True) + if ts_features: + # For the conditional autoencoders, save the time features as well + with open(Path(model_data.model_path) / 'ts_features.txt', 'w') as f: + f.write('\n'.join(ts_features)) + + # Predict + if ts_features: + # For the conditional autoencoders, add time features + test_data = add_cyclic_time_features(test_data, ts_features) + predictions = model.predict(test_data) + + # memory cleanup + del model + tf.keras.backend.clear_session() + gc.collect() + + return event_id, predictions + + +def add_cyclic_time_features(df: pd.DataFrame, features: List[str]) -> pd.DataFrame: + """Calculates cyclical time features from the timestamp index.""" + df = df.copy() + if 'hour_of_day' in features: + phase = df.index.hour / 24 + df['hour_of_day_sine'] = np.sin(2 * np.pi * phase) + df['hour_of_day_cosine'] = np.cos(2 * np.pi * phase) + if 'day_of_week' in features: + phase = df.index.day_of_week / 7 + df['day_of_week_sine'] = np.sin(2 * np.pi * phase) + df['day_of_week_cosine'] = np.cos(2 * np.pi * phase) + if 'day_of_year' in features: + phase = df.index.dayofyear / (365 + df.index.is_leap_year) + df['day_of_year_sine'] = np.sin(2 * np.pi * phase) + df['day_of_year_cosine'] = np.cos(2 * np.pi * phase) + return df + + +def calculate_bottleneck(df: pd.DataFrame, config: Config, ratio: float) -> int: + """Calculates code_size (the bottleneck of the autoencoder) relative to input dimensions, accounting for excluded + features and conditional features. + + Args: + df (pd.DataFrame): Input dataframe to determine the number of input features of the AE. + config (Config): Configuration for the AE. + ratio (float, optional): Ratio between input and bottleneck dimensions. + + Returns: + int: The calculated bottleneck size for the autoencoder. + """ + + # Get the conditional features from the config + ae_params = config['train']['autoencoder']['params'] + cond_features = ae_params.get('conditional_features', []) + + # Exclude conditions (not compressed) + input_dim = len(df.columns) - len([c for c in cond_features if c in df.columns]) + + # Check for feature exclusions in config + excluded = [] + dp_config = config['train'].get('data_preprocessor', {}) + if dp_config.get('params'): + # params-based data prep config + excluded = dp_config.get('params').get('features_to_exclude', []) + else: + # steps-based data prep config + steps = config['train']['data_preprocessor'].get('steps', []) + for step in steps: + if step['name'] == 'column_selector': + excluded = step['params'].get('features_to_exclude', []) + break + + # Remove the excluded features from the input dimension + input_dim -= len([e for e in excluded if e in df.columns]) + + return max(1, round(input_dim * ratio)) + + +def find_optimal_threshold(true_anomalies: pd.Series, + max_criticalities: pd.Series, + thresholds: np.ndarray = np.arange(1, 100), + k: int = 5) -> Tuple[int, float]: + """Finds the threshold maximizing reliability (Event-wise F0.5) using CV. + + Args: + true_anomalies (pd.Series): Series indicating whether each event is an anomaly. 1 = anomaly, 0 = normal. + max_criticalities (pd.Series): Series containing the maximum criticality of each event. + thresholds (np.ndarray, optional): Array of thresholds to evaluate. Defaults to np.arange(1, 100). + k (int, optional): Number of folds for CV. Defaults to 5. + + Returns: + Optimal criticality threshold and avg validation reliability score. + """ + y_true = true_anomalies.values + max_criticalities = max_criticalities.values + skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42) + + chosen_thresholds = [] + val_scores = [] + + for train_idx, val_idx in skf.split(y_true, y_true): + y_train, max_crit_train = y_true[train_idx], max_criticalities[train_idx] + y_val, max_crit_val = y_true[val_idx], max_criticalities[val_idx] + + # Best threshold on training data + best_t = None + best_train_f05 = -1.0 + for t in thresholds: + y_pred_train = (max_crit_train >= t).astype(int) + f05_train = fbeta_score(y_train, y_pred_train, beta=0.5, zero_division=0) + if f05_train > best_train_f05: + best_train_f05 = f05_train + best_t = t + + chosen_thresholds.append(best_t) + + # Evaluate on validation data + y_pred_val = (max_crit_val >= best_t).astype(int) + f05_val = fbeta_score(y_val, y_pred_val, beta=0.5, zero_division=0) + val_scores.append(f05_val) + + robust_t = int(np.median(chosen_thresholds)) + mean_val_f05 = float(np.mean(val_scores)) + + return robust_t, mean_val_f05 + + +def get_arcana_importances(manufacturer: int, event_id: int, config_name: str, data: pd.DataFrame) -> pd.Series: + """Get ARCANA importances for a given event.""" + + model_path = Path(f'models/m{manufacturer}/event_{event_id}/{config_name}') + model = FaultDetector() + model.load_models(model_path) + + # Load the time features and add them to the data if available (for the conditional autoencoders) + if (model_path / 'ts_features.txt').exists(): + with open(model_path / 'ts_features.txt', 'r') as f: + ts_features = f.read().splitlines() + data = add_cyclic_time_features(data, ts_features) + + bias, _, _ = model.run_root_cause_analysis(data, track_losses=False, track_bias=False) + return calculate_mean_arcana_importances(bias).sort_values(ascending=False) + + +def calculate_earliness(criticality_threshold: int, report_ts: int | pd.Timestamp, criticality: pd.Series, + min_detection_time: pd.Timedelta = pd.Timedelta(hours=24) + ) -> Tuple[int | pd.Timestamp | None, float]: + """Calculate the detection time and earliness score: + + E = max(0, min(1, (report_ts - detection_timestamp) / min_detection_time)) + + The earliness score is 1 if the fault is detected at least min_detection_time before the report and 0 if the + fault is detected after the report or not detected at all. Between min_detection_time before the report and the + report timestamp, the earliness score linearly decreases to 0. + + Args: + criticality_threshold (int): Threshold for determining whether the event is detected. + report_ts (int | pd.Timestamp): Timestamp of the report. + criticality (pd.Series): Series containing the criticality pd.Series of each event. + min_detection_time (pd.Timedelta, optional): Minimum detection time. Defaults to pd.Timedelta(hours=24). + + Returns: + A tuple containing the detection time and earliness score. If not detected, the detection time is None and + the earliness score is 0. + """ + + crit_threshold_reached = criticality[criticality >= criticality_threshold] + if crit_threshold_reached.empty: + detection_time = None + earliness = 0 + return detection_time, earliness + + detection_timestamp = crit_threshold_reached.sort_index(ascending=True).index[0] + detection_time = report_ts - detection_timestamp + # max(earliness, 0) to handle detection after the fault is known + earliness = max(min(1, detection_time / min_detection_time), 0) + return detection_time, earliness diff --git a/tests/config/test_config.py b/tests/config/test_config.py index ba39c46..a1a147e 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -3,7 +3,6 @@ import shutil import unittest -import numpy as np from energy_fault_detector.config import Config, InvalidConfigFile PROJECT_ROOT = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), '..') @@ -22,10 +21,7 @@ def test_init(self): self.assertDictEqual(conf.config_dict['train'], { 'anomaly_score': {'name': 'mahalanobis', 'params': {'pca': True, 'pca_min_var': 0.85}}, - 'data_preprocessor': {'params': {'max_nan_frac_per_col': 0.05, - 'imputer_strategy': 'mean', - 'features_to_exclude': ['feature1', 'feature2'], - 'include_duplicate_value_to_nan': False}}, + 'data_preprocessor': None, # unspecified, default pipeline 'autoencoder': {'name': 'MultilayerAutoencoder', 'verbose': 0, 'params': {'layers': [300], @@ -39,7 +35,7 @@ def test_init(self): 'threshold_selector': {'name': 'FDR', 'params': {'target_false_discovery_rate': 0.8}, 'fit_on_val': False}, - 'data_splitter': {'train_block_size': 7, 'val_block_size': 3, 'type': 'DataSplitter'}, + 'data_splitter': {'train_block_size': 7, 'val_block_size': 3, 'type': 'BlockDataSplitter'}, 'data_clipping': {'lower_percentile': 0.01, 'upper_percentile': 0.99} }) self.assertDictEqual(conf.config_dict['root_cause_analysis'], diff --git a/tests/config/test_quickstart_config.py b/tests/config/test_quickstart_config.py new file mode 100644 index 0000000..ae55b30 --- /dev/null +++ b/tests/config/test_quickstart_config.py @@ -0,0 +1,47 @@ +from unittest import TestCase +from typing import Any, Dict + +from energy_fault_detector.config.config import Config +from energy_fault_detector.config.quickstart_config import generate_quickstart_config # adjust + + +class TestQuickstartConfig(TestCase): + def test_generate_quickstart_config_valid_dict(self) -> None: + """Should return a valid config dict that Config accepts and includes required sections.""" + cfg: Dict[str, Any] = generate_quickstart_config( + output_path=None, + angle_columns=["theta_deg"], + counter_columns=["energy_total_kwh"], + scaler="standard", + imputer_strategy="mean", + early_stopping=False, + ) + + # Basic structure checks + self.assertIn("train", cfg) + train = cfg["train"] + self.assertIn("data_preprocessor", train) + self.assertIn("steps", train["data_preprocessor"]) + self.assertIn("autoencoder", train) + self.assertIn("params", train["autoencoder"]) + self.assertIn("threshold_selector", train) + self.assertIn("params", train["threshold_selector"]) + + # Ensure certain steps exist + step_names = [s["name"] for s in train["data_preprocessor"]["steps"]] + self.assertIn("column_selector", step_names) + self.assertIn("simple_imputer", step_names) + self.assertTrue( + any(n in ("standard_scaler", "minmax_scaler") for n in step_names), + "Expected a scaler step in the pipeline." + ) + + # Should not raise: validate via Config + Config(config_dict=cfg) + + def test_generate_quickstart_config_validation_split_guard(self) -> None: + """If validation split not in (0, 1) it should raise ValueError.""" + with self.assertRaises(ValueError): + _ = generate_quickstart_config( + validation_split=0.0, # invalid by design + ) diff --git a/tests/core/test_fault_detection_result.py b/tests/core/test_fault_detection_result.py new file mode 100644 index 0000000..813d688 --- /dev/null +++ b/tests/core/test_fault_detection_result.py @@ -0,0 +1,73 @@ + +import unittest +import tempfile +import pandas as pd +from pathlib import Path + +from energy_fault_detector.core import FaultDetectionResult + + +class TestFaultDetectionResultSaveLoad(unittest.TestCase): + + def setUp(self): + # Create sample data + index = pd.date_range("2023-01-01", periods=5, freq="H") + + self.predicted_anomalies = pd.Series([False, True, False, True, False], index=index, name="anomaly") + self.reconstruction = pd.DataFrame({ + "sensor_1": [1.0, 2.0, 3.0, 4.0, 5.0], + "sensor_2": [2.0, 3.0, 4.0, 5.0, 6.0] + }, index=index) + self.recon_error = pd.DataFrame({ + "sensor_1": [0.1, 0.2, 0.1, 0.3, 0.1], + "sensor_2": [0.2, 0.1, 0.3, 0.1, 0.2] + }, index=index) + self.anomaly_score = pd.Series([0.1, 0.9, 0.2, 0.8, 0.15], index=index, name="score") + + # Optional fields + self.bias_data = pd.DataFrame({"bias": [0.01, 0.02]}, index=index[:2]) + self.arcana_losses = pd.DataFrame({"loss_a": [0.1, 0.2], "loss_b": [0.05, 0.1]}, index=index[:2]) + self.tracked_bias = [ + pd.DataFrame({"bias_step_0": [0.01]}, index=[index[0]]), + pd.DataFrame({"bias_step_1": [0.02]}, index=[index[1]]) + ] + + # Instantiate object + self.fdr = FaultDetectionResult( + predicted_anomalies=self.predicted_anomalies, + reconstruction=self.reconstruction, + recon_error=self.recon_error, + anomaly_score=self.anomaly_score, + bias_data=self.bias_data, + arcana_losses=self.arcana_losses, + tracked_bias=self.tracked_bias + ) + + def test_save_and_load_roundtrip(self): + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + + # Save result + self.fdr.save(tmp_path) + + # Load result back + loaded_fdr = FaultDetectionResult.load(tmp_path) + + # Compare core attributes + pd.testing.assert_series_equal(loaded_fdr.predicted_anomalies, self.fdr.predicted_anomalies, check_freq=False) + pd.testing.assert_frame_equal(loaded_fdr.reconstruction, self.fdr.reconstruction, check_freq=False) + pd.testing.assert_frame_equal(loaded_fdr.recon_error, self.fdr.recon_error, check_freq=False) + pd.testing.assert_series_equal(loaded_fdr.anomaly_score, self.fdr.anomaly_score, check_freq=False) + + # Compare optional attributes + pd.testing.assert_frame_equal(loaded_fdr.bias_data, self.fdr.bias_data, check_freq=False) + pd.testing.assert_frame_equal(loaded_fdr.arcana_losses, self.fdr.arcana_losses, check_freq=False) + + # Compare tracked_bias list of DataFrames + self.assertEqual(len(loaded_fdr.tracked_bias), len(self.fdr.tracked_bias)) + for loaded_df, original_df in zip(loaded_fdr.tracked_bias, self.fdr.tracked_bias): + pd.testing.assert_frame_equal(loaded_df, original_df, check_freq=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/core/test_model_factory.py b/tests/core/test_model_factory.py index 90d4ca2..28ffef5 100644 --- a/tests/core/test_model_factory.py +++ b/tests/core/test_model_factory.py @@ -27,7 +27,6 @@ def test_model_creation(self): # Test for data preprocessor data_preprocessor = model_factory.data_preprocessor self.assertIsInstance(data_preprocessor, DataPreprocessor) - self.assertEqual(data_preprocessor.max_nan_frac_per_col, 0.05) # Test for threshold selector threshold_selector = model_factory.threshold_selector diff --git a/tests/data_preprocessing/test_column_selector.py b/tests/data_preprocessing/test_column_selector.py index 78a2c84..761f40e 100644 --- a/tests/data_preprocessing/test_column_selector.py +++ b/tests/data_preprocessing/test_column_selector.py @@ -51,3 +51,35 @@ def test_missing_columns(self): def test_not_fitted(self): with self.assertRaises(NotFittedError): self.column_selector.transform(self.raw_dataframe) + + def test_fit_with_features_to_select(self): + # Select a mix (case-insensitive); Sensor_3 should be dropped due to NaN fraction (0.5 >= 0.2) + selector = ColumnSelector(max_nan_frac_per_col=0.2, + features_to_select=['sensor_1', 'SENSOR_2', 'sensor_3', 'sensor_5']) + selector.fit(self.raw_dataframe) + expected_attributes = ["Sensor_1", "Sensor_2", "Sensor_5"] + assert_array_equal(expected_attributes, selector.feature_names_out_) + + def test_transform_with_features_to_select(self): + # Keep only Sensor_1 and Sensor_5 + selector = ColumnSelector(max_nan_frac_per_col=0.2, + features_to_select=['sensor_1', 'sensor_5']) + expected_df = self.raw_dataframe[["Sensor_1", "Sensor_5"]] + df = selector.fit_transform(self.raw_dataframe) + # Check values and column order + assert_array_equal(expected_df.columns.values, df.columns.values) + assert_array_equal(expected_df.values, df.values) + + def test_features_to_select_case_insensitive(self): + # Mixed casing in selection should match columns + selector = ColumnSelector(max_nan_frac_per_col=0.2, + features_to_select=['SeNsOr_1', 'seNSor_5']) + selector.fit(self.raw_dataframe) + expected_attributes = ["Sensor_1", "Sensor_5"] + assert_array_equal(expected_attributes, selector.feature_names_out_) + + def test_init_mutually_exclusive_args(self): + with self.assertRaises(ValueError): + ColumnSelector(max_nan_frac_per_col=0.2, + features_to_exclude=['sensor_1'], + features_to_select=['sensor_1', 'sensor_5']) diff --git a/tests/data_preprocessing/test_counter_diff_transformer.py b/tests/data_preprocessing/test_counter_diff_transformer.py new file mode 100644 index 0000000..9b4c178 --- /dev/null +++ b/tests/data_preprocessing/test_counter_diff_transformer.py @@ -0,0 +1,365 @@ +import unittest +from datetime import datetime, timedelta +from typing import List + +import numpy as np +import pandas as pd + +from energy_fault_detector.data_preprocessing.counter_diff_transformer import CounterDiffTransformer + + +class TestCounterDiffTransformer(unittest.TestCase): + """Unit tests for CounterDiffTransformer.""" + + def setUp(self) -> None: + """Create small helper datasets used across tests.""" + # Regular 1-second interval index + self.t0 = datetime(2024, 1, 1, 0, 0, 0) + self.idx_1s = pd.date_range(self.t0, periods=5, freq="1s", tz="UTC") + + def _df( + self, + values_a: List[float], + values_b: List[float] | None = None, + index: pd.DatetimeIndex | None = None, + ) -> pd.DataFrame: + """Helper to build a DataFrame with optional second counter.""" + index = index if index is not None else self.idx_1s + if len(values_a) < len(index): + index = index[:len(values_a)] + data = {"counter_a": values_a} + if values_b is not None: + data["counter_b"] = values_b + return pd.DataFrame(data, index=index) + + def test_fit_requires_datetime_index_when_rate_or_mask(self) -> None: + """fit should error on non-DatetimeIndex when rate/mask are requested.""" + df = pd.DataFrame({"counter_a": [0, 1, 2]}, index=[0, 1, 2]) + + # compute_rate=True requires DatetimeIndex + with self.assertRaises(ValueError): + CounterDiffTransformer(counters=["counter_a"], compute_rate=True).fit(df) + + # gap_policy='mask' requires DatetimeIndex + with self.assertRaises(ValueError): + CounterDiffTransformer(counters=["counter_a"], gap_policy="mask").fit(df) + + # If neither rate nor mask, fit should succeed + CounterDiffTransformer(counters=["counter_a"], gap_policy="ignore").fit(df) + + def test_fit_requires_monotonic_index(self) -> None: + """fit should error on non-monotonic DatetimeIndex when rate/mask are requested.""" + idx = pd.DatetimeIndex( + [self.t0, self.t0 + timedelta(seconds=2), self.t0 + timedelta(seconds=1)], + tz="UTC", + ) + df = self._df(values_a=[0, 1, 2], index=idx) + with self.assertRaises(ValueError): + CounterDiffTransformer(counters=["counter_a"], compute_rate=True).fit(df) + + # No problem if index is sorted + CounterDiffTransformer(counters=["counter_a"], compute_rate=True).fit(df.sort_index()) + + def test_diff_zero_strategy_default(self) -> None: + """Default 'zero' strategy: negative diff -> increment equals current value.""" + # 0 -> 1 -> 3 -> 0 (reset) -> 2 + df = self._df(values_a=[0, 1, 4, 1, 3]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="zero", + fill_first="nan", + keep_original=False, + gap_policy="ignore", + ).fit(df) + + out = tr.transform(df) + self.assertListEqual(list(out.columns), ["counter_a_diff"]) + + expected = pd.Series([np.nan, 1, 3, 1, 2], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_diff_fill_first_zero(self) -> None: + """First increment filled with zero when fill_first='zero'.""" + df = self._df(values_a=[5, 7, 8, 10, 12]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="zero", + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + + out = tr.transform(df) + expected = pd.Series([0, 2, 1, 2, 2], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_rollover_strategy_with_value(self) -> None: + """'rollover' strategy uses provided rollover value to compute increment.""" + # 95 -> 98 -> 2 (rollover at 100) => inc: NaN/0, 3, 2 + (100 - 98) = 4 + df = self._df(values_a=[95, 98, 2, 7, 20]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="rollover", + rollover_values={"counter_a": 100.0}, + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + + out = tr.transform(df) + expected = pd.Series([0, 3, 4, 5, 13], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_rollover_strategy_without_value_errors(self) -> None: + """'rollover' without a rollover_value should raise a ValueError.""" + df = self._df(values_a=[50, 10]) # negative diff + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="rollover", + rollover_values={}, # missing + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + with self.assertRaises(ValueError): + tr.transform(df) + + def test_nan_strategy(self) -> None: + """'nan' strategy sets negative diffs to NaN.""" + df = self._df(values_a=[10, 8, 9]) + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="nan", + fill_first="zero", + keep_original=False, + gap_policy="ignore", + ).fit(df) + out = tr.transform(df) + expected = pd.Series([0, np.nan, 1], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_auto_strategy_prefers_rollover_when_available(self) -> None: + """'auto' uses rollover if a value is supplied; else behaves like 'zero'.""" + df = self._df(values_a=[95, 98, 2]) + + # With rollover value -> like 'rollover' + tr1 = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="auto", + rollover_values={"counter_a": 100.0}, + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out1 = tr1.transform(df) + expected1 = pd.Series([0, 3, 4], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out1["counter_a_diff"], expected1, check_dtype=False) + + # Without rollover value -> like 'zero' + tr2 = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="auto", + rollover_values={}, # none + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out2 = tr2.transform(df) + expected2 = pd.Series([0, 3, 2], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out2["counter_a_diff"], expected2, check_dtype=False) + + def test_small_negative_tolerance(self) -> None: + """Small negative diff within tolerance is clamped to zero.""" + df = self._df(values_a=[10.0, 9.9995, 10.5]) + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + reset_strategy="zero", + small_negative_tolerance=0.01, + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out = tr.transform(df) + # diff: 0, -0.0005 (-> 0), 0.5005 + expected = pd.Series([0.0, 0.0, 0.5005], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected) + + def test_compute_rate(self) -> None: + """Rate equals increment divided by dt seconds.""" + idx = pd.DatetimeIndex( + [self.t0, self.t0 + timedelta(seconds=2), self.t0 + timedelta(seconds=5)], + tz="UTC", + ) + df = self._df(values_a=[0, 4, 7], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=True, + reset_strategy="zero", + fill_first="zero", + gap_policy="ignore", + ).fit(df) + out = tr.transform(df) + # increments: [0, 4, 3]; dt: [NaN, 2, 3]; rate: [0, 2, 1] + expected = pd.Series([0.0, 2.0, 1.0], index=df.index, name="counter_a_rate") + pd.testing.assert_series_equal(out["counter_a_rate"], expected) + + def test_gap_masking_with_max_gap_seconds(self) -> None: + """Values at positions where dt > threshold should be masked (NaN).""" + idx = pd.DatetimeIndex( + [ + self.t0, + self.t0 + timedelta(seconds=1), + self.t0 + timedelta(seconds=10), # big gap from previous + self.t0 + timedelta(seconds=11), + ], + tz="UTC", + ) + df = self._df(values_a=[0, 1, 2, 3], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="mask", + max_gap_seconds=8.0, # gap = 9 seconds + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + # increments: [0,1,1,1]; dt: [NaN,1,9,1]; mask where dt>5 -> index 2 + self.assertTrue(np.isnan(out["counter_a_diff"].iloc[2])) + self.assertEqual(out["counter_a_diff"].iloc[1], 1.0) + self.assertEqual(out["counter_a_diff"].iloc[3], 1.0) + + def test_gap_masking_with_factor_median(self) -> None: + """Threshold computed as factor * median(dt).""" + idx = pd.DatetimeIndex( + [ + self.t0, + self.t0 + timedelta(seconds=2), + self.t0 + timedelta(seconds=4), + self.t0 + timedelta(seconds=20), # gap 16 > factor*median (median=2) + ], + tz="UTC", + ) + df = self._df(values_a=[0, 2, 3, 5], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="mask", + max_gap_seconds=None, + max_gap_factor=3.0, # 3 * median = 6 + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + self.assertTrue(np.isnan(out["counter_a_diff"].iloc[3])) # masked at data gap + self.assertEqual(out["counter_a_diff"].iloc[1], 2.0) + self.assertEqual(out["counter_a_diff"].iloc[2], 1.0) + + def test_gap_policy_ignore(self) -> None: + """No masking when gap_policy='ignore'.""" + idx = pd.DatetimeIndex( + [self.t0, self.t0 + timedelta(seconds=1), self.t0 + timedelta(seconds=10)], + tz="UTC", + ) + df = self._df(values_a=[0, 1, 30], index=idx) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + expected = pd.Series([0, 1, 29], index=df.index, name="counter_a_diff") + pd.testing.assert_series_equal(out["counter_a_diff"], expected, check_dtype=False) + + def test_keep_original_false_drops_counters(self) -> None: + """When keep_original=False, original counters are dropped from output.""" + df = self._df(values_a=[0, 1, 2], values_b=[0, 10, 20]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + keep_original=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + # 'counter_b' should be kept, 'counter_a' replaced by 'counter_a_diff' + self.assertListEqual(list(out.columns), ["counter_b", "counter_a_diff"]) + + def test_keep_original_true_keeps_counters(self) -> None: + """When keep_original=True, original counters remain alongside outputs.""" + df = self._df(values_a=[0, 1, 2], values_b=[0, 10, 20]) + + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + keep_original=True, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + out = tr.transform(df) + self.assertListEqual(list(out.columns), ["counter_a", "counter_b", "counter_a_diff"]) + + def test_feature_names_out(self) -> None: + """get_feature_names_out returns correct output ordering.""" + df = self._df(values_a=[0, 1, 2], values_b=[0, 10, 20]) + tr = CounterDiffTransformer( + counters=["counter_a", "missing_counter"], + compute_rate=False, + keep_original=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + # Only present counters are transformed; others ignored + self.assertEqual(tr.counters_, ["counter_a"]) + self.assertEqual(tr.get_feature_names_out(), ["counter_b", "counter_a_diff"]) + + out = tr.transform(df) + self.assertListEqual(tr.get_feature_names_out(), list(out.columns)) + + def test_non_numeric_values_raise_error(self) -> None: + """Non-numeric values should be coerced to NaN then diff computed.""" + df = self._df(values_a=[0, "1", "3", "bad", 7]) # 'bad' -> NaN + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + + with self.assertRaises(ValueError): + tr.transform(df) + + def test_inverse_transform(self) -> None: + """inverse_transform returns input unchanged.""" + df = self._df(values_a=[0, 1, 2]) + tr = CounterDiffTransformer( + counters=["counter_a"], + compute_rate=False, + gap_policy="ignore", + fill_first="zero", + ).fit(df) + out = tr.transform(df) + back = tr.inverse_transform(out.copy()) + pd.testing.assert_frame_equal(out, back) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/data_preprocessing/test_data_clipper.py b/tests/data_preprocessing/test_data_clipper.py index dd9cd87..effefa9 100644 --- a/tests/data_preprocessing/test_data_clipper.py +++ b/tests/data_preprocessing/test_data_clipper.py @@ -2,6 +2,7 @@ import unittest import pandas as pd +import pandas.testing as pdt from energy_fault_detector.data_preprocessing.data_clipper import DataClipper @@ -29,3 +30,23 @@ def test_transform(self): ) self.data_clipper.fit(x_test) self.assertTrue(self.data_clipper.transform(x_test).equals(expected_output)) + + def test_transform_with_features_to_clip(self): + # Only clip 'feature1'; leave 'feature2' and angles unchanged + clipper = DataClipper(lower_percentile=0.2, upper_percentile=0.8, + features_to_clip=['feature1']) + x_test = pd.DataFrame( + {'feature1': [1, 2, 3, 4, 5], 'feature2': [4, 5, 6, 7, 8], 'angle1': [0, 45, 90, 135, 180], + 'angle2': [0, 45, 90, 135, 180]} + ) + expected_output = pd.DataFrame( + {'feature1': [1.8, 2, 3, 4, 4.2], 'feature2': [4, 5, 6, 7, 8], 'angle1': [0, 45, 90, 135, 180], + 'angle2': [0, 45, 90, 135, 180]} + ) + clipper.fit(x_test) + pdt.assert_frame_equal(clipper.transform(x_test), expected_output) + + def test_init_mutually_exclusive_args(self): + with self.assertRaises(ValueError): + DataClipper(lower_percentile=0.2, upper_percentile=0.8, + features_to_exclude=['angle1'], features_to_clip=['feature1']) diff --git a/tests/data_preprocessing/test_data_preprocessor.py b/tests/data_preprocessing/test_data_preprocessor.py index 5ebd235..2f33f6e 100644 --- a/tests/data_preprocessing/test_data_preprocessor.py +++ b/tests/data_preprocessing/test_data_preprocessor.py @@ -12,6 +12,17 @@ class TestDataPreprocessorPipeline(TestCase): def setUp(self) -> None: self.standard_preprocessor = DataPreprocessor( + steps=[ + {'name': 'column_selector', + 'params': {'max_nan_frac_per_col': 0.2}}, + {'name': 'angle_transform', + 'params': {'angles': ['Sensor_6']}}, + {'name': 'duplicate_values_to_nan'}, + {'name': 'low_unique_value_filter',} + ] + ) + # legacy set up + self.standard_preprocessor_old = DataPreprocessor( max_nan_frac_per_col=0.2, imputer_strategy='mean', angles=['Sensor_6'], @@ -20,17 +31,21 @@ def setUp(self) -> None: include_low_unique_value_filter=True, min_unique_value_count=2, ) - - self.ts_preprocessor = DataPreprocessor( - max_nan_frac_per_col=0.2, - min_unique_value_count=3, - max_col_zero_frac=0.30, - include_column_selector=True, - include_duplicate_value_to_nan=False, - include_low_unique_value_filter=True + self.another_preprocessor = DataPreprocessor( + steps=[ + {'name': 'column_selector', + 'params': {'max_nan_frac_per_col': 0.2}}, + {'name': 'angle_transform', + 'params': {'angles': ['Sensor_6']}}, + {'name': 'duplicate_values_to_nan', + 'params': {'n_max_duplicates': 4, + 'value_to_replace': 0}}, + {'name': 'low_unique_value_filter', + 'params': {'min_unique_value_count': 1}}, + ] ) - - self.extended_preprocessor = DataPreprocessor( + # legacy set up + self.another_preprocessor_old = DataPreprocessor( max_nan_frac_per_col=0.2, imputer_strategy='mean', min_unique_value_count=1, @@ -41,8 +56,16 @@ def setUp(self) -> None: include_duplicate_value_to_nan=True, include_low_unique_value_filter=True ) - + # Feature consistent, does not drop columns self.fc_preprocessor = DataPreprocessor( + steps=[ + {'name': 'column_selector', 'enabled': False}, + {'name': 'angle_transform', + 'params': {'angles': ['Sensor_6']}}, + ] + ) + # legacy set up + self.fc_preprocessor_old = DataPreprocessor( imputer_strategy='mean', angles=['Sensor_6'], include_low_unique_value_filter=False, @@ -94,12 +117,14 @@ def setUp(self) -> None: self.test_data3 = pd.DataFrame(index=time_index, data=data) def test_fit_standard_preprocessor(self): + self.standard_preprocessor_old.fit(self.test_data1) + check_is_fitted(self.standard_preprocessor_old.named_steps['scaler']) self.standard_preprocessor.fit(self.test_data1) check_is_fitted(self.standard_preprocessor.named_steps['scaler']) def test_fit_extended(self): - self.extended_preprocessor.fit(self.test_data3) - check_is_fitted(self.extended_preprocessor.named_steps['scaler']) + self.another_preprocessor_old.fit(self.test_data3) + check_is_fitted(self.another_preprocessor_old.named_steps['scaler']) def test_transform(self): # expected output @@ -139,8 +164,8 @@ def test_transform_extended(self): sincos = (sincos - sincos.mean(axis=0)) / sincos.std(axis=0) exp_result = np.hstack([exp_result, sincos]) - self.extended_preprocessor.fit(self.test_data3) - data = self.extended_preprocessor.transform(self.test_data3) + self.another_preprocessor_old.fit(self.test_data3) + data = self.another_preprocessor_old.transform(self.test_data3) assert_array_almost_equal(data, exp_result) @@ -156,8 +181,8 @@ def test_transform_fc(self): [1.21854359, 1.22474487, 0., 0., 0., 1.21773319, -1.32214018], [1.5666989, 1.63299316, 0., 0., 0., 1.56338116, -1.95410719]]) - self.fc_preprocessor.fit(self.test_data1) - data = self.fc_preprocessor.transform(self.test_data1) + self.fc_preprocessor_old.fit(self.test_data1) + data = self.fc_preprocessor_old.transform(self.test_data1) assert_array_almost_equal(data, exp_result) @@ -166,49 +191,134 @@ def test_not_fitted(self): self.standard_preprocessor.transform(self.test_data1) with self.assertRaises(NotFittedError): - self.ts_preprocessor.transform(self.test_data1) + self.another_preprocessor.transform(self.test_data1) def test_inverse_transform(self): - self.standard_preprocessor.fit(self.test_data1) + for preprocessor in [self.standard_preprocessor, self.standard_preprocessor_old]: + preprocessor.fit(self.test_data1) - output = self.standard_preprocessor.inverse_transform( - self.standard_preprocessor.transform(self.test_data1) - ).astype(float) - expected = self.test_data1[['Sensor_1', 'Sensor_2', 'Sensor_6']].astype(float) - expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. + output = preprocessor.inverse_transform( + preprocessor.transform(self.test_data1) + ).astype(float) + expected = self.test_data1[['Sensor_1', 'Sensor_2', 'Sensor_6']].astype(float) + expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. - assert_frame_equal( - output.reset_index(drop=True), - expected.reset_index(drop=True), - ) + assert_frame_equal( + output.reset_index(drop=True), + expected.reset_index(drop=True), + ) def test_inverse_transform_extended(self): - self.extended_preprocessor.fit(self.test_data3) - - output = self.extended_preprocessor.inverse_transform( - self.extended_preprocessor.transform(self.test_data3) - ).astype(float) - expected = self.test_data3[['Sensor_1', 'Sensor_2', 'Sensor_6', 'Sensor_7']].astype(float) - expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. - expected.loc['2021-05-02 08:00:00', 'Sensor_7'] = 0.555556 - - assert_frame_equal( - output.reset_index(drop=True), - expected.reset_index(drop=True), - ) + for preprocessor in [self.another_preprocessor, self.another_preprocessor_old]: + preprocessor.fit(self.test_data3) + + output = preprocessor.inverse_transform( + preprocessor.transform(self.test_data3) + ).astype(float) + expected = self.test_data3[['Sensor_1', 'Sensor_2', 'Sensor_6', 'Sensor_7']].astype(float) + expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. + expected.loc['2021-05-02 08:00:00', 'Sensor_7'] = 0.555556 + + assert_frame_equal( + output.reset_index(drop=True), + expected.reset_index(drop=True), + ) def test_inverse_transform_fc(self): - self.fc_preprocessor.fit(self.test_data1) - - output = self.fc_preprocessor.inverse_transform( - self.fc_preprocessor.transform(self.test_data1) - ).astype(float) - expected = self.test_data1.astype(float) - expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. - expected.loc[pd.isnull(expected['Sensor_3']), 'Sensor_3'] = 2. - expected.loc[pd.isnull(expected['Sensor_4']), 'Sensor_4'] = 0. - - assert_frame_equal( - output.reset_index(drop=True), - expected.reset_index(drop=True), + for preprocessor in [self.fc_preprocessor, self.fc_preprocessor_old]: + preprocessor.fit(self.test_data1) + + output = preprocessor.inverse_transform( + preprocessor.transform(self.test_data1) + ).astype(float) + expected = self.test_data1.astype(float) + expected.loc[pd.isnull(expected['Sensor_2']), 'Sensor_2'] = 5. + expected.loc[pd.isnull(expected['Sensor_3']), 'Sensor_3'] = 2. + expected.loc[pd.isnull(expected['Sensor_4']), 'Sensor_4'] = 0. + + assert_frame_equal( + output.reset_index(drop=True), + expected.reset_index(drop=True), + ) + + def test_steps_mode_no_duplicate_imputer(self) -> None: + """Providing 'simple_imputer' explicitly should not add a second default imputer.""" + dp = DataPreprocessor( + steps=[ + {"name": "column_selector", "params": {"max_nan_frac_per_col": 0.2}}, + {"name": "simple_imputer", "params": {"strategy": "median"}}, + {"name": "standard_scaler"}, + ] + ) + # Count imputers by estimator type + n_imputers = sum( + est.__class__.__name__ == "SimpleImputer" for _, est in dp.steps + ) + self.assertEqual(n_imputers, 1, "There should be exactly one SimpleImputer.") + + # Ensure imputer precedes scaler + imputer_idx = next( + i for i, (_, est) in enumerate(dp.steps) if est.__class__.__name__ == "SimpleImputer" + ) + scaler_idx = next( + i for i, (_, est) in enumerate(dp.steps) + if est.__class__.__name__ in {"StandardScaler", "MinMaxScaler"} + ) + self.assertLess(imputer_idx, scaler_idx, "Imputer must precede scaler.") + + def test_steps_mode_default_imputer_inserted(self) -> None: + """Omitting 'simple_imputer' should auto-insert a default imputer before the scaler.""" + dp = DataPreprocessor( + steps=[ + {"name": "column_selector", "params": {"max_nan_frac_per_col": 0.2}}, + {"name": "standard_scaler"}, + ] + ) + # Exactly one imputer should be present + n_imputers = sum( + est.__class__.__name__ == "SimpleImputer" for _, est in dp.steps + ) + self.assertEqual(n_imputers, 1, "A single default SimpleImputer should be added.") + + # Imputer must be before scaler + imputer_idx = next( + i for i, (_, est) in enumerate(dp.steps) if est.__class__.__name__ == "SimpleImputer" + ) + scaler_idx = next( + i for i, (_, est) in enumerate(dp.steps) + if est.__class__.__name__ in {"StandardScaler", "MinMaxScaler"} + ) + self.assertLess(imputer_idx, scaler_idx, "Default imputer must be inserted before scaler.") + + def test_steps_mode_alias_imputer_is_normalized(self) -> None: + """Using 'imputer' alias should be normalized to 'simple_imputer' internally.""" + dp = DataPreprocessor( + steps=[ + {"name": "imputer", "params": {"strategy": "mean"}}, # alias + {"name": "standard_scaler"}, + ] ) + # Named steps should include the canonical 'simple_imputer' + self.assertIn("simple_imputer", dp.named_steps) + + def test_singleton_violation_raises(self) -> None: + """Two enabled simple_imputer steps should raise a ValueError.""" + with self.assertRaises(ValueError): + _ = DataPreprocessor( + steps=[ + {"name": "simple_imputer", "params": {"strategy": "mean"}}, + {"name": "simple_imputer", "params": {"strategy": "median"}}, + {"name": "standard_scaler"}, + ] + ) + + def test_only_one_scaler_allowed(self) -> None: + """Defining more than one scaler should raise a ValueError.""" + with self.assertRaises(ValueError): + _ = DataPreprocessor( + steps=[ + {"name": "column_selector", "params": {"max_nan_frac_per_col": 0.2}}, + {"name": "standard_scaler"}, + {"name": "minmax_scaler"}, + ] + ) diff --git a/tests/test_data/ensemble_config.yaml b/tests/test_data/ensemble_config.yaml deleted file mode 100644 index c610063..0000000 --- a/tests/test_data/ensemble_config.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# Model settings -train: - anomaly_score: - name: 'mahalanobis' - params: - pca: true - pca_min_var: 0.85 - data_preprocessor: - name: 'default' - autoencoder: - name: 'EnsembleAutoencoder' - params: - hyperparams_list: - - layers: [300] - code_size: 50 - learning_rate: 0.001 - decay_rate: 0.001 - batch_size: 144 - epochs: 10 - loss_name: 'mean_squared_error' - - layers: [200] - code_size: 25 - learning_rate: 0.001 - decay_rate: 0.001 - batch_size: 144 - epochs: 20 - loss_name: 'mean_squared_error' - threshold_selector: - name: 'FDR' - params: - target_false_discovery_rate: 0.8 diff --git a/tests/test_data/test_bad_early_stopping_config.yaml b/tests/test_data/test_bad_early_stopping_config.yaml index 76c5908..f6e9284 100644 --- a/tests/test_data/test_bad_early_stopping_config.yaml +++ b/tests/test_data/test_bad_early_stopping_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_conditional_ae_config.yaml b/tests/test_data/test_conditional_ae_config.yaml index f020a4a..66983fe 100644 --- a/tests/test_data/test_conditional_ae_config.yaml +++ b/tests/test_data/test_conditional_ae_config.yaml @@ -4,14 +4,7 @@ train: params: pca: true pca_min_var: 0.85 - data_preprocessor: - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 - include_duplicate_value_to_nan: false + data_preprocessor: # no further spec, so defaults are applied autoencoder: name: 'ConditionalAE' verbose: 0 diff --git a/tests/test_data/test_config.yaml b/tests/test_data/test_config.yaml index c6cf979..2f093ef 100644 --- a/tests/test_data/test_config.yaml +++ b/tests/test_data/test_config.yaml @@ -4,14 +4,7 @@ train: params: pca: true pca_min_var: 0.85 - data_preprocessor: - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 - include_duplicate_value_to_nan: false + data_preprocessor: # no further spec, so defaults are applied autoencoder: name: 'MultilayerAutoencoder' verbose: 0 diff --git a/tests/test_data/test_config_no_rca.yaml b/tests/test_data/test_config_no_rca.yaml index 7c83e0b..aa32fdc 100644 --- a/tests/test_data/test_config_no_rca.yaml +++ b/tests/test_data/test_config_no_rca.yaml @@ -5,13 +5,6 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_criticality_config.yaml b/tests/test_data/test_criticality_config.yaml index 7165be0..f366d4c 100644 --- a/tests/test_data/test_criticality_config.yaml +++ b/tests/test_data/test_criticality_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_early_stopping_val_block_config.yaml b/tests/test_data/test_early_stopping_val_block_config.yaml index 95ab325..62aad09 100644 --- a/tests/test_data/test_early_stopping_val_block_config.yaml +++ b/tests/test_data/test_early_stopping_val_block_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_early_stopping_val_split_config.yaml b/tests/test_data/test_early_stopping_val_split_config.yaml index f1b256b..eb0d903 100644 --- a/tests/test_data/test_early_stopping_val_split_config.yaml +++ b/tests/test_data/test_early_stopping_val_split_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: diff --git a/tests/test_data/test_export_default_adaptive.yaml b/tests/test_data/test_export_default_adaptive.yaml deleted file mode 100644 index 77b0d8b..0000000 --- a/tests/test_data/test_export_default_adaptive.yaml +++ /dev/null @@ -1,42 +0,0 @@ -train: - data_clipping: # (optional) if not specified, not applied. - # clip training data to remove outliers - lower_percentile: 0.01 - upper_percentile: 0.99 - - data_preprocessor: - params: - features_to_exclude: - - a - - b - max_nan_frac_per_col: 0.2 - imputer_strategy: mean - min_unique_value_count: 1 - angles: - - c - - data_splitter: - # use train_test_split without shuffle for LSTM and CNN models! - type: sklearn - validation_split: 0.3 - shuffle: False - - autoencoder: - name: default - params: - batch_size: 8 - learning_rate: 0.001 - epochs: 10 - code_size: 1 - layers: - - 5 - loss_name: mean_squared_error - verbose: 0 - - anomaly_score: - name: mahalanobis - - threshold_selector: - name: adaptive - params: - gamma: 0.1 \ No newline at end of file diff --git a/tests/test_data/test_export_full_prep_cnn_ad.yaml b/tests/test_data/test_export_full_prep_cnn_ad.yaml deleted file mode 100644 index c62b556..0000000 --- a/tests/test_data/test_export_full_prep_cnn_ad.yaml +++ /dev/null @@ -1,53 +0,0 @@ -train: - data_clipping: # (optional) if not specified, not applied. - # clip training data to remove outliers - lower_percentile: 0.01 - upper_percentile: 0.99 - - data_preprocessor: - params: - features_to_exclude: - - a - - b - max_nan_frac_per_col: 0.2 - imputer_strategy: mean - min_unique_value_count: 1 - angles: - - c - n_max_duplicates: 4 - value_to_replace: 0 - include_column_selector: True - include_duplicate_value_to_nan: True - include_low_unique_value_filter: True - ts_features: - - 'day_of_year' - - 'hour_of_day' - verbose: 0 - - data_splitter: - # use train_test_split without shuffle for LSTM and CNN models! - type: sklearn - validation_split: 0.5 - shuffle: False - - autoencoder: - name: CNN - # Data sampler configuration (only for Seq2Seq models) - time_series_sampler: - sequence_length: 2 - overlap: 0 # ignored in random mode, used in inference phase - params: - batch_size: 8 - learning_rate: 0.001 - epochs: 1 - filters: - - 5 - loss_name: mean_squared_error - - anomaly_score: - name: rmse - - threshold_selector: - name: fbeta - params: - beta: 0.5 \ No newline at end of file diff --git a/tests/test_data/verbose_config.yaml b/tests/test_data/verbose_config.yaml index c9eaec9..bfebf23 100644 --- a/tests/test_data/verbose_config.yaml +++ b/tests/test_data/verbose_config.yaml @@ -5,13 +5,12 @@ train: pca: true pca_min_var: 0.85 data_preprocessor: - name: 'standard_preprocessor' - params: - max_nan_frac_per_col: 0.05 - imputer_strategy: 'mean' - features_to_exclude: - - feature1 - - feature2 + steps: + - name: column_selector + params: + features_to_exclude: + - feature1 + - feature2 autoencoder: name: 'MultilayerAutoencoder' params: