Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ results = fault_detector.predict(sensor_data=test_sensor_data)

The pandas `DataFrame` `sensor_data` contains the operational data in wide format with the timestamp as index, the
pandas `Series` `normal_index` indicates which timestamps are considered 'normal' operation and can be used to create
a normal behaviour model. The [`base_config.yaml`](energy_fault_detector/base_config.yaml) file contains all model
a normal behaviour model. The [`base_config.yaml`](energy_fault_detector/base_config.yaml) file contains the model
settings, an example is found [here](energy_fault_detector/base_config.yaml).


Expand Down Expand Up @@ -100,12 +100,17 @@ This project is licensed under the [MIT License](./LICENSE).
## References
If you use this work, please cite us:

**Fault detection in district heating substations**:
- Enabling Predictive Maintenance in District Heating Substations: A Labelled Dataset and Fault Detection Evaluation Framework based on Service Data.
PrePrint on ArXiv. https://doi.org/10.48550/arXiv.2511.14791
- Dataset: PreDist Dataset - Operational data of district heating substations labelled with faults and maintenance information. Zenodo, Nov 2025, https://doi.org/10.5281/zenodo.17522254.

**ARCANA Algorithm**:
Autoencoder-based anomaly root cause analysis for wind turbines. Energy and AI. 2021;4:100065. https://doi.org/10.1016/j.egyai.2021.100065

**CARE to Compare dataset and CARE-Score**:
- Paper: CARE to Compare: A Real-World Benchmark Dataset for Early Fault Detection in Wind Turbine Data. Data. 2024; 9(12):138. https://doi.org/10.3390/data9120138
- Dataset: Wind Turbine SCADA Data For Early Fault Detection. Zenodo, Mar. 2025, https://doi.org/10.5281/ZENODO.14958989.
- Dataset: Wind Turbine SCADA Data For Early Fault Detection. Zenodo, Oct. 2024, https://doi.org/10.5281/ZENODO.14958989.

**Transfer learning methods**:
Transfer learning applications for autoencoder-based anomaly detection in wind turbines. Energy and AI. 2024;17:100373. https://doi.org/10.1016/j.egyai.2024.100373
Expand Down
6 changes: 6 additions & 0 deletions energy_fault_detector/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
'train': {'type': 'dict', 'schema': TRAIN_SCHEMA, 'required': False, 'allow_unknown': True},
'predict': {'type': 'dict', 'schema': PREDICT_SCHEMA, 'required': False},
'root_cause_analysis': {'type': 'dict', 'schema': ROOT_CAUSE_ANALYSIS_SCHEMA, 'required': False},
'dtype': {'type': 'string', 'required': False, 'allowed': ['float32', 'float64']}
}


Expand Down Expand Up @@ -203,3 +204,8 @@ def fit_threshold_on_val(self) -> bool:
def verbose(self) -> int:
"""Verbosity Level of the Autoencoder."""
return self.config_dict.get('train', {}).get('autoencoder', {}).get('verbose', 1)

@property
def dtype(self):
"""Data type, float32 by default."""
return self.config_dict.get('dtype', 'float32')
9 changes: 5 additions & 4 deletions energy_fault_detector/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""This module contains class templates for most of the anomaly detection classes, such as
autoencoders, anomaly scores, threshold selectors and data classes."""

from energy_fault_detector.core.anomaly_score import AnomalyScore
from energy_fault_detector.core.autoencoder import Autoencoder
from energy_fault_detector.core.data_transformer import DataTransformer
from energy_fault_detector.core.threshold_selector import ThresholdSelector
from .anomaly_score import AnomalyScore
from .autoencoder import Autoencoder
from .data_transformer import DataTransformer
from .threshold_selector import ThresholdSelector
from .fault_detection_result import FaultDetectionResult, ModelMetadata
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
"""Logging settings"""

import os
from pathlib import Path
import logging.config as logging_config

import yaml


def setup_logging(default_path: str = 'logging.yaml', env_key: str = 'LOG_CFG') -> None:
def setup_logging(default_path: str | Path = 'logging.yaml', env_key: str = 'LOG_CFG') -> None:
"""Setup logging configuration

Args:
default_path (str): default logging configuration file. Default is 'logging.yaml'
default_path (str or Path): default logging configuration file. Default is 'logging.yaml'
env_key (str): Environment variable holding logging config file path (overrides default_path). Default is
'LOG_CFG'
"""

path = default_path
path = Path(default_path)
value = os.getenv(env_key, None)
if value:
path = value
path = Path(value)

try:
with open(path, 'rt', encoding='utf-8') as f:
config = yaml.safe_load(f.read())
# check paths exist or create them:
for _, handler in config['handlers'].items():
if handler.get('filename'):
dirname = os.path.dirname(handler['filename'])
if dirname != '' and not os.path.exists(dirname):
os.makedirs(dirname)
filename = handler.get('filename')
if filename:
# Resolve path and create parent directories if they don't exist
Path(filename).parent.mkdir(parents=True, exist_ok=True)

logging_config.dictConfig(config)
except Exception as e:
Expand Down
19 changes: 11 additions & 8 deletions energy_fault_detector/core/fault_detection_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import os
from abc import ABC, abstractmethod
from typing import Any, Optional, Union, List, Tuple
from typing import Optional, Union, List, Tuple
import logging
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np
Expand All @@ -16,10 +17,10 @@
from energy_fault_detector.core.model_factory import ModelFactory
from energy_fault_detector.core.fault_detection_result import ModelMetadata, FaultDetectionResult
from energy_fault_detector.data_preprocessing import DataPreprocessor
from energy_fault_detector._logs import setup_logging
from energy_fault_detector.core._logs import setup_logging
from energy_fault_detector.data_splitting.data_splitter import BlockDataSplitter

setup_logging(os.path.join(os.path.dirname(__file__), '..', 'logging.yaml'))
setup_logging(Path(__file__).parent.parent / 'logging.yaml')
logger = logging.getLogger('energy_fault_detector')

DATA_PREP_DIR = 'data_preprocessor'
Expand All @@ -28,6 +29,8 @@
SCORE_DIR = 'anomaly_score'

DataType = Union[pd.DataFrame, np.ndarray, List]
PathLike = Union[str, Path]
ModelPart = Union[DataPreprocessor, Autoencoder, AnomalyScore, ThresholdSelector]


class NoTrainingData(Exception):
Expand All @@ -50,9 +53,9 @@ class FaultDetectionModel(ABC):
save_timestamps: a list of string timestamps, indicating when the model was saved.
"""

def __init__(self, config: Optional[Config] = None, model_directory: str = 'models'):
def __init__(self, config: Optional[Config] = None, model_directory: PathLike = 'models'):
self.config: Optional[Config] = config
self.model_directory: str = model_directory
self.model_directory: PathLike = model_directory

self.anomaly_score: Optional[AnomalyScore] = None
self.autoencoder: Optional[Autoencoder] = None
Expand Down Expand Up @@ -191,11 +194,11 @@ def save_models(self, model_name: Union[str, int] = None, overwrite: bool = Fals

return os.path.abspath(model_dir), current_datetime

def load_models(self, model_path: str) -> None:
def load_models(self, model_path: PathLike) -> None:
"""Load saved models given the model path.

Args:
model_path: Path to the model files.
model_path (str, Path): Path to the model files.
"""

data_prep_dir = os.path.join(model_path, DATA_PREP_DIR)
Expand All @@ -221,7 +224,7 @@ def load_models(self, model_path: str) -> None:
self._model_factory = ModelFactory(self.config)

@staticmethod
def _load_pickled_model(model_type: str, model_directory: str):
def _load_pickled_model(model_type: str, model_directory: str) -> ModelPart:
"""Load a pickled model of given type, using file name (which is the class name)."""
model_class_name = os.listdir(model_directory)[0].split('.')[0]
if model_type != 'data_preprocessor':
Expand Down
96 changes: 82 additions & 14 deletions energy_fault_detector/core/fault_detection_result.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@

import os
from typing import Optional, List
from dataclasses import dataclass
from pathlib import Path

import pandas as pd
import numpy as np

from ..utils.analysis import calculate_criticality


@dataclass
class FaultDetectionResult:
Expand All @@ -27,43 +29,109 @@ class FaultDetectionResult:
"""DataFrame with ARCANA results (ARCANA bias). None if ARCANA was not run."""

arcana_losses: Optional[pd.DataFrame] = None
"""DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run."""
"""DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run.
Empty if losses were not tracked."""

tracked_bias: Optional[List[pd.DataFrame]] = None
"""List of DataFrames containing the ARCANA bias every 50th iteration. None if ARCANA was not run."""
"""List of DataFrames containing the ARCANA bias every 50th iteration. None if ARCANA was not run.
Empty if bias was not tracked."""

def criticality(self, normal_idx: pd.Series | None = None, init_criticality: int = 0, max_criticality: int = 1000
) -> pd.Series:
"""Criticality based on the predicted anomalies.

Args:
normal_idx (pd.Series, optional): A pandas Series with boolean values indicating normal operation, indexed
by timestamp. Ignored if None.
init_criticality (int, optional): The initial criticality value. Defaults to 0.
max_criticality (int, optional): The maximum criticality value. Defaults to 1000.

"""
return calculate_criticality(self.predicted_anomalies, normal_idx, init_criticality, max_criticality)

def save(self, directory: str, **kwargs) -> None:
def save(self, directory: str | Path, **kwargs) -> None:
"""Saves the results to CSV files in the specified directory.

Args:
directory (str): The directory where the CSV files will be saved.
kwargs: other keywords args for `pd.DataFrame.to_csv`
kwargs: other keywords args for `pd.DataFrame.to_csv` (i.e. sep=',')
"""
# Ensure the directory exists
os.makedirs(directory, exist_ok=True)
directory = Path(directory)
directory.mkdir(exist_ok=True, parents=True)

# Save each DataFrame as a CSV file
self.predicted_anomalies.to_csv(os.path.join(directory, 'predicted_anomalies.csv'), **kwargs)
self.reconstruction.to_csv(os.path.join(directory, 'reconstruction.csv'), **kwargs)
self.recon_error.to_csv(os.path.join(directory, 'reconstruction_errors.csv'), **kwargs)
self.anomaly_score.to_csv(os.path.join(directory, 'anomaly_scores.csv'), **kwargs)
self.predicted_anomalies.to_csv(directory / 'predicted_anomalies.csv', **kwargs)
self.reconstruction.to_csv(directory / 'reconstruction.csv', **kwargs)
self.recon_error.to_csv(directory / 'reconstruction_errors.csv', **kwargs)
self.anomaly_score.to_csv(directory / 'anomaly_scores.csv', **kwargs)

if self.bias_data is not None:
self.bias_data.to_csv(os.path.join(directory, 'bias_data.csv'), **kwargs)
self.bias_data.to_csv(directory / 'bias_data.csv', **kwargs)

if self.arcana_losses is not None:
self.arcana_losses.to_csv(os.path.join(directory, 'arcana_losses.csv'), **kwargs)
self.arcana_losses.to_csv(directory / 'arcana_losses.csv', **kwargs)

if self.tracked_bias is not None and len(self.tracked_bias) > 0:
for idx, bias_df in enumerate(self.tracked_bias):
bias_df.to_csv(os.path.join(directory, f'tracked_bias_{idx}.csv'), **kwargs)
bias_df.to_csv(directory / f'tracked_bias_{idx}.csv', **kwargs)

@classmethod
def load(cls, directory: str | Path, **kwargs) -> "FaultDetectionResult":
"""Loads the results from CSV files in the specified directory.

Args:
directory (str | Path): The directory where the CSV files are stored.
kwargs: other keywords args for `pd.read_csv` (e.g., sep=',')

Returns:
FaultDetectionResult: The loaded result object.
"""
directory = Path(directory)

# Default pandas loading arguments to ensure indices are restored correctly
params = {'index_col': 0, 'parse_dates': True}
params.update(kwargs)

# Load mandatory fields
predicted_anomalies = pd.read_csv(directory / 'predicted_anomalies.csv', **params).iloc[:, 0]
# Ensure predicted_anomalies is explicitly a Series and boolean
predicted_anomalies = predicted_anomalies.astype(bool)

reconstruction = pd.read_csv(directory / 'reconstruction.csv', **params)
recon_error = pd.read_csv(directory / 'reconstruction_errors.csv', **params)
anomaly_score = pd.read_csv(directory / 'anomaly_scores.csv', **params).iloc[:, 0]

# Load optional fields if they exist
bias_data = None
if (directory / 'bias_data.csv').exists():
bias_data = pd.read_csv(directory / 'bias_data.csv', **params)

arcana_losses = None
if (directory / 'arcana_losses.csv').exists():
arcana_losses = pd.read_csv(directory / 'arcana_losses.csv', **params)

tracked_bias = None
tracked_files = sorted(directory.glob('tracked_bias_*.csv'))
if tracked_files:
tracked_bias = [pd.read_csv(f, **params) for f in tracked_files]

return cls(
predicted_anomalies=predicted_anomalies,
reconstruction=reconstruction,
recon_error=recon_error,
anomaly_score=anomaly_score,
bias_data=bias_data,
arcana_losses=arcana_losses,
tracked_bias=tracked_bias
)


@dataclass
class ModelMetadata:
"""Class to encapsulate metadata about the FaultDetector model."""

model_date: str
model_path: str
model_path: str | Path
train_recon_error: np.ndarray
val_recon_error: Optional[np.ndarray] = None
5 changes: 3 additions & 2 deletions energy_fault_detector/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Evaluation classes and methods, including the CARE-Score and Care2CompareDataset."""

from energy_fault_detector.evaluation.care_score import CAREScore
from energy_fault_detector.evaluation.care2compare import Care2CompareDataset
from .care_score import CAREScore
from .care2compare import Care2CompareDataset
from .predist_dataset import PreDistDataset
3 changes: 0 additions & 3 deletions energy_fault_detector/evaluation/care2compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ class Care2CompareDataset:
The data can be downloaded either manually from https://doi.org/10.5281/zenodo.14958989 (in this case specify
`path`) or it can be downloaded automatically by setting download_dataset to True.

All data is loaded into memory, which might be problematic for large datasets (consider using DataLoader classes of
TensorFlow and PyTorch in that case).

By default, only the averages are read. See statistics argument of the data loading methods.

Method overview:
Expand Down
Loading