From b09c63ac05b964559ce88671fd1a9c019dddbec8 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Sun, 5 Oct 2025 07:10:16 -0700 Subject: [PATCH 01/42] Add mode switching and metadata handling to quick CLI --- README.md | 26 ++++++-- energy_fault_detector/main.py | 37 ++++++++++- .../quick_fault_detector.py | 63 ++++++++++++++----- 3 files changed, 104 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 289a4ee..ad29e2c 100644 --- a/README.md +++ b/README.md @@ -36,14 +36,30 @@ To install the `energy-fault-detector` package, run: `pip install energy-fault-d ## Quick fault detection -For a quick demo on a specific dataset, run: +The `quick_fault_detector` CLI now supports dedicated training and prediction workflows: -```quick_fault_detector ``` +- **Train and evaluate a model** (default mode). This trains a new autoencoder, evaluates it on the provided + test slice, and reports where the model artefacts were stored. -For more options, run ```quick_fault_detector -h```. + ```bash + quick_fault_detector --mode train [--options options.yaml] + ``` -For an example using one of the CARE2Compare datasets, run: -```quick_fault_detector --c2c_example``` +- **Run predictions with an existing model**. Supply the dataset to score alongside the directory that contains the + saved model files returned from a previous training run. + + ```bash + quick_fault_detector --mode predict --model_path [--options options.yaml] + ``` + + The `--model_path` argument is mandatory in predict mode. + +Prediction artefacts (anomaly scores, reconstructions, and detected events) are written to the directory specified by +`--results_dir` (defaults to `./results`). For an example using one of the CARE2Compare datasets, run: + +```bash +quick_fault_detector --c2c_example +``` For more information, have a look at the notebook [Quick Failure Detection](./notebooks/Example%20-%20Quick%20Failure%20Detection.ipynb) diff --git a/energy_fault_detector/main.py b/energy_fault_detector/main.py index 3f379dd..f47ca61 100644 --- a/energy_fault_detector/main.py +++ b/energy_fault_detector/main.py @@ -92,6 +92,19 @@ def main(): help='Path to a directory where results will be saved.', default='results' ) + parser.add_argument( + '--mode', + type=str, + choices=['train', 'predict'], + default='train', + help="Run in 'train' mode to fit a new model or 'predict' to load an existing one." + ) + parser.add_argument( + '--model_path', + type=str, + help='Path to a previously saved model directory (required for predict mode).', + default=None + ) parser.add_argument( '--c2c_example', action='store_true', @@ -107,6 +120,7 @@ def main(): logger.info(f"Options YAML: {args.options}") logger.info(f"Results Directory: {args.results_dir}") + logger.info(f"Execution Mode: {args.mode}") os.makedirs(args.results_dir, exist_ok=True) options = Options() # Initialize with default values @@ -117,10 +131,13 @@ def main(): print(options) + if args.mode == 'predict' and args.model_path is None: + parser.error('--model_path must be provided when --mode is set to predict.') + # Call the quick_fault_detector function with parsed arguments try: from .quick_fault_detection import quick_fault_detector - prediction_results, event_meta_data = quick_fault_detector( + prediction_results, event_meta_data, model_metadata = quick_fault_detector( csv_data_path=args.csv_data_path, csv_test_data_path=options.csv_test_data_path, train_test_column_name=options.train_test_column_name, @@ -135,10 +152,26 @@ def main(): enable_debug_plots=options.enable_debug_plots, min_anomaly_length=options.min_anomaly_length, save_dir=args.results_dir, + mode=args.mode, + model_path=args.model_path, ) logger.info(f'Fault detection completed. Results are saved in {args.results_dir}.') prediction_results.save(args.results_dir) - event_meta_data.to_csv(os.path.join(args.results_dir, 'events.csv'), index=False) + predicted_anomalies_path = os.path.join(args.results_dir, 'predicted_anomalies.csv') + logger.info('Prediction data stored at %s.', predicted_anomalies_path) + events_path = os.path.join(args.results_dir, 'events.csv') + event_meta_data.to_csv(events_path, index=False) + logger.info('Event metadata stored at %s.', events_path) + + anomaly_count = int(prediction_results.predicted_anomalies['anomaly'].sum()) + logger.info('Detected %d anomalous timestamps grouped into %d events.', anomaly_count, + len(event_meta_data)) + + if model_metadata is not None and model_metadata.model_path: + logger.info('Trained model saved to %s.', model_metadata.model_path) + print(f"Model saved to: {model_metadata.model_path}") + elif args.mode == 'predict' and args.model_path is not None: + logger.info('Predictions generated using model at %s.', args.model_path) except Exception as e: logger.error(f'An error occurred: {e}') diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index d226341..ba60d73 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -3,6 +3,10 @@ import os import logging from typing import List, Optional, Tuple +try: + from typing import Literal +except ImportError: # pragma: no cover - for Python<3.8 compatibility + from typing_extensions import Literal import pandas as pd @@ -10,7 +14,7 @@ from energy_fault_detector.fault_detector import FaultDetector from energy_fault_detector.utils.analysis import create_events from energy_fault_detector.root_cause_analysis.arcana_utils import calculate_mean_arcana_importances -from energy_fault_detector.core.fault_detection_result import FaultDetectionResult +from energy_fault_detector.core.fault_detection_result import FaultDetectionResult, ModelMetadata from energy_fault_detector.quick_fault_detection.data_loading import load_train_test_data from energy_fault_detector.quick_fault_detection.configuration import select_config @@ -27,8 +31,10 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = status_label_confidence_percentage: Optional[float] = 0.95, features_to_exclude: Optional[List[str]] = None, angle_features: Optional[List[str]] = None, automatic_optimization: bool = True, enable_debug_plots: bool = False, - min_anomaly_length: int = 18, save_dir: Optional[str] = None - ) -> Tuple[FaultDetectionResult, pd.DataFrame]: + min_anomaly_length: int = 18, save_dir: Optional[str] = None, + mode: Literal['train', 'predict'] = 'train', + model_path: Optional[str] = None + ) -> Tuple[FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]]: """Analyzes provided data using an autoencoder based approach for identifying anomalies based on a learned normal behavior. Anomalies are then aggregated to events and further analyzed. Runs the entire fault detection module chain in one function call. Sections of this function call are: @@ -81,9 +87,13 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = the FaultDetector threshold) to define an anomaly event. save_dir (Optional[str]): Directory to save the output plots. If not provided, the plots are not saved. Defaults to None. + mode (Literal['train', 'predict']): Determines whether the detector should train a new model or load an + existing one for prediction. Defaults to 'train'. + model_path (Optional[str]): Path to a previously saved model directory. Required when mode='predict'. Returns: - Tuple(FaultDetectionResult, pd.DataFrame): FaultDetectionResult object with the following DataFrames: + Tuple(FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]): FaultDetectionResult object with the + following DataFrames: - predicted_anomalies: DataFrame with a column 'anomaly' (bool). - reconstruction: DataFrame with reconstruction of the sensor data with timestamp as index. @@ -94,8 +104,17 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = - tracked_bias: List of DataFrames. None if ARCANA was not run. and the detected anomaly events as dataframe. + + When run in training mode, the tuple additionally contains ModelMetadata with information about the saved + model. In prediction mode, the metadata element is None. """ - logger.info('Starting Automated Energy Fault Detection and Identification.') + if mode not in {'train', 'predict'}: + raise ValueError(f"Unsupported mode '{mode}'. Please choose 'train' or 'predict'.") + + if mode == 'predict' and model_path is None: + raise ValueError('`model_path` must be provided when running in predict mode.') + + logger.info('Starting Automated Energy Fault Detection and Identification (mode=%s).', mode) logger.info('Loading Data...') train_data, train_normal_index, test_data = load_train_test_data(csv_data_path=csv_data_path, csv_test_data_path=csv_test_data_path, @@ -104,16 +123,30 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = time_column_name=time_column_name, status_data_column_name=status_data_column_name, status_mapping=status_mapping) - logger.info('Selecting suitable config...') - config = select_config(train_data=train_data, normal_index=train_normal_index, - status_label_confidence_percentage=status_label_confidence_percentage, - features_to_exclude=features_to_exclude, angles=angle_features, - automatic_optimization=automatic_optimization) - logger.info('Training a Normal behavior model.') - anomaly_detector = FaultDetector(config=config) - anomaly_detector.fit(sensor_data=train_data, normal_index=train_normal_index) + model_metadata: Optional[ModelMetadata] = None + + if mode == 'train': + logger.info('Selecting suitable config...') + config = select_config(train_data=train_data, normal_index=train_normal_index, + status_label_confidence_percentage=status_label_confidence_percentage, + features_to_exclude=features_to_exclude, angles=angle_features, + automatic_optimization=automatic_optimization) + logger.info('Training a Normal behavior model.') + anomaly_detector = FaultDetector(config=config) + model_metadata = anomaly_detector.fit(sensor_data=train_data, normal_index=train_normal_index) + if model_metadata.model_path: + logger.info('Saved trained model to %s.', model_metadata.model_path) + else: + logger.info('Model was trained but not saved to disk.') + root_cause_analysis = False + else: + logger.info('Loading pre-trained model from %s.', model_path) + anomaly_detector = FaultDetector() + anomaly_detector.load_models(model_path=model_path) + root_cause_analysis = True + logger.info('Evaluating Test data based on the learned normal behavior.') - prediction_results = anomaly_detector.predict(sensor_data=test_data, root_cause_analysis=False) + prediction_results = anomaly_detector.predict(sensor_data=test_data, root_cause_analysis=root_cause_analysis) anomalies = prediction_results.predicted_anomalies['anomaly'] # Find anomaly events event_meta_data, event_data_list = create_events(sensor_data=test_data, @@ -137,7 +170,7 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = test_data=test_data, arcana_losses=arcana_loss_list, arcana_mean_importances=arcana_mean_importance_list, event_meta_data=event_meta_data, save_dir=save_dir) - return prediction_results, event_meta_data + return prediction_results, event_meta_data, model_metadata def analyze_event(anomaly_detector: FaultDetector, event_data: pd.DataFrame, track_losses: bool From 92c4b6235e7da298763b937fe710bc8db5375b4a Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 6 Oct 2025 01:38:51 -0700 Subject: [PATCH 02/42] Add unsupervised behaviour model for sensor anomalies --- .../utils/unsupervised_behavior_model.py | 328 ++++++++++++++++++ .../utils/test_unsupervised_behavior_model.py | 67 ++++ 2 files changed, 395 insertions(+) create mode 100644 energy_fault_detector/utils/unsupervised_behavior_model.py create mode 100644 tests/utils/test_unsupervised_behavior_model.py diff --git a/energy_fault_detector/utils/unsupervised_behavior_model.py b/energy_fault_detector/utils/unsupervised_behavior_model.py new file mode 100644 index 0000000..64055a3 --- /dev/null +++ b/energy_fault_detector/utils/unsupervised_behavior_model.py @@ -0,0 +1,328 @@ +"""Unsupervised behaviour modelling for wind turbine sensor data. + +This module provides a lightweight, fully unsupervised workflow that can be +used when only raw sensor logs of a single wind turbine are available. The +``UnsupervisedBehaviorModel`` learns robust per-sensor operating ranges and a +multivariate anomaly detector (Isolation Forest) that jointly flag +observations deviating from historic patterns. It can be reused for any wind +farm by passing a :class:`pandas.DataFrame` with numerical sensor columns. + +Example +------- +>>> import pandas as pd +>>> from energy_fault_detector.utils.unsupervised_behavior_model import ( +... UnsupervisedBehaviorModel, +... ) +>>> # sensor logs of a single turbine (no labels required) +>>> df = pd.DataFrame({"power": [1.1, 1.2, 5.5], "wind_speed": [7.5, 7.4, 25]}) +>>> model = UnsupervisedBehaviorModel().fit(df) +>>> model.get_normal_ranges() + lower median upper +power ... ... ... +wind_speed ... ... ... +>>> predictions = model.predict(df) +>>> predictions["is_anomaly"].tolist() +[False, False, True] + +The implementation purposely favours transparency over complex modelling so +that domain experts can inspect the inferred ranges and understand why a +record has been labelled anomalous. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from os import PathLike +from typing import Dict, List, Optional, Sequence, Tuple + +import numpy as np +import pandas as pd +from sklearn.ensemble import IsolationForest +from sklearn.preprocessing import RobustScaler + + +@dataclass(frozen=True) +class SensorRange: + """Container for the inferred normal operating range of a sensor.""" + + lower: float + upper: float + + def contains(self, values: pd.Series) -> pd.Series: + """Return a boolean series indicating which values fall inside the range.""" + + return (values >= self.lower) & (values <= self.upper) + + +class UnsupervisedBehaviorModel: + """Identify anomalous sensor behaviour without labelled data. + + The model combines per-sensor robust statistics with an Isolation Forest to + flag outliers. Per-sensor ranges are derived from empirical quantiles and an + inter-quantile margin so the approach remains interpretable. The Isolation + Forest captures multivariate anomalies that may not breach individual + thresholds but still indicate abnormal behaviour when considering the + sensors jointly. + + Parameters + ---------- + contamination: + Expected fraction of anomalies in the historic data. This is forwarded + to the Isolation Forest. + quantile_bounds: + Low/high quantiles that represent typical behaviour. A Tukey-style + whisker is added on top of this interval to soften the range. + random_state: + Optional seed for deterministic Isolation Forest behaviour. + """ + + def __init__( + self, + contamination: float = 0.05, + quantile_bounds: Tuple[float, float] = (0.01, 0.99), + random_state: Optional[int] = 42, + ) -> None: + if not 0 < contamination < 0.5: + raise ValueError("`contamination` must be between 0 and 0.5.") + + low_q, high_q = quantile_bounds + if not 0.0 < low_q < high_q < 1.0: + raise ValueError( + "`quantile_bounds` must satisfy 0 < low_quantile < high_quantile < 1." + ) + + self.contamination: float = contamination + self.quantile_bounds: Tuple[float, float] = (low_q, high_q) + self.random_state: Optional[int] = random_state + + self._scaler: Optional[RobustScaler] = None + self._model: Optional[IsolationForest] = None + self._feature_names: Optional[pd.Index] = None + self._normal_ranges: Dict[str, SensorRange] = {} + self._feature_medians: Optional[pd.Series] = None + + @staticmethod + def _select_numeric_features(data: pd.DataFrame) -> pd.DataFrame: + """Return only numeric columns, coercing boolean values to integers.""" + + numeric = data.select_dtypes(include=[np.number, "bool"]).copy() + for column in numeric.columns: + if numeric[column].dtype == bool: + numeric[column] = numeric[column].astype(float) + return numeric.apply(pd.to_numeric, errors="coerce") + + def fit(self, data: pd.DataFrame) -> "UnsupervisedBehaviorModel": + """Learn per-sensor ranges and fit the Isolation Forest. + + Parameters + ---------- + data: + A DataFrame containing historic sensor readings for a single wind + turbine. Only numeric columns are used. Missing values are imputed + with the column medians. + """ + + numeric_data = self._select_numeric_features(data) + if numeric_data.empty: + raise ValueError("`data` must contain at least one numeric column.") + + self._feature_names = numeric_data.columns + self._feature_medians = numeric_data.median() + + self._normal_ranges = {} + low_q, high_q = self.quantile_bounds + for column in self._feature_names: + series = numeric_data[column].dropna() + if series.empty: + # Degenerate column, fall back to the median and allow a narrow band. + median_value = self._feature_medians[column] + margin = max(abs(median_value) * 0.05, 1e-9) + self._normal_ranges[column] = SensorRange( + lower=median_value - margin, + upper=median_value + margin, + ) + continue + + low = float(series.quantile(low_q)) + high = float(series.quantile(high_q)) + iqr = high - low + if np.isclose(iqr, 0.0): + std = float(series.std(ddof=0)) + margin = 3.0 * std if not np.isclose(std, 0.0) else max( + abs(series.iloc[0]) * 0.05, + 1e-9, + ) + else: + margin = 1.5 * iqr + + self._normal_ranges[column] = SensorRange(lower=low - margin, upper=high + margin) + + filled_data = numeric_data.fillna(self._feature_medians) + self._scaler = RobustScaler().fit(filled_data) + scaled = self._scaler.transform(filled_data) + + self._model = IsolationForest( + contamination=self.contamination, + random_state=self.random_state, + ) + self._model.fit(scaled) + + return self + + @property + def feature_names(self) -> pd.Index: + if self._feature_names is None: + raise RuntimeError("The model has not been fitted yet.") + return self._feature_names + + def get_normal_ranges(self) -> pd.DataFrame: + """Return the learnt normal operating ranges for each sensor.""" + + if self._feature_medians is None: + raise RuntimeError("The model has not been fitted yet.") + + records = [] + for name in self.feature_names: + sensor_range = self._normal_ranges[name] + records.append( + { + "sensor": name, + "lower": sensor_range.lower, + "median": self._feature_medians[name], + "upper": sensor_range.upper, + } + ) + return pd.DataFrame.from_records(records).set_index("sensor") + + def _prepare_prediction_frame(self, data: pd.DataFrame) -> pd.DataFrame: + if self._feature_medians is None: + raise RuntimeError("The model has not been fitted yet.") + + numeric_data = self._select_numeric_features(data) + missing_columns = [col for col in self.feature_names if col not in numeric_data] + for column in missing_columns: + numeric_data[column] = self._feature_medians[column] + numeric_data = numeric_data[self.feature_names] + return numeric_data.fillna(self._feature_medians) + + def predict(self, data: pd.DataFrame) -> pd.DataFrame: + """Label observations as normal or anomalous. + + Parameters + ---------- + data: + DataFrame with the same sensor columns that were used during + :meth:`fit`. Additional columns are ignored. The returned DataFrame + contains a boolean ``is_anomaly`` column together with diagnostic + information. + """ + + if self._model is None or self._scaler is None: + raise RuntimeError("The model has not been fitted yet.") + + numeric_data = self._prepare_prediction_frame(data) + scaled = self._scaler.transform(numeric_data) + + anomaly_scores = -self._model.score_samples(scaled) + isolation_flags = self._model.predict(scaled) == -1 + + # Range violations per sensor. + range_flags = {} + for column in self.feature_names: + sensor_range = self._normal_ranges[column] + range_flags[column] = ~sensor_range.contains(numeric_data[column]) + range_violation_df = pd.DataFrame(range_flags) + violation_counts = range_violation_df.sum(axis=1) + + def violated_columns(row: pd.Series) -> List[str]: + return row.index[row].tolist() + + violations = range_violation_df.apply(violated_columns, axis=1) + + result = pd.DataFrame(index=data.index) + result["anomaly_score"] = anomaly_scores + result["isolation_forest_flag"] = isolation_flags + result["range_violation_count"] = violation_counts + result["violated_sensors"] = violations + result["is_anomaly"] = isolation_flags | (violation_counts > 0) + + return result + + def explain(self, data: pd.DataFrame) -> pd.DataFrame: + """Return prediction results joined with the original sensor values. + + This helper simplifies reporting by concatenating ``predict`` output and + the respective sensor readings for quick inspection. + """ + + prediction = self.predict(data) + numeric_data = self._prepare_prediction_frame(data) + return pd.concat([numeric_data, prediction], axis=1) + + +def aggregate_wind_farm_data( + dataset_root: str | PathLike[str], + wind_farm: str = "B", + statistics: Optional[Sequence[str]] = None, + index_column: str = "id", +) -> pd.DataFrame: + """Aggregate Care-to-Compare data for a specific wind farm. + + The Care-to-Compare (C2C) dataset is organised per anomaly event. For + unsupervised modelling it is convenient to concatenate all prediction + segments into a single DataFrame. The function keeps only sensor columns and + drops the metadata columns. Normal/anomalous status labels are intentionally + ignored to emulate a purely unsupervised scenario. + + Parameters + ---------- + dataset_root: + Root path that contains the extracted C2C dataset. + wind_farm: + Select which wind farm ("A", "B" or "C") to load. + statistics: + Optional statistics suffixes passed to :class:`Care2CompareDataset` to + control which sensor aggregates are loaded. + index_column: + Column used as index when reading the CSV files ("id" or "time_stamp"). + """ + + from energy_fault_detector.evaluation.care2compare import Care2CompareDataset + + dataset = Care2CompareDataset(dataset_root) + frames: List[pd.DataFrame] = [] + if statistics is None: + stat_list: Optional[List[str]] = None + elif isinstance(statistics, str): + stat_list = [statistics] + else: + stat_list = list(statistics) + + for (event_data, _event_id) in dataset.iter_datasets( + wind_farm=wind_farm, test_only=True, statistics=stat_list, + index_column=index_column, + ): + sensor_data = event_data.drop( + columns=[ + column + for column in ["asset_id", "id", "time_stamp", "status_type_id", "train_test"] + if column in event_data.columns + ], + errors="ignore", + ) + frames.append(sensor_data) + + if not frames: + raise ValueError( + "No datasets were found. Verify that `dataset_root` points to the extracted Care-to-Compare data." + ) + + return pd.concat(frames, axis=0) + + +__all__ = [ + "SensorRange", + "UnsupervisedBehaviorModel", + "aggregate_wind_farm_data", +] + diff --git a/tests/utils/test_unsupervised_behavior_model.py b/tests/utils/test_unsupervised_behavior_model.py new file mode 100644 index 0000000..62b3fc7 --- /dev/null +++ b/tests/utils/test_unsupervised_behavior_model.py @@ -0,0 +1,67 @@ +"""Tests for the unsupervised behaviour model utility.""" + +from __future__ import annotations + +import pytest + +np = pytest.importorskip("numpy") +pd = pytest.importorskip("pandas") + +from energy_fault_detector.utils.unsupervised_behavior_model import ( + SensorRange, + UnsupervisedBehaviorModel, +) + + +def _create_synthetic_data(seed: int = 7) -> pd.DataFrame: + rng = np.random.default_rng(seed) + base = pd.DataFrame( + { + "power": rng.normal(loc=1.0, scale=0.05, size=512), + "wind_speed": rng.normal(loc=12.0, scale=0.5, size=512), + } + ) + return base + + +def test_sensor_range_contains_works_for_bounds() -> None: + sensor_range = SensorRange(lower=0.0, upper=10.0) + series = pd.Series([-1.0, 0.0, 5.0, 10.0, 11.0]) + + result = sensor_range.contains(series) + + np.testing.assert_array_equal(result.values, np.array([False, True, True, True, False])) + + +def test_fit_computes_normal_ranges() -> None: + data = _create_synthetic_data() + + model = UnsupervisedBehaviorModel(contamination=0.05, quantile_bounds=(0.05, 0.95), random_state=1) + model.fit(data) + normal_ranges = model.get_normal_ranges() + + assert list(normal_ranges.index) == ["power", "wind_speed"] + # Verify that medians are close to the expected centre point. + assert abs(normal_ranges.loc["power", "median"] - 1.0) < 0.01 + assert abs(normal_ranges.loc["wind_speed", "median"] - 12.0) < 0.1 + + +def test_predict_flags_outliers() -> None: + data = _create_synthetic_data() + model = UnsupervisedBehaviorModel(contamination=0.05, quantile_bounds=(0.05, 0.95), random_state=1) + model.fit(data) + + evaluation = pd.DataFrame( + { + "power": [1.02, 3.0], + "wind_speed": [12.1, 5.0], + } + ) + + predictions = model.predict(evaluation) + + assert predictions.loc[0, "is_anomaly"] is False + assert predictions.loc[1, "is_anomaly"] is True + assert predictions.loc[1, "range_violation_count"] >= 1 + assert "wind_speed" in predictions.loc[1, "violated_sensors"] + From 1e4fb379e7ec5d6c10a02f2ed3269c7c39b586f1 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 6 Oct 2025 03:56:03 -0700 Subject: [PATCH 03/42] Add asset dataset splitting utility --- .../utils/asset_dataset_splitter.py | 183 ++++++++++++++++++ tests/utils/test_asset_dataset_splitter.py | 48 +++++ 2 files changed, 231 insertions(+) create mode 100644 energy_fault_detector/utils/asset_dataset_splitter.py create mode 100644 tests/utils/test_asset_dataset_splitter.py diff --git a/energy_fault_detector/utils/asset_dataset_splitter.py b/energy_fault_detector/utils/asset_dataset_splitter.py new file mode 100644 index 0000000..47761b9 --- /dev/null +++ b/energy_fault_detector/utils/asset_dataset_splitter.py @@ -0,0 +1,183 @@ +"""Utility for splitting asset datasets into train and prediction CSV files. + +The module provides a command line interface that accepts the path to a +directory containing ``.csv`` files separated by semicolons. Each file is +expected to contain records for one or more assets identified by the +``asset_id`` column. + +The script creates two output files per asset: + +``train_.csv`` + Contains all records marked as ``train`` in the ``train_test`` column and + all records whose ``status_type_id`` is either 0 or 2. + +``predict_.csv`` + Contains all records whose ``status_type_id`` is 1, 3, 4 or 5 and all + records whose ``train_test`` value is ``prediction`` regardless of the + ``status_type_id`` value. + +Before saving, the script removes helper columns as well as columns that +contain ``_max``, ``_min`` or ``_std`` from the resulting files. +""" + +from __future__ import annotations + +import argparse +from collections import defaultdict +from pathlib import Path +from typing import Dict, Iterable, List + +import pandas as pd + + +STATUS_TYPE_NORMAL = {"0", "2"} +STATUS_TYPE_ANOMALY = {"1", "3", "4", "5"} +DROP_COLUMNS = { + "asset_id", + "train_test", + "train_test_bool", + "status_type_id", + "status_type_bool", +} +DROP_COLUMN_SUBSTRINGS = ("_max", "_min", "_std") + + +def _read_asset_frames(path: Path) -> Dict[str, List[pd.DataFrame]]: + """Read ``.csv`` files from ``path`` and group rows by ``asset_id``. + + Args: + path: Directory containing the source ``.csv`` files. + + Returns: + Mapping of asset identifier to a list of data frames containing rows + for that asset. + """ + + asset_frames: Dict[str, List[pd.DataFrame]] = defaultdict(list) + for csv_file in sorted(path.glob("*.csv")): + df = pd.read_csv(csv_file, sep=";", dtype=str) + if "asset_id" not in df.columns: + raise ValueError( + f"File '{csv_file}' does not contain required column 'asset_id'." + ) + + for asset_id, group in df.groupby("asset_id", sort=False): + asset_frames[str(asset_id)].append(group.reset_index(drop=True)) + + return asset_frames + + +def _combine_frames(frames: Iterable[pd.DataFrame]) -> pd.DataFrame: + """Combine a list of frames into a single data frame.""" + + return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() + + +def _clean_columns(df: pd.DataFrame) -> pd.DataFrame: + """Remove helper columns and columns containing ``_max``/``_min``/``_std``.""" + + to_drop = [col for col in df.columns if col in DROP_COLUMNS] + to_drop.extend( + col for col in df.columns if any(substr in col for substr in DROP_COLUMN_SUBSTRINGS) + ) + return df.drop(columns=to_drop, errors="ignore") + + +def _get_lowercase_series(df: pd.DataFrame, column: str) -> pd.Series: + series = df.get(column) + if series is None: + return pd.Series("", index=df.index, dtype=object) + return series.fillna("").astype(str).str.lower() + + +def _get_status_series(df: pd.DataFrame) -> pd.Series: + series = df.get("status_type_id") + if series is None: + return pd.Series("", index=df.index, dtype=object) + return series.fillna("").astype(str) + + +def _build_train_frame(df: pd.DataFrame) -> pd.DataFrame: + """Filter rows belonging to the training split.""" + + train_series = _get_lowercase_series(df, "train_test") + status_series = _get_status_series(df) + + is_train = train_series == "train" + is_normal_status = status_series.isin(STATUS_TYPE_NORMAL) + return df[is_train | is_normal_status].copy() + + +def _build_predict_frame(df: pd.DataFrame) -> pd.DataFrame: + """Filter rows belonging to the prediction split.""" + + train_series = _get_lowercase_series(df, "train_test") + status_series = _get_status_series(df) + + is_prediction = train_series == "prediction" + is_anomaly_status = status_series.isin(STATUS_TYPE_ANOMALY) + return df[is_prediction | is_anomaly_status].copy() + + +def _save_asset_frames(asset_id: str, df: pd.DataFrame, output_dir: Path) -> None: + """Create train and prediction files for ``asset_id`` from ``df``.""" + + if df.empty: + return + + train_df = _clean_columns(_build_train_frame(df)) + predict_df = _clean_columns(_build_predict_frame(df)) + + if not train_df.empty: + train_path = output_dir / f"train_{asset_id}.csv" + train_df.to_csv(train_path, sep=";", index=False) + + if not predict_df.empty: + predict_path = output_dir / f"predict_{asset_id}.csv" + predict_df.to_csv(predict_path, sep=";", index=False) + + +def split_asset_datasets(input_dir: Path, output_dir: Path | None = None) -> None: + """Split datasets per asset into train and prediction CSV files. + + Args: + input_dir: Directory containing the source ``.csv`` files. + output_dir: Optional directory to store the results. When ``None`` the + input directory is used. + """ + + if not input_dir.is_dir(): + raise NotADirectoryError(f"Input directory '{input_dir}' does not exist or is not a directory") + + output_dir = output_dir or input_dir + output_dir.mkdir(parents=True, exist_ok=True) + + asset_frames = _read_asset_frames(input_dir) + for asset_id, frames in asset_frames.items(): + asset_df = _combine_frames(frames) + _save_asset_frames(asset_id, asset_df, output_dir) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Split asset datasets into train and prediction files") + parser.add_argument( + "input_dir", + type=Path, + help="Path to the directory containing the source CSV files", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help="Optional output directory. Defaults to the input directory.", + ) + return parser.parse_args() + + +def main() -> None: + args = _parse_args() + split_asset_datasets(args.input_dir, args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/tests/utils/test_asset_dataset_splitter.py b/tests/utils/test_asset_dataset_splitter.py new file mode 100644 index 0000000..9398e54 --- /dev/null +++ b/tests/utils/test_asset_dataset_splitter.py @@ -0,0 +1,48 @@ +from pathlib import Path + +import pytest + +pd = pytest.importorskip("pandas") + +from energy_fault_detector.utils.asset_dataset_splitter import split_asset_datasets + + +def _write_csv(path: Path, data: pd.DataFrame) -> None: + path.write_text(data.to_csv(sep=";", index=False)) + + +def test_split_asset_datasets(tmp_path): + input_dir = tmp_path / "input" + input_dir.mkdir() + + data = pd.DataFrame( + { + "asset_id": ["asset_a", "asset_a", "asset_a", "asset_b"], + "train_test": ["train", "prediction", "train", "prediction"], + "train_test_bool": [True, False, True, False], + "status_type_id": [0, 1, 3, 2], + "status_type_bool": [False, True, True, False], + "temperature_avg": [10, 11, 12, 13], + "temperature_max": [20, 21, 22, 23], + "sensor_value": [100, 110, 120, 130], + "time_stamp": ["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04"], + } + ) + + _write_csv(input_dir / "dataset.csv", data) + + split_asset_datasets(input_dir) + + train_a = pd.read_csv(input_dir / "train_asset_a.csv", sep=";") + predict_a = pd.read_csv(input_dir / "predict_asset_a.csv", sep=";") + predict_b = pd.read_csv(input_dir / "predict_asset_b.csv", sep=";") + + assert set(train_a.columns) == {"temperature_avg", "sensor_value", "time_stamp"} + assert set(predict_a.columns) == {"temperature_avg", "sensor_value", "time_stamp"} + + assert len(train_a) == 3 # includes both train rows and normal status rows + assert len(predict_a) == 2 # prediction row + anomaly status row + + # Asset B only has a prediction row with normal status. It should appear in prediction output + assert set(predict_b.columns) == {"temperature_avg", "sensor_value", "time_stamp"} + assert len(predict_b) == 1 From 340e413252d907ab21278bb1b7d474b8a9cc0960 Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Mon, 6 Oct 2025 22:05:02 +0530 Subject: [PATCH 04/42] discover the delimiter of the csv file and then use that for reading it --- .../utils/asset_dataset_splitter.py | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/energy_fault_detector/utils/asset_dataset_splitter.py b/energy_fault_detector/utils/asset_dataset_splitter.py index 47761b9..a508a1e 100644 --- a/energy_fault_detector/utils/asset_dataset_splitter.py +++ b/energy_fault_detector/utils/asset_dataset_splitter.py @@ -29,6 +29,8 @@ import pandas as pd +import csv + STATUS_TYPE_NORMAL = {"0", "2"} STATUS_TYPE_ANOMALY = {"1", "3", "4", "5"} @@ -43,23 +45,25 @@ def _read_asset_frames(path: Path) -> Dict[str, List[pd.DataFrame]]: - """Read ``.csv`` files from ``path`` and group rows by ``asset_id``. - - Args: - path: Directory containing the source ``.csv`` files. - - Returns: - Mapping of asset identifier to a list of data frames containing rows - for that asset. - """ - + """Read .csv files from `path` and group rows by asset_id, auto-detecting delimiter.""" asset_frames: Dict[str, List[pd.DataFrame]] = defaultdict(list) + for csv_file in sorted(path.glob("*.csv")): - df = pd.read_csv(csv_file, sep=";", dtype=str) + # Detect delimiter automatically + with open(csv_file, "r", encoding="utf-8", errors="ignore") as f: + sample = f.read(2048) # read a small chunk + sniffer = csv.Sniffer() + try: + dialect = sniffer.sniff(sample) + delimiter = dialect.delimiter + except csv.Error: + delimiter = ";" # fallback default + + # Read CSV with detected delimiter + df = pd.read_csv(csv_file, sep=delimiter, dtype=str) + if "asset_id" not in df.columns: - raise ValueError( - f"File '{csv_file}' does not contain required column 'asset_id'." - ) + raise ValueError(f"File '{csv_file}' does not contain required column 'asset_id'.") for asset_id, group in df.groupby("asset_id", sort=False): asset_frames[str(asset_id)].append(group.reset_index(drop=True)) From a498e52c45e1df298abb99c08984659c5252eb1d Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Mon, 6 Oct 2025 22:21:43 +0530 Subject: [PATCH 05/42] Add run and pre-process --- pre-process.py | 89 +++++++++++++++++++++++++++++++++ run.py | 130 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 pre-process.py create mode 100644 run.py diff --git a/pre-process.py b/pre-process.py new file mode 100644 index 0000000..d2b232c --- /dev/null +++ b/pre-process.py @@ -0,0 +1,89 @@ +import pandas as pd +import numpy as np + +def analyze_dataframe( + df: pd.DataFrame, + required_cols=None, + max_nan_frac_per_col=0.05, + min_unique_value_count=2, + max_col_zero_frac=0.99, + duplicate_value_to_nan=True, + n_max_duplicates=144, + value_to_replace=0, + imputer_strategy="mean" +): + """ + Analyze a DataFrame for preprocessing issues (without modifying it). + + Args: + df (pd.DataFrame): Input dataframe. + required_cols (list, optional): Fixed required columns. + max_nan_frac_per_col (float): Max allowed NaN fraction per column. + min_unique_value_count (int): Minimum unique values required in a column. + max_col_zero_frac (float): Max allowed fraction of zeros in numeric columns. + duplicate_value_to_nan (bool): Whether to check for long duplicate sequences. + n_max_duplicates (int): Max allowed consecutive duplicate values. + value_to_replace (int/float): Value considered for duplicate sequences. + imputer_strategy (str): Default imputation strategy for numeric NaNs. + + Returns: + dict: Report containing issues and suggestions. + """ + + if required_cols is None: + required_cols = [ + "time_stamp", "asset_id", "train_test", "train_test_bool", + "status_type_id", "status_type_bool" + ] + + # Retain only required + *_avg columns + cols_to_keep = required_cols + [c for c in df.columns if c.endswith("_avg")] + df = df[cols_to_keep] + + report = {} + + # 1. NaN fraction check + nan_fractions = df.isna().mean() + nan_violations = nan_fractions[nan_fractions > max_nan_frac_per_col] + report["high_nan_columns"] = nan_violations.to_dict() + + # 2. Unique value check + low_unique = {} + for col in df.columns: + uniq_count = df[col].nunique(dropna=True) + if uniq_count < min_unique_value_count: + low_unique[col] = uniq_count + report["low_unique_columns"] = low_unique + + # 3. Zero fraction check + zero_fractions = (df == 0).mean(numeric_only=True) + zero_violations = zero_fractions[zero_fractions > max_col_zero_frac] + report["high_zero_columns"] = zero_violations.to_dict() + + # 4. Duplicate sequence check + duplicate_report = {} + if duplicate_value_to_nan: + for col in df.columns: + if pd.api.types.is_numeric_dtype(df[col]): + consecutive_dupes = ( + (df[col] == value_to_replace) + .astype(int) + .groupby(df[col].ne(value_to_replace).cumsum()) + .cumsum() + ) + long_dupes = df[col][consecutive_dupes > n_max_duplicates] + if not long_dupes.empty: + duplicate_report[col] = long_dupes.index.tolist() + report["long_duplicate_sequences"] = duplicate_report + + # 5. Imputation strategy suggestion + imputer_suggestions = {} + for col in df.columns: + if df[col].isna().any(): + if pd.api.types.is_numeric_dtype(df[col]): + imputer_suggestions[col] = f"{imputer_strategy} imputation" + else: + imputer_suggestions[col] = "most_frequent imputation" + report["imputation_suggestions"] = imputer_suggestions + + return report diff --git a/run.py b/run.py new file mode 100644 index 0000000..636d2ac --- /dev/null +++ b/run.py @@ -0,0 +1,130 @@ +import pandas as pd +from sklearn.impute import SimpleImputer +from energy_fault_detector.quick_fault_detection import quick_fault_detector +from energy_fault_detector.config import Config +import tempfile +import os + +import pandas as pd +import numpy as np + +def analyze_dataframe( + df: pd.DataFrame, + required_cols=None, + max_nan_frac_per_col=0.05, + min_unique_value_count=2, + max_col_zero_frac=0.99, + duplicate_value_to_nan=True, + n_max_duplicates=144, + value_to_replace=0, + imputer_strategy="mean" +): + """ + Analyze a DataFrame for preprocessing issues (without modifying it). + + Args: + df (pd.DataFrame): Input dataframe. + required_cols (list, optional): Fixed required columns. + max_nan_frac_per_col (float): Max allowed NaN fraction per column. + min_unique_value_count (int): Minimum unique values required in a column. + max_col_zero_frac (float): Max allowed fraction of zeros in numeric columns. + duplicate_value_to_nan (bool): Whether to check for long duplicate sequences. + n_max_duplicates (int): Max allowed consecutive duplicate values. + value_to_replace (int/float): Value considered for duplicate sequences. + imputer_strategy (str): Default imputation strategy for numeric NaNs. + + Returns: + dict: Report containing issues and suggestions. + """ + + if required_cols is None: + required_cols = [ + "time_stamp", "train_test", "train_test_bool", + "status_type_id", "status_type_bool" + ] + + # Retain only required + *_avg columns + cols_to_keep = required_cols + [c for c in df.columns if c.endswith("_avg")] + df = df[cols_to_keep] + + report = {} + + # 1. NaN fraction check + nan_fractions = df.isna().mean() + nan_violations = nan_fractions[nan_fractions > max_nan_frac_per_col] + report["high_nan_columns"] = nan_violations.to_dict() + + # 2. Unique value check + low_unique = {} + for col in df.columns: + uniq_count = df[col].nunique(dropna=True) + if uniq_count < min_unique_value_count: + low_unique[col] = uniq_count + report["low_unique_columns"] = low_unique + + # 3. Zero fraction check + zero_fractions = (df == 0).mean(numeric_only=True) + zero_violations = zero_fractions[zero_fractions > max_col_zero_frac] + report["high_zero_columns"] = zero_violations.to_dict() + + # 4. Duplicate sequence check + duplicate_report = {} + if duplicate_value_to_nan: + for col in df.columns: + if pd.api.types.is_numeric_dtype(df[col]): + consecutive_dupes = ( + (df[col] == value_to_replace) + .astype(int) + .groupby(df[col].ne(value_to_replace).cumsum()) + .cumsum() + ) + long_dupes = df[col][consecutive_dupes > n_max_duplicates] + if not long_dupes.empty: + duplicate_report[col] = long_dupes.index.tolist() + # report["long_duplicate_sequences"] = duplicate_report + + # 5. Imputation strategy suggestion + imputer_suggestions = {} + for col in df.columns: + if df[col].isna().any(): + if pd.api.types.is_numeric_dtype(df[col]): + imputer_suggestions[col] = f"{imputer_strategy} imputation" + else: + imputer_suggestions[col] = "most_frequent imputation" + # report["imputation_suggestions"] = imputer_suggestions + + return df,report + + +farm_path = "/content/drive/MyDrive/Wind Turbine/Care Dataset/CARE_To_Compare/Wind Farm B/7.csv" +farm_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\7.csv" +# Load the data using pandas +df = pd.read_csv(farm_path) + +df,report = analyze_dataframe(df) +df.to_csv(r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\7_preprocessed.csv", index=False) +# import pprint +# pprint.pprint(report) + +# Identify numeric columns +numeric_cols = df.select_dtypes(include=['number']).columns +df_numeric = df[numeric_cols] + +# Handle missing values using SimpleImputer +imputer = SimpleImputer(strategy='mean') +df_imputed_numeric = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_cols) + +# Re-add non-numeric columns (time_stamp and status_type_id_bool) to the imputed dataframe +df_processed = df[['time_stamp', 'train_test_bool', 'status_type_bool']].copy() +df_processed[numeric_cols] = df_imputed_numeric + +# Create a temporary CSV file +with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp_file: + temp_csv_path = tmp_file.name + df_processed.to_csv(temp_csv_path, index=False) + +# Pass the path of the temporary CSV file to the quick_fault_detector +quick_fault_detector, quick_fault_detector_df = quick_fault_detector(temp_csv_path, None, "train_test_bool", None, "time_stamp", "status_type_bool") + +# Optionally, remove the temporary file after use +# os.remove(temp_csv_path) \ No newline at end of file From 3b7f9f2db313599fce3edf36358e0123d3a34677 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 6 Oct 2025 11:07:15 -0700 Subject: [PATCH 06/42] Add model output customization options for quick fault training --- energy_fault_detector/fault_detector.py | 7 +++++-- .../quick_fault_detector.py | 20 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 207f2e8..b582de0 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -89,7 +89,7 @@ def preprocess_train_data(self, sensor_data: pd.DataFrame, normal_index: pd.Seri def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, save_models: bool = True, overwrite_models: bool = False, fit_autoencoder_only: bool = False, fit_preprocessor: bool = True, - **kwargs) -> ModelMetadata: + model_name: Optional[str] = None, **kwargs) -> ModelMetadata: """Fit models on the given sensor_data and save them locally and return the metadata. Args: @@ -104,6 +104,8 @@ def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, save_mo fit_autoencoder_only (bool, optional): If True, only fit the data preprocessor and autoencoder objects. Defaults to False. fit_preprocessor (bool, optional): If True, the preprocessor is fitted. Defaults to True. + model_name (Optional[str], optional): Custom directory name for the saved model files. Defaults to + 'trained_model'. Returns: ModelMetadata: metadata of the trained model: model_date, model_path, model reconstruction errors @@ -128,7 +130,8 @@ def fit(self, sensor_data: pd.DataFrame, normal_index: pd.Series = None, save_mo # save the models if save_models: - model_path, model_date = self.save_models(model_name='trained_model', overwrite=overwrite_models) + save_dir_name = model_name or 'trained_model' + model_path, model_date = self.save_models(model_name=save_dir_name, overwrite=overwrite_models) else: model_date = datetime.now().strftime("%Y%m%d_%H%M%S") return ModelMetadata( diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index ba60d73..f06bcad 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -33,7 +33,10 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = automatic_optimization: bool = True, enable_debug_plots: bool = False, min_anomaly_length: int = 18, save_dir: Optional[str] = None, mode: Literal['train', 'predict'] = 'train', - model_path: Optional[str] = None + model_path: Optional[str] = None, + model_directory: Optional[str] = None, + model_subdir: Optional[str] = None, + model_name: Optional[str] = None ) -> Tuple[FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]]: """Analyzes provided data using an autoencoder based approach for identifying anomalies based on a learned normal behavior. Anomalies are then aggregated to events and further analyzed. @@ -90,6 +93,11 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = mode (Literal['train', 'predict']): Determines whether the detector should train a new model or load an existing one for prediction. Defaults to 'train'. model_path (Optional[str]): Path to a previously saved model directory. Required when mode='predict'. + model_directory (Optional[str]): Directory where trained model artifacts should be stored. Defaults to the + FaultDetector default when not provided. + model_subdir (Optional[str]): Optional subdirectory inside ``model_directory`` that should be used when saving + models. If not provided, a timestamp-based folder name is used. + model_name (Optional[str]): Custom directory name for the saved model artifacts. Defaults to 'trained_model'. Returns: Tuple(FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]): FaultDetectionResult object with the @@ -132,8 +140,14 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = features_to_exclude=features_to_exclude, angles=angle_features, automatic_optimization=automatic_optimization) logger.info('Training a Normal behavior model.') - anomaly_detector = FaultDetector(config=config) - model_metadata = anomaly_detector.fit(sensor_data=train_data, normal_index=train_normal_index) + fault_detector_kwargs = {} + if model_directory is not None: + fault_detector_kwargs['model_directory'] = model_directory + if model_subdir is not None: + fault_detector_kwargs['model_subdir'] = model_subdir + anomaly_detector = FaultDetector(config=config, **fault_detector_kwargs) + model_metadata = anomaly_detector.fit(sensor_data=train_data, normal_index=train_normal_index, + model_name=model_name) if model_metadata.model_path: logger.info('Saved trained model to %s.', model_metadata.model_path) else: From 3564bbc4958c355d73487ccd267bab53d44d17b6 Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Tue, 7 Oct 2025 13:53:41 +0530 Subject: [PATCH 07/42] Change config --- run.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/run.py b/run.py index 636d2ac..68de106 100644 --- a/run.py +++ b/run.py @@ -97,12 +97,14 @@ def analyze_dataframe( farm_path = "/content/drive/MyDrive/Wind Turbine/Care Dataset/CARE_To_Compare/Wind Farm B/7.csv" -farm_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\7.csv" +farm_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files\train_0.csv" +test_file_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files\predict_0.csv" +output_root_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\models" # Load the data using pandas df = pd.read_csv(farm_path) -df,report = analyze_dataframe(df) -df.to_csv(r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\7_preprocessed.csv", index=False) +# df,report = analyze_dataframe(df) +# df.to_csv(r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files\train_0_processed.csv", index=False) # import pprint # pprint.pprint(report) @@ -115,7 +117,7 @@ def analyze_dataframe( df_imputed_numeric = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_cols) # Re-add non-numeric columns (time_stamp and status_type_id_bool) to the imputed dataframe -df_processed = df[['time_stamp', 'train_test_bool', 'status_type_bool']].copy() +df_processed = df[['time_stamp']].copy() df_processed[numeric_cols] = df_imputed_numeric # Create a temporary CSV file @@ -126,5 +128,18 @@ def analyze_dataframe( # Pass the path of the temporary CSV file to the quick_fault_detector quick_fault_detector, quick_fault_detector_df = quick_fault_detector(temp_csv_path, None, "train_test_bool", None, "time_stamp", "status_type_bool") +# from energy_fault_detector.quick_fault_detection import quick_fault_detector + +prediction_results, events, metadata = quick_fault_detector( + csv_data_path=temp_csv_path, + csv_test_data_path=test_file_path, + mode="train", + time_column_name="time_stamp", # optional, if you need timestamp parsing + model_directory=output_root_path, # optional; defaults to the package setting + model_subdir="asset_0", # optional; becomes a subfolder under model_directory + model_name="farm_b" # optional; final folder for saved artifacts +) + + # Optionally, remove the temporary file after use # os.remove(temp_csv_path) \ No newline at end of file From b0ce9bc04e479cf69bedd3875fff3620ae33d74f Mon Sep 17 00:00:00 2001 From: teehari79 Date: Tue, 7 Oct 2025 02:04:28 -0700 Subject: [PATCH 08/42] Handle empty numeric columns during imputation --- run.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/run.py b/run.py index 68de106..417b1fa 100644 --- a/run.py +++ b/run.py @@ -109,16 +109,23 @@ def analyze_dataframe( # pprint.pprint(report) # Identify numeric columns -numeric_cols = df.select_dtypes(include=['number']).columns -df_numeric = df[numeric_cols] - -# Handle missing values using SimpleImputer -imputer = SimpleImputer(strategy='mean') -df_imputed_numeric = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_cols) - -# Re-add non-numeric columns (time_stamp and status_type_id_bool) to the imputed dataframe -df_processed = df[['time_stamp']].copy() -df_processed[numeric_cols] = df_imputed_numeric +numeric_cols = df.select_dtypes(include=["number"]).columns +df_processed = df[["time_stamp"]].copy() + +if len(numeric_cols) > 0: + df_numeric = df[numeric_cols] + + # Handle missing values using SimpleImputer + imputer = SimpleImputer(strategy="mean") + df_imputed_numeric = pd.DataFrame( + imputer.fit_transform(df_numeric), columns=numeric_cols + ) + + # Re-add non-numeric columns (time_stamp and status_type_id_bool) to the imputed dataframe + df_processed[numeric_cols] = df_imputed_numeric +else: + # If there are no numeric columns, simply keep the original non-numeric data + df_processed = df_processed.join(df.drop(columns=["time_stamp"])) # Create a temporary CSV file with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp_file: From f59e5335be834db9a7eaf362d652dd1232053ab0 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Tue, 7 Oct 2025 02:20:05 -0700 Subject: [PATCH 09/42] Guard preprocessing when time column missing --- run.py | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/run.py b/run.py index 68de106..ebe08e3 100644 --- a/run.py +++ b/run.py @@ -109,16 +109,34 @@ def analyze_dataframe( # pprint.pprint(report) # Identify numeric columns -numeric_cols = df.select_dtypes(include=['number']).columns -df_numeric = df[numeric_cols] - -# Handle missing values using SimpleImputer -imputer = SimpleImputer(strategy='mean') -df_imputed_numeric = pd.DataFrame(imputer.fit_transform(df_numeric), columns=numeric_cols) - -# Re-add non-numeric columns (time_stamp and status_type_id_bool) to the imputed dataframe -df_processed = df[['time_stamp']].copy() -df_processed[numeric_cols] = df_imputed_numeric +numeric_cols = df.select_dtypes(include=["number"]).columns +time_column = "time_stamp" + +# Prepare processed dataframe, keeping the time column if it exists +df_processed = pd.DataFrame(index=df.index) +if time_column in df.columns: + df_processed[time_column] = df[time_column] + +if len(numeric_cols) > 0: + df_numeric = df[numeric_cols] + + # Handle missing values using SimpleImputer + imputer = SimpleImputer(strategy="mean") + df_imputed_numeric = pd.DataFrame( + imputer.fit_transform(df_numeric), + columns=numeric_cols, + index=df.index, + ) + + df_processed = df_processed.join(df_imputed_numeric) + +# Include remaining non-numeric columns (excluding the time column if present) +non_numeric_cols = [col for col in df.columns if col not in numeric_cols] +other_non_numeric_cols = [ + col for col in non_numeric_cols if col != time_column +] +if other_non_numeric_cols: + df_processed = df_processed.join(df[other_non_numeric_cols]) # Create a temporary CSV file with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp_file: @@ -142,4 +160,4 @@ def analyze_dataframe( # Optionally, remove the temporary file after use -# os.remove(temp_csv_path) \ No newline at end of file +# os.remove(temp_csv_path) From 4d931e1f9c871bf3186ecb1a9fc78e49c46b377b Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Tue, 7 Oct 2025 15:13:44 +0530 Subject: [PATCH 10/42] fixed issue with csv delimiter --- run.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/run.py b/run.py index 417b1fa..7245991 100644 --- a/run.py +++ b/run.py @@ -100,8 +100,20 @@ def analyze_dataframe( farm_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files\train_0.csv" test_file_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files\predict_0.csv" output_root_path = r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\models" + # Load the data using pandas -df = pd.read_csv(farm_path) +import csv +with open(farm_path, "r", encoding="utf-8", errors="ignore") as f: + sample = f.read(2048) # read a small chunk + sniffer = csv.Sniffer() + try: + dialect = sniffer.sniff(sample) + delimiter = dialect.delimiter + except csv.Error: + delimiter = ";" # fallback default + +# Read CSV with detected delimiter +df = pd.read_csv(farm_path, sep=delimiter, dtype=str) # df,report = analyze_dataframe(df) # df.to_csv(r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files\train_0_processed.csv", index=False) @@ -110,6 +122,7 @@ def analyze_dataframe( # Identify numeric columns numeric_cols = df.select_dtypes(include=["number"]).columns +print ("Number columns:",numeric_cols) df_processed = df[["time_stamp"]].copy() if len(numeric_cols) > 0: @@ -133,7 +146,7 @@ def analyze_dataframe( df_processed.to_csv(temp_csv_path, index=False) # Pass the path of the temporary CSV file to the quick_fault_detector -quick_fault_detector, quick_fault_detector_df = quick_fault_detector(temp_csv_path, None, "train_test_bool", None, "time_stamp", "status_type_bool") +# quick_fault_detector, quick_fault_detector_df = quick_fault_detector(temp_csv_path, None, "train_test_bool", None, "time_stamp", "status_type_bool") # from energy_fault_detector.quick_fault_detection import quick_fault_detector From a562747e03d9827990191147c5e7727e70a2e586 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Tue, 7 Oct 2025 03:36:29 -0700 Subject: [PATCH 11/42] Add predict mode runner for quick fault detector --- run_predict.py | 184 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 run_predict.py diff --git a/run_predict.py b/run_predict.py new file mode 100644 index 0000000..bd4e2b4 --- /dev/null +++ b/run_predict.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +"""Utility script for running the quick fault detector in predict mode. + +This executable mirrors the high level behaviour of :mod:`run.py`, but focuses on +loading a previously trained model and applying it to new data. At a minimum the +user has to provide the path to the model directory and the CSV file that should be +evaluated. Optionally a dedicated training dataset as well as metadata about the +input columns can be supplied. + +Example usage:: + + python run_predict.py \ + --model-path path/to/model_directory \ + --csv-predict-data path/to/predict.csv \ + --time-column time_stamp + +The script prints a short textual summary of the run and leaves the detailed +artefacts (plots, CSVs) to the underlying ``quick_fault_detector`` implementation. +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any, Dict, Optional + +from energy_fault_detector.quick_fault_detection import quick_fault_detector + + +def _parse_mapping(value: Optional[str]) -> Optional[Dict[str, Any]]: + """Parse a JSON dictionary supplied via the command line. + + Args: + value: Raw argument string. + + Returns: + Parsed dictionary or ``None`` when ``value`` is ``None``. + + Raises: + argparse.ArgumentTypeError: If the value is not valid JSON or does not + decode to a dictionary. + """ + + if value is None: + return None + + try: + mapping = json.loads(value) + except json.JSONDecodeError as exc: # pragma: no cover - handled at runtime + raise argparse.ArgumentTypeError( + f"Failed to decode mapping '{value}': {exc}" + ) from exc + + if not isinstance(mapping, dict): + raise argparse.ArgumentTypeError("Mapping must decode to a JSON object (dict).") + + return mapping + + +def build_argument_parser() -> argparse.ArgumentParser: + """Create the command line argument parser used by the script.""" + + parser = argparse.ArgumentParser( + description="Run the quick fault detector in predict mode using a saved model.", + ) + parser.add_argument( + "--model-path", + required=True, + help="Path to the directory that contains the previously trained model.", + ) + parser.add_argument( + "--csv-predict-data", + required=True, + help="CSV file that contains the data points to analyse for anomalies.", + ) + parser.add_argument( + "--csv-train-data", + help=( + "Optional CSV file that contains the training data. When omitted, the" + " prediction data is reused as a stand-in to satisfy the loader." + ), + ) + parser.add_argument( + "--time-column", + help="Name of the timestamp column in the provided CSV files.", + ) + parser.add_argument( + "--status-column", + help=( + "Name of the column describing the operating status of the asset." + " Used to identify rows that represent normal behaviour." + ), + ) + parser.add_argument( + "--status-mapping", + type=_parse_mapping, + help="JSON dictionary mapping status values to booleans (e.g. '{\"OK\": true}').", + ) + parser.add_argument( + "--train-test-column", + help=( + "Column indicating whether a row belongs to the training (True) or" + " prediction (False) portion of the dataset." + ), + ) + parser.add_argument( + "--train-test-mapping", + type=_parse_mapping, + help="JSON dictionary mapping train/test column values to booleans.", + ) + parser.add_argument( + "--min-anomaly-length", + type=int, + default=18, + help="Minimal number of consecutive anomalies to form an event (default: 18).", + ) + parser.add_argument( + "--save-dir", + help="Directory where diagnostic output (plots, CSVs) should be written.", + ) + parser.add_argument( + "--debug-plots", + action="store_true", + help="Enable creation of additional debug plots during prediction.", + ) + return parser + + +def validate_paths(model_path: str, csv_predict_data: str, csv_train_data: Optional[str]) -> None: + """Ensure that the provided filesystem paths point to existing resources.""" + + model_dir = Path(model_path) + if not model_dir.exists(): + raise FileNotFoundError(f"Model directory '{model_dir}' does not exist.") + + predict_path = Path(csv_predict_data) + if not predict_path.is_file(): + raise FileNotFoundError(f"Prediction data file '{predict_path}' was not found.") + + if csv_train_data is not None: + train_path = Path(csv_train_data) + if not train_path.is_file(): + raise FileNotFoundError(f"Training data file '{train_path}' was not found.") + + +def main() -> None: + """Entry point for command line execution.""" + + parser = build_argument_parser() + args = parser.parse_args() + + validate_paths( + model_path=args.model_path, + csv_predict_data=args.csv_predict_data, + csv_train_data=args.csv_train_data, + ) + + train_data_path = args.csv_train_data or args.csv_predict_data + + prediction_results, event_metadata, _ = quick_fault_detector( + csv_data_path=train_data_path, + csv_test_data_path=args.csv_predict_data, + train_test_column_name=args.train_test_column, + train_test_mapping=args.train_test_mapping, + time_column_name=args.time_column, + status_data_column_name=args.status_column, + status_mapping=args.status_mapping, + min_anomaly_length=args.min_anomaly_length, + save_dir=args.save_dir, + enable_debug_plots=args.debug_plots, + mode="predict", + model_path=args.model_path, + ) + + print("Prediction finished successfully.") + print(f"Number of detected events: {len(event_metadata)}") + print("Anomaly score preview:") + print(prediction_results.anomaly_score.head()) + + +if __name__ == "__main__": # pragma: no cover - script entry point + main() + From 8d55c457e0951abe6afa733f847c1560c8087f5f Mon Sep 17 00:00:00 2001 From: teehari79 Date: Tue, 7 Oct 2025 04:22:19 -0700 Subject: [PATCH 12/42] Adjust prediction defaults for wind farm dataset --- .../quick_fault_detection/data_loading.py | 46 ++++++++++++------- .../quick_fault_detector.py | 12 +++-- run_predict.py | 37 +++++++++++---- 3 files changed, 67 insertions(+), 28 deletions(-) diff --git a/energy_fault_detector/quick_fault_detection/data_loading.py b/energy_fault_detector/quick_fault_detection/data_loading.py index 63bd462..d78f492 100644 --- a/energy_fault_detector/quick_fault_detection/data_loading.py +++ b/energy_fault_detector/quick_fault_detection/data_loading.py @@ -141,9 +141,10 @@ def load_data(csv_data_path: str, train_test_column_name: Optional[str] = None, split is performed if train_test_column_name is not None. Args: - csv_data_path (str): Path to a csv-file containing tabular data which must contain training data for the - autoencoder. This data can also contain test data for evaluation, but in this case train_test_column and - optionally train_test_mapping must be provided. + csv_data_path (Optional str): Path to a csv-file containing tabular data which must contain training data for + the autoencoder. This data can also contain test data for evaluation, but in this case train_test_column and + optionally train_test_mapping must be provided. When ``None`` the training data is skipped, which is useful + for pure prediction runs with a pre-trained model. train_test_column_name (Optional str): Name of the column which specifies which part of the data in csv_data_path is training data and which is test data. If this column does not contain boolean values or values which can be cast into boolean values, then train_test_mapping must be provided. Default is None. @@ -172,6 +173,7 @@ def load_data(csv_data_path: str, train_test_column_name: Optional[str] = None, # train test data split assumes that True stands for training data and False for prediction data train_test_split = get_boolean_feature(df=df, bool_data_column_name=train_test_column_name, boolean_mapping=train_test_mapping) + df = df.drop(columns=['id'], errors='ignore') sensor_data = get_sensor_data(df=df) normal_index = get_boolean_feature(df=df, bool_data_column_name=status_data_column_name, boolean_mapping=status_mapping) @@ -190,7 +192,7 @@ def load_data(csv_data_path: str, train_test_column_name: Optional[str] = None, return train_data, normal_index, test_data -def load_train_test_data(csv_data_path: str, csv_test_data_path: Optional[str] = None, +def load_train_test_data(csv_data_path: Optional[str], csv_test_data_path: Optional[str] = None, train_test_column_name: Optional[str] = None, train_test_mapping: Optional[dict] = None, time_column_name: Optional[str] = None, status_data_column_name: Optional[str] = None, status_mapping: Optional[dict] = None @@ -200,9 +202,9 @@ def load_train_test_data(csv_data_path: str, csv_test_data_path: Optional[str] = If no test data is provided an exception is raised. Args: - csv_data_path (str): Path to a csv-file containing tabular data which must contain training data for the + csv_data_path (Optional str): Path to a csv-file containing tabular data which must contain training data for the autoencoder. This data can also contain test data for evaluation, but in this case train_test_column and - optionally train_test_mapping must be provided. + optionally train_test_mapping must be provided. When ``None`` only the dedicated prediction data is used. csv_test_data_path (Optional str): Path to a csv file containing test data for evaluation. If test data is provided in both ways (i.e. via csv_test_data_path and in csv_data_path + train_test_column) then both test data sets will be fused into one. Default is None. @@ -225,21 +227,28 @@ def load_train_test_data(csv_data_path: str, csv_test_data_path: Optional[str] = booleans or at least castable to booleans. Default is None. Returns: tuple - train_data (pd.DataFrame): Contains training data for the AnomalyDetector (only numeric values). + train_data (pd.DataFrame): Contains training data for the AnomalyDetector (only numeric values). When no + training dataset is provided, this will be an empty dataframe with the same columns as the prediction data. train_normal_index (pd.Series): Contains boolean information about which rows of train_data are normal and which contain anomalous behavior. test_data (pd.DataFrame): Contains test data for the AnomalyDetector (only numeric values). """ - train_data, train_normal_index, test_data = load_data(csv_data_path=csv_data_path, - time_column_name=time_column_name, - status_data_column_name=status_data_column_name, - status_mapping=status_mapping, - train_test_column_name=train_test_column_name, - train_test_mapping=train_test_mapping) - # normal index for prediction data can be ignored + if csv_data_path is not None: + train_data, train_normal_index, test_data = load_data(csv_data_path=csv_data_path, + time_column_name=time_column_name, + status_data_column_name=status_data_column_name, + status_mapping=status_mapping, + train_test_column_name=train_test_column_name, + train_test_mapping=train_test_mapping) + else: + train_data = pd.DataFrame() + train_normal_index = pd.Series(dtype=bool) + test_data = None + if csv_test_data_path is not None: - separated_test_data, _, _ = load_data(csv_data_path=csv_test_data_path, time_column_name=time_column_name, - status_data_column_name=None) + separated_df = read_csv_file(csv_data_path=csv_test_data_path, time_column_name=time_column_name) + separated_df = separated_df.drop(columns=['id'], errors='ignore') + separated_test_data = get_sensor_data(df=separated_df) else: separated_test_data = None @@ -252,4 +261,9 @@ def load_train_test_data(csv_data_path: str, csv_test_data_path: Optional[str] = test_data = pd.concat([test_data, separated_test_data]) elif separated_test_data is not None: test_data = separated_test_data + if test_data is not None: + test_data = test_data.drop(columns=['id'], errors='ignore') + if train_data.empty and test_data is not None: + train_data = test_data.iloc[0:0] + train_normal_index = pd.Series(dtype=bool, index=train_data.index) return train_data, train_normal_index, test_data diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index f06bcad..61c4cd6 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -24,7 +24,7 @@ logger = logging.getLogger('energy_fault_detector') -def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = None, +def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optional[str] = None, train_test_column_name: Optional[str] = None, train_test_mapping: Optional[dict] = None, time_column_name: Optional[str] = None, status_data_column_name: Optional[str] = None, status_mapping: Optional[dict] = None, @@ -50,9 +50,10 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = 7. Visualization of output Args: - csv_data_path (str): Path to a csv-file containing tabular data which must contain training data for the - autoencoder. This data can also contain test data for evaluation, but in this case train_test_column and - optionally train_test_mapping must be provided. + csv_data_path (Optional[str]): Path to a csv-file containing tabular data which must contain training data for + the autoencoder. When running in prediction mode with a pre-trained model, this can be ``None`` to skip + loading training data. This data can also contain test data for evaluation, but in this case + ``train_test_column`` and optionally ``train_test_mapping`` must be provided. csv_test_data_path (Optional str): Path to a csv file containing test data for evaluation. If test data is provided in both ways (i.e. via csv_test_data_path and in csv_data_path + train_test_column) then both test data sets will be fused into one. Default is None. @@ -124,6 +125,9 @@ def quick_fault_detector(csv_data_path: str, csv_test_data_path: Optional[str] = logger.info('Starting Automated Energy Fault Detection and Identification (mode=%s).', mode) logger.info('Loading Data...') + if csv_data_path is None and csv_test_data_path is None: + raise ValueError('At least one data source must be provided via `csv_data_path` or `csv_test_data_path`.') + train_data, train_normal_index, test_data = load_train_test_data(csv_data_path=csv_data_path, csv_test_data_path=csv_test_data_path, train_test_column_name=train_test_column_name, diff --git a/run_predict.py b/run_predict.py index bd4e2b4..87eb586 100644 --- a/run_predict.py +++ b/run_predict.py @@ -28,6 +28,17 @@ from energy_fault_detector.quick_fault_detection import quick_fault_detector +DEFAULT_MODEL_PATH = ( + r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\models" + r"\asset_0\20251007_154309" +) +DEFAULT_PREDICT_DATA_PATH = ( + r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files" + r"\predict_5.csv" +) +DEFAULT_TIME_COLUMN = "time_stamp" + + def _parse_mapping(value: Optional[str]) -> Optional[Dict[str, Any]]: """Parse a JSON dictionary supplied via the command line. @@ -66,24 +77,34 @@ def build_argument_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--model-path", - required=True, - help="Path to the directory that contains the previously trained model.", + default=DEFAULT_MODEL_PATH, + help=( + "Path to the directory that contains the previously trained model. " + "Defaults to the predefined wind farm model when not supplied." + ), ) parser.add_argument( "--csv-predict-data", - required=True, - help="CSV file that contains the data points to analyse for anomalies.", + default=DEFAULT_PREDICT_DATA_PATH, + help=( + "CSV file that contains the data points to analyse for anomalies. " + "Defaults to the provided wind farm dataset when not supplied." + ), ) parser.add_argument( "--csv-train-data", help=( - "Optional CSV file that contains the training data. When omitted, the" - " prediction data is reused as a stand-in to satisfy the loader." + "Optional CSV file that contains the training data. When omitted, no " + "additional training is performed and the provided model is reused." ), ) parser.add_argument( "--time-column", - help="Name of the timestamp column in the provided CSV files.", + default=DEFAULT_TIME_COLUMN, + help=( + "Name of the timestamp column in the provided CSV files. Defaults to " + "'time_stamp' for the predefined dataset." + ), ) parser.add_argument( "--status-column", @@ -156,7 +177,7 @@ def main() -> None: csv_train_data=args.csv_train_data, ) - train_data_path = args.csv_train_data or args.csv_predict_data + train_data_path = args.csv_train_data prediction_results, event_metadata, _ = quick_fault_detector( csv_data_path=train_data_path, From a4061f8806e52a6e5577acbd5810e1680aee1d33 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Tue, 7 Oct 2025 22:57:43 -0700 Subject: [PATCH 13/42] Improve ColumnSelector missing column error message --- .../data_preprocessing/column_selector.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/energy_fault_detector/data_preprocessing/column_selector.py b/energy_fault_detector/data_preprocessing/column_selector.py index 3274cbf..b897678 100644 --- a/energy_fault_detector/data_preprocessing/column_selector.py +++ b/energy_fault_detector/data_preprocessing/column_selector.py @@ -69,8 +69,13 @@ def transform(self, x: pd.DataFrame) -> pd.DataFrame: # transformation is not possible. missing_columns = set(self.feature_names_out_) - set(x.columns) if len(missing_columns) > 0: - raise ValueError('ColumnSelector: There are columns missing in the prediction data, which were present in' - ' the training data. New models need to be trained!') + missing_columns_sorted = sorted(missing_columns) + missing_columns_str = ', '.join(missing_columns_sorted) + raise ValueError( + 'ColumnSelector: There are columns missing in the prediction data, which were present in' + ' the training data. Missing columns: ' + f"{missing_columns_str}. New models need to be trained!" + ) x = x[self.feature_names_out_] # ensure ordering return x From f9a7aa29b75b3e8cf17a649d5512b8b9133027a0 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Tue, 7 Oct 2025 23:17:46 -0700 Subject: [PATCH 14/42] Enhance anomaly prediction metadata --- .../core/fault_detection_result.py | 2 +- energy_fault_detector/fault_detector.py | 36 +++++++++++++++++-- tests/test_fault_detector.py | 13 +++++-- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/energy_fault_detector/core/fault_detection_result.py b/energy_fault_detector/core/fault_detection_result.py index 716a1ad..df08e81 100644 --- a/energy_fault_detector/core/fault_detection_result.py +++ b/energy_fault_detector/core/fault_detection_result.py @@ -12,7 +12,7 @@ class FaultDetectionResult: """Class to encapsulate results from the fault detection process.""" predicted_anomalies: pd.DataFrame - """DataFrame with a column 'anomaly' (bool).""" + """DataFrame containing anomaly indicators and metadata per timestamp.""" reconstruction: pd.DataFrame """DataFrame with reconstruction of the sensor data with timestamp as index.""" diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index b582de0..d8c84cb 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -281,8 +281,40 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, df_inverse_scaled_pred = self.data_preprocessor.inverse_transform(x_predicted) scores = self.anomaly_score.transform(recon_error) - predicted_anomalies = self.predict_anomalies(scores, x_prepped) - predicted_anomalies = pd.DataFrame(data=predicted_anomalies, columns=['anomaly'], index=scores.index) + + if isinstance(scores, pd.DataFrame): + if scores.shape[1] != 1: + raise ValueError('Anomaly score DataFrame must contain exactly one column.') + score_series = scores.iloc[:, 0] + elif isinstance(scores, pd.Series): + score_series = scores + else: + score_series = pd.Series(scores, index=x_prepped.index) + + predicted_anomalies = self.predict_anomalies(score_series, x_prepped) + predicted_anomalies = pd.DataFrame(data=predicted_anomalies, columns=['anomaly'], index=score_series.index) + + threshold_value = getattr(self.threshold_selector, 'threshold', None) + + error_magnitudes = recon_error.reindex(predicted_anomalies.index).abs() + top_attributes = error_magnitudes.apply( + lambda row: [col for col in row[row > 0].sort_values(ascending=False).index[:3]], axis=1 + ) + top_attribute_strings = top_attributes.apply(lambda attrs: ', '.join(attrs)) + + predicted_anomalies['behaviour'] = predicted_anomalies['anomaly'].map( + {True: 'anamoly', False: 'normal'} + ) + predicted_anomalies['anamoly_score'] = score_series.reindex(predicted_anomalies.index) + predicted_anomalies['threshold_score'] = threshold_value + predicted_anomalies['cumulative_anamoly_score'] = ( + predicted_anomalies['anomaly'].astype(int).cumsum() + ) + predicted_anomalies['anamolous fields'] = predicted_anomalies['anomaly'].map( + lambda is_anomaly: '' + ) + anomaly_indices = predicted_anomalies['anomaly'] + predicted_anomalies.loc[anomaly_indices, 'anamolous fields'] = top_attribute_strings.loc[anomaly_indices] if root_cause_analysis: logger.info('Run root cause analysis.') diff --git a/tests/test_fault_detector.py b/tests/test_fault_detector.py index 5588ecd..3432009 100644 --- a/tests/test_fault_detector.py +++ b/tests/test_fault_detector.py @@ -99,6 +99,7 @@ def setUp(self) -> None: mock_data_preprocessor.reset_mock(return_value=True, side_effect=True) mock_score.reset_mock(return_value=True, side_effect=True) mock_threshold.reset_mock(return_value=True, side_effect=True) + mock_threshold.threshold = 0.5 def tearDown(self) -> None: # Remove the temporary directory after the test @@ -243,8 +244,16 @@ def test_predict(self, mock_find_arcana_bias): # expected results recon = self.predictions recon.index = [0, 1, 2] - anomalies = pd.DataFrame(data=[[False], [False], [True]], - columns=['anomaly']) + anomalies = pd.DataFrame( + data={ + 'anomaly': [False, False, True], + 'behaviour': ['normal', 'normal', 'anamoly'], + 'anamoly_score': [0.1, 0.2, 0.15], + 'threshold_score': [0.5, 0.5, 0.5], + 'cumulative_anamoly_score': [0, 0, 1], + 'anamolous fields': ['', '', 'c'], + } + ) pd.testing.assert_frame_equal(results.reconstruction, recon) pd.testing.assert_frame_equal(results.predicted_anomalies, anomalies) From 620ae61ccb8941bee3d7de878e34f03c7bb8f1e8 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 02:35:33 -0700 Subject: [PATCH 15/42] Handle missing training data when plotting quick fault results --- energy_fault_detector/quick_fault_detection/output.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/energy_fault_detector/quick_fault_detection/output.py b/energy_fault_detector/quick_fault_detection/output.py index d249aab..99d8959 100644 --- a/energy_fault_detector/quick_fault_detection/output.py +++ b/energy_fault_detector/quick_fault_detection/output.py @@ -50,8 +50,15 @@ def generate_output_plots(anomaly_detector: FaultDetector, train_data: pd.DataFr axs[0, 0].axvspan(event_meta_data.iloc[i]['start'], event_meta_data.iloc[i]['end'], alpha=0.1, color='red') axs[0, 1].set_title('Anomaly Score During Training') - viz.plot_score_with_threshold(model=anomaly_detector, data=train_data, normal_index=normal_index, ax=axs[0, 1]) - axs[0, 1].set_yscale('log') + if train_data is not None and not train_data.empty: + viz.plot_score_with_threshold(model=anomaly_detector, data=train_data, normal_index=normal_index, + ax=axs[0, 1]) + axs[0, 1].set_yscale('log') + else: + axs[0, 1].text(0.5, 0.5, 'No training data available.', + ha='center', va='center', fontsize=14, + bbox=dict(boxstyle='round,pad=0.5', facecolor='white', edgecolor='black', linewidth=1.5)) + axs[0, 1].set_axis_off() viz.plot_learning_curve(anomaly_detector, ax=axs[1, 0]) axs[1, 0].set_title('Model Learning curve') From 540eddf7f2ee56b5c24fb10ce095175233875284 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 04:19:32 -0700 Subject: [PATCH 16/42] Add support for ignoring features in ARCANA --- docs/config_example.yaml | 3 ++ docs/usage_examples.rst | 4 ++ energy_fault_detector/config/config.py | 5 ++ .../root_cause_analysis/arcana.py | 48 ++++++++++++++++++- tests/config/test_config.py | 6 ++- tests/root_cause_analysis/test_arcana.py | 9 ++++ tests/test_data/test_config.yaml | 3 ++ 7 files changed, 74 insertions(+), 4 deletions(-) diff --git a/docs/config_example.yaml b/docs/config_example.yaml index 3dd8c39..ecc7ae7 100644 --- a/docs/config_example.yaml +++ b/docs/config_example.yaml @@ -63,4 +63,7 @@ root_cause_analysis: # (optional) if not specified, no root_cause_analysis (ARC alpha: 0.8 init_x_bias: recon num_iter: 200 + ignore_features: + - windspeed + - output_power diff --git a/docs/usage_examples.rst b/docs/usage_examples.rst index ed70e09..e20b640 100644 --- a/docs/usage_examples.rst +++ b/docs/usage_examples.rst @@ -98,6 +98,10 @@ algorithm. An example: .. include:: config_example.yaml :literal: +The optional ``ignore_features`` list inside ``root_cause_analysis`` can be used to prevent ARCANA from adjusting +specific sensors (for example ``windspeed`` or ``output_power``). These features will be kept fixed during the +optimisation and therefore won't be reported as a root cause. + To update the configuration 'on the fly' (for example for hyperparameter optimization), you provide a new configuration dictionary via the ``update_config`` method: diff --git a/energy_fault_detector/config/config.py b/energy_fault_detector/config/config.py index 9722615..15df690 100644 --- a/energy_fault_detector/config/config.py +++ b/energy_fault_detector/config/config.py @@ -76,6 +76,11 @@ 'num_iter': {'type': 'integer', 'required': False}, 'epsilon': {'type': 'float', 'required': False}, 'verbose': {'type': 'boolean', 'required': False}, + 'ignore_features': { + 'type': 'list', + 'required': False, + 'schema': {'type': 'string'}, + }, } PREDICT_SCHEMA = { diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index 3381beb..453f201 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -2,7 +2,7 @@ import logging -from typing import Tuple, List +from typing import Tuple, List, Optional, Set import numpy as np import pandas as pd @@ -72,7 +72,7 @@ class Arcana: def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: str = 'recon', alpha: float = 0.8, num_iter: int = 400, epsilon: float = 1e-8, verbose: bool = False, - max_sample_threshold: int = 1000, **kwargs): + max_sample_threshold: int = 1000, ignore_features: Optional[List[str]] = None, **kwargs): self.keras_model: AE_TYPE = model @@ -89,6 +89,8 @@ def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: st self.num_iter: int = num_iter self.verbose: bool = verbose self.max_sample_threshold = max_sample_threshold + self.ignore_features: Set[str] = set(ignore_features or []) + self._feature_mask: Optional[tf.Tensor] = None def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bias: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame, List[pd.DataFrame]]: @@ -111,6 +113,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi """ feature_names = x.columns + self._feature_mask = self._build_feature_mask(feature_names) timestamps = x.index x = x.values.astype('float32') @@ -120,6 +123,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi x_bias = self.initialize_x_bias(x) x_bias = tf.Variable(x_bias, dtype=tf.float32) + self._apply_feature_mask(x_bias) tracked_losses = {'Combined Loss': [], 'Reconstruction Loss': [], 'Regularization Loss': [], 'Iteration': []} bias = x_bias.numpy() @@ -127,6 +131,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi tracked_bias = [bias] for i in range(self.num_iter): x_bias, losses, _ = self.update_x_bias(x, x_bias) + self._apply_feature_mask(x_bias) if i % 50 == 0: loss_1, loss_2, combined_loss = losses[0].numpy(), losses[1].numpy(), losses[2].numpy() if track_losses: @@ -141,11 +146,15 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi logger.info('%d Combined Loss: %.2f', i, combined_loss) x_bias = pd.DataFrame(data=x_bias.numpy(), columns=feature_names, index=timestamps) + self._apply_feature_mask_to_dataframe(x_bias) # return x_bias as a pandas DataFrame tracked_losses = pd.DataFrame(tracked_losses) tracked_losses = tracked_losses.set_index('Iteration') tracked_bias_dfs = [pd.DataFrame(data=bias, columns=feature_names, index=timestamps) for bias in tracked_bias] + for bias_df in tracked_bias_dfs: + self._apply_feature_mask_to_dataframe(bias_df) + self._feature_mask = None return x_bias, tracked_losses, tracked_bias_dfs def draw_samples(self, x: np.array) -> np.array: @@ -215,5 +224,40 @@ def update_x_bias(self, x: tf.Variable, x_bias: tf.Variable) -> BIAS_RETURN_TYPE # differentiate w.r.t. x_bias grad = grad_tape.gradient(loss_full, x_bias) + if self._feature_mask is not None: + grad = grad * self._feature_mask self.opt.apply_gradients(zip([grad], [x_bias])) + if self._feature_mask is not None: + x_bias.assign(x_bias * self._feature_mask) return x_bias, (loss_1, loss_2, loss_full), grad + + def _build_feature_mask(self, feature_names: pd.Index) -> Optional[tf.Tensor]: + """Create mask to zero gradients for ignored features.""" + if not self.ignore_features: + return None + mask = np.ones((1, len(feature_names)), dtype='float32') + for idx, name in enumerate(feature_names): + if name in self.ignore_features: + mask[0, idx] = 0.0 + if np.all(mask == 1.0): + return None + ignored = sorted(set(feature_names).intersection(self.ignore_features)) + if ignored: + logger.info('Ignoring %s feature(s) during ARCANA optimisation: %s', len(ignored), ', '.join(ignored)) + not_found = self.ignore_features.difference(feature_names) + if not_found: + logger.warning('Configured features to ignore not found in input data: %s', ', '.join(sorted(not_found))) + return tf.constant(mask, dtype=tf.float32) + + def _apply_feature_mask(self, x_bias: tf.Variable) -> None: + """Ensure ignored features remain unchanged.""" + if self._feature_mask is not None: + x_bias.assign(x_bias * self._feature_mask) + + def _apply_feature_mask_to_dataframe(self, df: pd.DataFrame) -> None: + """Zero-out ignored feature values in a DataFrame result.""" + if not self.ignore_features: + return + intersection = self.ignore_features.intersection(df.columns) + if intersection: + df.loc[:, list(intersection)] = 0.0 diff --git a/tests/config/test_config.py b/tests/config/test_config.py index bf29cba..21e754d 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -46,7 +46,8 @@ def test_init(self): self.assertDictEqual(conf.config_dict['root_cause_analysis'], {'alpha': 0.8, 'init_x_bias': 'recon', - 'num_iter': 200} + 'num_iter': 200, + 'ignore_features': ['windspeed', 'output_power']} ) def test_arcana_config(self): @@ -54,7 +55,8 @@ def test_arcana_config(self): self.assertTrue(conf.root_cause_analysis) self.assertDictEqual( conf.arcana_params, - {'alpha': 0.8, 'num_iter': 200, 'init_x_bias': 'recon'} + {'alpha': 0.8, 'num_iter': 200, 'init_x_bias': 'recon', + 'ignore_features': ['windspeed', 'output_power']} ) conf = Config(os.path.join(PROJECT_ROOT, './tests/test_data/test_config_no_rca.yaml')) diff --git a/tests/root_cause_analysis/test_arcana.py b/tests/root_cause_analysis/test_arcana.py index 4a25e2b..1b2332b 100644 --- a/tests/root_cause_analysis/test_arcana.py +++ b/tests/root_cause_analysis/test_arcana.py @@ -104,3 +104,12 @@ def test_draw_samples(self): selection = arcana.draw_samples(x=self.data) self.assertTupleEqual(self.data[:-1].shape, self.data[selection].shape) + def test_ignore_features(self): + ignore_cols = [self.data_frame.columns[0], 'non_existing_feature'] + arcana = Arcana(model=self.ml_ae, num_iter=5, ignore_features=ignore_cols) + subset = self.data_frame.iloc[:20] + bias, losses, _ = arcana.find_arcana_bias(subset, track_losses=True) + self.assertTrue((bias[ignore_cols[0]] == 0).all()) + # ensure optimisation still runs and logs losses + self.assertFalse(losses.empty) + diff --git a/tests/test_data/test_config.yaml b/tests/test_data/test_config.yaml index ee0f42e..23f0479 100644 --- a/tests/test_data/test_config.yaml +++ b/tests/test_data/test_config.yaml @@ -40,3 +40,6 @@ root_cause_analysis: alpha: 0.8 init_x_bias: recon num_iter: 200 + ignore_features: + - windspeed + - output_power From 48fa690ea254439d69aa846ee29f787c68a6408b Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 04:30:59 -0700 Subject: [PATCH 17/42] Save anomaly details during prediction runs --- run_predict.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/run_predict.py b/run_predict.py index 87eb586..9390e13 100644 --- a/run_predict.py +++ b/run_predict.py @@ -194,10 +194,22 @@ def main() -> None: model_path=args.model_path, ) + predicted_anomalies = prediction_results.predicted_anomalies + + if args.save_dir is not None: + output_dir = Path(args.save_dir) + else: + output_dir = Path(args.csv_predict_data).resolve().parent + + output_dir.mkdir(parents=True, exist_ok=True) + anomaly_details_path = output_dir / f"{Path(args.csv_predict_data).stem}_anomaly_details.csv" + predicted_anomalies.to_csv(anomaly_details_path) + print("Prediction finished successfully.") print(f"Number of detected events: {len(event_metadata)}") print("Anomaly score preview:") print(prediction_results.anomaly_score.head()) + print(f"Saved anomaly details to: {anomaly_details_path}") if __name__ == "__main__": # pragma: no cover - script entry point From 846742eaade31a496c40bcb215fb8f43e1800012 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 04:36:13 -0700 Subject: [PATCH 18/42] Clarify and extend ARCANA feature ignoring --- docs/config_example.yaml | 4 + docs/usage_examples.rst | 10 +++ energy_fault_detector/config/config.py | 5 ++ .../root_cause_analysis/arcana.py | 73 ++++++++++++++++++- tests/config/test_config.py | 6 +- tests/root_cause_analysis/test_arcana.py | 18 +++++ tests/test_data/test_config.yaml | 3 + 7 files changed, 115 insertions(+), 4 deletions(-) diff --git a/docs/config_example.yaml b/docs/config_example.yaml index 3dd8c39..8d26aed 100644 --- a/docs/config_example.yaml +++ b/docs/config_example.yaml @@ -63,4 +63,8 @@ root_cause_analysis: # (optional) if not specified, no root_cause_analysis (ARC alpha: 0.8 init_x_bias: recon num_iter: 200 + ignore_features: + # list exact column names or use wildcards such as "windspeed*" + - windspeed + - output_power diff --git a/docs/usage_examples.rst b/docs/usage_examples.rst index ed70e09..956f5be 100644 --- a/docs/usage_examples.rst +++ b/docs/usage_examples.rst @@ -98,6 +98,16 @@ algorithm. An example: .. include:: config_example.yaml :literal: +To prevent ARCANA from adjusting certain sensors you can edit the configuration YAML that you pass to +:py:obj:`Config ` (for example ``configs/base_config.yaml`` or a copy of it). Inside +the ``root_cause_analysis`` section add or update the optional ``ignore_features`` list. Any column names listed here +remain fixed during optimisation and are therefore never suggested as a root cause. + +Column names must match the data frame headers used at prediction time. You can either provide the exact column name or +use Unix shell-style wildcards to target multiple columns at once (for example ``windspeed*`` matches +``windspeed_avg`` and ``windspeed_peak``). If a configured pattern does not match any columns a warning is logged so you +can correct the entry. + To update the configuration 'on the fly' (for example for hyperparameter optimization), you provide a new configuration dictionary via the ``update_config`` method: diff --git a/energy_fault_detector/config/config.py b/energy_fault_detector/config/config.py index 9722615..15df690 100644 --- a/energy_fault_detector/config/config.py +++ b/energy_fault_detector/config/config.py @@ -76,6 +76,11 @@ 'num_iter': {'type': 'integer', 'required': False}, 'epsilon': {'type': 'float', 'required': False}, 'verbose': {'type': 'boolean', 'required': False}, + 'ignore_features': { + 'type': 'list', + 'required': False, + 'schema': {'type': 'string'}, + }, } PREDICT_SCHEMA = { diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index 3381beb..b82c03d 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -2,7 +2,8 @@ import logging -from typing import Tuple, List +from fnmatch import fnmatch +from typing import Tuple, List, Optional, Set import numpy as np import pandas as pd @@ -72,7 +73,7 @@ class Arcana: def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: str = 'recon', alpha: float = 0.8, num_iter: int = 400, epsilon: float = 1e-8, verbose: bool = False, - max_sample_threshold: int = 1000, **kwargs): + max_sample_threshold: int = 1000, ignore_features: Optional[List[str]] = None, **kwargs): self.keras_model: AE_TYPE = model @@ -89,6 +90,9 @@ def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: st self.num_iter: int = num_iter self.verbose: bool = verbose self.max_sample_threshold = max_sample_threshold + self.ignore_features: Tuple[str, ...] = tuple(ignore_features or []) + self._feature_mask: Optional[tf.Tensor] = None + self._ignored_columns: Set[str] = set() def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bias: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame, List[pd.DataFrame]]: @@ -111,6 +115,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi """ feature_names = x.columns + self._feature_mask = self._build_feature_mask(feature_names) timestamps = x.index x = x.values.astype('float32') @@ -120,6 +125,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi x_bias = self.initialize_x_bias(x) x_bias = tf.Variable(x_bias, dtype=tf.float32) + self._apply_feature_mask(x_bias) tracked_losses = {'Combined Loss': [], 'Reconstruction Loss': [], 'Regularization Loss': [], 'Iteration': []} bias = x_bias.numpy() @@ -127,6 +133,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi tracked_bias = [bias] for i in range(self.num_iter): x_bias, losses, _ = self.update_x_bias(x, x_bias) + self._apply_feature_mask(x_bias) if i % 50 == 0: loss_1, loss_2, combined_loss = losses[0].numpy(), losses[1].numpy(), losses[2].numpy() if track_losses: @@ -141,11 +148,15 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi logger.info('%d Combined Loss: %.2f', i, combined_loss) x_bias = pd.DataFrame(data=x_bias.numpy(), columns=feature_names, index=timestamps) + self._apply_feature_mask_to_dataframe(x_bias) # return x_bias as a pandas DataFrame tracked_losses = pd.DataFrame(tracked_losses) tracked_losses = tracked_losses.set_index('Iteration') tracked_bias_dfs = [pd.DataFrame(data=bias, columns=feature_names, index=timestamps) for bias in tracked_bias] + for bias_df in tracked_bias_dfs: + self._apply_feature_mask_to_dataframe(bias_df) + self._feature_mask = None return x_bias, tracked_losses, tracked_bias_dfs def draw_samples(self, x: np.array) -> np.array: @@ -215,5 +226,63 @@ def update_x_bias(self, x: tf.Variable, x_bias: tf.Variable) -> BIAS_RETURN_TYPE # differentiate w.r.t. x_bias grad = grad_tape.gradient(loss_full, x_bias) + if self._feature_mask is not None: + grad = grad * self._feature_mask self.opt.apply_gradients(zip([grad], [x_bias])) + if self._feature_mask is not None: + x_bias.assign(x_bias * self._feature_mask) return x_bias, (loss_1, loss_2, loss_full), grad + + def _build_feature_mask(self, feature_names: pd.Index) -> Optional[tf.Tensor]: + """Create mask to zero gradients for ignored features.""" + if not self.ignore_features: + self._ignored_columns = set() + return None + mask = np.ones((1, len(feature_names)), dtype='float32') + ignored_columns: Set[str] = set() + matched_patterns: Set[str] = set() + + for idx, name in enumerate(feature_names): + for pattern in self.ignore_features: + if fnmatch(name, pattern): + mask[0, idx] = 0.0 + ignored_columns.add(name) + matched_patterns.add(pattern) + break + + self._ignored_columns = ignored_columns + if np.all(mask == 1.0): + if unmatched := sorted(set(self.ignore_features) - matched_patterns): + logger.warning( + 'Configured features to ignore not found in input data: %s', + ', '.join(unmatched) + ) + return None + + if ignored_columns: + logger.info( + 'Ignoring %s feature(s) during ARCANA optimisation: %s', + len(ignored_columns), + ', '.join(sorted(ignored_columns)) + ) + + if unmatched := sorted(set(self.ignore_features) - matched_patterns): + logger.warning( + 'Configured features to ignore not found in input data: %s', + ', '.join(unmatched) + ) + + return tf.constant(mask, dtype=tf.float32) + + def _apply_feature_mask(self, x_bias: tf.Variable) -> None: + """Ensure ignored features remain unchanged.""" + if self._feature_mask is not None: + x_bias.assign(x_bias * self._feature_mask) + + def _apply_feature_mask_to_dataframe(self, df: pd.DataFrame) -> None: + """Zero-out ignored feature values in a DataFrame result.""" + if not self._ignored_columns: + return + intersection = self._ignored_columns.intersection(df.columns) + if intersection: + df.loc[:, list(intersection)] = 0.0 diff --git a/tests/config/test_config.py b/tests/config/test_config.py index bf29cba..21e754d 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -46,7 +46,8 @@ def test_init(self): self.assertDictEqual(conf.config_dict['root_cause_analysis'], {'alpha': 0.8, 'init_x_bias': 'recon', - 'num_iter': 200} + 'num_iter': 200, + 'ignore_features': ['windspeed', 'output_power']} ) def test_arcana_config(self): @@ -54,7 +55,8 @@ def test_arcana_config(self): self.assertTrue(conf.root_cause_analysis) self.assertDictEqual( conf.arcana_params, - {'alpha': 0.8, 'num_iter': 200, 'init_x_bias': 'recon'} + {'alpha': 0.8, 'num_iter': 200, 'init_x_bias': 'recon', + 'ignore_features': ['windspeed', 'output_power']} ) conf = Config(os.path.join(PROJECT_ROOT, './tests/test_data/test_config_no_rca.yaml')) diff --git a/tests/root_cause_analysis/test_arcana.py b/tests/root_cause_analysis/test_arcana.py index 4a25e2b..22e4194 100644 --- a/tests/root_cause_analysis/test_arcana.py +++ b/tests/root_cause_analysis/test_arcana.py @@ -104,3 +104,21 @@ def test_draw_samples(self): selection = arcana.draw_samples(x=self.data) self.assertTupleEqual(self.data[:-1].shape, self.data[selection].shape) + def test_ignore_features(self): + subset = self.data_frame.iloc[:20].copy() + subset.columns = [ + 'windspeed_avg', + 'output_power', + 'temperature', + 'pressure', + 'windspeed_peak', + 'other_feature' + ] + ignore_patterns = ['windspeed*', 'non_existing*'] + arcana = Arcana(model=self.ml_ae, num_iter=5, ignore_features=ignore_patterns) + bias, losses, _ = arcana.find_arcana_bias(subset, track_losses=True) + self.assertTrue((bias[['windspeed_avg', 'windspeed_peak']] == 0).all().all()) + self.assertEqual(arcana._ignored_columns, {'windspeed_avg', 'windspeed_peak'}) + # ensure optimisation still runs and logs losses + self.assertFalse(losses.empty) + diff --git a/tests/test_data/test_config.yaml b/tests/test_data/test_config.yaml index ee0f42e..23f0479 100644 --- a/tests/test_data/test_config.yaml +++ b/tests/test_data/test_config.yaml @@ -40,3 +40,6 @@ root_cause_analysis: alpha: 0.8 init_x_bias: recon num_iter: 200 + ignore_features: + - windspeed + - output_power From a086029c86dd32f04b895fba1cbc420eb5e3ca15 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 04:56:32 -0700 Subject: [PATCH 19/42] Mask ignored features in anomaly scoring and RCA --- docs/config_example.yaml | 4 ++ docs/usage_examples.rst | 24 +++++++ energy_fault_detector/config/config.py | 5 ++ energy_fault_detector/fault_detector.py | 33 +++++++++ .../root_cause_analysis/arcana.py | 68 ++++++++++++++++++- .../utils/feature_filters.py | 33 +++++++++ tests/config/test_config.py | 6 +- tests/root_cause_analysis/test_arcana.py | 18 +++++ tests/test_data/test_config.yaml | 3 + tests/utils/test_feature_filters.py | 37 ++++++++++ 10 files changed, 227 insertions(+), 4 deletions(-) create mode 100644 energy_fault_detector/utils/feature_filters.py create mode 100644 tests/utils/test_feature_filters.py diff --git a/docs/config_example.yaml b/docs/config_example.yaml index 3dd8c39..38ca43e 100644 --- a/docs/config_example.yaml +++ b/docs/config_example.yaml @@ -63,4 +63,8 @@ root_cause_analysis: # (optional) if not specified, no root_cause_analysis (ARC alpha: 0.8 init_x_bias: recon num_iter: 200 + ignore_features: + # Patterns apply to anomaly scoring and ARCANA. Wildcards such as "windspeed*" are supported. + - windspeed + - output_power diff --git a/docs/usage_examples.rst b/docs/usage_examples.rst index ed70e09..8e71056 100644 --- a/docs/usage_examples.rst +++ b/docs/usage_examples.rst @@ -98,6 +98,30 @@ algorithm. An example: .. include:: config_example.yaml :literal: +To keep specific sensors out of both anomaly scoring *and* ARCANA you can edit the configuration YAML that you pass to +:py:obj:`Config ` (for example ``configs/base_config.yaml`` or a copy of it). Inside +the ``root_cause_analysis`` section add or update the optional ``ignore_features`` list. Any column names listed here are +zeroed before the anomaly score is calculated and remain fixed during ARCANA optimisation, so they never trigger +anomalies and are never suggested as a root cause. + +Column names must match the data frame headers used during training or prediction. You can either provide the exact +column name or use Unix shell-style wildcards to target multiple columns at once (for example ``windspeed*`` matches +``windspeed_avg`` and ``windspeed_peak``). If a configured pattern does not match any columns a warning is logged so you +can correct the entry. + +.. code-block:: yaml + + root_cause_analysis: + ignore_features: + - windspeed* + - output_power + +When the configuration above is used, wind speed signals such as ``windspeed_avg`` are kept for model training but their +reconstruction error is ignored for anomaly scores and in ARCANA runs. The same YAML file can be supplied when calling +:py:meth:`FaultDetector.fit ` for training or +:py:meth:`FaultDetector.predict ` for inference so the same +set of exclusions is applied in both stages. + To update the configuration 'on the fly' (for example for hyperparameter optimization), you provide a new configuration dictionary via the ``update_config`` method: diff --git a/energy_fault_detector/config/config.py b/energy_fault_detector/config/config.py index 9722615..15df690 100644 --- a/energy_fault_detector/config/config.py +++ b/energy_fault_detector/config/config.py @@ -76,6 +76,11 @@ 'num_iter': {'type': 'integer', 'required': False}, 'epsilon': {'type': 'float', 'required': False}, 'verbose': {'type': 'boolean', 'required': False}, + 'ignore_features': { + 'type': 'list', + 'required': False, + 'schema': {'type': 'string'}, + }, } PREDICT_SCHEMA = { diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index d8c84cb..583ae84 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -17,6 +17,7 @@ from energy_fault_detector.config import Config from energy_fault_detector._logs import setup_logging from energy_fault_detector.core.fault_detection_result import FaultDetectionResult, ModelMetadata +from energy_fault_detector.utils.feature_filters import mask_ignored_features setup_logging(os.path.join(os.path.dirname(__file__), 'logging.yaml')) logger = logging.getLogger('energy_fault_detector') @@ -276,6 +277,7 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, else: x_predicted = self.autoencoder.predict(x_prepped, verbose=self.config.verbose) recon_error = self.autoencoder.get_reconstruction_error(x_prepped) + recon_error = self._mask_reconstruction_error(recon_error, 'anomaly scoring (prediction)') # inverse transform predictions so they are comparable to the raw data df_inverse_scaled_pred = self.data_preprocessor.inverse_transform(x_predicted) @@ -341,6 +343,7 @@ def predict_anomaly_score(self, sensor_data: pd.DataFrame) -> pd.Series: x_prepped = self.data_preprocessor.transform(sensor_data) recon_error = self.autoencoder.get_reconstruction_error(x_prepped) + recon_error = self._mask_reconstruction_error(recon_error, 'anomaly scoring (direct score)') scores = self.anomaly_score.transform(recon_error) return scores @@ -388,6 +391,7 @@ def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit # Fit score object only on normal data (all training + validation data) x_prepped_all = self.data_preprocessor.transform(x) deviations = self.autoencoder.get_reconstruction_error(x_prepped_all) + deviations = self._mask_reconstruction_error(deviations, 'anomaly scoring (training)') y_ = y.loc[deviations.index] self.anomaly_score.fit(deviations[y_.values]) # use series values for compatibility with a multi-index @@ -400,6 +404,7 @@ def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit x_val_all = x_prepped_all.sort_index().loc[x_val.index.min():] # including known anomalies re_val = self.autoencoder.get_reconstruction_error(x_val_all) + re_val = self._mask_reconstruction_error(re_val, 'anomaly scoring (validation)') scores = self.anomaly_score.transform(re_val) logger.info('Fit threshold.') @@ -410,3 +415,31 @@ def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit normal_index=y.loc[scores.index]) else: self.threshold_selector.fit(x=scores, y=y.loc[scores.index]) + + @property + def _ignore_feature_patterns(self) -> Tuple[str, ...]: + if self.config is None: + return tuple() + return tuple(self.config.arcana_params.get('ignore_features', [])) + + def _mask_reconstruction_error(self, recon_error: pd.DataFrame, context: str) -> pd.DataFrame: + """Zero-out ignored feature columns before scoring.""" + patterns = self._ignore_feature_patterns + if not patterns: + return recon_error + + masked, ignored_columns, unmatched = mask_ignored_features(recon_error, patterns) + if ignored_columns: + logger.info( + 'Ignoring %s feature(s) for %s: %s', + len(ignored_columns), + context, + ', '.join(sorted(ignored_columns)) + ) + if unmatched: + logger.warning( + 'Configured features to ignore not found in %s: %s', + context, + ', '.join(sorted(unmatched)) + ) + return masked diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index 3381beb..9dd956d 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -2,7 +2,7 @@ import logging -from typing import Tuple, List +from typing import Tuple, List, Optional, Set import numpy as np import pandas as pd @@ -13,6 +13,7 @@ from tensorflow.keras.optimizers import Adam from energy_fault_detector.core import Autoencoder +from energy_fault_detector.utils.feature_filters import resolve_ignored_columns logger = logging.getLogger('energy_fault_detector') @@ -72,7 +73,7 @@ class Arcana: def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: str = 'recon', alpha: float = 0.8, num_iter: int = 400, epsilon: float = 1e-8, verbose: bool = False, - max_sample_threshold: int = 1000, **kwargs): + max_sample_threshold: int = 1000, ignore_features: Optional[List[str]] = None, **kwargs): self.keras_model: AE_TYPE = model @@ -89,6 +90,9 @@ def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: st self.num_iter: int = num_iter self.verbose: bool = verbose self.max_sample_threshold = max_sample_threshold + self.ignore_features: Tuple[str, ...] = tuple(ignore_features or []) + self._feature_mask: Optional[tf.Tensor] = None + self._ignored_columns: Set[str] = set() def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bias: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame, List[pd.DataFrame]]: @@ -111,6 +115,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi """ feature_names = x.columns + self._feature_mask = self._build_feature_mask(feature_names) timestamps = x.index x = x.values.astype('float32') @@ -120,6 +125,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi x_bias = self.initialize_x_bias(x) x_bias = tf.Variable(x_bias, dtype=tf.float32) + self._apply_feature_mask(x_bias) tracked_losses = {'Combined Loss': [], 'Reconstruction Loss': [], 'Regularization Loss': [], 'Iteration': []} bias = x_bias.numpy() @@ -127,6 +133,7 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi tracked_bias = [bias] for i in range(self.num_iter): x_bias, losses, _ = self.update_x_bias(x, x_bias) + self._apply_feature_mask(x_bias) if i % 50 == 0: loss_1, loss_2, combined_loss = losses[0].numpy(), losses[1].numpy(), losses[2].numpy() if track_losses: @@ -141,11 +148,15 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi logger.info('%d Combined Loss: %.2f', i, combined_loss) x_bias = pd.DataFrame(data=x_bias.numpy(), columns=feature_names, index=timestamps) + self._apply_feature_mask_to_dataframe(x_bias) # return x_bias as a pandas DataFrame tracked_losses = pd.DataFrame(tracked_losses) tracked_losses = tracked_losses.set_index('Iteration') tracked_bias_dfs = [pd.DataFrame(data=bias, columns=feature_names, index=timestamps) for bias in tracked_bias] + for bias_df in tracked_bias_dfs: + self._apply_feature_mask_to_dataframe(bias_df) + self._feature_mask = None return x_bias, tracked_losses, tracked_bias_dfs def draw_samples(self, x: np.array) -> np.array: @@ -215,5 +226,58 @@ def update_x_bias(self, x: tf.Variable, x_bias: tf.Variable) -> BIAS_RETURN_TYPE # differentiate w.r.t. x_bias grad = grad_tape.gradient(loss_full, x_bias) + if self._feature_mask is not None: + grad = grad * self._feature_mask self.opt.apply_gradients(zip([grad], [x_bias])) + if self._feature_mask is not None: + x_bias.assign(x_bias * self._feature_mask) return x_bias, (loss_1, loss_2, loss_full), grad + + def _build_feature_mask(self, feature_names: pd.Index) -> Optional[tf.Tensor]: + """Create mask to zero gradients for ignored features.""" + if not self.ignore_features: + self._ignored_columns = set() + return None + mask = np.ones((1, len(feature_names)), dtype='float32') + ignored_columns, unmatched = resolve_ignored_columns(feature_names, self.ignore_features) + + if ignored_columns: + for idx, name in enumerate(feature_names): + if name in ignored_columns: + mask[0, idx] = 0.0 + + self._ignored_columns = ignored_columns + if np.all(mask == 1.0): + if unmatched: + logger.warning( + 'Configured features to ignore not found in input data: %s', + ', '.join(sorted(unmatched)) + ) + return None + + logger.info( + 'Ignoring %s feature(s) during ARCANA optimisation: %s', + len(ignored_columns), + ', '.join(sorted(ignored_columns)) + ) + + if unmatched: + logger.warning( + 'Configured features to ignore not found in input data: %s', + ', '.join(sorted(unmatched)) + ) + + return tf.constant(mask, dtype=tf.float32) + + def _apply_feature_mask(self, x_bias: tf.Variable) -> None: + """Ensure ignored features remain unchanged.""" + if self._feature_mask is not None: + x_bias.assign(x_bias * self._feature_mask) + + def _apply_feature_mask_to_dataframe(self, df: pd.DataFrame) -> None: + """Zero-out ignored feature values in a DataFrame result.""" + if not self._ignored_columns: + return + intersection = self._ignored_columns.intersection(df.columns) + if intersection: + df.loc[:, list(intersection)] = 0.0 diff --git a/energy_fault_detector/utils/feature_filters.py b/energy_fault_detector/utils/feature_filters.py new file mode 100644 index 0000000..aca76b0 --- /dev/null +++ b/energy_fault_detector/utils/feature_filters.py @@ -0,0 +1,33 @@ +"""Utilities for handling feature exclusion patterns.""" + +from fnmatch import fnmatch +from typing import Iterable, Set, Tuple + +import pandas as pd + + +def resolve_ignored_columns(columns: Iterable[str], patterns: Iterable[str]) -> Tuple[Set[str], Set[str]]: + """Resolve wildcard patterns against column names.""" + matched_columns: Set[str] = set() + matched_patterns: Set[str] = set() + + for name in columns: + for pattern in patterns: + if fnmatch(name, pattern): + matched_columns.add(name) + matched_patterns.add(pattern) + break + + unmatched_patterns = set(patterns) - matched_patterns + return matched_columns, unmatched_patterns + + +def mask_ignored_features(df: pd.DataFrame, patterns: Iterable[str]) -> Tuple[pd.DataFrame, Set[str], Set[str]]: + """Return a copy of *df* with ignored columns set to zero.""" + ignored_columns, unmatched_patterns = resolve_ignored_columns(df.columns, patterns) + if not ignored_columns: + return df, ignored_columns, unmatched_patterns + + masked = df.copy() + masked.loc[:, list(ignored_columns)] = 0.0 + return masked, ignored_columns, unmatched_patterns diff --git a/tests/config/test_config.py b/tests/config/test_config.py index bf29cba..21e754d 100644 --- a/tests/config/test_config.py +++ b/tests/config/test_config.py @@ -46,7 +46,8 @@ def test_init(self): self.assertDictEqual(conf.config_dict['root_cause_analysis'], {'alpha': 0.8, 'init_x_bias': 'recon', - 'num_iter': 200} + 'num_iter': 200, + 'ignore_features': ['windspeed', 'output_power']} ) def test_arcana_config(self): @@ -54,7 +55,8 @@ def test_arcana_config(self): self.assertTrue(conf.root_cause_analysis) self.assertDictEqual( conf.arcana_params, - {'alpha': 0.8, 'num_iter': 200, 'init_x_bias': 'recon'} + {'alpha': 0.8, 'num_iter': 200, 'init_x_bias': 'recon', + 'ignore_features': ['windspeed', 'output_power']} ) conf = Config(os.path.join(PROJECT_ROOT, './tests/test_data/test_config_no_rca.yaml')) diff --git a/tests/root_cause_analysis/test_arcana.py b/tests/root_cause_analysis/test_arcana.py index 4a25e2b..22e4194 100644 --- a/tests/root_cause_analysis/test_arcana.py +++ b/tests/root_cause_analysis/test_arcana.py @@ -104,3 +104,21 @@ def test_draw_samples(self): selection = arcana.draw_samples(x=self.data) self.assertTupleEqual(self.data[:-1].shape, self.data[selection].shape) + def test_ignore_features(self): + subset = self.data_frame.iloc[:20].copy() + subset.columns = [ + 'windspeed_avg', + 'output_power', + 'temperature', + 'pressure', + 'windspeed_peak', + 'other_feature' + ] + ignore_patterns = ['windspeed*', 'non_existing*'] + arcana = Arcana(model=self.ml_ae, num_iter=5, ignore_features=ignore_patterns) + bias, losses, _ = arcana.find_arcana_bias(subset, track_losses=True) + self.assertTrue((bias[['windspeed_avg', 'windspeed_peak']] == 0).all().all()) + self.assertEqual(arcana._ignored_columns, {'windspeed_avg', 'windspeed_peak'}) + # ensure optimisation still runs and logs losses + self.assertFalse(losses.empty) + diff --git a/tests/test_data/test_config.yaml b/tests/test_data/test_config.yaml index ee0f42e..23f0479 100644 --- a/tests/test_data/test_config.yaml +++ b/tests/test_data/test_config.yaml @@ -40,3 +40,6 @@ root_cause_analysis: alpha: 0.8 init_x_bias: recon num_iter: 200 + ignore_features: + - windspeed + - output_power diff --git a/tests/utils/test_feature_filters.py b/tests/utils/test_feature_filters.py new file mode 100644 index 0000000..571dfdf --- /dev/null +++ b/tests/utils/test_feature_filters.py @@ -0,0 +1,37 @@ +import pytest + +pd = pytest.importorskip('pandas') + +import pandas as pd + +from energy_fault_detector.utils.feature_filters import ( + mask_ignored_features, + resolve_ignored_columns, +) + + +def test_resolve_ignored_columns_matches_and_reports_unmatched(): + columns = ['windspeed_avg', 'output_power', 'temperature'] + patterns = ['windspeed*', 'non_existent'] + + ignored, unmatched = resolve_ignored_columns(columns, patterns) + + assert ignored == {'windspeed_avg'} + assert unmatched == {'non_existent'} + + +def test_mask_ignored_features_sets_columns_to_zero(): + df = pd.DataFrame({ + 'windspeed_avg': [1.0, -2.0], + 'temperature': [3.0, 4.0], + }) + + masked, ignored, unmatched = mask_ignored_features(df, ['windspeed*']) + + assert ignored == {'windspeed_avg'} + assert unmatched == set() + assert masked['windspeed_avg'].tolist() == [0.0, 0.0] + assert masked['temperature'].tolist() == [3.0, 4.0] + # original DataFrame is not modified + assert df['windspeed_avg'].tolist() == [1.0, -2.0] + From da0789c4f5d5df5ee1bdfcb26c50e90bb3aa9697 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 09:06:34 -0700 Subject: [PATCH 20/42] Save prediction outputs by asset --- .../quick_fault_detector.py | 45 ++++++++++++++++++- run_predict.py | 8 ++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index 61c4cd6..d33ace8 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -2,6 +2,7 @@ import os import logging +from pathlib import Path from typing import List, Optional, Tuple try: from typing import Literal @@ -36,7 +37,8 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio model_path: Optional[str] = None, model_directory: Optional[str] = None, model_subdir: Optional[str] = None, - model_name: Optional[str] = None + model_name: Optional[str] = None, + asset_name: Optional[str] = None ) -> Tuple[FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]]: """Analyzes provided data using an autoencoder based approach for identifying anomalies based on a learned normal behavior. Anomalies are then aggregated to events and further analyzed. @@ -99,6 +101,9 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio model_subdir (Optional[str]): Optional subdirectory inside ``model_directory`` that should be used when saving models. If not provided, a timestamp-based folder name is used. model_name (Optional[str]): Custom directory name for the saved model artifacts. Defaults to 'trained_model'. + asset_name (Optional[str]): Identifier of the asset whose data is being analysed. When running in prediction + mode this name is used to create a dedicated folder inside ``prediction_output``. If omitted the folder name + is inferred from ``csv_test_data_path`` when possible. Returns: Tuple(FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]): FaultDetectionResult object with the @@ -112,7 +117,8 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio - arcana_losses: DataFrame containing recorded values for all losses in ARCANA. None if ARCANA was not run. - tracked_bias: List of DataFrames. None if ARCANA was not run. - and the detected anomaly events as dataframe. + and the detected anomaly events as dataframe. When run in prediction mode, all DataFrames contained in the + ``FaultDetectionResult`` are additionally persisted as CSV files under ``prediction_output/``. When run in training mode, the tuple additionally contains ModelMetadata with information about the saved model. In prediction mode, the metadata element is None. @@ -188,6 +194,14 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio test_data=test_data, arcana_losses=arcana_loss_list, arcana_mean_importances=arcana_mean_importance_list, event_meta_data=event_meta_data, save_dir=save_dir) + if mode == 'predict': + _save_prediction_results( + prediction_results=prediction_results, + csv_test_data_path=csv_test_data_path, + save_dir=save_dir, + asset_name=asset_name, + ) + return prediction_results, event_meta_data, model_metadata @@ -209,3 +223,30 @@ def analyze_event(anomaly_detector: FaultDetector, event_data: pd.DataFrame, tra track_bias=False) importances_mean = calculate_mean_arcana_importances(bias_data=bias) return importances_mean, tracked_losses + + +def _save_prediction_results(prediction_results: FaultDetectionResult, + csv_test_data_path: Optional[str], + save_dir: Optional[str], + asset_name: Optional[str]) -> None: + """Persist prediction artefacts to disk when running in predict mode.""" + + base_directory: Path + if save_dir is not None: + base_directory = Path(save_dir) + elif csv_test_data_path is not None: + base_directory = Path(csv_test_data_path).resolve().parent + else: + base_directory = Path.cwd() + + inferred_asset_name: str + if asset_name: + inferred_asset_name = asset_name + elif csv_test_data_path is not None: + csv_path = Path(csv_test_data_path) + inferred_asset_name = csv_path.resolve().parent.name or csv_path.stem + else: + inferred_asset_name = 'asset' + + output_directory = base_directory / 'prediction_output' / inferred_asset_name + prediction_results.save(str(output_directory)) diff --git a/run_predict.py b/run_predict.py index 9390e13..1be296b 100644 --- a/run_predict.py +++ b/run_predict.py @@ -145,6 +145,13 @@ def build_argument_parser() -> argparse.ArgumentParser: action="store_true", help="Enable creation of additional debug plots during prediction.", ) + parser.add_argument( + "--asset-name", + help=( + "Optional identifier for the analysed asset. When provided, prediction CSV files are " + "written to a 'prediction_output/' directory." + ), + ) return parser @@ -192,6 +199,7 @@ def main() -> None: enable_debug_plots=args.debug_plots, mode="predict", model_path=args.model_path, + asset_name=args.asset_name, ) predicted_anomalies = prediction_results.predicted_anomalies From c7d851abe366974d36e84e692843ec5d5bab4afc Mon Sep 17 00:00:00 2001 From: teehari79 Date: Wed, 8 Oct 2025 21:24:55 -0700 Subject: [PATCH 21/42] Fix RCA ignore patterns and reset cumulative score --- energy_fault_detector/fault_detector.py | 3 +- .../root_cause_analysis/arcana.py | 71 +++---------------- 2 files changed, 10 insertions(+), 64 deletions(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 583ae84..0862522 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -309,8 +309,9 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, ) predicted_anomalies['anamoly_score'] = score_series.reindex(predicted_anomalies.index) predicted_anomalies['threshold_score'] = threshold_value + anomaly_flags = predicted_anomalies['anomaly'].astype(int) predicted_anomalies['cumulative_anamoly_score'] = ( - predicted_anomalies['anomaly'].astype(int).cumsum() + anomaly_flags.groupby((anomaly_flags == 0).cumsum()).cumsum() ) predicted_anomalies['anamolous fields'] = predicted_anomalies['anomaly'].map( lambda is_anomaly: '' diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index aedf3b6..1c46539 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -2,7 +2,6 @@ import logging -from fnmatch import fnmatch from typing import Tuple, List, Optional, Set import numpy as np @@ -94,8 +93,6 @@ def __init__(self, model: AE_TYPE, learning_rate: float = 0.001, init_x_bias: st self.ignore_features: Tuple[str, ...] = tuple(ignore_features or []) self._feature_mask: Optional[tf.Tensor] = None self._ignored_columns: Set[str] = set() - self.ignore_features: Set[str] = set(ignore_features or []) - self._feature_mask: Optional[tf.Tensor] = None def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bias: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame, List[pd.DataFrame]]: @@ -241,81 +238,31 @@ def _build_feature_mask(self, feature_names: pd.Index) -> Optional[tf.Tensor]: if not self.ignore_features: self._ignored_columns = set() return None + mask = np.ones((1, len(feature_names)), dtype='float32') ignored_columns, unmatched = resolve_ignored_columns(feature_names, self.ignore_features) if ignored_columns: + logger.info( + 'Ignoring %s feature(s) during ARCANA optimisation: %s', + len(ignored_columns), + ', '.join(sorted(ignored_columns)) + ) for idx, name in enumerate(feature_names): if name in ignored_columns: mask[0, idx] = 0.0 - self._ignored_columns = ignored_columns - if np.all(mask == 1.0): - if unmatched: - logger.warning( - 'Configured features to ignore not found in input data: %s', - ', '.join(sorted(unmatched)) - ) - return None - - logger.info( - 'Ignoring %s feature(s) during ARCANA optimisation: %s', - len(ignored_columns), - ', '.join(sorted(ignored_columns)) - ) - if unmatched: logger.warning( 'Configured features to ignore not found in input data: %s', ', '.join(sorted(unmatched)) ) - ignored_columns: Set[str] = set() - matched_patterns: Set[str] = set() - - for idx, name in enumerate(feature_names): - for pattern in self.ignore_features: - if fnmatch(name, pattern): - mask[0, idx] = 0.0 - ignored_columns.add(name) - matched_patterns.add(pattern) - break + self._ignored_columns = set(ignored_columns) - self._ignored_columns = ignored_columns if np.all(mask == 1.0): - if unmatched := sorted(set(self.ignore_features) - matched_patterns): - logger.warning( - 'Configured features to ignore not found in input data: %s', - ', '.join(unmatched) - ) return None - if ignored_columns: - logger.info( - 'Ignoring %s feature(s) during ARCANA optimisation: %s', - len(ignored_columns), - ', '.join(sorted(ignored_columns)) - ) - - if unmatched := sorted(set(self.ignore_features) - matched_patterns): - logger.warning( - 'Configured features to ignore not found in input data: %s', - ', '.join(unmatched) - ) - - return None - mask = np.ones((1, len(feature_names)), dtype='float32') - for idx, name in enumerate(feature_names): - if name in self.ignore_features: - mask[0, idx] = 0.0 - if np.all(mask == 1.0): - return None - ignored = sorted(set(feature_names).intersection(self.ignore_features)) - if ignored: - logger.info('Ignoring %s feature(s) during ARCANA optimisation: %s', len(ignored), ', '.join(ignored)) - not_found = self.ignore_features.difference(feature_names) - if not_found: - logger.warning('Configured features to ignore not found in input data: %s', ', '.join(sorted(not_found))) return tf.constant(mask, dtype=tf.float32) def _apply_feature_mask(self, x_bias: tf.Variable) -> None: @@ -327,9 +274,7 @@ def _apply_feature_mask_to_dataframe(self, df: pd.DataFrame) -> None: """Zero-out ignored feature values in a DataFrame result.""" if not self._ignored_columns: return + intersection = self._ignored_columns.intersection(df.columns) - if not self.ignore_features: - return - intersection = self.ignore_features.intersection(df.columns) if intersection: df.loc[:, list(intersection)] = 0.0 From 2dcdca411904d7b674c8b22b6401c5d18b216797 Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Thu, 9 Oct 2025 11:26:27 +0530 Subject: [PATCH 22/42] changed config --- docs/config_example.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/config_example.yaml b/docs/config_example.yaml index 4ae2cd2..8a16532 100644 --- a/docs/config_example.yaml +++ b/docs/config_example.yaml @@ -66,6 +66,8 @@ root_cause_analysis: # (optional) if not specified, no root_cause_analysis (ARC ignore_features: # Patterns apply to anomaly scoring and ARCANA. Wildcards such as "windspeed*" are supported. # list exact column names or use wildcards such as "windspeed*" - - windspeed - - output_power - + - wind_speed_59_avg + - power_58_avg + - wind_speed_60_avg + - wind_speed_61_avg + - power_62_avg \ No newline at end of file From 8b93bd7fcf7895567611492d4bf770368b45486e Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Thu, 9 Oct 2025 11:59:19 +0530 Subject: [PATCH 23/42] updated base_config.yaml with ignore features --- energy_fault_detector/base_config.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/energy_fault_detector/base_config.yaml b/energy_fault_detector/base_config.yaml index f708a7e..a2056a0 100644 --- a/energy_fault_detector/base_config.yaml +++ b/energy_fault_detector/base_config.yaml @@ -45,4 +45,12 @@ root_cause_analysis: alpha: 0.5 init_x_bias: recon num_iter: 1000 - verbose: true \ No newline at end of file + verbose: true + ignore_features: + # Patterns apply to anomaly scoring and ARCANA. Wildcards such as "windspeed*" are supported. + # list exact column names or use wildcards such as "windspeed*" + - wind_speed_59_avg + - power_58_avg + - wind_speed_60_avg + - wind_speed_61_avg + - power_62_avg \ No newline at end of file From 8432fec0e1d031223ec2ef437a4cd3ac8b9cba4f Mon Sep 17 00:00:00 2001 From: teehari79 Date: Thu, 9 Oct 2025 00:10:50 -0700 Subject: [PATCH 24/42] Make ignored feature matching case-insensitive --- energy_fault_detector/utils/feature_filters.py | 17 ++++++++++++++--- tests/utils/test_feature_filters.py | 10 ++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/energy_fault_detector/utils/feature_filters.py b/energy_fault_detector/utils/feature_filters.py index aca76b0..8ee17de 100644 --- a/energy_fault_detector/utils/feature_filters.py +++ b/energy_fault_detector/utils/feature_filters.py @@ -7,13 +7,24 @@ def resolve_ignored_columns(columns: Iterable[str], patterns: Iterable[str]) -> Tuple[Set[str], Set[str]]: - """Resolve wildcard patterns against column names.""" + """Resolve wildcard patterns against column names. + + Matching is case-insensitive to better support configuration values that + might not exactly match the column name casing. This avoids situations + where an ignored feature listed in the configuration (for example + ``power_58_avg``) would still appear in ARCANA results because the actual + column name differs only by letter case (such as ``Power_58_Avg``). + """ + matched_columns: Set[str] = set() matched_patterns: Set[str] = set() + normalised_patterns = [(pattern, pattern.lower()) for pattern in patterns] + for name in columns: - for pattern in patterns: - if fnmatch(name, pattern): + lower_name = name.lower() + for pattern, pattern_lower in normalised_patterns: + if fnmatch(name, pattern) or fnmatch(lower_name, pattern_lower): matched_columns.add(name) matched_patterns.add(pattern) break diff --git a/tests/utils/test_feature_filters.py b/tests/utils/test_feature_filters.py index 571dfdf..06df679 100644 --- a/tests/utils/test_feature_filters.py +++ b/tests/utils/test_feature_filters.py @@ -20,6 +20,16 @@ def test_resolve_ignored_columns_matches_and_reports_unmatched(): assert unmatched == {'non_existent'} +def test_resolve_ignored_columns_matches_case_insensitively(): + columns = ['Power_58_Avg', 'Other'] + patterns = ['power_58_avg'] + + ignored, unmatched = resolve_ignored_columns(columns, patterns) + + assert ignored == {'Power_58_Avg'} + assert unmatched == set() + + def test_mask_ignored_features_sets_columns_to_zero(): df = pd.DataFrame({ 'windspeed_avg': [1.0, -2.0], From d2917067cbff0691a8f7841769299a0b30a8b75f Mon Sep 17 00:00:00 2001 From: teehari79 Date: Thu, 9 Oct 2025 03:14:15 -0700 Subject: [PATCH 25/42] Ensure predict mode uses fallback config --- .../quick_fault_detection/quick_fault_detector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index d33ace8..ee0815d 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -12,6 +12,7 @@ import pandas as pd from energy_fault_detector._logs import setup_logging +from energy_fault_detector.config import Config from energy_fault_detector.fault_detector import FaultDetector from energy_fault_detector.utils.analysis import create_events from energy_fault_detector.root_cause_analysis.arcana_utils import calculate_mean_arcana_importances @@ -165,7 +166,9 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio root_cause_analysis = False else: logger.info('Loading pre-trained model from %s.', model_path) - anomaly_detector = FaultDetector() + fallback_config_path = Path(__file__).resolve().parent.parent / 'base_config.yaml' + fallback_config = Config(str(fallback_config_path)) + anomaly_detector = FaultDetector(config=fallback_config) anomaly_detector.load_models(model_path=model_path) root_cause_analysis = True From 68d16cf827f42b43e35c2859a880c7d2325bb040 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Fri, 10 Oct 2025 03:47:22 -0700 Subject: [PATCH 26/42] Ensure ARCANA respects ignore feature patterns --- energy_fault_detector/fault_detector.py | 25 ++++++-- tests/test_root_cause_ignore_features.py | 76 ++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 tests/test_root_cause_ignore_features.py diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 0862522..3e4973a 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -1,7 +1,7 @@ """Main fault detection class""" import logging -from typing import Optional, Any, Tuple, List +from typing import Optional, Any, Tuple, List, Iterable, Dict from datetime import datetime import os @@ -359,13 +359,15 @@ def predict_anomalies(self, scores: pd.Series, x_prepped: pd.DataFrame = None) - return predicted_anomalies def run_root_cause_analysis(self, sensor_data: pd.DataFrame, track_losses: bool = False, - track_bias: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame, List[pd.DataFrame]]: + track_bias: bool = False, ignore_features: Optional[Iterable[str]] = None + ) -> Tuple[pd.DataFrame, pd.DataFrame, List[pd.DataFrame]]: """Run ARCANA Args: sensor_data: pandas DataFrame containing the sensor data which should be analyzed. track_losses: optional bool. If True the arcana losses will be tracked over the iterations track_bias: optional bool. If True the arcana bias will be tracked over the iterations + ignore_features: optional iterable of patterns identifying features to exclude from the analysis. Returns: Tuple of (pd.DataFrame, pd.DataFrame, List[pd.DataFrame]) df_arcana_bias: pandas dataframe containing the arcana bias. @@ -374,11 +376,22 @@ def run_root_cause_analysis(self, sensor_data: pd.DataFrame, track_losses: bool """ x_prepped = self.data_preprocessor.transform(sensor_data) - if self.config is None: - # backwards compatibility, old models did not save config, just use default parameters - rca = Arcana(model=self.autoencoder) + arcana_kwargs: Dict[str, Any] = {} + if self.config is not None: + arcana_kwargs.update(self.config.arcana_params) + + configured_ignore_patterns: Tuple[str, ...] + if ignore_features is not None: + configured_ignore_patterns = tuple(ignore_features) else: - rca = Arcana(model=self.autoencoder, **self.config.arcana_params) + configured_ignore_patterns = tuple( + arcana_kwargs.get('ignore_features') or self._ignore_feature_patterns + ) + + if configured_ignore_patterns: + arcana_kwargs['ignore_features'] = list(configured_ignore_patterns) + + rca = Arcana(model=self.autoencoder, **arcana_kwargs) df_arcana_bias, arcana_losses, tracked_bias = rca.find_arcana_bias(x=x_prepped, track_losses=track_losses, diff --git a/tests/test_root_cause_ignore_features.py b/tests/test_root_cause_ignore_features.py new file mode 100644 index 0000000..bdd1e66 --- /dev/null +++ b/tests/test_root_cause_ignore_features.py @@ -0,0 +1,76 @@ +import pytest + +pd = pytest.importorskip('pandas') + +from energy_fault_detector.config import Config +from energy_fault_detector.fault_detector import FaultDetector + + +class _IdentityPreprocessor: + def transform(self, data): + return data + + +def _build_config(ignore_features=None): + config_dict = { + 'train': { + 'anomaly_score': {'name': 'rmse', 'params': {}}, + 'autoencoder': {'name': 'default', 'params': {'layers': [1]}}, + 'data_preprocessor': {'params': {}}, + 'threshold_selector': {'name': 'quantile', 'params': {}}, + }, + 'root_cause_analysis': {}, + } + if ignore_features is not None: + config_dict['root_cause_analysis']['ignore_features'] = ignore_features + return Config(config_dict=config_dict) + + +def _build_fault_detector(config): + detector = FaultDetector.__new__(FaultDetector) + detector.config = config + detector.data_preprocessor = _IdentityPreprocessor() + detector.autoencoder = object() + return detector + + +def test_run_root_cause_analysis_uses_configured_ignore_features(monkeypatch): + config = _build_config(ignore_features=['temp_*']) + detector = _build_fault_detector(config) + + captured_kwargs = {} + + class DummyArcana: + def __init__(self, model, **kwargs): + captured_kwargs.update(kwargs) + + def find_arcana_bias(self, x, track_losses, track_bias): + return x.copy(), pd.DataFrame(), [] + + monkeypatch.setattr('energy_fault_detector.fault_detector.Arcana', DummyArcana) + + sensor_data = pd.DataFrame({'temp_a': [1.0], 'temp_b': [2.0]}) + detector.run_root_cause_analysis(sensor_data=sensor_data) + + assert captured_kwargs.get('ignore_features') == ['temp_*'] + + +def test_run_root_cause_analysis_allows_ignore_feature_override(monkeypatch): + config = _build_config(ignore_features=['temp_*']) + detector = _build_fault_detector(config) + + captured_kwargs = {} + + class DummyArcana: + def __init__(self, model, **kwargs): + captured_kwargs.update(kwargs) + + def find_arcana_bias(self, x, track_losses, track_bias): + return x.copy(), pd.DataFrame(), [] + + monkeypatch.setattr('energy_fault_detector.fault_detector.Arcana', DummyArcana) + + sensor_data = pd.DataFrame({'temp_a': [1.0], 'temp_b': [2.0]}) + detector.run_root_cause_analysis(sensor_data=sensor_data, ignore_features=['pressure_*']) + + assert captured_kwargs.get('ignore_features') == ['pressure_*'] From 3f6d7c74065f51e01dde6a630e794898447bf7e3 Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Fri, 10 Oct 2025 16:18:16 +0530 Subject: [PATCH 27/42] Added print statements for debugging --- energy_fault_detector/fault_detector.py | 4 +++- .../quick_fault_detection/quick_fault_detector.py | 1 + energy_fault_detector/root_cause_analysis/arcana.py | 5 +++++ energy_fault_detector/utils/analysis.py | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 0862522..86c2f19 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -374,15 +374,17 @@ def run_root_cause_analysis(self, sensor_data: pd.DataFrame, track_losses: bool """ x_prepped = self.data_preprocessor.transform(sensor_data) + print("Arcana params:",self.config) if self.config is None: # backwards compatibility, old models did not save config, just use default parameters rca = Arcana(model=self.autoencoder) else: rca = Arcana(model=self.autoencoder, **self.config.arcana_params) - + print("Before finding Arcana bias") df_arcana_bias, arcana_losses, tracked_bias = rca.find_arcana_bias(x=x_prepped, track_losses=track_losses, track_bias=track_bias) + print("After finding Arcana bias") return df_arcana_bias, arcana_losses, tracked_bias def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit_on_validation: bool = False diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index ee0815d..6b42e3a 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -168,6 +168,7 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio logger.info('Loading pre-trained model from %s.', model_path) fallback_config_path = Path(__file__).resolve().parent.parent / 'base_config.yaml' fallback_config = Config(str(fallback_config_path)) + print("Fallback config:",fallback_config) anomaly_detector = FaultDetector(config=fallback_config) anomaly_detector.load_models(model_path=model_path) root_cause_analysis = True diff --git a/energy_fault_detector/root_cause_analysis/arcana.py b/energy_fault_detector/root_cause_analysis/arcana.py index 1c46539..7888279 100644 --- a/energy_fault_detector/root_cause_analysis/arcana.py +++ b/energy_fault_detector/root_cause_analysis/arcana.py @@ -115,7 +115,9 @@ def find_arcana_bias(self, x: pd.DataFrame, track_losses: bool = False, track_bi """ feature_names = x.columns + print("Before build feature mask") self._feature_mask = self._build_feature_mask(feature_names) + print("After build feature mask") timestamps = x.index x = x.values.astype('float32') @@ -235,12 +237,15 @@ def update_x_bias(self, x: tf.Variable, x_bias: tf.Variable) -> BIAS_RETURN_TYPE def _build_feature_mask(self, feature_names: pd.Index) -> Optional[tf.Tensor]: """Create mask to zero gradients for ignored features.""" + print("Ignore features set?",self.ignore_features) if not self.ignore_features: self._ignored_columns = set() return None mask = np.ones((1, len(feature_names)), dtype='float32') + print("Before ignoring columns") ignored_columns, unmatched = resolve_ignored_columns(feature_names, self.ignore_features) + print("ignored columns are:",ignored_columns) if ignored_columns: logger.info( diff --git a/energy_fault_detector/utils/analysis.py b/energy_fault_detector/utils/analysis.py index c45a44d..6bb9bd0 100644 --- a/energy_fault_detector/utils/analysis.py +++ b/energy_fault_detector/utils/analysis.py @@ -24,11 +24,14 @@ def create_events(sensor_data: pd.DataFrame, boolean_information: pd.Series, """ # Create a boolean mask for consecutive True values mask = (boolean_information != boolean_information.shift()).cumsum() + print("mask:",mask) # Group by the mask and filter groups where bool_series is True and has more # than consecutive_true_value_threshold consecutive True bool_mask = boolean_information.groupby(mask).transform(lambda data: data.all()).fillna(False) + print("Boolean mask:",mask) grouped_sensor_data = sensor_data[bool_mask].groupby(mask) + print("Grouped sensor data:",grouped_sensor_data) event_data = [group[1] for group in grouped_sensor_data if len(group[1]) >= min_event_length] event_meta_data = pd.DataFrame() From c3cffda5a52bf7e589df0cab01e2e0b01c208dd6 Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Fri, 10 Oct 2025 19:06:01 +0530 Subject: [PATCH 28/42] Updated fault_detector --- energy_fault_detector/fault_detector.py | 14 +++++++------- run_predict.py | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 47c222a..f30901d 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -375,12 +375,12 @@ def run_root_cause_analysis(self, sensor_data: pd.DataFrame, track_losses: bool """ x_prepped = self.data_preprocessor.transform(sensor_data) - print("Arcana params:",self.config) - if self.config is None: - # backwards compatibility, old models did not save config, just use default parameters - rca = Arcana(model=self.autoencoder) - else: - rca = Arcana(model=self.autoencoder, **self.config.arcana_params) + # print("Arcana params:",self.config) + # if self.config is None: + # # backwards compatibility, old models did not save config, just use default parameters + # rca = Arcana(model=self.autoencoder) + # else: + # rca = Arcana(model=self.autoencoder, **self.config.arcana_params) print("Before finding Arcana bias") arcana_kwargs: Dict[str, Any] = {} if self.config is not None: @@ -402,7 +402,7 @@ def run_root_cause_analysis(self, sensor_data: pd.DataFrame, track_losses: bool df_arcana_bias, arcana_losses, tracked_bias = rca.find_arcana_bias(x=x_prepped, track_losses=track_losses, track_bias=track_bias) - print("After finding Arcana bias") + # print("After finding Arcana bias", **arcana_kwargs) return df_arcana_bias, arcana_losses, tracked_bias def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit_on_validation: bool = False diff --git a/run_predict.py b/run_predict.py index 1be296b..1cff4c8 100644 --- a/run_predict.py +++ b/run_predict.py @@ -30,13 +30,14 @@ DEFAULT_MODEL_PATH = ( r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\models" - r"\asset_0\20251007_154309" + r"\asset_0\20251008_222632" ) DEFAULT_PREDICT_DATA_PATH = ( r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files" r"\predict_5.csv" ) DEFAULT_TIME_COLUMN = "time_stamp" +DEFAULT_ASSET = "asset_5" def _parse_mapping(value: Optional[str]) -> Optional[Dict[str, Any]]: @@ -147,6 +148,7 @@ def build_argument_parser() -> argparse.ArgumentParser: ) parser.add_argument( "--asset-name", + default = DEFAULT_ASSET, help=( "Optional identifier for the analysed asset. When provided, prediction CSV files are " "written to a 'prediction_output/' directory." From cde8346ecf2339bebff8505da8875b5138c1ec4e Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Fri, 10 Oct 2025 19:24:06 +0530 Subject: [PATCH 29/42] print arcana args --- energy_fault_detector/fault_detector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index f30901d..933d4c8 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -399,6 +399,8 @@ def run_root_cause_analysis(self, sensor_data: pd.DataFrame, track_losses: bool rca = Arcana(model=self.autoencoder, **arcana_kwargs) + print("Arcana args:", arcana_kwargs) + df_arcana_bias, arcana_losses, tracked_bias = rca.find_arcana_bias(x=x_prepped, track_losses=track_losses, track_bias=track_bias) From 832044d7de6efcc7274502f566afcfaf92d27e7a Mon Sep 17 00:00:00 2001 From: teehari79 Date: Fri, 10 Oct 2025 07:04:03 -0700 Subject: [PATCH 30/42] Allow overriding ignored features during prediction --- energy_fault_detector/fault_detector.py | 31 +++++++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 933d4c8..5bbc2c5 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -236,7 +236,8 @@ def tune(self, sensor_data: pd.DataFrame, normal_index: pd.Series, def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, root_cause_analysis: bool = False, track_losses: bool = False, - track_bias: bool = False) -> FaultDetectionResult: + track_bias: bool = False, ignore_features: Optional[Iterable[str]] = None + ) -> FaultDetectionResult: """Predict with given models for a specific asset Args: @@ -247,6 +248,8 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, root_cause_analysis (bool, optional): Whether to run ARCANA. Defaults to False. track_losses (bool, optional): Optional; if True, ARCANA losses will be tracked over the iterations. Defaults to False. track_bias (bool, optional): Optional; if True, ARCANA bias will be tracked over the iterations. Defaults to False. + ignore_features (Optional[Iterable[str]], optional): Iterable of patterns identifying features that should be + excluded from anomaly scoring and root cause analysis. Overrides the configuration defaults when provided. Returns: FaultDetectionResult: with the following DataFrames: @@ -275,8 +278,16 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, x_predicted = x_predicted[column_order] else: x_predicted = self.autoencoder.predict(x_prepped, verbose=self.config.verbose) + configured_ignore_patterns: Optional[Tuple[str, ...]] = None + if ignore_features is not None: + configured_ignore_patterns = tuple(ignore_features) + recon_error = self.autoencoder.get_reconstruction_error(x_prepped) - recon_error = self._mask_reconstruction_error(recon_error, 'anomaly scoring (prediction)') + recon_error = self._mask_reconstruction_error( + recon_error, + 'anomaly scoring (prediction)', + ignore_feature_patterns=configured_ignore_patterns, + ) # inverse transform predictions so they are comparable to the raw data df_inverse_scaled_pred = self.data_preprocessor.inverse_transform(x_predicted) @@ -322,7 +333,8 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, logger.info('Run root cause analysis.') df_arcana_bias, arcana_losses, tracked_bias = self.run_root_cause_analysis(sensor_data=sensor_data, track_losses=track_losses, - track_bias=track_bias) + track_bias=track_bias, + ignore_features=ignore_features) else: df_arcana_bias = None arcana_losses = None @@ -445,9 +457,18 @@ def _ignore_feature_patterns(self) -> Tuple[str, ...]: return tuple() return tuple(self.config.arcana_params.get('ignore_features', [])) - def _mask_reconstruction_error(self, recon_error: pd.DataFrame, context: str) -> pd.DataFrame: + def _mask_reconstruction_error( + self, + recon_error: pd.DataFrame, + context: str, + ignore_feature_patterns: Optional[Iterable[str]] = None, + ) -> pd.DataFrame: """Zero-out ignored feature columns before scoring.""" - patterns = self._ignore_feature_patterns + if ignore_feature_patterns is None: + patterns = self._ignore_feature_patterns + else: + patterns = tuple(ignore_feature_patterns) + if not patterns: return recon_error From 233ee34779c4719a16a993398c90d712deced4d3 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Fri, 10 Oct 2025 10:06:26 -0700 Subject: [PATCH 31/42] Ensure predict propagates ignore features --- energy_fault_detector/fault_detector.py | 7 +++- tests/test_root_cause_ignore_features.py | 47 +++++++++++++++++++++++- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 5bbc2c5..214719d 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -278,9 +278,12 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, x_predicted = x_predicted[column_order] else: x_predicted = self.autoencoder.predict(x_prepped, verbose=self.config.verbose) - configured_ignore_patterns: Optional[Tuple[str, ...]] = None + configured_ignore_patterns: Optional[Tuple[str, ...]] if ignore_features is not None: configured_ignore_patterns = tuple(ignore_features) + else: + config_patterns = self._ignore_feature_patterns + configured_ignore_patterns = config_patterns or None recon_error = self.autoencoder.get_reconstruction_error(x_prepped) recon_error = self._mask_reconstruction_error( @@ -334,7 +337,7 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, df_arcana_bias, arcana_losses, tracked_bias = self.run_root_cause_analysis(sensor_data=sensor_data, track_losses=track_losses, track_bias=track_bias, - ignore_features=ignore_features) + ignore_features=configured_ignore_patterns) else: df_arcana_bias = None arcana_losses = None diff --git a/tests/test_root_cause_ignore_features.py b/tests/test_root_cause_ignore_features.py index bdd1e66..3f3fea8 100644 --- a/tests/test_root_cause_ignore_features.py +++ b/tests/test_root_cause_ignore_features.py @@ -10,6 +10,31 @@ class _IdentityPreprocessor: def transform(self, data): return data + def inverse_transform(self, data): + return data + + +class _IdentityAutoencoder: + conditional_features = () + + def predict(self, x, return_conditions=False, verbose=None): + return x + + def get_reconstruction_error(self, x): + return pd.DataFrame(0.0, index=x.index, columns=x.columns) + + +class _ZeroAnomalyScore: + def transform(self, recon_error): + return pd.Series(0.0, index=recon_error.index) + + +class _ZeroThresholdSelector: + threshold = 0.0 + + def predict(self, scores): + return pd.Series(False, index=scores.index) + def _build_config(ignore_features=None): config_dict = { @@ -30,7 +55,9 @@ def _build_fault_detector(config): detector = FaultDetector.__new__(FaultDetector) detector.config = config detector.data_preprocessor = _IdentityPreprocessor() - detector.autoencoder = object() + detector.autoencoder = _IdentityAutoencoder() + detector.anomaly_score = _ZeroAnomalyScore() + detector.threshold_selector = _ZeroThresholdSelector() return detector @@ -74,3 +101,21 @@ def find_arcana_bias(self, x, track_losses, track_bias): detector.run_root_cause_analysis(sensor_data=sensor_data, ignore_features=['pressure_*']) assert captured_kwargs.get('ignore_features') == ['pressure_*'] + + +def test_predict_passes_configured_ignore_features(monkeypatch): + config = _build_config(ignore_features=['temp_*']) + detector = _build_fault_detector(config) + + captured_kwargs = {} + + def _run_root_cause_analysis(self, sensor_data, track_losses, track_bias, ignore_features=None): + captured_kwargs['ignore_features'] = ignore_features + return pd.DataFrame(), pd.DataFrame(), [] + + monkeypatch.setattr(FaultDetector, 'run_root_cause_analysis', _run_root_cause_analysis) + + sensor_data = pd.DataFrame({'temp_a': [1.0, 2.0], 'temp_b': [3.0, 4.0]}) + detector.predict(sensor_data=sensor_data, root_cause_analysis=True) + + assert list(captured_kwargs.get('ignore_features')) == ['temp_*'] From 6aca67504f08577993950807c0e43d348d08dc5c Mon Sep 17 00:00:00 2001 From: teehari79 Date: Fri, 10 Oct 2025 10:38:50 -0700 Subject: [PATCH 32/42] Preserve ignore feature configuration when loading models --- .../core/fault_detection_model.py | 19 ++++++++++++--- tests/test_fault_detector.py | 23 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/energy_fault_detector/core/fault_detection_model.py b/energy_fault_detector/core/fault_detection_model.py index b4ee8a3..2933dd7 100644 --- a/energy_fault_detector/core/fault_detection_model.py +++ b/energy_fault_detector/core/fault_detection_model.py @@ -209,9 +209,22 @@ def load_models(self, model_path: str, load_threshold: bool = True) -> None: model_directory=os.path.join(model_path, SCORE_DIR) ) # for backwards compatibility - check whether config was saved: - if os.path.exists(os.path.join(model_path, 'config.yaml')): - self.config = Config(os.path.join(model_path, 'config.yaml')) - self._model_factory = ModelFactory(self.config) + config_path = os.path.join(model_path, 'config.yaml') + if os.path.exists(config_path): + loaded_config = Config(config_path) + if self.config is None: + self.config = loaded_config + else: + new_config_dict = loaded_config.config_dict.copy() + existing_rca = self.config.config_dict.get('root_cause_analysis', {}) + if existing_rca: + new_rca = new_config_dict.setdefault('root_cause_analysis', {}) + if 'ignore_features' not in new_rca and 'ignore_features' in existing_rca: + new_rca['ignore_features'] = existing_rca['ignore_features'] + self.config.update_config(new_config_dict) + self.config.configuration_file = config_path + if self._model_factory is None: + self._model_factory = ModelFactory(self.config) @staticmethod def _load_pickled_model(model_type: str, model_directory: str): diff --git a/tests/test_fault_detector.py b/tests/test_fault_detector.py index 3432009..811a77a 100644 --- a/tests/test_fault_detector.py +++ b/tests/test_fault_detector.py @@ -1,4 +1,5 @@ +import copy import os import unittest from unittest.mock import patch, MagicMock @@ -10,6 +11,7 @@ import numpy as np from energy_fault_detector.fault_detector import FaultDetector, Config +from energy_fault_detector.core.fault_detection_model import FaultDetectionModel mock_autoencoder = MagicMock() mock_data_preprocessor = MagicMock() @@ -158,6 +160,27 @@ def test_load_models(self): self.assertEqual(call_args[1]['model_type'], name) self.assertEqual(call_args[1]['model_directory'], os.path.join('path_to_saved_models', name)) + def test_load_models_preserves_ignore_features(self): + fault_detector = self._create_fault_detector(self.conf) + existing_factory = fault_detector._model_factory + existing_ignore = tuple(fault_detector.config.arcana_params.get('ignore_features', [])) + + with tempfile.TemporaryDirectory() as model_dir: + for subdir in ['data_preprocessor', 'autoencoder', 'threshold_selector', 'anomaly_score']: + os.makedirs(os.path.join(model_dir, subdir), exist_ok=True) + + config_without_ignore = copy.deepcopy(self.conf.config_dict) + config_without_ignore.setdefault('root_cause_analysis', {}).pop('ignore_features', None) + temp_config = Config(config_dict=config_without_ignore) + temp_config.write_config(os.path.join(model_dir, 'config.yaml'), overwrite=True) + + load_side_effects = ['prep', 'ae', 'threshold', 'score'] + with patch.object(FaultDetectionModel, '_load_pickled_model', side_effect=load_side_effects): + fault_detector.load_models(model_path=model_dir) + + self.assertIs(fault_detector._model_factory, existing_factory) + self.assertEqual(tuple(fault_detector.config.arcana_params.get('ignore_features', [])), existing_ignore) + def test_train(self): self.conf.write_config = MagicMock() fault_detector = self._create_fault_detector(self.conf) From 0f2ae75e584c9388931fd6ef565c90c56dd744ac Mon Sep 17 00:00:00 2001 From: teehari79 Date: Fri, 10 Oct 2025 11:30:43 -0700 Subject: [PATCH 33/42] Fix create_events boolean alignment --- energy_fault_detector/utils/analysis.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/energy_fault_detector/utils/analysis.py b/energy_fault_detector/utils/analysis.py index 6bb9bd0..c560562 100644 --- a/energy_fault_detector/utils/analysis.py +++ b/energy_fault_detector/utils/analysis.py @@ -22,16 +22,20 @@ def create_events(sensor_data: pd.DataFrame, boolean_information: pd.Series, - event_meta_data (pd.DataFrame): A DataFrame with columns 'start', 'end', and 'duration' for each event. - event_data (List[pd.DataFrame]): A list of DataFrames corresponding to the sensor data during the defined events. """ + # Ensure the boolean information uses the same index as the sensor data. When running predictions on + # different assets we encountered cases where the boolean series used a different index. Pandas silently + # reindexes boolean masks during ``DataFrame.__getitem__`` which, in this case, resulted in out of bounds + # indices and raised an ``IndexError``. Aligning the series to the sensor data index avoids the + # misalignment and guarantees a pure boolean mask of equal length. + boolean_information = boolean_information.reindex(sensor_data.index, fill_value=False) + # Create a boolean mask for consecutive True values mask = (boolean_information != boolean_information.shift()).cumsum() - print("mask:",mask) # Group by the mask and filter groups where bool_series is True and has more # than consecutive_true_value_threshold consecutive True bool_mask = boolean_information.groupby(mask).transform(lambda data: data.all()).fillna(False) - print("Boolean mask:",mask) grouped_sensor_data = sensor_data[bool_mask].groupby(mask) - print("Grouped sensor data:",grouped_sensor_data) event_data = [group[1] for group in grouped_sensor_data if len(group[1]) >= min_event_length] event_meta_data = pd.DataFrame() From bbd405ab02324fd3bb6922ee5d62496df6742739 Mon Sep 17 00:00:00 2001 From: hari-humanize Date: Sat, 11 Oct 2025 22:23:39 +0530 Subject: [PATCH 34/42] Added print statements for troubleshooting... Moved data splitter under root folder EnergyFaultDetector --- ..._dataset_splitter.py => asset_dataset_splitter.py | 0 energy_fault_detector/base_config.yaml | 2 +- energy_fault_detector/fault_detector.py | 7 +++++++ .../quick_fault_detection/quick_fault_detector.py | 2 ++ energy_fault_detector/utils/analysis.py | 12 ++++++++++++ run_predict.py | 4 ++-- 6 files changed, 24 insertions(+), 3 deletions(-) rename energy_fault_detector/utils/asset_dataset_splitter.py => asset_dataset_splitter.py (100%) diff --git a/energy_fault_detector/utils/asset_dataset_splitter.py b/asset_dataset_splitter.py similarity index 100% rename from energy_fault_detector/utils/asset_dataset_splitter.py rename to asset_dataset_splitter.py diff --git a/energy_fault_detector/base_config.yaml b/energy_fault_detector/base_config.yaml index a2056a0..e5fb253 100644 --- a/energy_fault_detector/base_config.yaml +++ b/energy_fault_detector/base_config.yaml @@ -44,7 +44,7 @@ train: root_cause_analysis: alpha: 0.5 init_x_bias: recon - num_iter: 1000 + num_iter: 900 verbose: true ignore_features: # Patterns apply to anomaly scoring and ARCANA. Wildcards such as "windspeed*" are supported. diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 214719d..71dd39c 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -272,6 +272,7 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, x_prepped = self.data_preprocessor.transform(x).sort_index() column_order = x_prepped.columns + print("ignore_features before predict:", self._ignore_feature_patterns) if hasattr(self.autoencoder, 'conditional_features'): x_predicted = self.autoencoder.predict(x_prepped, return_conditions=True, verbose=self.config.verbose) @@ -279,11 +280,16 @@ def predict(self, sensor_data: pd.DataFrame, model_path: Optional[str] = None, else: x_predicted = self.autoencoder.predict(x_prepped, verbose=self.config.verbose) configured_ignore_patterns: Optional[Tuple[str, ...]] + + print("ignore_features:", self._ignore_feature_patterns) if ignore_features is not None: configured_ignore_patterns = tuple(ignore_features) else: + print("config_patterns:", self._ignore_feature_patterns) config_patterns = self._ignore_feature_patterns configured_ignore_patterns = config_patterns or None + + print("configured_ignore_patterns:", configured_ignore_patterns) recon_error = self.autoencoder.get_reconstruction_error(x_prepped) recon_error = self._mask_reconstruction_error( @@ -458,6 +464,7 @@ def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit def _ignore_feature_patterns(self) -> Tuple[str, ...]: if self.config is None: return tuple() + print("ignore_features:", self.config.arcana_params) return tuple(self.config.arcana_params.get('ignore_features', [])) def _mask_reconstruction_error( diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index 6b42e3a..5efc5a7 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -170,7 +170,9 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio fallback_config = Config(str(fallback_config_path)) print("Fallback config:",fallback_config) anomaly_detector = FaultDetector(config=fallback_config) + print("anomaly_detector:",anomaly_detector.config.arcana_params) anomaly_detector.load_models(model_path=model_path) + print("anomaly_detector after load:",anomaly_detector.config.arcana_params) root_cause_analysis = True logger.info('Evaluating Test data based on the learned normal behavior.') diff --git a/energy_fault_detector/utils/analysis.py b/energy_fault_detector/utils/analysis.py index c560562..52b2cb1 100644 --- a/energy_fault_detector/utils/analysis.py +++ b/energy_fault_detector/utils/analysis.py @@ -27,6 +27,18 @@ def create_events(sensor_data: pd.DataFrame, boolean_information: pd.Series, # reindexes boolean masks during ``DataFrame.__getitem__`` which, in this case, resulted in out of bounds # indices and raised an ``IndexError``. Aligning the series to the sensor data index avoids the # misalignment and guarantees a pure boolean mask of equal length. + + + # + # print("boolean_information index:",boolean_information,boolean_information.index) + # print("min_event_length:",min_event_length) + # print("sensor_data index:",sensor_data,sensor_data.index) + sensor_data = sensor_data.groupby(level=0).mean() + if boolean_information.index.has_duplicates: + # ``Series.reindex`` does not support duplicate indices. Duplicate timestamps can appear when + # predictions are generated from concatenated batches. In this context we treat any duplicate + # timestamp as an event whenever at least one of the values is ``True``. + boolean_information = boolean_information.groupby(level=0).max() boolean_information = boolean_information.reindex(sensor_data.index, fill_value=False) # Create a boolean mask for consecutive True values diff --git a/run_predict.py b/run_predict.py index 1cff4c8..9360997 100644 --- a/run_predict.py +++ b/run_predict.py @@ -34,10 +34,10 @@ ) DEFAULT_PREDICT_DATA_PATH = ( r"D:\Personal\Ideas\Wind turbine\CARE_To_Compare\CARE_To_Compare\Wind Farm B\asset_files" - r"\predict_5.csv" + r"\predict_6.csv" ) DEFAULT_TIME_COLUMN = "time_stamp" -DEFAULT_ASSET = "asset_5" +DEFAULT_ASSET = "asset_6" def _parse_mapping(value: Optional[str]) -> Optional[Dict[str, Any]]: From cfbd6120a6d40e700b15a019620e28bf166dadf8 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Sat, 11 Oct 2025 11:20:08 -0700 Subject: [PATCH 35/42] Optimize asset dataset splitter memory usage --- asset_dataset_splitter.py | 109 +++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 36 deletions(-) diff --git a/asset_dataset_splitter.py b/asset_dataset_splitter.py index a508a1e..f5faf8c 100644 --- a/asset_dataset_splitter.py +++ b/asset_dataset_splitter.py @@ -23,9 +23,8 @@ from __future__ import annotations import argparse -from collections import defaultdict from pathlib import Path -from typing import Dict, Iterable, List +from typing import Dict, Iterable, MutableMapping import pandas as pd @@ -44,37 +43,42 @@ DROP_COLUMN_SUBSTRINGS = ("_max", "_min", "_std") -def _read_asset_frames(path: Path) -> Dict[str, List[pd.DataFrame]]: - """Read .csv files from `path` and group rows by asset_id, auto-detecting delimiter.""" - asset_frames: Dict[str, List[pd.DataFrame]] = defaultdict(list) +def _detect_delimiter(csv_file: Path) -> str: + """Return the detected delimiter for ``csv_file`` with ``;`` as fallback.""" - for csv_file in sorted(path.glob("*.csv")): - # Detect delimiter automatically - with open(csv_file, "r", encoding="utf-8", errors="ignore") as f: - sample = f.read(2048) # read a small chunk - sniffer = csv.Sniffer() - try: - dialect = sniffer.sniff(sample) - delimiter = dialect.delimiter - except csv.Error: - delimiter = ";" # fallback default + with open(csv_file, "r", encoding="utf-8", errors="ignore") as handle: + sample = handle.read(2048) + sniffer = csv.Sniffer() + try: + dialect = sniffer.sniff(sample) + return dialect.delimiter + except csv.Error: + return ";" - # Read CSV with detected delimiter - df = pd.read_csv(csv_file, sep=delimiter, dtype=str) - if "asset_id" not in df.columns: - raise ValueError(f"File '{csv_file}' does not contain required column 'asset_id'.") +def _iter_asset_frames( + csv_file: Path, *, chunksize: int | None = 100_000 +) -> Iterable[tuple[str, pd.DataFrame]]: + """Yield (asset_id, dataframe) pairs from ``csv_file`` without loading everything into memory.""" - for asset_id, group in df.groupby("asset_id", sort=False): - asset_frames[str(asset_id)].append(group.reset_index(drop=True)) + delimiter = _detect_delimiter(csv_file) + reader = pd.read_csv(csv_file, sep=delimiter, dtype=str, chunksize=chunksize) - return asset_frames + # ``pd.read_csv`` returns ``DataFrame`` when ``chunksize`` is ``None``. + if isinstance(reader, pd.DataFrame): + reader = [reader] + for chunk in reader: + if chunk.empty: + continue -def _combine_frames(frames: Iterable[pd.DataFrame]) -> pd.DataFrame: - """Combine a list of frames into a single data frame.""" + if "asset_id" not in chunk.columns: + raise ValueError( + f"File '{csv_file}' does not contain required column 'asset_id'." + ) - return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame() + for asset_id, group in chunk.groupby("asset_id", sort=False): + yield str(asset_id), group.reset_index(drop=True) def _clean_columns(df: pd.DataFrame) -> pd.DataFrame: @@ -123,8 +127,13 @@ def _build_predict_frame(df: pd.DataFrame) -> pd.DataFrame: return df[is_prediction | is_anomaly_status].copy() -def _save_asset_frames(asset_id: str, df: pd.DataFrame, output_dir: Path) -> None: - """Create train and prediction files for ``asset_id`` from ``df``.""" +def _append_asset_frames( + asset_id: str, + df: pd.DataFrame, + output_dir: Path, + header_written: MutableMapping[Path, bool], +) -> None: + """Append data for ``asset_id`` to the corresponding train/predict CSV files.""" if df.empty: return @@ -134,20 +143,41 @@ def _save_asset_frames(asset_id: str, df: pd.DataFrame, output_dir: Path) -> Non if not train_df.empty: train_path = output_dir / f"train_{asset_id}.csv" - train_df.to_csv(train_path, sep=";", index=False) + train_df.to_csv( + train_path, + sep=";", + index=False, + mode="a", + header=not header_written.get(train_path, False), + ) + header_written[train_path] = True if not predict_df.empty: predict_path = output_dir / f"predict_{asset_id}.csv" - predict_df.to_csv(predict_path, sep=";", index=False) - - -def split_asset_datasets(input_dir: Path, output_dir: Path | None = None) -> None: + predict_df.to_csv( + predict_path, + sep=";", + index=False, + mode="a", + header=not header_written.get(predict_path, False), + ) + header_written[predict_path] = True + + +def split_asset_datasets( + input_dir: Path, + output_dir: Path | None = None, + *, + chunksize: int | None = 100_000, +) -> None: """Split datasets per asset into train and prediction CSV files. Args: input_dir: Directory containing the source ``.csv`` files. output_dir: Optional directory to store the results. When ``None`` the input directory is used. + chunksize: Maximum number of rows per chunk read from the source files. + ``None`` loads the entire file at once. """ if not input_dir.is_dir(): @@ -156,10 +186,17 @@ def split_asset_datasets(input_dir: Path, output_dir: Path | None = None) -> Non output_dir = output_dir or input_dir output_dir.mkdir(parents=True, exist_ok=True) - asset_frames = _read_asset_frames(input_dir) - for asset_id, frames in asset_frames.items(): - asset_df = _combine_frames(frames) - _save_asset_frames(asset_id, asset_df, output_dir) + # Remove previously generated files to avoid appending to stale data. + for existing_file in output_dir.glob("train_*.csv"): + existing_file.unlink() + for existing_file in output_dir.glob("predict_*.csv"): + existing_file.unlink() + + header_written: Dict[Path, bool] = {} + + for csv_file in sorted(input_dir.glob("*.csv")): + for asset_id, asset_df in _iter_asset_frames(csv_file, chunksize=chunksize): + _append_asset_frames(asset_id, asset_df, output_dir, header_written) def _parse_args() -> argparse.Namespace: From 1d71b31a9aa3ceda81c2f80ae22d2435e86fabe2 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Sun, 12 Oct 2025 05:19:09 -0700 Subject: [PATCH 36/42] Add event identifiers to prediction outputs --- .../quick_fault_detector.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index 5efc5a7..b91020f 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -177,11 +177,27 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio logger.info('Evaluating Test data based on the learned normal behavior.') prediction_results = anomaly_detector.predict(sensor_data=test_data, root_cause_analysis=root_cause_analysis) - anomalies = prediction_results.predicted_anomalies['anomaly'] + predicted_anomalies = prediction_results.predicted_anomalies.copy() + anomalies = predicted_anomalies['anomaly'] # Find anomaly events event_meta_data, event_data_list = create_events(sensor_data=test_data, boolean_information=anomalies, min_event_length=min_anomaly_length) + event_ids = list(range(1, len(event_data_list) + 1)) + if not event_meta_data.empty: + event_meta_data = event_meta_data.copy() + event_meta_data.insert(0, 'event_id', event_ids) + event_meta_data['critical_event'] = True + + predicted_anomalies['event_id'] = pd.Series(pd.NA, index=predicted_anomalies.index, dtype='Int64') + predicted_anomalies['critical_event'] = False + + for event_id, event_data in zip(event_ids, event_data_list): + event_index = event_data.index + predicted_anomalies.loc[event_index, 'event_id'] = event_id + predicted_anomalies.loc[event_index, 'critical_event'] = True + + prediction_results.predicted_anomalies = predicted_anomalies arcana_mean_importance_list = [] arcana_loss_list = [] for i in range(len(event_meta_data)): From 4f7fe274d856540565d5111979384ecd02815ee2 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Sun, 12 Oct 2025 10:52:11 -0700 Subject: [PATCH 37/42] Fix threshold fitting mask alignment --- energy_fault_detector/fault_detector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index 71dd39c..be52b6e 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -437,7 +437,8 @@ def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit deviations = self.autoencoder.get_reconstruction_error(x_prepped_all) deviations = self._mask_reconstruction_error(deviations, 'anomaly scoring (training)') y_ = y.loc[deviations.index] - self.anomaly_score.fit(deviations[y_.values]) # use series values for compatibility with a multi-index + normal_mask = y_.astype(bool) + self.anomaly_score.fit(deviations.loc[normal_mask]) scores = self.anomaly_score.transform(deviations) if fit_on_validation: From ed6dd7d0bb4eecce66753b5c8dfd361e3e8ca736 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Sun, 12 Oct 2025 22:28:11 -0700 Subject: [PATCH 38/42] Fix threshold fitting mask alignment --- energy_fault_detector/fault_detector.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/energy_fault_detector/fault_detector.py b/energy_fault_detector/fault_detector.py index be52b6e..2fd626e 100644 --- a/energy_fault_detector/fault_detector.py +++ b/energy_fault_detector/fault_detector.py @@ -437,8 +437,9 @@ def _fit_threshold(self, x: pd.DataFrame, y: pd.Series, x_val: pd.DataFrame, fit deviations = self.autoencoder.get_reconstruction_error(x_prepped_all) deviations = self._mask_reconstruction_error(deviations, 'anomaly scoring (training)') y_ = y.loc[deviations.index] - normal_mask = y_.astype(bool) - self.anomaly_score.fit(deviations.loc[normal_mask]) + normal_mask = y_.fillna(False).astype(bool) + normal_indices = normal_mask.index[normal_mask.to_numpy()] + self.anomaly_score.fit(deviations.loc[normal_indices]) scores = self.anomaly_score.transform(deviations) if fit_on_validation: From aff30517d7bdb37f0eddd1b711d682ada4cfe8be Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 13 Oct 2025 00:37:12 -0700 Subject: [PATCH 39/42] Avoid large allocations in RMSE score fitting --- .../anomaly_scores/rmse_score.py | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/energy_fault_detector/anomaly_scores/rmse_score.py b/energy_fault_detector/anomaly_scores/rmse_score.py index 5bfc0a4..f5d3e23 100644 --- a/energy_fault_detector/anomaly_scores/rmse_score.py +++ b/energy_fault_detector/anomaly_scores/rmse_score.py @@ -49,8 +49,22 @@ def fit(self, x: DataType, y: Optional[pd.Series] = None) -> 'RMSEScore': if self.scale: # fitted attributes need trailing underscore - and are not initialized - self.std_x_: np.array = np.std(x, axis=0) - self.mean_x_: np.array = np.mean(x, axis=0) + data = self._to_numpy(x) + + if data.ndim == 1: + data = data.reshape(-1, 1) + + n_samples = data.shape[0] + if n_samples == 0: + raise ValueError("Cannot fit RMSEScore on empty data.") + + # Calculate mean and standard deviation without allocating an + # additional array the size of the dataset. + self.mean_x_: np.ndarray = np.sum(data, axis=0) / n_samples + sum_squared = np.einsum('ij,ij->j', data, data, optimize=True) + variance = sum_squared / n_samples - np.square(self.mean_x_) + variance = np.maximum(variance, 0.0) + self.std_x_: np.ndarray = np.sqrt(variance) self.fitted_ = True # nothing to fit return self @@ -70,17 +84,35 @@ def transform(self, x: DataType) -> pd.Series: check_is_fitted(self) + original_index = x.index if isinstance(x, pd.DataFrame) else None + data = self._to_numpy(x) + + if data.ndim == 1: + data = data.reshape(-1, 1) + if self.scale: # standardization of the reconstruction error in X if np.all(self.std_x_ > 0): - x = (x - self.mean_x_) / self.std_x_ + data = (data - self.mean_x_) / self.std_x_ else: - x = x - self.mean_x_ - # replace possible inf values with 0 - x[np.isinf(x)] = 0 + data = data - self.mean_x_ + # replace possible inf values with 0 + data[np.isinf(data)] = 0 + + if data.shape[1] == 0: + raise ValueError("Input data must contain at least one feature.") - scores = np.sqrt(np.mean(x ** 2, axis=1)) - if isinstance(x, (pd.DataFrame, pd.Series)): - scores = pd.Series(scores, index=x.index) + mean_squared = np.einsum('ij,ij->i', data, data, optimize=True) / data.shape[1] + scores = np.sqrt(mean_squared) + if original_index is not None: + scores = pd.Series(scores, index=original_index) return scores + + @staticmethod + def _to_numpy(x: DataType) -> np.ndarray: + if isinstance(x, pd.DataFrame): + return x.to_numpy(dtype=np.float64, copy=False) + if isinstance(x, pd.Series): + return x.to_numpy(dtype=np.float64, copy=False) + return np.asarray(x, dtype=np.float64) From 549f019f05e65e219eb381c8ebefefcbaa608386 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 13 Oct 2025 02:32:13 -0700 Subject: [PATCH 40/42] Fix quantile threshold selector alignment and tests --- .../threshold_selectors/quantile_threshold.py | 48 +++++++++++++++++-- .../test_quantile_threshold.py | 17 +++++++ 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/energy_fault_detector/threshold_selectors/quantile_threshold.py b/energy_fault_detector/threshold_selectors/quantile_threshold.py index f413e1a..c8e0a3b 100644 --- a/energy_fault_detector/threshold_selectors/quantile_threshold.py +++ b/energy_fault_detector/threshold_selectors/quantile_threshold.py @@ -54,14 +54,52 @@ def fit(self, x: Array1D, y: pd.Series = None) -> 'QuantileThresholdSelector': """ if isinstance(x, pd.Series): - x = x.sort_index().values - if isinstance(y, pd.Series): - y = y.sort_index().values + x_series = x.sort_index() + else: + x_series = pd.Series(np.asarray(x)) + + selected_scores: Union[pd.Series, np.ndarray] if y is not None: - x_ = x[y] + if isinstance(y, pd.DataFrame): + if y.shape[1] != 1: + raise ValueError('QuantileThresholdSelector requires a one-dimensional normal index.') + y_series = y.iloc[:, 0] + elif isinstance(y, pd.Series): + y_series = y + else: + y_array = np.asarray(y) + if y_array.ndim != 1: + raise ValueError('QuantileThresholdSelector requires a one-dimensional normal index.') + if y_array.shape[0] != len(x_series): + raise ValueError('Boolean index must be the same length as the anomaly scores.') + selected_scores = x_series[y_array.astype(bool)] + y_series = None + + if y is not None and isinstance(y, (pd.Series, pd.DataFrame)): + y_series = y_series.sort_index() + if isinstance(x, pd.Series): + x_series, y_series = x_series.align(y_series, join='inner') + else: + if not y_series.index.equals(x_series.index): + y_series = pd.Series(y_series.to_numpy(), index=x_series.index) + y_mask = y_series.fillna(False).astype(bool) + if len(y_mask) != len(x_series): + raise ValueError('Boolean index must be the same length as the anomaly scores.') + selected_scores = x_series[y_mask] + + if isinstance(selected_scores, pd.Series) and selected_scores.empty: + selected_scores = x_series else: - x_ = x + selected_scores = x_series + + if isinstance(selected_scores, pd.Series): + x_ = selected_scores.to_numpy(dtype=np.float64, copy=False) + else: + x_ = np.asarray(selected_scores, dtype=np.float64) + + if x_.size == 0: + x_ = x_series.to_numpy(dtype=np.float64, copy=False) self.threshold = float(np.quantile(x_, self.quantile)) diff --git a/tests/threshold_selectors/test_quantile_threshold.py b/tests/threshold_selectors/test_quantile_threshold.py index 740708a..b77bf0e 100644 --- a/tests/threshold_selectors/test_quantile_threshold.py +++ b/tests/threshold_selectors/test_quantile_threshold.py @@ -2,6 +2,7 @@ from unittest import TestCase import numpy as np +import pandas as pd from numpy.testing import assert_array_equal from energy_fault_detector.anomaly_scores.rmse_score import RMSEScore @@ -38,6 +39,22 @@ def test_fit_without_label(self) -> None: assert_array_equal(exp_threshold_with_anomaly, self.threshold_selector.threshold) + def test_fit_with_dataframe_normal_index(self) -> None: + scores = pd.Series(self.rmse.transform(self.train_data)) + normal_index = pd.DataFrame({'normal': self.normal_index}, index=scores.index) + + self.threshold_selector.fit(scores, normal_index) + + assert_array_equal(1.2392478743647883, self.threshold_selector.threshold) + + def test_fit_with_all_false_labels_uses_all_scores(self) -> None: + scores = pd.Series(self.rmse.transform(self.train_data)) + normal_index = pd.Series([False] * len(scores), index=scores.index) + + self.threshold_selector.fit(scores, normal_index) + + assert_array_equal(1.617323497052351, self.threshold_selector.threshold) + def test_predict(self) -> None: # expected output exp_anomalies = [True]*5 + [False]*23 + [True]*5 From a3e9e6edfcc394100783d255a4d304affb68fae5 Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 13 Oct 2025 07:18:42 -0700 Subject: [PATCH 41/42] Add REST API for prediction and configure model registry --- README.md | 20 ++ energy_fault_detector/api/__init__.py | 19 ++ energy_fault_detector/api/app.py | 210 ++++++++++++++++++ energy_fault_detector/api/artifacts/.gitkeep | 0 energy_fault_detector/api/model_registry.py | 91 ++++++++ energy_fault_detector/api/models/.gitkeep | 0 energy_fault_detector/api/service_config.yaml | 16 ++ energy_fault_detector/api/settings.py | 138 ++++++++++++ .../quick_fault_detector.py | 11 +- requirements.txt | 4 +- tests/test_model_registry.py | 54 +++++ tests/test_quick_fault_detector_rca.py | 87 ++++++++ 12 files changed, 648 insertions(+), 2 deletions(-) create mode 100644 energy_fault_detector/api/__init__.py create mode 100644 energy_fault_detector/api/app.py create mode 100644 energy_fault_detector/api/artifacts/.gitkeep create mode 100644 energy_fault_detector/api/model_registry.py create mode 100644 energy_fault_detector/api/models/.gitkeep create mode 100644 energy_fault_detector/api/service_config.yaml create mode 100644 energy_fault_detector/api/settings.py create mode 100644 tests/test_model_registry.py create mode 100644 tests/test_quick_fault_detector_rca.py diff --git a/README.md b/README.md index ad29e2c..fe2cf02 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,26 @@ quick_fault_detector --c2c_example For more information, have a look at the notebook [Quick Failure Detection](./notebooks/Example%20-%20Quick%20Failure%20Detection.ipynb) +## REST prediction API + +The project ships with a lightweight FastAPI application that exposes the prediction workflow via HTTP. The service +resolves models by name and version using the directory structure described in +[`energy_fault_detector/api/service_config.yaml`](energy_fault_detector/api/service_config.yaml). + +Start the API with: + +```bash +uvicorn energy_fault_detector.api.app:app --reload +``` + +By default the service reads its configuration from the bundled `service_config.yaml`. Provide the +`EFD_SERVICE_CONFIG` environment variable to point to a custom YAML file when you want to adapt the model root +directory, override default ignore patterns, or tweak other runtime parameters. Predictions are triggered with a `POST` +request to `/predict` and expect a JSON payload containing at least the `model_name` and `data_path` fields. Optional +fields such as `model_version`, `ignore_features`, and `asset_name` refine which artefacts are used and how the results +are stored. + + ## Fault detection in 5 lines of code ```python diff --git a/energy_fault_detector/api/__init__.py b/energy_fault_detector/api/__init__.py new file mode 100644 index 0000000..d561f23 --- /dev/null +++ b/energy_fault_detector/api/__init__.py @@ -0,0 +1,19 @@ +"""REST API utilities for the Energy Fault Detector package.""" + +from __future__ import annotations + +from importlib import import_module +from typing import Any, List + +__all__ = ["app"] + + +def __getattr__(name: str) -> Any: + if name == "app": + module = import_module("energy_fault_detector.api.app") + return module.app + raise AttributeError(name) + + +def __dir__() -> List[str]: # pragma: no cover - small helper + return sorted(__all__) diff --git a/energy_fault_detector/api/app.py b/energy_fault_detector/api/app.py new file mode 100644 index 0000000..e308c24 --- /dev/null +++ b/energy_fault_detector/api/app.py @@ -0,0 +1,210 @@ +"""FastAPI application exposing prediction endpoints.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Any, Dict, List, Optional + +import pandas as pd +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field, validator + +from energy_fault_detector.quick_fault_detection import quick_fault_detector + +from .model_registry import ModelNotFoundError, ModelRegistry +from .settings import get_settings + + +logger = logging.getLogger(__name__) + + +def _merge_ignore_patterns(defaults: List[str], overrides: List[str]) -> List[str]: + merged: List[str] = [] + for values in (defaults or [], overrides or []): + for value in values: + if value not in merged: + merged.append(value) + return merged + + +def _dataframe_to_records(df: Optional[pd.DataFrame]) -> List[Dict[str, Any]]: + if df is None or df.empty: + return [] + + serialisable = df.reset_index() + serialisable = serialisable.where(pd.notnull(serialisable), None) + return json.loads(serialisable.to_json(orient="records", date_format="iso")) + + +def _determine_asset_name(model_name: str, model_version: str, explicit: Optional[str], template: str) -> str: + if explicit: + return explicit + + try: + return template.format(model_name=model_name, model_version=model_version) + except KeyError as exc: # pragma: no cover - configuration error + raise HTTPException( + status_code=500, + detail=f"Invalid asset name template; missing placeholder: {exc}", + ) from exc + + +def _derive_artifact_directory(save_root: Optional[Path], data_path: Path, asset_name: str) -> Path: + base_dir = save_root if save_root is not None else data_path.resolve().parent + return base_dir / "prediction_output" / asset_name + + +class PredictionRequest(BaseModel): + """Request body for the prediction endpoint.""" + + model_name: str = Field(..., description="Logical name of the model to load.") + data_path: str = Field(..., description="Path to the CSV file that should be analysed.") + model_version: Optional[str] = Field( + None, description="Specific model version to use. Defaults to the latest available version." + ) + ignore_features: List[str] = Field( + default_factory=list, + description="Feature names or wildcard patterns to exclude from root cause analysis.", + ) + asset_name: Optional[str] = Field( + None, description="Optional asset identifier to use when persisting prediction artefacts." + ) + debug_plots: Optional[bool] = Field( + None, description="Override the configuration setting that controls debug plot generation." + ) + + @validator("ignore_features", pre=True) + def _coerce_ignore_features(cls, value: Any) -> List[str]: # pragma: no cover - validation logic + if value is None: + return [] + if isinstance(value, str): + return [value] + return list(value) + + +class PredictionSummary(BaseModel): + total_samples: int + anomaly_samples: int + critical_samples: int + event_count: int + + +class PredictionResponse(BaseModel): + model_name: str + model_version: str + data_path: str + asset_name: str + ignore_features: List[str] + artifact_directory: Optional[str] + summary: PredictionSummary + events: List[Dict[str, Any]] + + +class HealthResponse(BaseModel): + status: str = "ok" + + +settings = get_settings() +model_registry = ModelRegistry( + root_directory=settings.model_store.root_directory, + default_version_strategy=settings.model_store.default_version_strategy, +) + +app = FastAPI(title="Energy Fault Detector", version="1.0.0") + + +@app.get("/health", response_model=HealthResponse) +def health_check() -> HealthResponse: + """Simple endpoint to verify that the service is running.""" + + return HealthResponse() + + +@app.post("/predict", response_model=PredictionResponse) +def predict(request: PredictionRequest) -> PredictionResponse: + """Execute the quick fault detector in prediction mode.""" + + try: + model_path, resolved_version = model_registry.resolve( + model_name=request.model_name, model_version=request.model_version + ) + except ModelNotFoundError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + + data_path = Path(request.data_path).expanduser().resolve() + if not data_path.exists(): + raise HTTPException(status_code=400, detail=f"Data file '{data_path}' does not exist.") + + ignore_patterns = _merge_ignore_patterns( + settings.prediction.default_ignore_features, request.ignore_features + ) + debug_plots = request.debug_plots if request.debug_plots is not None else settings.prediction.debug_plots + + asset_name = _determine_asset_name( + model_name=request.model_name, + model_version=resolved_version, + explicit=request.asset_name, + template=settings.prediction.default_asset_name, + ) + + save_root = settings.prediction.output_directory + if save_root is not None: + save_root.mkdir(parents=True, exist_ok=True) + + try: + prediction_results, event_metadata, _ = quick_fault_detector( + csv_data_path=None, + csv_test_data_path=str(data_path), + train_test_column_name=settings.prediction.train_test_column, + train_test_mapping=settings.prediction.train_test_mapping or None, + time_column_name=settings.prediction.time_column, + status_data_column_name=settings.prediction.status_column, + status_mapping=settings.prediction.status_mapping or None, + min_anomaly_length=settings.prediction.min_anomaly_length, + enable_debug_plots=debug_plots, + save_dir=str(save_root) if save_root is not None else None, + mode="predict", + model_path=str(model_path), + asset_name=asset_name, + rca_ignore_features=ignore_patterns or None, + ) + except FileNotFoundError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + except Exception as exc: # pragma: no cover - defensive programming + logger.exception("Prediction request failed: %s", exc) + raise HTTPException(status_code=500, detail="Prediction failed. See logs for details.") from exc + + predicted_anomalies = prediction_results.predicted_anomalies.copy() + total_samples = int(len(predicted_anomalies)) + anomaly_samples = int(predicted_anomalies.get("anomaly", pd.Series(dtype=bool)).fillna(False).sum()) + critical_samples = int( + predicted_anomalies.get("critical_event", pd.Series(dtype=bool)).fillna(False).sum() + ) + event_count = int(len(event_metadata)) if event_metadata is not None else 0 + + artifact_directory = _derive_artifact_directory(save_root, data_path, asset_name) + + response = PredictionResponse( + model_name=request.model_name, + model_version=resolved_version, + data_path=str(data_path), + asset_name=asset_name, + ignore_features=ignore_patterns, + artifact_directory=str(artifact_directory) if artifact_directory is not None else None, + summary=PredictionSummary( + total_samples=total_samples, + anomaly_samples=anomaly_samples, + critical_samples=critical_samples, + event_count=event_count, + ), + events=_dataframe_to_records(event_metadata), + ) + + return response + + +__all__ = ["app", "PredictionRequest", "PredictionResponse", "PredictionSummary"] diff --git a/energy_fault_detector/api/artifacts/.gitkeep b/energy_fault_detector/api/artifacts/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/energy_fault_detector/api/model_registry.py b/energy_fault_detector/api/model_registry.py new file mode 100644 index 0000000..a543593 --- /dev/null +++ b/energy_fault_detector/api/model_registry.py @@ -0,0 +1,91 @@ +"""Utilities for locating trained model artefacts.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + + +class ModelNotFoundError(FileNotFoundError): + """Raised when the requested model or version does not exist.""" + + +def _looks_like_model_directory(path: Path) -> bool: + """Heuristic to determine whether ``path`` contains model artefacts.""" + + if not path.is_dir(): + return False + + if (path / "config.yaml").exists(): + return True + + expected_components = {"data_preprocessor", "autoencoder", "threshold_selector", "anomaly_score"} + child_directories = {child.name for child in path.iterdir() if child.is_dir()} + return bool(expected_components & child_directories) + + +def _sort_versions(versions: Iterable[Path]) -> List[Path]: + return sorted(versions, key=lambda version: version.name) + + +@dataclass +class ModelRegistry: + """Simple model registry resolving model names to artefact locations.""" + + root_directory: Path + default_version_strategy: str = "latest" + + def __post_init__(self) -> None: + self.root_directory = Path(self.root_directory).resolve() + + def resolve(self, model_name: str, model_version: Optional[str] = None) -> Tuple[Path, str]: + """Return the filesystem path for ``model_name`` and ``model_version``.""" + + model_root = self.root_directory / model_name + if not model_root.exists(): + raise ModelNotFoundError(f"Model '{model_name}' was not found under '{self.root_directory}'.") + + if model_version: + candidate = model_root / model_version + if not _looks_like_model_directory(candidate): + raise ModelNotFoundError( + f"Model '{model_name}' does not provide a version named '{model_version}'." + ) + return candidate, model_version + + versions = [child for child in model_root.iterdir() if _looks_like_model_directory(child)] + + if versions: + if self.default_version_strategy != "latest": + raise ValueError( + f"Unsupported version selection strategy '{self.default_version_strategy}'." + ) + selected = _sort_versions(versions)[-1] + return selected, selected.name + + if _looks_like_model_directory(model_root): + return model_root, model_root.name + + raise ModelNotFoundError( + f"No model versions found for '{model_name}' in '{model_root}'. Ensure the expected directory structure exists." + ) + + def list_versions(self, model_name: str) -> List[str]: + """Return all discovered versions for ``model_name``.""" + + model_root = self.root_directory / model_name + if not model_root.exists(): + raise ModelNotFoundError(f"Model '{model_name}' was not found under '{self.root_directory}'.") + + versions = [child for child in model_root.iterdir() if _looks_like_model_directory(child)] + if versions: + return [path.name for path in _sort_versions(versions)] + + if _looks_like_model_directory(model_root): + return [model_root.name] + + return [] + + +__all__ = ["ModelRegistry", "ModelNotFoundError"] diff --git a/energy_fault_detector/api/models/.gitkeep b/energy_fault_detector/api/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/energy_fault_detector/api/service_config.yaml b/energy_fault_detector/api/service_config.yaml new file mode 100644 index 0000000..1b8dcc7 --- /dev/null +++ b/energy_fault_detector/api/service_config.yaml @@ -0,0 +1,16 @@ +model_store: + root_directory: "./models" + default_version_strategy: "latest" + +prediction: + config_path: "../base_config.yaml" + time_column: "time_stamp" + status_column: null + status_mapping: {} + train_test_column: null + train_test_mapping: {} + min_anomaly_length: 18 + debug_plots: false + output_directory: "./artifacts" + default_asset_name: "{model_name}" + default_ignore_features: [] diff --git a/energy_fault_detector/api/settings.py b/energy_fault_detector/api/settings.py new file mode 100644 index 0000000..02b92ea --- /dev/null +++ b/energy_fault_detector/api/settings.py @@ -0,0 +1,138 @@ +"""Settings loader for the REST API.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from functools import lru_cache +from pathlib import Path +from typing import Dict, List, Optional + +import yaml + + +DEFAULT_CONFIG_PATH = Path(__file__).with_name("service_config.yaml") +CONFIG_ENV_VAR = "EFD_SERVICE_CONFIG" + + +@dataclass +class ModelStoreSettings: + """Settings describing where trained models are stored.""" + + root_directory: Path + default_version_strategy: str = "latest" + + +@dataclass +class PredictionSettings: + """Settings that control how predictions are executed.""" + + config_path: Path + time_column: Optional[str] = None + status_column: Optional[str] = None + status_mapping: Dict[str, bool] = field(default_factory=dict) + train_test_column: Optional[str] = None + train_test_mapping: Dict[str, bool] = field(default_factory=dict) + min_anomaly_length: int = 18 + debug_plots: bool = False + output_directory: Optional[Path] = None + default_asset_name: str = "{model_name}" + default_ignore_features: List[str] = field(default_factory=list) + + +@dataclass +class APISettings: + """Container for all API settings.""" + + model_store: ModelStoreSettings + prediction: PredictionSettings + + +def _resolve_path(base_dir: Path, value: Optional[str]) -> Optional[Path]: + """Resolve ``value`` relative to ``base_dir`` when it is not absolute.""" + + if value in (None, ""): + return None + + path = Path(value) + if not path.is_absolute(): + path = (base_dir / path).resolve() + return path + + +def _load_yaml(config_path: Path) -> Dict[str, object]: + with open(config_path, "r", encoding="utf-8") as stream: + data = yaml.safe_load(stream) or {} + if not isinstance(data, dict): + raise ValueError("The service configuration must be a YAML mapping/dictionary.") + return data + + +@lru_cache() +def get_settings() -> APISettings: + """Return the API settings, loading them from disk on first access.""" + + configured_path = os.environ.get(CONFIG_ENV_VAR) + config_path = Path(configured_path) if configured_path else DEFAULT_CONFIG_PATH + if not config_path.exists(): + raise FileNotFoundError( + f"Service configuration file '{config_path}' could not be found." + ) + + config_path = config_path.resolve() + raw_config = _load_yaml(config_path) + base_dir = config_path.parent + + model_store_cfg = raw_config.get("model_store", {}) or {} + prediction_cfg = raw_config.get("prediction", {}) or {} + + model_root = _resolve_path(base_dir, model_store_cfg.get("root_directory", "./models")) + if model_root is None: + raise ValueError("A model store root directory must be provided in the configuration.") + + version_strategy = model_store_cfg.get("default_version_strategy", "latest") + + prediction_config_path = _resolve_path( + base_dir, prediction_cfg.get("config_path", "../base_config.yaml") + ) + if prediction_config_path is None or not prediction_config_path.exists(): + raise FileNotFoundError( + f"Prediction configuration file '{prediction_config_path}' could not be found." + ) + + output_directory = _resolve_path(base_dir, prediction_cfg.get("output_directory")) + + default_asset_name = prediction_cfg.get("default_asset_name", "{model_name}") + default_ignore = prediction_cfg.get("default_ignore_features") or [] + + settings = APISettings( + model_store=ModelStoreSettings( + root_directory=model_root, + default_version_strategy=str(version_strategy), + ), + prediction=PredictionSettings( + config_path=prediction_config_path, + time_column=prediction_cfg.get("time_column"), + status_column=prediction_cfg.get("status_column"), + status_mapping=dict(prediction_cfg.get("status_mapping") or {}), + train_test_column=prediction_cfg.get("train_test_column"), + train_test_mapping=dict(prediction_cfg.get("train_test_mapping") or {}), + min_anomaly_length=int(prediction_cfg.get("min_anomaly_length", 18)), + debug_plots=bool(prediction_cfg.get("debug_plots", False)), + output_directory=output_directory, + default_asset_name=str(default_asset_name), + default_ignore_features=list(default_ignore), + ), + ) + + return settings + + +__all__ = [ + "APISettings", + "ModelStoreSettings", + "PredictionSettings", + "CONFIG_ENV_VAR", + "DEFAULT_CONFIG_PATH", + "get_settings", +] diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index b91020f..7aae922 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -39,7 +39,8 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio model_directory: Optional[str] = None, model_subdir: Optional[str] = None, model_name: Optional[str] = None, - asset_name: Optional[str] = None + asset_name: Optional[str] = None, + rca_ignore_features: Optional[List[str]] = None ) -> Tuple[FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]]: """Analyzes provided data using an autoencoder based approach for identifying anomalies based on a learned normal behavior. Anomalies are then aggregated to events and further analyzed. @@ -105,6 +106,9 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio asset_name (Optional[str]): Identifier of the asset whose data is being analysed. When running in prediction mode this name is used to create a dedicated folder inside ``prediction_output``. If omitted the folder name is inferred from ``csv_test_data_path`` when possible. + rca_ignore_features (Optional[List[str]]): Additional feature names or wildcard patterns that should be ignored + by the root cause analysis module during prediction. Values provided here extend the patterns that are stored + in the persisted model configuration. Returns: Tuple(FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]): FaultDetectionResult object with the @@ -168,6 +172,11 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio logger.info('Loading pre-trained model from %s.', model_path) fallback_config_path = Path(__file__).resolve().parent.parent / 'base_config.yaml' fallback_config = Config(str(fallback_config_path)) + if rca_ignore_features: + rca_config = fallback_config.config_dict.setdefault('root_cause_analysis', {}) + existing_patterns = rca_config.get('ignore_features', []) or [] + merged_patterns = list(dict.fromkeys([*existing_patterns, *rca_ignore_features])) + rca_config['ignore_features'] = merged_patterns print("Fallback config:",fallback_config) anomaly_detector = FaultDetector(config=fallback_config) print("anomaly_detector:",anomaly_detector.config.arcana_params) diff --git a/requirements.txt b/requirements.txt index 7c0686a..e696d23 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ pandas>=2.0 cerberus matplotlib>=3.9.0 optuna>=4.0 -chardet>=5.0 \ No newline at end of file +chardet>=5.0 +fastapi>=0.111 +uvicorn[standard]>=0.23 diff --git a/tests/test_model_registry.py b/tests/test_model_registry.py new file mode 100644 index 0000000..8e54513 --- /dev/null +++ b/tests/test_model_registry.py @@ -0,0 +1,54 @@ +"""Tests for the model registry helper.""" + +from pathlib import Path + +import pytest + +from energy_fault_detector.api.model_registry import ModelNotFoundError, ModelRegistry + + +def _create_model_dir(base: Path, model_name: str, version: str) -> Path: + path = base / model_name / version + (path / "config.yaml").parent.mkdir(parents=True, exist_ok=True) + (path / "config.yaml").write_text("model: test", encoding="utf-8") + return path + + +def test_resolve_latest_version(tmp_path): + registry = ModelRegistry(root_directory=tmp_path) + _create_model_dir(tmp_path, "turbine", "20230101") + latest = _create_model_dir(tmp_path, "turbine", "20240202") + + resolved_path, version = registry.resolve("turbine") + + assert resolved_path == latest + assert version == "20240202" + + +def test_resolve_specific_version(tmp_path): + registry = ModelRegistry(root_directory=tmp_path) + expected = _create_model_dir(tmp_path, "asset", "v1") + + resolved_path, version = registry.resolve("asset", "v1") + + assert resolved_path == expected + assert version == "v1" + + +def test_resolve_versionless_model(tmp_path): + registry = ModelRegistry(root_directory=tmp_path) + path = tmp_path / "single" + (path / "config.yaml").parent.mkdir(parents=True, exist_ok=True) + (path / "config.yaml").write_text("model: test", encoding="utf-8") + + resolved_path, version = registry.resolve("single") + + assert resolved_path == path + assert version == "single" + + +def test_resolve_missing_model(tmp_path): + registry = ModelRegistry(root_directory=tmp_path) + + with pytest.raises(ModelNotFoundError): + registry.resolve("unknown") diff --git a/tests/test_quick_fault_detector_rca.py b/tests/test_quick_fault_detector_rca.py new file mode 100644 index 0000000..3cd206a --- /dev/null +++ b/tests/test_quick_fault_detector_rca.py @@ -0,0 +1,87 @@ +"""Ensure that additional RCA ignore patterns are respected during prediction.""" + +from __future__ import annotations + +from typing import List + +import pytest + +pd = pytest.importorskip("pandas") + +from energy_fault_detector.quick_fault_detection.quick_fault_detector import quick_fault_detector + + +class DummyResult: + def __init__(self) -> None: + self.predicted_anomalies = pd.DataFrame({"anomaly": [False], "critical_event": [False]}) + + def save(self, *_args, **_kwargs) -> None: + return None + + +class DummyFaultDetector: + instances: List["DummyFaultDetector"] = [] + + def __init__(self, config, **_kwargs) -> None: # noqa: D401 - matches interface + self.config = config + self.instances.append(self) + + def load_models(self, *_args, **_kwargs) -> None: + return None + + def predict(self, *_args, **_kwargs): + return DummyResult() + + +def test_quick_fault_detector_extends_ignore_features(monkeypatch, tmp_path): + DummyFaultDetector.instances = [] + + def fake_load_train_test_data(**_kwargs): + data = pd.DataFrame({"a": [1.0, 2.0]}) + return data, pd.Series([True, False]), data + + def fake_create_events(**_kwargs): + return pd.DataFrame(), [] + + def fake_analyze_event(**_kwargs): + return pd.Series(dtype=float), pd.DataFrame() + + monkeypatch.setattr( + "energy_fault_detector.quick_fault_detection.quick_fault_detector.load_train_test_data", + fake_load_train_test_data, + ) + monkeypatch.setattr( + "energy_fault_detector.quick_fault_detection.quick_fault_detector.create_events", + fake_create_events, + ) + monkeypatch.setattr( + "energy_fault_detector.quick_fault_detection.quick_fault_detector.analyze_event", + fake_analyze_event, + ) + monkeypatch.setattr( + "energy_fault_detector.quick_fault_detection.quick_fault_detector.generate_output_plots", + lambda **_kwargs: None, + ) + monkeypatch.setattr( + "energy_fault_detector.quick_fault_detection.quick_fault_detector._save_prediction_results", + lambda **_kwargs: None, + ) + monkeypatch.setattr( + "energy_fault_detector.quick_fault_detection.quick_fault_detector.FaultDetector", + DummyFaultDetector, + ) + + csv_path = tmp_path / "data.csv" + csv_path.write_text("a\n1\n2\n", encoding="utf-8") + + quick_fault_detector( + csv_data_path=None, + csv_test_data_path=str(csv_path), + mode="predict", + model_path=str(tmp_path), + rca_ignore_features=["feature_a"], + ) + + assert DummyFaultDetector.instances, "FaultDetector was not instantiated" + ignore_features = DummyFaultDetector.instances[0].config.config_dict["root_cause_analysis"]["ignore_features"] + assert "feature_a" in ignore_features From 62791f5753e616c87c9ec3385d4be8531edfc83b Mon Sep 17 00:00:00 2001 From: teehari79 Date: Mon, 13 Oct 2025 07:40:47 -0700 Subject: [PATCH 42/42] Add FastAPI prediction endpoint with enhanced validation --- energy_fault_detector/api/__init__.py | 5 + energy_fault_detector/api/prediction_api.py | 462 ++++++++++++++++++ energy_fault_detector/main.py | 2 +- .../quick_fault_detector.py | 36 +- requirements.txt | 5 +- run.py | 2 +- run_predict.py | 2 +- tests/api/test_prediction_api.py | 239 +++++++++ 8 files changed, 741 insertions(+), 12 deletions(-) create mode 100644 energy_fault_detector/api/__init__.py create mode 100644 energy_fault_detector/api/prediction_api.py create mode 100644 tests/api/test_prediction_api.py diff --git a/energy_fault_detector/api/__init__.py b/energy_fault_detector/api/__init__.py new file mode 100644 index 0000000..95f9992 --- /dev/null +++ b/energy_fault_detector/api/__init__.py @@ -0,0 +1,5 @@ +"""API utilities for the Energy Fault Detector package.""" + +from .prediction_api import app + +__all__ = ["app"] diff --git a/energy_fault_detector/api/prediction_api.py b/energy_fault_detector/api/prediction_api.py new file mode 100644 index 0000000..5ec7cce --- /dev/null +++ b/energy_fault_detector/api/prediction_api.py @@ -0,0 +1,462 @@ +"""FastAPI application exposing the quick fault detector prediction endpoint.""" + +from __future__ import annotations + +import logging +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple + +import pandas as pd +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +from energy_fault_detector._logs import setup_logging +from energy_fault_detector.config import Config +from energy_fault_detector.core.fault_detection_result import FaultDetectionResult +from energy_fault_detector.fault_detector import FaultDetector +from energy_fault_detector.quick_fault_detection.data_loading import get_sensor_data +from energy_fault_detector.quick_fault_detection.quick_fault_detector import analyze_event +from energy_fault_detector.utils.analysis import create_events + +DEFAULT_TIMESTAMP_COLUMN = "time_stamp" +DEFAULT_MIN_EVENT_LENGTH = 18 + +LOGGING_CONFIG_PATH = Path(__file__).resolve().parent.parent / "logging.yaml" +setup_logging(str(LOGGING_CONFIG_PATH)) + +logger = logging.getLogger("energy_fault_detector.prediction_api") + + +class PredictionAPIError(Exception): + """Base class for handled prediction API exceptions.""" + + status_code: int = 500 + error_code: str = "EFD_INTERNAL_ERROR" + + def __init__(self, message: str) -> None: + super().__init__(message) + self.message = message + + +class InvalidModelError(PredictionAPIError): + """Raised when a referenced model cannot be loaded.""" + + status_code = 404 + error_code = "EFD_MODEL_NOT_FOUND" + + +class SchemaMismatchError(PredictionAPIError): + """Raised when the input schema does not match the model expectations.""" + + status_code = 409 + error_code = "EFD_SCHEMA_MISMATCH" + + +class DataTypeMismatchError(PredictionAPIError): + """Raised when data types of provided columns do not match the trained model.""" + + status_code = 422 + error_code = "EFD_DATATYPE_MISMATCH" + + +class TimestampValidationError(PredictionAPIError): + """Raised when the timestamp column is missing or cannot be parsed.""" + + status_code = 422 + error_code = "EFD_TIMESTAMP_INVALID" + + +class EmptyInputError(PredictionAPIError): + """Raised when the provided payload does not contain any rows.""" + + status_code = 400 + error_code = "EFD_EMPTY_PAYLOAD" + + +class EventMetadata(BaseModel): + """Metadata describing an anomaly event.""" + + event_id: int = Field(..., description="Identifier of the anomaly event.") + start: datetime = Field(..., description="Start timestamp of the event.") + end: datetime = Field(..., description="End timestamp of the event.") + duration_seconds: float = Field(..., ge=0, description="Duration of the event in seconds.") + + +class SensorPoint(BaseModel): + """Time-series point describing sensor values and anomaly scores.""" + + timestamp: datetime = Field(..., description="Timestamp of the sensor snapshot.") + anomaly_score: Optional[float] = Field(None, description="Computed anomaly score.") + threshold_score: Optional[float] = Field(None, description="Threshold used for anomaly decision.") + behaviour: Optional[str] = Field(None, description="Model assessment of the behaviour at this timestamp.") + cumulative_anomaly_score: Optional[int] = Field(None, description="Cumulative anomaly counter.") + sensors: Dict[str, Optional[float]] = Field(..., description="Sensor readings contributing to the event.") + + +class EventSensorData(BaseModel): + """Detailed sensor time series for an event, including ARCANA summaries.""" + + event_id: int + points: List[SensorPoint] + arcana_mean_importances: Dict[str, float] = Field(default_factory=dict) + arcana_losses: Optional[List[Dict[str, Any]]] = None + + +class PredictionSuccessResponse(BaseModel): + """Successful prediction payload.""" + + status: str = Field("success", const=True) + events: List[EventMetadata] + event_sensor_data: List[EventSensorData] + + +class PredictionRequest(BaseModel): + """Payload describing a prediction request.""" + + model_path: str = Field(..., description="Path to the saved fault detector model.") + data: List[Dict[str, Any]] = Field(..., min_items=1, description="Prediction data as list of records.") + timestamp_column: str = Field( + DEFAULT_TIMESTAMP_COLUMN, + description="Name of the timestamp column in the provided data.", + ) + min_event_length: int = Field( + DEFAULT_MIN_EVENT_LENGTH, + ge=1, + description="Minimum number of consecutive anomalies required to form an event.", + ) + + +def _load_fault_detector(model_path: str) -> FaultDetector: + """Instantiate and load a :class:`FaultDetector` from disk.""" + + try: + fallback_config_path = Path(__file__).resolve().parent.parent / "base_config.yaml" + fallback_config = Config(str(fallback_config_path)) + detector = FaultDetector(config=fallback_config) + detector.load_models(model_path=model_path) + except FileNotFoundError as exc: # pragma: no cover - dependent on filesystem + raise InvalidModelError( + f"The provided model path '{model_path}' does not exist." + ) from exc + except (OSError, ValueError) as exc: # pragma: no cover - dependent on filesystem + raise InvalidModelError( + f"Failed to load model artefacts from '{model_path}'." + ) from exc + return detector + + +def _ensure_datetime(value: Any) -> datetime: + """Convert various timestamp representations to :class:`datetime`.""" + + if isinstance(value, pd.Timestamp): + return value.to_pydatetime() + if isinstance(value, datetime): + return value + return pd.to_datetime(value).to_pydatetime() + + +def _to_optional_float(value: Any) -> Optional[float]: + """Convert numeric values to floats while preserving missing values.""" + + if value is None: + return None + if isinstance(value, (float, int)): + return float(value) + if pd.isna(value): + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _extract_expected_columns(data_preprocessor: Any) -> List[str]: + """Retrieve the expected input columns from the saved data preprocessor.""" + + if data_preprocessor is None: + return [] + + named_steps = getattr(data_preprocessor, "named_steps", {}) + column_selector = named_steps.get("column_selector") + if column_selector is not None and hasattr(column_selector, "feature_names_out_"): + return list(column_selector.feature_names_out_) + + imputer = named_steps.get("imputer") + if imputer is not None and hasattr(imputer, "feature_names_in_"): + return list(imputer.feature_names_in_) + + if hasattr(data_preprocessor, "get_feature_names_out"): + try: + return list(data_preprocessor.get_feature_names_out()) + except ValueError: # pragma: no cover - depends on sklearn internals + pass + + if hasattr(data_preprocessor, "feature_names_in_"): + return list(data_preprocessor.feature_names_in_) + + return [] + + +def _validate_input_schema( + sensor_data: pd.DataFrame, + expected_columns: Sequence[str], + original_columns: Iterable[str], +) -> None: + """Validate that provided sensor data matches model expectations.""" + + original_column_set = set(original_columns) + + missing_columns: List[str] = [] + dtype_issues: List[str] = [] + for column in expected_columns: + if column not in sensor_data.columns: + if column in original_column_set: + dtype_issues.append(column) + else: + missing_columns.append(column) + continue + + converted = pd.to_numeric(sensor_data[column], errors="coerce") + non_convertible = sensor_data[column].notna() & converted.isna() + if non_convertible.any(): + dtype_issues.append(column) + + if missing_columns: + formatted = ", ".join(sorted(missing_columns)) + raise SchemaMismatchError( + f"Input data is missing required features: {formatted}." + ) + + if dtype_issues: + formatted = ", ".join(sorted(set(dtype_issues))) + raise DataTypeMismatchError( + f"The following features contain non-numeric values: {formatted}." + ) + + +def _build_event_sensor_payload( + event_id: int, + event_data: pd.DataFrame, + predicted_anomalies: pd.DataFrame, + arcana_mean_importances: Optional[pd.Series], + arcana_losses: Optional[pd.DataFrame], +) -> EventSensorData: + """Create the response payload for a single anomaly event.""" + + aligned_anomalies = predicted_anomalies.reindex(event_data.index) + + points: List[SensorPoint] = [] + for (timestamp, sensor_row), (_, anomaly_row) in zip(event_data.iterrows(), aligned_anomalies.iterrows()): + sensors = {column: _to_optional_float(value) for column, value in sensor_row.items()} + anomaly_score = anomaly_row.get("anamoly_score") + threshold_value = anomaly_row.get("threshold_score") + cumulative_score = anomaly_row.get("cumulative_anamoly_score") + + point = SensorPoint( + timestamp=_ensure_datetime(timestamp), + anomaly_score=_to_optional_float(anomaly_score), + threshold_score=_to_optional_float(threshold_value), + behaviour=str(anomaly_row.get("behaviour")) if anomaly_row.get("behaviour") is not None else None, + cumulative_anomaly_score=int(cumulative_score) if pd.notna(cumulative_score) else None, + sensors=sensors, + ) + points.append(point) + + mean_importances = ( + {feature: float(value) for feature, value in arcana_mean_importances.items()} + if arcana_mean_importances is not None + else {} + ) + + if arcana_losses is not None: + losses_payload = ( + arcana_losses.reset_index() + .rename(columns={arcana_losses.index.name or "index": "iteration"}) + .to_dict(orient="records") + ) + else: + losses_payload = None + + return EventSensorData( + event_id=event_id, + points=points, + arcana_mean_importances=mean_importances, + arcana_losses=losses_payload, + ) + + +def run_prediction( + request: PredictionRequest, + detector_loader: Optional[Callable[[str], FaultDetector]] = None, + events_factory: Optional[ + Callable[[pd.DataFrame, pd.Series, int], Tuple[pd.DataFrame, List[pd.DataFrame]]] + ] = None, + event_analyzer: Optional[ + Callable[[FaultDetector, pd.DataFrame, bool], Tuple[pd.Series, pd.DataFrame]] + ] = None, +) -> PredictionSuccessResponse: + """Execute a prediction run and build the response payload.""" + + if not request.data: + raise EmptyInputError("No prediction data provided.") + + timestamp_column = request.timestamp_column or DEFAULT_TIMESTAMP_COLUMN + + data_frame = pd.DataFrame(request.data) + if data_frame.empty: + raise EmptyInputError("No prediction data provided.") + + if timestamp_column not in data_frame.columns: + raise TimestampValidationError( + f"Timestamp column '{timestamp_column}' is missing from the input data." + ) + + try: + timestamp_index = pd.to_datetime(data_frame[timestamp_column], errors="raise") + except (ValueError, TypeError) as exc: + raise TimestampValidationError( + f"Failed to parse timestamp column '{timestamp_column}': {exc}" + ) from exc + + feature_data = data_frame.drop(columns=[timestamp_column]) + feature_data.index = timestamp_index + original_columns = list(feature_data.columns) + + sensor_data = get_sensor_data(feature_data.copy()) + if sensor_data.empty: + raise DataTypeMismatchError( + "None of the provided features could be converted to numeric sensor values." + ) + + loader = detector_loader or _load_fault_detector + detector = loader(request.model_path) + + expected_columns = _extract_expected_columns(detector.data_preprocessor) + if expected_columns: + _validate_input_schema(sensor_data, expected_columns, original_columns) + sensor_data = sensor_data[expected_columns] + else: + expected_columns = list(sensor_data.columns) + + sensor_data = sensor_data.sort_index() + + prediction_results: FaultDetectionResult = detector.predict( + sensor_data=sensor_data, root_cause_analysis=True + ) + predicted_anomalies = prediction_results.predicted_anomalies.copy() + anomalies = predicted_anomalies["anomaly"] + + event_creator = events_factory or create_events + event_meta_data, event_data_list = event_creator( + sensor_data=sensor_data, + boolean_information=anomalies, + min_event_length=request.min_event_length, + ) + + event_ids = list(range(1, len(event_data_list) + 1)) + + predicted_anomalies["event_id"] = pd.Series( + pd.NA, index=predicted_anomalies.index, dtype="Int64" + ) + predicted_anomalies["critical_event"] = False + for event_id, event_data in zip(event_ids, event_data_list): + event_index = event_data.index + predicted_anomalies.loc[event_index, "event_id"] = event_id + predicted_anomalies.loc[event_index, "critical_event"] = True + + analyzer = event_analyzer or analyze_event + event_sensor_payload: List[EventSensorData] = [] + for event_id, event_data in zip(event_ids, event_data_list): + arcana_mean_importances, arcana_losses = analyzer( + anomaly_detector=detector, + event_data=event_data, + track_losses=False, + ) + event_sensor_payload.append( + _build_event_sensor_payload( + event_id=event_id, + event_data=event_data, + predicted_anomalies=predicted_anomalies, + arcana_mean_importances=arcana_mean_importances, + arcana_losses=arcana_losses if arcana_losses is not None and not arcana_losses.empty else None, + ) + ) + + events_payload: List[EventMetadata] = [] + for event_id, (_, meta_row) in zip(event_ids, event_meta_data.iterrows()): + duration = meta_row["duration"] + if isinstance(duration, pd.Timedelta): + duration_seconds = float(duration.total_seconds()) + else: + duration_seconds = float(duration) + events_payload.append( + EventMetadata( + event_id=event_id, + start=_ensure_datetime(meta_row["start"]), + end=_ensure_datetime(meta_row["end"]), + duration_seconds=duration_seconds, + ) + ) + + logger.info( + "Prediction finished for model '%s'. Detected %d anomaly events.", + request.model_path, + len(events_payload), + ) + + return PredictionSuccessResponse( + status="success", + events=events_payload, + event_sensor_data=event_sensor_payload, + ) + + +app = FastAPI(title="Energy Fault Detector Prediction API") + + +@app.post( + "/predict", + response_model=PredictionSuccessResponse, + responses={ + 404: {"description": "Model not found."}, + 409: {"description": "Schema mismatch."}, + 422: {"description": "Data validation error."}, + 500: {"description": "Internal server error."}, + }, +) +async def predict(request: PredictionRequest) -> PredictionSuccessResponse: + """Predict anomalies for the provided payload.""" + + try: + return run_prediction(request) + except PredictionAPIError as exc: + logger.exception("Prediction request failed: %s", exc.message) + raise HTTPException( + status_code=exc.status_code, + detail={"status": "error", "code": exc.error_code, "message": exc.message}, + ) from exc + except Exception as exc: # pragma: no cover - safeguard for unexpected failures + logger.exception("Unexpected error during prediction request handling.") + raise HTTPException( + status_code=500, + detail={ + "status": "error", + "code": "EFD_INTERNAL_ERROR", + "message": "Unexpected internal server error.", + }, + ) from exc + + +__all__ = [ + "app", + "run_prediction", + "PredictionRequest", + "PredictionSuccessResponse", + "PredictionAPIError", + "SchemaMismatchError", + "DataTypeMismatchError", + "InvalidModelError", + "TimestampValidationError", + "EmptyInputError", +] diff --git a/energy_fault_detector/main.py b/energy_fault_detector/main.py index f47ca61..d57314d 100644 --- a/energy_fault_detector/main.py +++ b/energy_fault_detector/main.py @@ -137,7 +137,7 @@ def main(): # Call the quick_fault_detector function with parsed arguments try: from .quick_fault_detection import quick_fault_detector - prediction_results, event_meta_data, model_metadata = quick_fault_detector( + prediction_results, event_meta_data, _event_details, model_metadata = quick_fault_detector( csv_data_path=args.csv_data_path, csv_test_data_path=options.csv_test_data_path, train_test_column_name=options.train_test_column_name, diff --git a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py index b91020f..3fabf22 100644 --- a/energy_fault_detector/quick_fault_detection/quick_fault_detector.py +++ b/energy_fault_detector/quick_fault_detection/quick_fault_detector.py @@ -3,7 +3,7 @@ import os import logging from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Dict, Any try: from typing import Literal except ImportError: # pragma: no cover - for Python<3.8 compatibility @@ -40,7 +40,12 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio model_subdir: Optional[str] = None, model_name: Optional[str] = None, asset_name: Optional[str] = None - ) -> Tuple[FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]]: + ) -> Tuple[ + FaultDetectionResult, + pd.DataFrame, + List[Dict[str, Any]], + Optional[ModelMetadata], + ]: """Analyzes provided data using an autoencoder based approach for identifying anomalies based on a learned normal behavior. Anomalies are then aggregated to events and further analyzed. Runs the entire fault detection module chain in one function call. Sections of this function call are: @@ -107,7 +112,8 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio is inferred from ``csv_test_data_path`` when possible. Returns: - Tuple(FaultDetectionResult, pd.DataFrame, Optional[ModelMetadata]): FaultDetectionResult object with the + Tuple(FaultDetectionResult, pd.DataFrame, List[Dict[str, Any]], Optional[ModelMetadata]): FaultDetectionResult + object with the following DataFrames: - predicted_anomalies: DataFrame with a column 'anomaly' (bool). @@ -119,7 +125,9 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio - tracked_bias: List of DataFrames. None if ARCANA was not run. and the detected anomaly events as dataframe. When run in prediction mode, all DataFrames contained in the - ``FaultDetectionResult`` are additionally persisted as CSV files under ``prediction_output/``. + ``FaultDetectionResult`` are additionally persisted as CSV files under ``prediction_output/``. The + second return value contains event metadata while the third element of the tuple holds a list of dictionaries + with detailed information for every detected event (sensor data slices and ARCANA summaries). When run in training mode, the tuple additionally contains ModelMetadata with information about the saved model. In prediction mode, the metadata element is None. @@ -198,8 +206,8 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio predicted_anomalies.loc[event_index, 'critical_event'] = True prediction_results.predicted_anomalies = predicted_anomalies - arcana_mean_importance_list = [] - arcana_loss_list = [] + arcana_mean_importance_list: List[pd.Series] = [] + arcana_loss_list: List[Optional[pd.DataFrame]] = [] for i in range(len(event_meta_data)): logger.info(f'Analyzing anomaly events ({i + 1} of {len(event_meta_data)}).') event_data = event_data_list[i] @@ -208,8 +216,10 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio event_data=event_data, track_losses=enable_debug_plots) arcana_mean_importance_list.append(arcana_mean_importances) - if len(arcana_losses) > 0: + if arcana_losses is not None and not arcana_losses.empty: arcana_loss_list.append(arcana_losses) + else: + arcana_loss_list.append(None) logger.info('Generating Output Graphics.') logger.info(output_info) generate_output_plots(anomaly_detector=anomaly_detector, train_data=train_data, normal_index=train_normal_index, @@ -224,7 +234,17 @@ def quick_fault_detector(csv_data_path: Optional[str], csv_test_data_path: Optio asset_name=asset_name, ) - return prediction_results, event_meta_data, model_metadata + event_analysis: List[Dict[str, Any]] = [] + for event_id, event_data, importances, losses in zip(event_ids, event_data_list, + arcana_mean_importance_list, arcana_loss_list): + event_analysis.append({ + 'event_id': event_id, + 'event_data': event_data, + 'arcana_mean_importances': importances, + 'arcana_losses': losses, + }) + + return prediction_results, event_meta_data, event_analysis, model_metadata def analyze_event(anomaly_detector: FaultDetector, event_data: pd.DataFrame, track_losses: bool diff --git a/requirements.txt b/requirements.txt index 7c0686a..eb36231 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,7 @@ pandas>=2.0 cerberus matplotlib>=3.9.0 optuna>=4.0 -chardet>=5.0 \ No newline at end of file +chardet>=5.0 +fastapi>=0.110,<1.0 +uvicorn>=0.23,<0.32 +requests>=2.31,<3.0 diff --git a/run.py b/run.py index dfb1f3e..6e3a048 100644 --- a/run.py +++ b/run.py @@ -150,7 +150,7 @@ def analyze_dataframe( # from energy_fault_detector.quick_fault_detection import quick_fault_detector -prediction_results, events, metadata = quick_fault_detector( +prediction_results, events, _event_details, metadata = quick_fault_detector( csv_data_path=temp_csv_path, csv_test_data_path=test_file_path, mode="train", diff --git a/run_predict.py b/run_predict.py index 9360997..630f906 100644 --- a/run_predict.py +++ b/run_predict.py @@ -188,7 +188,7 @@ def main() -> None: train_data_path = args.csv_train_data - prediction_results, event_metadata, _ = quick_fault_detector( + prediction_results, event_metadata, _event_details, _ = quick_fault_detector( csv_data_path=train_data_path, csv_test_data_path=args.csv_predict_data, train_test_column_name=args.train_test_column, diff --git a/tests/api/test_prediction_api.py b/tests/api/test_prediction_api.py new file mode 100644 index 0000000..8cd57ac --- /dev/null +++ b/tests/api/test_prediction_api.py @@ -0,0 +1,239 @@ +from __future__ import annotations + +from types import SimpleNamespace +from typing import Tuple + +import pandas as pd +import pytest +from fastapi.testclient import TestClient + +from energy_fault_detector.api import prediction_api +from energy_fault_detector.api.prediction_api import ( + DataTypeMismatchError, + EmptyInputError, + InvalidModelError, + PredictionRequest, + PredictionSuccessResponse, + SchemaMismatchError, + TimestampValidationError, + run_prediction, +) +from energy_fault_detector.core.fault_detection_result import FaultDetectionResult + + +def _create_detector(result: FaultDetectionResult, expected_columns: Tuple[str, ...]): + column_selector = SimpleNamespace(feature_names_out_=list(expected_columns)) + imputer = SimpleNamespace(feature_names_in_=list(expected_columns)) + data_preprocessor = SimpleNamespace( + named_steps={"column_selector": column_selector, "imputer": imputer}, + get_feature_names_out=lambda: list(expected_columns), + ) + + class _Detector: + data_preprocessor = data_preprocessor + + @staticmethod + def predict(sensor_data: pd.DataFrame, root_cause_analysis: bool = False): # pylint: disable=unused-argument + return result + + return _Detector() + + +def _sample_fault_detection_result(timestamps: pd.DatetimeIndex) -> FaultDetectionResult: + sensor_values = pd.DataFrame( + { + "sensor_a": [1.0, 2.0, 3.0], + "sensor_b": [0.5, 0.6, 0.7], + }, + index=timestamps, + ) + + predicted_anomalies = pd.DataFrame( + { + "anomaly": [False, True, True], + "behaviour": ["normal", "anamoly", "anamoly"], + "anamoly_score": [0.1, 0.9, 1.1], + "threshold_score": [0.8, 0.8, 0.8], + "cumulative_anamoly_score": [0, 1, 2], + }, + index=timestamps, + ) + + anomaly_score = pd.DataFrame({"value": [0.1, 0.9, 1.1]}, index=timestamps) + + return FaultDetectionResult( + predicted_anomalies=predicted_anomalies, + reconstruction=sensor_values, + recon_error=sensor_values, + anomaly_score=anomaly_score, + bias_data=None, + arcana_losses=None, + tracked_bias=None, + ) + + +def test_run_prediction_success(): + timestamps = pd.to_datetime( + ["2024-01-01T00:00:00Z", "2024-01-01T00:01:00Z", "2024-01-01T00:02:00Z"] + ) + result = _sample_fault_detection_result(timestamps) + + detector = _create_detector(result, ("sensor_a", "sensor_b")) + + request = PredictionRequest( + model_path="dummy", + data=[ + {"time_stamp": ts.isoformat(), "sensor_a": float(idx + 1), "sensor_b": 0.5 + idx * 0.1} + for idx, ts in enumerate(timestamps) + ], + timestamp_column="time_stamp", + min_event_length=1, + ) + + event_data = result.reconstruction.iloc[1:] + event_meta = pd.DataFrame( + { + "start": [event_data.index[0]], + "end": [event_data.index[-1]], + "duration": [event_data.index[-1] - event_data.index[0]], + } + ) + + def events_factory(sensor_data: pd.DataFrame, boolean_information: pd.Series, min_event_length: int): # pylint: disable=unused-argument + return event_meta, [event_data] + + def event_analyzer(detector_obj, data, track_losses): # pylint: disable=unused-argument + importances = pd.Series({"sensor_a": 0.6, "sensor_b": 0.4}) + losses = pd.DataFrame( + {"Combined Loss": [0.2, 0.1], "Reconstruction Loss": [0.15, 0.05], "Regularization Loss": [0.05, 0.05]}, + index=[0, 50], + ) + losses.index.name = "Iteration" + return importances, losses + + response = run_prediction( + request, + detector_loader=lambda _: detector, + events_factory=events_factory, + event_analyzer=event_analyzer, + ) + + assert isinstance(response, PredictionSuccessResponse) + assert response.status == "success" + assert len(response.events) == 1 + assert response.events[0].event_id == 1 + assert len(response.event_sensor_data) == 1 + assert len(response.event_sensor_data[0].points) == len(event_data) + assert response.event_sensor_data[0].arcana_mean_importances == {"sensor_a": 0.6, "sensor_b": 0.4} + + +def test_run_prediction_schema_mismatch(): + timestamps = pd.to_datetime(["2024-01-01T00:00:00Z", "2024-01-01T00:01:00Z"]) + result = _sample_fault_detection_result(timestamps) + detector = _create_detector(result, ("sensor_a", "sensor_b")) + + request = PredictionRequest( + model_path="dummy", + data=[ + {"time_stamp": ts.isoformat(), "sensor_a": float(idx + 1)} + for idx, ts in enumerate(timestamps) + ], + timestamp_column="time_stamp", + min_event_length=1, + ) + + with pytest.raises(SchemaMismatchError): + run_prediction(request, detector_loader=lambda _: detector, events_factory=lambda *args, **kwargs: (pd.DataFrame(), [])) + + +def test_run_prediction_dtype_mismatch(): + timestamps = pd.to_datetime(["2024-01-01T00:00:00Z", "2024-01-01T00:01:00Z"]) + result = _sample_fault_detection_result(timestamps) + detector = _create_detector(result, ("sensor_a", "sensor_b")) + + request = PredictionRequest( + model_path="dummy", + data=[ + {"time_stamp": ts.isoformat(), "sensor_a": float(idx + 1), "sensor_b": "bad"} + for idx, ts in enumerate(timestamps) + ], + timestamp_column="time_stamp", + min_event_length=1, + ) + + with pytest.raises(DataTypeMismatchError): + run_prediction(request, detector_loader=lambda _: detector, events_factory=lambda *args, **kwargs: (pd.DataFrame(), [])) + + +def test_run_prediction_timestamp_validation(): + request = PredictionRequest( + model_path="dummy", + data=[{"sensor_a": 1.0, "sensor_b": 2.0}], + timestamp_column="time_stamp", + min_event_length=1, + ) + + with pytest.raises(TimestampValidationError): + run_prediction(request, detector_loader=lambda _: None, events_factory=lambda *args, **kwargs: (pd.DataFrame(), [])) + + +def test_run_prediction_invalid_model(): + timestamps = pd.to_datetime(["2024-01-01T00:00:00Z"]) + request = PredictionRequest( + model_path="dummy", + data=[{"time_stamp": timestamps[0].isoformat(), "sensor_a": 1.0, "sensor_b": 2.0}], + timestamp_column="time_stamp", + min_event_length=1, + ) + + def failing_loader(_: str): + raise InvalidModelError("model missing") + + with pytest.raises(InvalidModelError): + run_prediction(request, detector_loader=failing_loader, events_factory=lambda *args, **kwargs: (pd.DataFrame(), [])) + + +def test_run_prediction_empty_payload(): + request = PredictionRequest.construct( + model_path="dummy", + data=[], + timestamp_column="time_stamp", + min_event_length=1, + ) + + with pytest.raises(EmptyInputError): + run_prediction(request, detector_loader=lambda _: None, events_factory=lambda *args, **kwargs: (pd.DataFrame(), [])) + + +def test_predict_endpoint_success(monkeypatch): + response_payload = PredictionSuccessResponse(status="success", events=[], event_sensor_data=[]) + monkeypatch.setattr(prediction_api, "run_prediction", lambda req: response_payload) + client = TestClient(prediction_api.app) + + payload = { + "model_path": "dummy", + "data": [{"time_stamp": "2024-01-01T00:00:00Z", "sensor_a": 1.0, "sensor_b": 2.0}], + } + + response = client.post("/predict", json=payload) + assert response.status_code == 200 + assert response.json()["status"] == "success" + + +def test_predict_endpoint_error(monkeypatch): + def failing_run_prediction(req): # pylint: disable=unused-argument + raise SchemaMismatchError("missing columns") + + monkeypatch.setattr(prediction_api, "run_prediction", failing_run_prediction) + client = TestClient(prediction_api.app) + + payload = { + "model_path": "dummy", + "data": [{"time_stamp": "2024-01-01T00:00:00Z", "sensor_a": 1.0}], + } + + response = client.post("/predict", json=payload) + assert response.status_code == 409 + detail = response.json()["detail"] + assert detail["code"] == "EFD_SCHEMA_MISMATCH" + assert detail["status"] == "error"