Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions tab_err/api/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def create_errors( # noqa: PLR0913
Defaults to None.
seed (int | None, optional): Random seed. Defaults to None.


Returns:
tuple[pd.DataFrame, pd.DataFrame]:
- The first element is a copy of 'data' with errors.
Expand Down Expand Up @@ -268,3 +269,87 @@ def create_errors( # noqa: PLR0913
# Create Errors & Return
dirty_data, error_mask = mid_level.create_errors(data_copy, config)
return dirty_data, error_mask


def create_errors_with_config( # noqa: PLR0913
data: pd.DataFrame,
error_rate: float,
n_error_models_per_column: int = 1,
error_types_to_include: list[ErrorType] | None = None,
error_types_to_exclude: list[ErrorType] | None = None,
error_mechanisms_to_include: list[ErrorMechanism] | None = None,
error_mechanisms_to_exclude: list[ErrorMechanism] | None = None,
seed: int | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, MidLevelConfig]:
"""Creates errors in a given DataFrame, at a rate of *approximately* max_error_rate and returns the config used to do so.

Args:
data (pd.DataFrame): The pandas DataFrame to create errors in.
error_rate (float): The maximum error rate to be introduced to each column in the DataFrame.
n_error_models_per_column (int, optional): The number of valid error models to apply to each column. Defaults to 1.
error_types_to_include (list[ErrorType] | None, optional): A list of the error types to be included when building error models. Defaults to None.
error_types_to_exclude (list[ErrorType] | None, optional): A list of the error types to be excluded when building error models. Defaults to None.
When both error_types_to_include and error_types_to_exclude are none, the maximum number of default error types will be used.
At least one must be None or an error will occur.
error_mechanisms_to_include (list[ErrorMechanism] | None = None): A list of the error mechanisms to be included when building error models.
Defaults to None.
error_mechanisms_to_exclude (list[ErrorMechanism] | None = None): A list of the error mechanisms to be excluded when building error models.
Defaults to None.
seed (int | None, optional): Random seed. Defaults to None.


Returns:
tuple[pd.DataFrame, pd.DataFrame, dict[str | int, list[ErrorModel]]]:
- The first element is a copy of 'data' with errors.
- The second element is the associated error mask.
- The third element is the dictionary of columns to lists of error models applied.
"""
random_generator = seed_randomness_and_get_generator(seed=seed)
# Input Checking
check_error_rate(error_rate)
check_data_emptiness(data)

# Set Up Data
data_copy = data.copy()
error_mask = pd.DataFrame(data=False, index=data.index, columns=data.columns)

# Build Dictionaries
col_type = _build_column_type_dictionary(
data=data, random_generator=random_generator, error_types_to_include=error_types_to_include, error_types_to_exclude=error_types_to_exclude
)
col_mechanisms = _build_column_mechanism_dictionary(
data=data,
random_generator=random_generator,
error_mechanisms_to_include=error_mechanisms_to_include,
error_mechanisms_to_exclude=error_mechanisms_to_exclude,
)
col_num_models = _build_column_number_of_models_dictionary(data=data, column_types=col_type, column_mechanisms=col_mechanisms)

if n_error_models_per_column > 0:
error_rate = error_rate / n_error_models_per_column
config_dictionary: dict[str | int, list[ErrorModel]] = {
column: [] for column in data.columns if col_num_models[column] > 0
} # Filter out those columns with no valid error models

if error_rate * len(data) < 1: # This value is calculated and rounded to 0 in the sample function of the error mechanism subclasses "n_errors"
msg = f"With a per-model error rate of: {error_rate} and {len(data)} rows, 0 errors will be introduced."
warnings.warn(msg, stacklevel=2)

for column, error_model_list in config_dictionary.items():
for _ in range(n_error_models_per_column):
error_model_list.append(
ErrorModel(
# NOTE: in python 3.9 mypy fails here but tests work
error_type=random_generator.choice(col_type[column]), # type: ignore[arg-type]
error_mechanism=random_generator.choice(col_mechanisms[column]), # type: ignore[arg-type]
error_rate=error_rate,
)
)
config = MidLevelConfig(config_dictionary)
else: # n_error_models_per_column is 0 or less.
msg = f"n_error_models_per_column is: {n_error_models_per_column} and should be a positive integer"
raise ValueError(msg)

# Create Errors & Return
dirty_data, error_mask = mid_level.create_errors(data_copy, config)
return dirty_data, error_mask, config
95 changes: 95 additions & 0 deletions tab_err/api/object_oriented.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from __future__ import annotations

import pickle
from pathlib import Path
from typing import TYPE_CHECKING

from tab_err.api.high_level import create_errors, create_errors_with_config

if TYPE_CHECKING:
import pandas as pd
from typing_extensions import Self

from tab_err import ErrorMechanism, ErrorType
from tab_err.api import MidLevelConfig


class ErrorInjector:
"""Object-oriented wrapper around the high-level error creation API.

This class allows:
- Reproducible error injection
- Access to the configuration used
- Serialization of the configuration for later reuse
"""

def __init__( # noqa: PLR0913
self,
*,
error_rate: float,
n_error_models_per_column: int = 1,
error_types_to_include: list[ErrorType] | None = None,
error_types_to_exclude: list[ErrorType] | None = None,
error_mechanisms_to_include: list[ErrorMechanism] | None = None,
error_mechanisms_to_exclude: list[ErrorMechanism] | None = None,
seed: int | None = None,
) -> None:
"""Initialize the ErrorInjector object."""
self._error_rate = error_rate
self._n_error_models_per_column = n_error_models_per_column
self._error_types_to_include = error_types_to_include
self._error_types_to_exclude = error_types_to_exclude
self._error_mechanisms_to_include = error_mechanisms_to_include
self._error_mechanisms_to_exclude = error_mechanisms_to_exclude
self._seed = seed

self._last_config: MidLevelConfig | None = None

@property
def config(self) -> MidLevelConfig:
"""Return the configuration used in the most recent error creation."""
if self._last_config is None:
msg = "No configuration available. Call apply_with_config() first."
raise RuntimeError(msg)
return self._last_config

def apply(self, data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Apply errors to a DataFrame without retaining the configuration."""
dirty_data, error_mask = create_errors(
data=data,
error_rate=self._error_rate,
n_error_models_per_column=self._n_error_models_per_column,
error_types_to_include=self._error_types_to_include,
error_types_to_exclude=self._error_types_to_exclude,
error_mechanisms_to_include=self._error_mechanisms_to_include,
error_mechanisms_to_exclude=self._error_mechanisms_to_exclude,
seed=self._seed,
)
return dirty_data, error_mask

def apply_with_config(self, data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Apply errors to a DataFrame and store the configuration used."""
dirty_data, error_mask, config = create_errors_with_config(
data=data,
error_rate=self._error_rate,
n_error_models_per_column=self._n_error_models_per_column,
error_types_to_include=self._error_types_to_include,
error_types_to_exclude=self._error_types_to_exclude,
error_mechanisms_to_include=self._error_mechanisms_to_include,
error_mechanisms_to_exclude=self._error_mechanisms_to_exclude,
seed=self._seed,
)
self._last_config = config
return dirty_data, error_mask

def save_config(self, path: str | Path) -> None:
"""Serialize the last-used configuration to disk."""
with Path(path).open("wb") as f:
pickle.dump(self.config, f)

@classmethod
def from_config(cls, config: MidLevelConfig) -> Self:
"""Create an injector that reuses an existing configuration."""
injector = cls(error_rate=0.0)
injector._last_config = config
return injector
8 changes: 8 additions & 0 deletions tab_err/error_mechanism/_ear.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,11 @@ def _sample(self: EAR, data: pd.DataFrame, column: str | int, error_rate: float,
se_mask.loc[selected_rows.index] = True

return error_mask

def __str__(self) -> str:
"""Return a human-readable string for the object."""
return f"EAR(condition_to_column={self.condition_to_column}, seed={getattr(self, '_random_generator', None)})"

def __repr__(self) -> str:
"""Return a detailed string for debugging."""
return f"<EAR condition_to_column={self.condition_to_column}, random_generator={getattr(self, '_random_generator', None)}>"
8 changes: 8 additions & 0 deletions tab_err/error_mechanism/_ecar.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,11 @@ def _sample(
error_indices = self._random_generator.choice(se_mask_error_free.index, n_errors, replace=False)
se_mask[error_indices] = True
return error_mask

def __str__(self) -> str:
"""Human-readable string."""
return f"ECAR(seed={getattr(self._random_generator, 'seed', None)})"

def __repr__(self) -> str:
"""Detailed debug string."""
return f"<ECAR random_generator={self._random_generator}>"
6 changes: 6 additions & 0 deletions tab_err/error_mechanism/_enar.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,9 @@ def _sample(self: ENAR, data: pd.DataFrame, column: str | int, error_rate: float
se_mask.loc[selected_rows.index] = True

return error_mask

def __str__(self) -> str:
return f"ENAR(seed={getattr(self._random_generator, 'seed', None)})"

def __repr__(self) -> str:
return f"<ENAR random_generator={self._random_generator}>"