Skip to content
3 changes: 3 additions & 0 deletions tab_err/error_type/_typo.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ def typo(input_text: str, typo_error_period: int = 10, layout: str = "ansi-qwert
message = "typo_error_period smaller than 1 is invalid, as multiple errors per word are not supported."
raise ValueError(message)

if input_text == "": # return random char if empty string
return random.choice(list(neighbors.keys()))

splits = input_text.split(" ")

# draw only from splits that have a content
Expand Down
Empty file added tests/api/__init__.py
Empty file.
106 changes: 106 additions & 0 deletions tests/api/test_high_level_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import numpy as np
import pandas as pd
import pytest

from tab_err.api.high_level import create_errors


class TestHighLevelAPI:
"""Tests the high-level API."""

def test_create_errors_basic(self, test_data: dict[str, pd.DataFrame]) -> None:
"""Test that create_errors returns two DataFrames with expected properties."""
seed = 42
error_rate = 0.5
modified_data_4rows_5columns, data_4rows_5columns_error_mask = create_errors(test_data["data_4rows_5columns"], error_rate, seed=seed)
modified_data_10rows_3columns, data_10rows_3columns_error_mask = create_errors(test_data["data_10rows_3columns"], error_rate, seed=seed)

# Check that they are still dataframes
assert isinstance(modified_data_4rows_5columns, pd.DataFrame)
assert isinstance(data_4rows_5columns_error_mask, pd.DataFrame)
assert isinstance(modified_data_10rows_3columns, pd.DataFrame)
assert isinstance(data_10rows_3columns_error_mask, pd.DataFrame)

# Check Shapes
assert modified_data_4rows_5columns.shape == test_data["data_4rows_5columns"].shape
assert data_4rows_5columns_error_mask.shape == test_data["data_4rows_5columns"].shape
assert modified_data_10rows_3columns.shape == test_data["data_10rows_3columns"].shape
assert data_10rows_3columns_error_mask.shape == test_data["data_10rows_3columns"].shape

# Assert the error masks contain only boolean values
assert data_4rows_5columns_error_mask.dtypes.apply(lambda dt: np.issubdtype(dt, np.bool_)).all()
assert data_10rows_3columns_error_mask.dtypes.apply(lambda dt: np.issubdtype(dt, np.bool_)).all()

# Assert that the error masks have the correct proportion of True to False
assert pytest.approx(error_rate) == data_4rows_5columns_error_mask.to_numpy().mean()
assert pytest.approx(error_rate) == data_10rows_3columns_error_mask.to_numpy().mean()

def test_create_errors_seed(self, test_data: dict[str, pd.DataFrame]) -> None:
"""Test that create_errors returns the same dataframe when a seed is used."""
seed = 42
error_rate = 0.5

modified_data_1, error_mask_1 = create_errors(test_data["data_10rows_3columns"], error_rate=error_rate, seed=seed)
modified_data_2, error_mask_2 = create_errors(test_data["data_10rows_3columns"], error_rate=error_rate, seed=seed)

# Ensure same seed yields same dataframes
pd.testing.assert_frame_equal(modified_data_1, modified_data_2)
pd.testing.assert_frame_equal(error_mask_1, error_mask_2)

def test_create_errors_error_rates(self, test_data: dict[str, pd.DataFrame]) -> None:
"""Test that create_errors returns two DataFrames with expected properties."""
seed = 42
for i in range(11):
error_rate = 0.1 * float(i)
modified_data_100rows_3columns, data_100rows_3columns_error_mask = create_errors(test_data["data_100rows_3columns"], error_rate, seed=seed)
modified_data_10rows_3columns, data_10rows_3columns_error_mask = create_errors(test_data["data_10rows_3columns"], error_rate, seed=seed)
modified_data_10rows_3columns_with_datetime, data_10rows_3columns_with_datetime_error_mask = create_errors(
test_data["data_10rows_3columns_with_datetime"], error_rate, seed=seed
)

# Assert that the error masks have the correct proportion of True to False
assert pytest.approx(error_rate) == data_100rows_3columns_error_mask.to_numpy().mean()
assert pytest.approx(error_rate) == data_10rows_3columns_error_mask.to_numpy().mean()
assert pytest.approx(error_rate) == data_10rows_3columns_with_datetime_error_mask.to_numpy().mean()

def test_create_errors_more_models(self, test_data: dict[str, pd.DataFrame]) -> None:
"""Test that when more error models are introduced, the create_errors method has expected DataFrame return."""
error_rate = 1.0
seed = 42
n_error_models = 2

modified_data_4rows_5columns, data_4rows_5columns_error_mask = create_errors(
test_data["data_4rows_5columns"], error_rate, n_error_models_per_column=n_error_models, seed=seed
)
modified_data_10rows_3columns, data_10rows_3columns_error_mask = create_errors(
test_data["data_10rows_3columns"], error_rate, n_error_models_per_column=n_error_models, seed=seed
)
modified_data_100rows_3columns, data_100rows_3columns_error_mask = create_errors(
test_data["data_100rows_3columns"], error_rate, n_error_models_per_column=n_error_models, seed=seed
)

# Check that they are still dataframes
assert isinstance(modified_data_4rows_5columns, pd.DataFrame)
assert isinstance(data_4rows_5columns_error_mask, pd.DataFrame)
assert isinstance(modified_data_10rows_3columns, pd.DataFrame)
assert isinstance(data_10rows_3columns_error_mask, pd.DataFrame)
assert isinstance(modified_data_100rows_3columns, pd.DataFrame)
assert isinstance(data_100rows_3columns_error_mask, pd.DataFrame)

# Check Shapes
assert modified_data_4rows_5columns.shape == test_data["data_4rows_5columns"].shape
assert data_4rows_5columns_error_mask.shape == test_data["data_4rows_5columns"].shape
assert modified_data_10rows_3columns.shape == test_data["data_10rows_3columns"].shape
assert data_10rows_3columns_error_mask.shape == test_data["data_10rows_3columns"].shape
assert modified_data_100rows_3columns.shape == test_data["data_100rows_3columns"].shape
assert data_100rows_3columns_error_mask.shape == test_data["data_100rows_3columns"].shape

# Assert the error masks contain only boolean values
assert data_4rows_5columns_error_mask.dtypes.apply(lambda dt: np.issubdtype(dt, np.bool_)).all()
assert data_10rows_3columns_error_mask.dtypes.apply(lambda dt: np.issubdtype(dt, np.bool_)).all()
assert data_100rows_3columns_error_mask.dtypes.apply(lambda dt: np.issubdtype(dt, np.bool_)).all()

# Assert that the error masks have the correct proportion of True to False
assert pytest.approx(error_rate) == data_4rows_5columns_error_mask.to_numpy().mean()
assert pytest.approx(error_rate) == data_10rows_3columns_error_mask.to_numpy().mean()
assert pytest.approx(error_rate) == data_100rows_3columns_error_mask.to_numpy().mean()
24 changes: 24 additions & 0 deletions tests/api/test_low_level_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import pandas as pd
import pytest

from tab_err import error_mechanism, error_type
from tab_err.api.low_level import create_errors


class TestLowLevelAPI:
"""Tests the low-level API."""

def test_create_errors_error_rates(self, test_data: dict[str, pd.DataFrame]) -> None:
"""Test that create_errors returns two DataFrames with expected properties."""
for i in range(11):
error_rate = 0.1 * float(i)
modified_data_100rows_3columns, data_100rows_3columns_error_mask = create_errors(
test_data["data_100rows_3columns"], "A", error_rate, error_mechanism.ECAR(), error_type.AddDelta()
)
modified_data_10rows_3columns, data_10rows_3columns_error_mask = create_errors(
test_data["data_10rows_3columns"], "A", error_rate, error_mechanism.ECAR(), error_type.AddDelta()
)

# Assert that the error masks have the correct proportion of True to False -- Note only one column is errored
assert pytest.approx(error_rate / 3.0) == data_100rows_3columns_error_mask.to_numpy().mean()
assert pytest.approx(error_rate / 3.0) == data_10rows_3columns_error_mask.to_numpy().mean()
19 changes: 19 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
import pandas as pd
import pytest


@pytest.fixture
def test_data() -> dict[str, pd.DataFrame]:
"""Fixture to provide test data before each test runs."""
rng = np.random.default_rng(42)
return {
"data_10rows_3columns": pd.DataFrame({"A": rng.integers(0, 100, 10), "B": rng.random(10), "C": rng.choice(["X", "Y", "Z"], 10)}),
"data_4rows_5columns": pd.DataFrame(
{"A": rng.integers(0, 100, 4), "B": rng.random(4), "C": rng.choice(["X", "Y", "Z"], 4), "D": rng.integers(0, 100, 4), "E": rng.random(4)}
),
"data_100rows_3columns": pd.DataFrame({"A": rng.integers(0, 100, 100), "B": rng.random(100), "C": rng.choice(["X", "Y", "Z"], 100)}),
"data_10rows_3columns_with_datetime": pd.DataFrame(
{"A": rng.integers(0, 100, 10), "B": pd.date_range(start="2025-03-04", periods=10, freq="2h"), "C": rng.choice(["X", "Y", "Z"], 10)}
),
}
Empty file added tests/error_type/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions tests/error_type/test_typo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd

from tab_err import error_mechanism, error_type
from tab_err.api.low_level import create_errors


def test_typo() -> None:
"""Test that Typo replaces empty strings with a random character."""
test_data = pd.DataFrame(
{
"A": ["", "Alice", "Bob", "Bob", "Clara", "David"],
}
)
modified_df, _ = create_errors(test_data, "A", 1, error_mechanism.ECAR(), error_type.Typo())
assert modified_df.iloc[0, 0] != ""
122 changes: 0 additions & 122 deletions tests/test_high_level.py

This file was deleted.

32 changes: 0 additions & 32 deletions tests/test_low_level.py

This file was deleted.