diff --git a/bulwark/checks.py b/bulwark/checks.py index 2b52aba..6e2cc79 100644 --- a/bulwark/checks.py +++ b/bulwark/checks.py @@ -408,6 +408,35 @@ def is_same_as(df, df_to_compare, **kwargs): return df +def matches_regex(df, pattern, columns=None, **kwargs): + """Asserts that the values in `df`'s `columns` match `pattern`. + + Args: + df (pd.DataFrame): Any pd.DataFrame. + pattern (str): Pattern to match against. + columns (list): A subset of columns to check for matching `pattern`. + **kwargs (dict): Keyword arguments passed through to pandas' ``Series.str.match``. + + Returns: + Original `df`. + + """ + columns = columns if columns is not None else df.columns + + non_matches_df = pd.DataFrame() + + for col in columns: + matches = df[col].str.match(pattern, **kwargs).fillna(False) + if not matches.all(): + non_matches_df[col] = ~matches + + if not non_matches_df.empty: + msg = bad_locations(non_matches_df) + raise AssertionError(msg) + + return df + + def multi_check(df, checks, warn=False): """Asserts that all checks pass. diff --git a/tests/test_checks.py b/tests/test_checks.py index 552526d..13e6657 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import re + import pytest import numpy as np import pandas as pd @@ -395,6 +397,67 @@ def test_is_same_as_with_kwargs(): tm.assert_frame_equal(df, result) +def test_matches_regex(): + df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']}) + pattern = r'a.' + result = ck.matches_regex(df, pattern) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern)(_noop)(df) + tm.assert_frame_equal(df, result) + + pattern = r'aa' + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern) + dc.MatchesRegex(pattern)(_noop)(df) + + +def test_matches_regex_with_columns(): + df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']}) + pattern = r'a.' + result = ck.matches_regex(df, pattern, columns=['A']) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, columns=['A'])(_noop)(df) + tm.assert_frame_equal(df, result) + + pattern = r'a[a-z]' + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern, columns=['B']) + dc.MatchesRegex(pattern, columns=['B'])(_noop)(df) + + +def test_matches_regex_with_kwargs(): + df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']}) + pattern = 'A.' + result = ck.matches_regex(df, pattern, case=False) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, case=False)(_noop)(df) + tm.assert_frame_equal(df, result) + + pattern = 'A.' + result = ck.matches_regex(df, pattern, flags=re.IGNORECASE) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, flags=re.IGNORECASE)(_noop)(df) + tm.assert_frame_equal(df, result) + + df = pd.DataFrame({'A': ['aa', 'ab', np.nan]}) + pattern = 'a.' + + # should raise because of the nan + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern) + dc.MatchesRegex(pattern)(_noop)(df) + + result = ck.matches_regex(df, pattern, na='az') + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, na='az')(_noop)(df) + tm.assert_frame_equal(df, result) + + def test_multi_check(): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = ck.multi_check(df,