From 3bf9cebc8977d1b5333dac8bdb7b2717cb5da496 Mon Sep 17 00:00:00 2001 From: William Horton Date: Sat, 27 Jul 2019 20:53:58 -0400 Subject: [PATCH 1/2] Add new check: matches_regex --- bulwark/checks.py | 29 +++++++++++++++++++++++ tests/test_checks.py | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/bulwark/checks.py b/bulwark/checks.py index 2b52aba..6e2cc79 100644 --- a/bulwark/checks.py +++ b/bulwark/checks.py @@ -408,6 +408,35 @@ def is_same_as(df, df_to_compare, **kwargs): return df +def matches_regex(df, pattern, columns=None, **kwargs): + """Asserts that the values in `df`'s `columns` match `pattern`. + + Args: + df (pd.DataFrame): Any pd.DataFrame. + pattern (str): Pattern to match against. + columns (list): A subset of columns to check for matching `pattern`. + **kwargs (dict): Keyword arguments passed through to pandas' ``Series.str.match``. + + Returns: + Original `df`. + + """ + columns = columns if columns is not None else df.columns + + non_matches_df = pd.DataFrame() + + for col in columns: + matches = df[col].str.match(pattern, **kwargs).fillna(False) + if not matches.all(): + non_matches_df[col] = ~matches + + if not non_matches_df.empty: + msg = bad_locations(non_matches_df) + raise AssertionError(msg) + + return df + + def multi_check(df, checks, warn=False): """Asserts that all checks pass. diff --git a/tests/test_checks.py b/tests/test_checks.py index 552526d..1f438ea 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import re + import pytest import numpy as np import pandas as pd @@ -395,6 +397,60 @@ def test_is_same_as_with_kwargs(): tm.assert_frame_equal(df, result) +def test_matches_regex(): + df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']}) + pattern = r'a.' + result = ck.matches_regex(df, pattern) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern)(_noop)(df) + tm.assert_frame_equal(df, result) + + result = ck.matches_regex(df, pattern, columns=['A']) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, columns=['A'])(_noop)(df) + tm.assert_frame_equal(df, result) + + pattern = r'a[a-z]' + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern, columns=['B']) + dc.MatchesRegex(pattern, columns=['B'])(_noop)(df) + + pattern = r'aa' + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern) + dc.MatchesRegex(pattern)(_noop)(df) + + pattern = 'A.' + result = ck.matches_regex(df, pattern, case=False) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, case=False)(_noop)(df) + tm.assert_frame_equal(df, result) + + pattern = 'A.' + result = ck.matches_regex(df, pattern, flags=re.IGNORECASE) + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, flags=re.IGNORECASE)(_noop)(df) + tm.assert_frame_equal(df, result) + + df = pd.DataFrame({'A': ['aa', 'ab', np.nan]}) + pattern = 'a.' + + # should raise because of the nan + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern) + dc.MatchesRegex(pattern)(_noop)(df) + + result = ck.matches_regex(df, pattern, na='az') + tm.assert_frame_equal(df, result) + + result = dc.MatchesRegex(pattern, na='az')(_noop)(df) + tm.assert_frame_equal(df, result) + + def test_multi_check(): df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) result = ck.multi_check(df, From 723fc61a79e4955ebfbc562f4ee50a5a08fe7b1a Mon Sep 17 00:00:00 2001 From: William Horton Date: Sat, 27 Jul 2019 21:06:32 -0400 Subject: [PATCH 2/2] More organization for tests. --- tests/test_checks.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/test_checks.py b/tests/test_checks.py index 1f438ea..13e6657 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -406,6 +406,15 @@ def test_matches_regex(): result = dc.MatchesRegex(pattern)(_noop)(df) tm.assert_frame_equal(df, result) + pattern = r'aa' + with pytest.raises(AssertionError): + ck.matches_regex(df, pattern) + dc.MatchesRegex(pattern)(_noop)(df) + + +def test_matches_regex_with_columns(): + df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']}) + pattern = r'a.' result = ck.matches_regex(df, pattern, columns=['A']) tm.assert_frame_equal(df, result) @@ -417,11 +426,9 @@ def test_matches_regex(): ck.matches_regex(df, pattern, columns=['B']) dc.MatchesRegex(pattern, columns=['B'])(_noop)(df) - pattern = r'aa' - with pytest.raises(AssertionError): - ck.matches_regex(df, pattern) - dc.MatchesRegex(pattern)(_noop)(df) +def test_matches_regex_with_kwargs(): + df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']}) pattern = 'A.' result = ck.matches_regex(df, pattern, case=False) tm.assert_frame_equal(df, result)