Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions bulwark/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,35 @@ def is_same_as(df, df_to_compare, **kwargs):
return df


def matches_regex(df, pattern, columns=None, **kwargs):
Copy link
Owner

@ZaxR ZaxR Jul 29, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think of passing a dictionary of pattern: columns, so that different columns can be checked for different patterns in the same check? Otherwise you might have to call this check a number of times. Trying to balance the simplest possible interface with the most flexibility

"""Asserts that the values in `df`'s `columns` match `pattern`.

Args:
df (pd.DataFrame): Any pd.DataFrame.
pattern (str): Pattern to match against.
columns (list): A subset of columns to check for matching `pattern`.
**kwargs (dict): Keyword arguments passed through to pandas' ``Series.str.match``.

Returns:
Original `df`.

"""
columns = columns if columns is not None else df.columns

non_matches_df = pd.DataFrame()

for col in columns:
matches = df[col].str.match(pattern, **kwargs).fillna(False)
if not matches.all():
non_matches_df[col] = ~matches

if not non_matches_df.empty:
msg = bad_locations(non_matches_df)
raise AssertionError(msg)

return df


def multi_check(df, checks, warn=False):
"""Asserts that all checks pass.

Expand Down
63 changes: 63 additions & 0 deletions tests/test_checks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import re

import pytest
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -395,6 +397,67 @@ def test_is_same_as_with_kwargs():
tm.assert_frame_equal(df, result)


def test_matches_regex():
df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']})
pattern = r'a.'
result = ck.matches_regex(df, pattern)
tm.assert_frame_equal(df, result)

result = dc.MatchesRegex(pattern)(_noop)(df)
tm.assert_frame_equal(df, result)

pattern = r'aa'
with pytest.raises(AssertionError):
ck.matches_regex(df, pattern)
dc.MatchesRegex(pattern)(_noop)(df)


def test_matches_regex_with_columns():
df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']})
pattern = r'a.'
result = ck.matches_regex(df, pattern, columns=['A'])
tm.assert_frame_equal(df, result)

result = dc.MatchesRegex(pattern, columns=['A'])(_noop)(df)
tm.assert_frame_equal(df, result)

pattern = r'a[a-z]'
with pytest.raises(AssertionError):
ck.matches_regex(df, pattern, columns=['B'])
dc.MatchesRegex(pattern, columns=['B'])(_noop)(df)


def test_matches_regex_with_kwargs():
df = pd.DataFrame({'A': ['aa', 'ab', 'ac'], 'B': ['ad', 'ae', 'a1']})
pattern = 'A.'
result = ck.matches_regex(df, pattern, case=False)
tm.assert_frame_equal(df, result)

result = dc.MatchesRegex(pattern, case=False)(_noop)(df)
tm.assert_frame_equal(df, result)

pattern = 'A.'
result = ck.matches_regex(df, pattern, flags=re.IGNORECASE)
tm.assert_frame_equal(df, result)

result = dc.MatchesRegex(pattern, flags=re.IGNORECASE)(_noop)(df)
tm.assert_frame_equal(df, result)

df = pd.DataFrame({'A': ['aa', 'ab', np.nan]})
pattern = 'a.'

# should raise because of the nan
with pytest.raises(AssertionError):
ck.matches_regex(df, pattern)
dc.MatchesRegex(pattern)(_noop)(df)

result = ck.matches_regex(df, pattern, na='az')
tm.assert_frame_equal(df, result)

result = dc.MatchesRegex(pattern, na='az')(_noop)(df)
tm.assert_frame_equal(df, result)


def test_multi_check():
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = ck.multi_check(df,
Expand Down