diff --git a/emm/preprocessing/functions.py b/emm/preprocessing/functions.py index fd9f2bf..b88c1c8 100644 --- a/emm/preprocessing/functions.py +++ b/emm/preprocessing/functions.py @@ -74,9 +74,9 @@ def map_shorthands(name): "remove_extra_space": F.regex_replace(r"""\s+""", " ", simple=True), # Map all the shorthands to the same format (stichting => stg) "map_shorthands": map_shorthands, - # Merge & separated abbreviations by removing & and the spaces between them + # Merge & or / separated abbreviations by removing & or / and the spaces between them "merge_&": F.regex_replace( - r"(\s|^)(\w)\s*&\s*(\w)(\s|$)", r"$1$2$3$4" if use_spark else r"\1\2\3\4", simple=True + r"(\s|^)(\w)\s*[&/]\s*(\w)(\s|$)", r"$1$2$3$4" if use_spark else r"\1\2\3\4", simple=True ), # remove legal form "remove_legal_form": F.run_custom_function( diff --git a/tests/unit/test_name_preprocessing.py b/tests/unit/test_name_preprocessing.py index df5b064..2f7235c 100644 --- a/tests/unit/test_name_preprocessing.py +++ b/tests/unit/test_name_preprocessing.py @@ -52,7 +52,7 @@ ["Tzu-Sun_BV.a;b,c_ä", "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ", "Café"], ["Tzu-Sun_BV.a;b,c_a", "acelnoszzACELNOSZZ", "Cafe"], ), - ("merge_&", ["xyz & abc C&D"], ["xyz & abc CD"]), + ("merge_&", ["xyz & abc C&D", "foobar S/A"], ["xyz & abc CD", "foobar SA"]), ( "preprocess_name", ["Tzu-Sun_BV.a;b,c_ä", "Tzu-Sun_BV morethan1space"],