From 4a6c8bd104e65f027a161cd5318155d3a821a1cb Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Wed, 22 May 2024 20:58:44 -0500 Subject: [PATCH] ensure upper() doesn't increase string length --- interegular/patterns.py | 13 +++++++++++-- tests/test_patterns.py | 8 ++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/interegular/patterns.py b/interegular/patterns.py index bb09dbb..bc3f3e2 100644 --- a/interegular/patterns.py +++ b/interegular/patterns.py @@ -54,6 +54,15 @@ def _combine_flags(base: REFlags, added: REFlags, removed: REFlags): return base +def _one_char_upper(char: str) -> str: + """Convert char to upper, if multiple chars, return original char""" + assert len(char) == 1 + upper_char = char.upper() + if len(upper_char) > 1: + return char + return upper_char + + @dataclass(frozen=True) class _BasePattern(ABC): __slots__ = '_alphabet_cache', '_prefix_cache', '_lengths_cache' @@ -115,7 +124,7 @@ class _CharGroup(_Repeatable): def _get_alphabet(self, flags: REFlags) -> Alphabet: if flags & REFlags.CASE_INSENSITIVE: - relevant = {*map(str.lower, self.chars), *map(str.upper, self.chars)} + relevant = {*map(str.lower, self.chars), *map(_one_char_upper, self.chars)} else: relevant = self.chars return Alphabet.from_groups(relevant, {anything_else}) @@ -139,7 +148,7 @@ def to_fsm(self, alphabet=None, prefix_postfix=None, flags=REFlags(0)) -> FSM: if flags: raise Unsupported(flags) if insensitive: - chars = frozenset({*(c.lower() for c in self.chars), *(c.upper() for c in self.chars)}) + chars = frozenset({*(c.lower() for c in self.chars), *(_one_char_upper(c) for c in self.chars)}) else: chars = self.chars diff --git a/tests/test_patterns.py b/tests/test_patterns.py index 74b3dc7..8650ae6 100644 --- a/tests/test_patterns.py +++ b/tests/test_patterns.py @@ -97,6 +97,14 @@ def test_flags(self): self.parse_unsupported("(?u)") self.parse_unsupported("(?x)") + def test_multichar(self): + fsm = parse_pattern("(?i:ß)").to_fsm() + strings = set([s[0] for s in fsm.strings()]) + assert fsm.accepts("ß") + assert not fsm.accepts("SS") + assert "ß" in strings + assert "SS" not in strings + if __name__ == '__main__': unittest.main()