diff --git a/src/linse/typedsequence.py b/src/linse/typedsequence.py index b719a65..915a94b 100644 --- a/src/linse/typedsequence.py +++ b/src/linse/typedsequence.py @@ -133,17 +133,26 @@ class Morpheme(TypedSequence): # noqa: N801 item_separator = ' ' @classmethod - def from_string(cls, s): - if re.search(r'\s+', s): - # We assume that s is a whitespace-separated list of segments: - s = s.split() + def from_string(cls, s, separator=None): + separator = separator or cls.item_separator + if separator.strip(): # if the separator is something else than whitespaces in any form + separator = "\s*" + re.escape(separator) + "\s*" + s = re.split(separator, s) else: - # - # FIXME: do segmentation here! - # - s = list(s) + if re.search(r'\s+', s): + # We assume that s is a whitespace-separated list of segments: + s = s.split() + else: + # + # FIXME: do segmentation here! + # + s = list(s) return cls(s) + @classmethod + def from_segments(cls, s, separator=None): + return s.split(separator) if separator else s.split() + def to_text(self): return ''.join(self) @@ -160,26 +169,38 @@ class Word(TypedSequence): item_separator = ' + ' @classmethod - def from_string(cls, s: str, **kw): + def from_string(cls, s: str, separator=None, **kw): + separator = separator or cls.item_separator kw['type'] = Morpheme # We assume s is a list of morphemes separated by +: return cls(iterable=[ - Morpheme.from_string(m.strip()) for m in s.split(cls.item_separator.strip())], **kw) + Morpheme.from_string(m.strip()) for m in s.split(separator.strip())], **kw) + + @classmethod + def from_segments(cls, s, separator=None, **kw): + separator = separator or cls.item_separator + pattern = r"\s+" + re.escape(separator.strip()) + r"\s+" + return cls(iterable=[ + Morpheme.from_segments(m) for m in re.split(pattern, s)], **kw) def to_text(self): return ''.join(m.to_text() for m in self) + def reversed_segments(self): + return Word([m[::-1] for m in self[::-1]]) + class Phrase(TypedSequence): item_type = Word item_separator = ' _ ' @classmethod - def from_string(cls, s: str, **kw): # pragma: no cover + def from_string(cls, s: str, separator=None, **kw): # pragma: no cover + separator = separator or cls.item_separator kw['type'] = Word # We assume s is a list of morphemes separated by +: return cls(iterable=[ - Word.from_string(m.strip()) for m in s.split(cls.item_separator.strip())], **kw) + Word.from_string(m.strip()) for m in s.split(separator.strip())], **kw) @classmethod def from_text(cls, text): diff --git a/tests/test_typedsequence.py b/tests/test_typedsequence.py index e40d50f..d45524b 100644 --- a/tests/test_typedsequence.py +++ b/tests/test_typedsequence.py @@ -169,6 +169,16 @@ def test_Word(): s[0] = Morpheme("b c d".split()) assert str(s[0]) == "b c d" + # test for monosegmental multi-character morphemes + word = Word.from_segments("a b + cd") + assert len(word[1]) == 1 + assert word[1][0] == "cd" + + # test custom separator & whitespace trimming + word = Word.from_segments("a b = c", separator="=") + assert len(word) == 2 + assert str(word) == "a b + c" + word = Word.from_string("a + b + c") # make sure word can be hashed