From 0c281e33cb4f2276df244b44ac2fed0906b81f90 Mon Sep 17 00:00:00 2001 From: Arne Rubehn Date: Wed, 17 Jul 2024 12:16:15 +0200 Subject: [PATCH 1/4] test --- src/linse/typedsequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linse/typedsequence.py b/src/linse/typedsequence.py index b719a65..e66e051 100644 --- a/src/linse/typedsequence.py +++ b/src/linse/typedsequence.py @@ -3,7 +3,7 @@ __all__ = ['TypedSequence', 'Morpheme', 'Word', 'ints', 'floats'] - +# test class TypedSequence(list): """ A list of objects of the same type. From 44dc9f351d7fd4f9d602ec8627e57d0ddb2f9b29 Mon Sep 17 00:00:00 2001 From: Arne Rubehn Date: Wed, 17 Jul 2024 12:17:35 +0200 Subject: [PATCH 2/4] test complete --- src/linse/typedsequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linse/typedsequence.py b/src/linse/typedsequence.py index e66e051..b719a65 100644 --- a/src/linse/typedsequence.py +++ b/src/linse/typedsequence.py @@ -3,7 +3,7 @@ __all__ = ['TypedSequence', 'Morpheme', 'Word', 'ints', 'floats'] -# test + class TypedSequence(list): """ A list of objects of the same type. From fa11cde6a9083f35a85f7ec77c41691b98e81890 Mon Sep 17 00:00:00 2001 From: Arne Rubehn Date: Wed, 17 Jul 2024 13:30:00 +0200 Subject: [PATCH 3/4] implement parsing segmented sequences --- src/linse/typedsequence.py | 42 ++++++++++++++++++++++++++----------- tests/test_typedsequence.py | 10 +++++++++ 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/linse/typedsequence.py b/src/linse/typedsequence.py index b719a65..5190f35 100644 --- a/src/linse/typedsequence.py +++ b/src/linse/typedsequence.py @@ -133,17 +133,26 @@ class Morpheme(TypedSequence): # noqa: N801 item_separator = ' ' @classmethod - def from_string(cls, s): - if re.search(r'\s+', s): - # We assume that s is a whitespace-separated list of segments: - s = s.split() + def from_string(cls, s, separator=None): + separator = separator or cls.item_separator + if separator.strip(): # if the separator is something else than whitespaces in any form + separator = "\s*" + re.escape(separator) + "\s*" + s = re.split(separator, s) else: - # - # FIXME: do segmentation here! - # - s = list(s) + if re.search(r'\s+', s): + # We assume that s is a whitespace-separated list of segments: + s = s.split() + else: + # + # FIXME: do segmentation here! + # + s = list(s) return cls(s) + @classmethod + def from_segments(cls, s, separator=None): + return s.split(separator) if separator else s.split() + def to_text(self): return ''.join(self) @@ -160,11 +169,19 @@ class Word(TypedSequence): item_separator = ' + ' @classmethod - def from_string(cls, s: str, **kw): + def from_string(cls, s: str, separator=None, **kw): + separator = separator or cls.item_separator kw['type'] = Morpheme # We assume s is a list of morphemes separated by +: return cls(iterable=[ - Morpheme.from_string(m.strip()) for m in s.split(cls.item_separator.strip())], **kw) + Morpheme.from_string(m.strip()) for m in s.split(separator.strip())], **kw) + + @classmethod + def from_segments(cls, s, separator=None, **kw): + separator = separator or cls.item_separator + pattern = r"\s+" + re.escape(separator.strip()) + r"\s+" + return cls(iterable=[ + Morpheme.from_segments(m) for m in re.split(pattern, s)], **kw) def to_text(self): return ''.join(m.to_text() for m in self) @@ -175,11 +192,12 @@ class Phrase(TypedSequence): item_separator = ' _ ' @classmethod - def from_string(cls, s: str, **kw): # pragma: no cover + def from_string(cls, s: str, separator=None, **kw): # pragma: no cover + separator = separator or cls.item_separator kw['type'] = Word # We assume s is a list of morphemes separated by +: return cls(iterable=[ - Word.from_string(m.strip()) for m in s.split(cls.item_separator.strip())], **kw) + Word.from_string(m.strip()) for m in s.split(separator.strip())], **kw) @classmethod def from_text(cls, text): diff --git a/tests/test_typedsequence.py b/tests/test_typedsequence.py index e40d50f..d45524b 100644 --- a/tests/test_typedsequence.py +++ b/tests/test_typedsequence.py @@ -169,6 +169,16 @@ def test_Word(): s[0] = Morpheme("b c d".split()) assert str(s[0]) == "b c d" + # test for monosegmental multi-character morphemes + word = Word.from_segments("a b + cd") + assert len(word[1]) == 1 + assert word[1][0] == "cd" + + # test custom separator & whitespace trimming + word = Word.from_segments("a b = c", separator="=") + assert len(word) == 2 + assert str(word) == "a b + c" + word = Word.from_string("a + b + c") # make sure word can be hashed From 93485da16ab135603415f56f0259a3513b633d48 Mon Sep 17 00:00:00 2001 From: Arne Rubehn Date: Thu, 8 Aug 2024 11:00:04 +0200 Subject: [PATCH 4/4] implement Word.reversed_segments --- src/linse/typedsequence.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/linse/typedsequence.py b/src/linse/typedsequence.py index 5190f35..915a94b 100644 --- a/src/linse/typedsequence.py +++ b/src/linse/typedsequence.py @@ -186,6 +186,9 @@ def from_segments(cls, s, separator=None, **kw): def to_text(self): return ''.join(m.to_text() for m in self) + def reversed_segments(self): + return Word([m[::-1] for m in self[::-1]]) + class Phrase(TypedSequence): item_type = Word