lingpy · arubehn · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024 · Aug 8, 2024
diff --git a/src/linse/typedsequence.py b/src/linse/typedsequence.py
@@ -133,17 +133,26 @@ class Morpheme(TypedSequence):  # noqa: N801
     item_separator = ' '
 
     @classmethod
-    def from_string(cls, s):
-        if re.search(r'\s+', s):
-            # We assume that s is a whitespace-separated list of segments:
-            s = s.split()
+    def from_string(cls, s, separator=None):
+        separator = separator or cls.item_separator
+        if separator.strip():  # if the separator is something else than whitespaces in any form
+            separator = "\s*" + re.escape(separator) + "\s*"
+            s = re.split(separator, s)
         else:
-            #
-            # FIXME: do segmentation here!
-            #
-            s = list(s)
+            if re.search(r'\s+', s):
+                # We assume that s is a whitespace-separated list of segments:
+                s = s.split()
+            else:
+                #
+                # FIXME: do segmentation here!
+                #
+                s = list(s)
         return cls(s)
 
+    @classmethod
+    def from_segments(cls, s, separator=None):
+        return s.split(separator) if separator else s.split()
+
     def to_text(self):
         return ''.join(self)
 
@@ -160,26 +169,38 @@ class Word(TypedSequence):
     item_separator = ' + '
 
     @classmethod
-    def from_string(cls, s: str, **kw):
+    def from_string(cls, s: str, separator=None, **kw):
+        separator = separator or cls.item_separator
         kw['type'] = Morpheme
         # We assume s is a list of morphemes separated by +:
         return cls(iterable=[
-            Morpheme.from_string(m.strip()) for m in s.split(cls.item_separator.strip())], **kw)
+            Morpheme.from_string(m.strip()) for m in s.split(separator.strip())], **kw)
+
+    @classmethod
+    def from_segments(cls, s, separator=None, **kw):
+        separator = separator or cls.item_separator
+        pattern = r"\s+" + re.escape(separator.strip()) + r"\s+"
+        return cls(iterable=[
+            Morpheme.from_segments(m) for m in re.split(pattern, s)], **kw)
 
     def to_text(self):
         return ''.join(m.to_text() for m in self)
 
+    def reversed_segments(self):
+        return Word([m[::-1] for m in self[::-1]])
+
 
 class Phrase(TypedSequence):
     item_type = Word
     item_separator = ' _ '
 
     @classmethod
-    def from_string(cls, s: str, **kw):  # pragma: no cover
+    def from_string(cls, s: str, separator=None, **kw):  # pragma: no cover
+        separator = separator or cls.item_separator
         kw['type'] = Word
         # We assume s is a list of morphemes separated by +:
         return cls(iterable=[
-            Word.from_string(m.strip()) for m in s.split(cls.item_separator.strip())], **kw)
+            Word.from_string(m.strip()) for m in s.split(separator.strip())], **kw)
 
     @classmethod
     def from_text(cls, text):

diff --git a/tests/test_typedsequence.py b/tests/test_typedsequence.py
@@ -169,6 +169,16 @@ def test_Word():
     s[0] = Morpheme("b c d".split())
     assert str(s[0]) == "b c d"
 
+    # test for monosegmental multi-character morphemes
+    word = Word.from_segments("a b + cd")
+    assert len(word[1]) == 1
+    assert word[1][0] == "cd"
+
+    # test custom separator & whitespace trimming
+    word = Word.from_segments("a b =  c", separator="=")
+    assert len(word) == 2
+    assert str(word) == "a b + c"
+
     word = Word.from_string("a + b + c")
 
     # make sure word can be hashed