From ae95db1eca9f0e35cdfa4c44559c7ac57b3d6693 Mon Sep 17 00:00:00 2001
From: Imran Ibrahimli <34434302+iibrahimli@users.noreply.github.com>
Date: Sun, 25 Apr 2021 12:39:12 +0400
Subject: [PATCH] Fix spacy component example

Update the example to work with the latest spacy as installed by `pip install spacy` (version 3.0.6), and fix failure to segment sentences due to `doc.char_span` returning None. The `doc.char_span` uses `alignment_mode="strict"` by default, which returns `None` when `sent_char_spans` contains trailing spaces, for example. Change the `alignment_mode` to `"contract"` so that it returns correct spans.
---
 examples/pysbd_as_spacy_component.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/pysbd_as_spacy_component.py b/examples/pysbd_as_spacy_component.py
index bd28ac7..18797c9 100644
--- a/examples/pysbd_as_spacy_component.py
+++ b/examples/pysbd_as_spacy_component.py
@@ -7,21 +7,22 @@
 import pysbd
 import spacy
 
-def pysbd_sentence_boundaries(doc):
-    seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
-    sents_char_spans = seg.segment(doc.text)
-    char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans]
-    start_token_ids = [span[0].idx for span in char_spans if span is not None]
-    for token in doc:
-        token.is_sent_start = True if token.idx in start_token_ids else False
-    return doc
-
 if __name__ == "__main__":
     text = "My name is Jonas E. Smith.          Please turn to p. 55."
     nlp = spacy.blank('en')
+    
+    @nlp.component("sbd")
+    def pysbd_sentence_boundaries(doc):
+        seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
+        sents_char_spans = seg.segment(doc.text)
+        char_spans = [doc.char_span(sent_span.start, sent_span.end, alignment_mode="contract") for sent_span in sents_char_spans]
+        start_token_ids = [span[0].idx for span in char_spans if span is not None]
+        for token in doc:
+            token.is_sent_start = True if token.idx in start_token_ids else False
+        return doc
 
     # add as a spacy pipeline component
-    nlp.add_pipe(pysbd_sentence_boundaries)
+    nlp.add_pipe("sbd", first=True)
 
     doc = nlp(text)
     print('sent_id', 'sentence', sep='\t|\t')