From ae95db1eca9f0e35cdfa4c44559c7ac57b3d6693 Mon Sep 17 00:00:00 2001 From: Imran Ibrahimli <34434302+iibrahimli@users.noreply.github.com> Date: Sun, 25 Apr 2021 12:39:12 +0400 Subject: [PATCH] Fix spacy component example Update the example to work with the latest spacy as installed by `pip install spacy` (version 3.0.6), and fix failure to segment sentences due to `doc.char_span` returning None. The `doc.char_span` uses `alignment_mode="strict"` by default, which returns `None` when `sent_char_spans` contains trailing spaces, for example. Change the `alignment_mode` to `"contract"` so that it returns correct spans. --- examples/pysbd_as_spacy_component.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/pysbd_as_spacy_component.py b/examples/pysbd_as_spacy_component.py index bd28ac7..18797c9 100644 --- a/examples/pysbd_as_spacy_component.py +++ b/examples/pysbd_as_spacy_component.py @@ -7,21 +7,22 @@ import pysbd import spacy -def pysbd_sentence_boundaries(doc): - seg = pysbd.Segmenter(language="en", clean=False, char_span=True) - sents_char_spans = seg.segment(doc.text) - char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans] - start_token_ids = [span[0].idx for span in char_spans if span is not None] - for token in doc: - token.is_sent_start = True if token.idx in start_token_ids else False - return doc - if __name__ == "__main__": text = "My name is Jonas E. Smith. Please turn to p. 55." nlp = spacy.blank('en') + + @nlp.component("sbd") + def pysbd_sentence_boundaries(doc): + seg = pysbd.Segmenter(language="en", clean=False, char_span=True) + sents_char_spans = seg.segment(doc.text) + char_spans = [doc.char_span(sent_span.start, sent_span.end, alignment_mode="contract") for sent_span in sents_char_spans] + start_token_ids = [span[0].idx for span in char_spans if span is not None] + for token in doc: + token.is_sent_start = True if token.idx in start_token_ids else False + return doc # add as a spacy pipeline component - nlp.add_pipe(pysbd_sentence_boundaries) + nlp.add_pipe("sbd", first=True) doc = nlp(text) print('sent_id', 'sentence', sep='\t|\t')