From 3391a8cf0e446dc7a3899dc529350b615dae9abb Mon Sep 17 00:00:00 2001 From: olivier Date: Tue, 2 Apr 2019 10:17:03 +0200 Subject: [PATCH 1/2] Added feature/endpoint allowing a genuine POS tagger response --- README.md | 108 +++++++++++++++----- displacy_service/parse.py | 97 ++++++++++++++++++ displacy_service/server.py | 29 +++++- displacy_service_tests/test_server.py | 142 +++++++++++++++++++------- 4 files changed, 308 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index f341e3f..1deb6a4 100644 --- a/README.md +++ b/README.md @@ -227,21 +227,25 @@ curl -s localhost:8000/dep -d '{"text":"Pastafarians are smarter than people wit --- -### `POST` `/ent/` +### `POST` `/tag/` Example request: ```json { - "text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.", + "text": "Fed raises interest rates 0.5 percent.", "model": "en" + "include_sentences": false, + "attr_filter": ["text", "start", "end", "lemma", "pos"] } ``` -| Name | Type | Description | -| ------- | ------ | ----------------------------------------------------- | -| `text` | string | text to be parsed | -| `model` | string | identifier string for a model installed on the server | +| Name | Type | Description | +| ------------------- | ------- | ----------------------------------------------------- | +| `text` | string | text to be parsed | +| `model` | string | identifier string for a model installed on the server | +| `include_sentences` | boolean | include sentence layer | +| `attr_filter` | array | array of token attributes to include in response | Example request using the Python [Requests library](http://docs.python-requests.org/en/master/): @@ -250,9 +254,9 @@ import json import requests url = "http://localhost:8000/ent" -message_text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." +message_text = "Fed raises interest rates 0.5 percent." headers = {'content-type': 'application/json'} -d = {'text': message_text, 'model': 'en'} +d = {'text': message_text, 'model': 'en', 'include_sentences': False, "attr_filter": ['text', 'start', 'end', 'lemma', 'pos']} response = requests.post(url, data=json.dumps(d), headers=headers) r = response.json() @@ -262,34 +266,84 @@ Example response: ```json [ - { "end": 20, "start": 5, "type": "PERSON" }, - { "end": 67, "start": 61, "type": "ORG" }, - { "end": 75, "start": 71, "type": "DATE" } +{"start": 0, "end": 3, "text": "Fed", "lemma": "fed", "pos": "PROPN"}, +{"start": 4, "end": 10, "text": "raises", "lemma": "raise", "pos": "VERB"}, +{"start": 11, "end": 19, "text": "interest", "lemma": "interest", "pos": "NOUN"}, +{"start": 20, "end": 25, "text": "rates", "lemma": "rate", "pos": "NOUN"}, +{"start": 26, "end": 29, "text": "0.5", "lemma": "0.5", "pos": "NUM"}, +{"start": 30, "end": 37, "text": "percent", "lemma": "percent", "pos": "NOUN"}, +{"start": 37, "end": 38, "text": ".", "lemma": ".", "pos": "PUNCT"} ] ``` -| Name | Type | Description | -| ------- | ------- | ------------------------------------------ | -| `end` | integer | character offset the entity ends **after** | -| `start` | integer | character offset the entity starts **on** | -| `type` | string | entity type | +| Name | Type | Description | +| ---------------- | ------- | ----------------------------------------- | +| `end` | integer | character offset the token ends **after** | +| `start` | integer | character offset the token starts **on** | +| `text` | string | | +| `orth` | string | | +| `lemma` | string | | +| `pos` | string | | +| `tag` | string | | +| `dep` | string | | +| `text` | string | | +| `ent_type` | string | | +| `ent_iob` | string | | +| `norm` | string | | +| `lower` | string | | +| `shape` | string | | +| `prefix` | string | | +| `suffix` | string | | +| `is_alpha` | string | | +| `is_ascii` | string | | +| `is_digit` | string | | +| `is_lower` | string | | +| `is_upper` | string | | +| `is_title` | string | | +| `is_punct` | string | | +| `is_left_punct` | string | | +| `is_right_punct` | string | | +| `is_space` | string | | +| `is_bracket` | string | | +| `is_currency` | string | | +| `like_url` | string | | +| `like_num` | string | | +| `like_email` | string | | +| `is_oov` | string | | +| `is_stop` | string | | +| `cluster` | string | | ``` -curl -s localhost:8000/ent -d '{"text":"Pastafarians are smarter than people with Coca Cola bottles.", "model":"en"}' +curl -s localhost:8000/tag -d '{"text":"This a test that should split into sentences! This is the second.", "model":"en", "include_sentences": true, "attr_filter": ["text", "start", "end", "lemma", "pos"]}' ``` ```json [ - { - "end": 12, - "start": 0, - "type": "NORP" - }, - { - "end": 51, - "start": 42, - "type": "ORG" - } +{"text": "This a test that should split into sentences!", + "start": 0, + "end": 45, + "tokens": [ + {"text": "This", "start": 0, "end": 4, "lemma": "this", "pos": "DET"}, + {"text": "a", "start": 5, "end": 6, "lemma": "a", "pos": "DET"}, + {"text": "test", "start": 7, "end": 11, "lemma": "test", "pos": "NOUN"}, + {"text": "that", "start": 12, "end": 16, "lemma": "that", "pos": "ADJ"}, + {"text": "should", "start": 17, "end": 23, "lemma": "should", "pos": "VERB"}, + {"text": "split", "start": 24, "end": 29, "lemma": "split", "pos": "VERB"}, + {"text": "into", "start": 30, "end": 34, "lemma": "into", "pos": "ADP"}, + {"text": "sentences", "start": 35, "end": 44, "lemma": "sentence", "pos": "NOUN"}, + {"text": "!", "start": 44, "end": 45, "lemma": "!", "pos": "PUNCT"} + ]}, +{ + "text": "This is the second.", + "start": 46, + "end": 65, + "tokens": [ + {"text": "This", "start": 46, "end": 50, "lemma": "this", "pos": "DET"}, + {"text": "is", "start": 51, "end": 53, "lemma": "be", "pos": "VERB"}, + {"text": "the", "start": 54, "end": 57, "lemma": "the", "pos": "DET"}, + {"text": "second", "start": 58, "end": 64, "lemma": "second", "pos": "ADJ"}, + {"text": ".", "start": 64, "end": 65, "lemma": ".", "pos": "PUNCT"} + ]} ] ``` diff --git a/displacy_service/parse.py b/displacy_service/parse.py index eab6839..8cfbb44 100644 --- a/displacy_service/parse.py +++ b/displacy_service/parse.py @@ -63,6 +63,103 @@ def to_json(self): } for ent in self.doc.ents ] +class Tokens(object): + def __init__(self, nlp, text, include_sentences, attr_filter): + self.doc = nlp(text) + self.filter = attr_filter + self.inc_sents = include_sentences + + def to_json(self): + if self.inc_sents: + return [ self.sent_to_dict(sent) for sent in self.doc.sents] + else: + return [ self.token_to_dict(tok) for tok in self.doc ] + + def sent_to_dict(self, sent): + all = len(self.filter) == 0 + attrs = { + 'text': sent.text, + 'start': sent.start_char, + 'end': sent.end_char, + "tokens" : [ self.token_to_dict(tok) for tok in sent ] + } + # if all or 'vector' in self.filter: + # attrs['vector'] = sent.vector + return attrs + + def token_to_dict(self, tok): + all = len(self.filter) == 0 + attrs = { + 'start': tok.idx, + 'end': tok.idx + len(tok), + } + if all or 'text' in self.filter: + attrs['text'] = tok.text + if all or 'orth' in self.filter: + attrs['orth'] = tok.orth_ + if all or 'lemma' in self.filter: + attrs['lemma'] = tok.lemma_ + if all or 'pos' in self.filter: + attrs['pos'] = tok.pos_ + if all or 'tag' in self.filter: + attrs['tag'] = tok.tag_ + if all or 'dep' in self.filter: + attrs['dep'] = tok.dep_ + # if all or 'vector' in self.filter: + # attrs['vector'] = tok.vector.tolist() + if all or 'ent_type' in self.filter: + attrs['ent_type'] = tok.ent_type_ + if all or 'ent_iob_' in self.filter: + attrs['ent_iob'] = tok.ent_iob_ + if all or 'norm' in self.filter: + attrs['norm'] = tok.norm_ + if all or 'lower' in self.filter: + attrs['lower'] = tok.lower_ + if all or 'shape' in self.filter: + attrs['shape'] = tok.shape_ + if all or 'prefix' in self.filter: + attrs['prefix'] = tok.prefix_ + if all or 'suffix' in self.filter: + attrs['suffix'] = tok.suffix_ + if all or 'is_alpha' in self.filter: + attrs['is_alpha'] = tok.is_alpha + if all or 'is_ascii' in self.filter: + attrs['is_ascii'] = tok.is_ascii + if all or 'is_digit' in self.filter: + attrs['is_digit'] = tok.is_digit + if all or 'is_lower' in self.filter: + attrs['is_lower'] = tok.is_lower + if all or 'is_upper' in self.filter: + attrs['is_upper'] = tok.is_upper + if all or 'is_title' in self.filter: + attrs['is_title'] = tok.is_title + if all or 'is_punct' in self.filter: + attrs['is_punct'] = tok.is_punct + if all or 'is_left_punct' in self.filter: + attrs['is_left_punct'] = tok.is_left_punct + if all or 'is_right_punct' in self.filter: + attrs['is_right_punct'] = tok.is_right_punct + if all or 'is_space' in self.filter: + attrs['is_space'] = tok.is_space + if all or 'is_bracket' in self.filter: + attrs['is_bracket'] = tok.is_bracket + if all or 'is_quote' in self.filter: + attrs['is_quote'] = tok.is_quote + if all or 'is_currency' in self.filter: + attrs['is_currency'] = tok.is_currency + if all or 'like_url' in self.filter: + attrs['like_url'] = tok.like_url + if all or 'like_num' in self.filter: + attrs['like_num'] = tok.like_num + if all or 'like_email' in self.filter: + attrs['like_email'] = tok.like_email + if all or 'is_oov' in self.filter: + attrs['is_oov'] = tok.is_oov + if all or 'is_stop' in self.filter: + attrs['is_stop'] = tok.is_stop + if all or 'cluster' in self.filter: + attrs['cluster'] = tok.cluster + return attrs class Sentences(object): def __init__(self, nlp, text): diff --git a/displacy_service/server.py b/displacy_service/server.py index 6310797..33ef440 100644 --- a/displacy_service/server.py +++ b/displacy_service/server.py @@ -9,8 +9,8 @@ import spacy.about import spacy.util -from .parse import Parse, Entities, Sentences +from .parse import Parse, Entities, Sentences, Tokens MODELS = os.getenv("languages", "").split() @@ -155,6 +155,28 @@ def on_post(self, req, resp): resp.status = falcon.HTTP_500 +class TaggerResource(object): + """Returns tokens.""" + + def on_post(self, req, resp): + req_body = req.stream.read() + json_data = json.loads(req_body.decode('utf8')) + text = json_data.get('text') + model_name = json_data.get('model', 'en') + include_sentences = json_data.get('include_sentences', False) + attr_filter = json_data.get('attr_filter', []) + try: + model = get_model(model_name) + tokens = Tokens(model, text, include_sentences,attr_filter) + resp.body = json.dumps(tokens.to_json(), + indent=2) + resp.content_type = 'application/json' + resp.append_header('Access-Control-Allow-Origin', "*") + resp.status = falcon.HTTP_200 + except Exception as err: + resp.status = falcon.HTTP_500 + + class SentsResources(object): """Returns sentences""" @@ -169,16 +191,17 @@ def on_post(self, req, resp): sentences = Sentences(model, text) resp.body = json.dumps(sentences.to_json(), sort_keys=True, indent=2) - resp.content_type = 'text/string' + resp.content_type = 'application/json' resp.append_header('Access-Control-Allow-Origin', "*") resp.status = falcon.HTTP_200 - except Exception: + except Exception as err: resp.status = falcon.HTTP_500 APP = falcon.API() APP.add_route('/dep', DepResource()) APP.add_route('/ent', EntResource()) +APP.add_route('/tag', TaggerResource()) APP.add_route('/sents', SentsResources()) APP.add_route('/{model_name}/schema', SchemaResource()) APP.add_route('/models', ModelsResource()) diff --git a/displacy_service_tests/test_server.py b/displacy_service_tests/test_server.py index 0e145fb..13edffc 100644 --- a/displacy_service_tests/test_server.py +++ b/displacy_service_tests/test_server.py @@ -5,41 +5,107 @@ class TestAPI(falcon.testing.TestCase): - def __init__(self): - self.api = APP - - -def test_deps(): - test_api = TestAPI() - result = test_api.simulate_post( - path='/dep', - body='''{"text": "This is a test.", "model": "en", - "collapse_punctuation": false, - "collapse_phrases": false}''' - ) - result = json.loads(result.text) - words = [w['text'] for w in result['words']] - assert words == ["This", "is", "a", "test", "."] - - -def test_ents(): - test_api = TestAPI() - result = test_api.simulate_post( - path='/ent', - body='''{"text": "What a great company Google is.", - "model": "en"}''') - ents = json.loads(result.text) - assert ents == [ - {"start": 21, "end": 27, "type": "ORG", "text": "Google"}] - - -def test_sents(): - test_api = TestAPI() - sentences = test_api.simulate_post( - path='/sent', - body='''{"text": "This a test that should split into sentences! - This is the second. Is this the third?", "model": "en"}''' - ) - - assert sentences == ['This a test that should split into sentences!', - 'This is the second.', 'Is this the third?'] + def setUp(self): + super(TestAPI, self).setUp() + self.app = APP + + def test_deps(self): + result = self.simulate_post( + path='/dep', + body='''{"text": "This is a test.", "model": "en", + "collapse_punctuation": false, + "collapse_phrases": false}''' + ) + result = json.loads(result.text) + words = [w['text'] for w in result['words']] + assert words == ["This", "is", "a", "test", "."] + + def test_ents(self): + result = self.simulate_post( + path='/ent', + body='''{"text": "What a great company Google is.", + "model": "en"}''') + ents = json.loads(result.text) + assert ents == [ + {"start": 21, "end": 27, "type": "ORG", "text": "Google"}] + + def test_tag_full(self): + toks = self.simulate_post( + path='/tag', + json={ + "text": "Foo", + "model": "en", + }).json + assert toks[0] == {'start': 0, 'end': 3, 'text': 'Foo', 'orth' : 'Foo', 'lemma': 'foo', 'pos': 'PROPN', 'tag': 'NNP', + 'dep': 'ROOT', 'ent_type': '', 'ent_iob': 'O', 'norm': 'foo', + 'lower': 'foo', 'shape': 'Xxx', 'prefix': 'F', 'suffix': 'Foo', 'is_alpha': True, + 'is_ascii': True, 'is_digit': False, 'is_lower': False, 'is_upper': False, 'is_title': True, + 'is_punct': False, 'is_left_punct': False, 'is_right_punct': False, 'is_space': False, + 'is_bracket': False, 'is_quote': False, 'is_currency': False, 'like_url': False, + 'like_num': False, 'like_email': False, 'is_oov': True, 'is_stop': False, 'cluster': 0} + + def test_tag_with_filter(self): + toks = self.simulate_post( + path='/tag', + json={ + "text": "Fed raises interest rates 0.5 percent.", + "model": "en", + "attr_filter": ["text", "start", "end", "lemma", "pos"] + }).json + + assert toks == [{'start': 0, 'end': 3, 'text': 'Fed', 'lemma': 'fed', 'pos': 'PROPN'}, + {'start': 4, 'end': 10, 'text': 'raises', 'lemma': 'raise', 'pos': 'VERB'}, + {'start': 11, 'end': 19, 'text': 'interest', 'lemma': 'interest', 'pos': 'NOUN'}, + {'start': 20, 'end': 25, 'text': 'rates', 'lemma': 'rate', 'pos': 'NOUN'}, + {'start': 26, 'end': 29, 'text': '0.5', 'lemma': '0.5', 'pos': 'NUM'}, + {'start': 30, 'end': 37, 'text': 'percent', 'lemma': 'percent', 'pos': 'NOUN'}, + {'start': 37, 'end': 38, 'text': '.', 'lemma': '.', 'pos': 'PUNCT'}] + + def test_tag_with_sents(self): + sents = self.simulate_post( + path='/tag', + json={ + "text": "This a test that should split into sentences! This is the second.", + "model": "en", + "include_sentences": True, + "attr_filter": ["text", "start", "end", "lemma", "pos"] + }).json + assert sents == [ + {'text': 'This a test that should split into sentences!', + 'start': 0, + 'end': 45, + 'tokens': [ + {'text': 'This', 'start': 0, 'end': 4, 'lemma': 'this', 'pos': 'DET'}, + {'text': 'a', 'start': 5, 'end': 6, 'lemma': 'a', 'pos': 'DET'}, + {'text': 'test', 'start': 7, 'end': 11, 'lemma': 'test', 'pos': 'NOUN'}, + {'text': 'that', 'start': 12, 'end': 16, 'lemma': 'that', 'pos': 'ADJ'}, + {'text': 'should', 'start': 17, 'end': 23, 'lemma': 'should', 'pos': 'VERB'}, + {'text': 'split', 'start': 24, 'end': 29, 'lemma': 'split', 'pos': 'VERB'}, + {'text': 'into', 'start': 30, 'end': 34, 'lemma': 'into', 'pos': 'ADP'}, + {'text': 'sentences', 'start': 35, 'end': 44, 'lemma': 'sentence', 'pos': 'NOUN'}, + {'text': '!', 'start': 44, 'end': 45, 'lemma': '!', 'pos': 'PUNCT'} + ]}, + { + 'text': 'This is the second.', + 'start': 46, + 'end': 65, + 'tokens': [ + {'text': 'This', 'start': 46, 'end': 50, 'lemma': 'this', 'pos': 'DET'}, + {'text': 'is', 'start': 51, 'end': 53, 'lemma': 'be', 'pos': 'VERB'}, + {'text': 'the', 'start': 54, 'end': 57, 'lemma': 'the', 'pos': 'DET'}, + {'text': 'second', 'start': 58, 'end': 64, 'lemma': 'second', 'pos': 'ADJ'}, + {'text': '.', 'start': 64, 'end': 65, 'lemma': '.', 'pos': 'PUNCT'} + ]} + ] + + def test_sents(self): + sentences = self.simulate_post( + path='/sents', + json={ + "text": """This a test that should split into sentences! + This is the second. Is this the third?""", + "model": "en" + } + ) + assert sentences.json == ['This a test that should split into sentences!', + 'This is the second.', 'Is this the third?'] From d910cf324bfe9d73277dd7a99e35e7c90b989e62 Mon Sep 17 00:00:00 2001 From: olivier Date: Tue, 2 Apr 2019 11:28:52 +0200 Subject: [PATCH 2/2] Add a new sentence_filter argument --- README.md | 15 +++-- displacy_service/parse.py | 85 ++++++++++++++------------- displacy_service/scripts/app.py | 3 + displacy_service/server.py | 5 +- displacy_service_tests/test_server.py | 5 +- 5 files changed, 63 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 1deb6a4..ceaa999 100644 --- a/README.md +++ b/README.md @@ -236,7 +236,7 @@ Example request: "text": "Fed raises interest rates 0.5 percent.", "model": "en" "include_sentences": false, - "attr_filter": ["text", "start", "end", "lemma", "pos"] + "token_filter": ["text", "start", "end", "lemma", "pos"] } ``` @@ -245,7 +245,8 @@ Example request: | `text` | string | text to be parsed | | `model` | string | identifier string for a model installed on the server | | `include_sentences` | boolean | include sentence layer | -| `attr_filter` | array | array of token attributes to include in response | +| `token_filter` | array | array of token attributes to include in response | +| `sentence_filter` | array | array of sentence attributes to include in response | Example request using the Python [Requests library](http://docs.python-requests.org/en/master/): @@ -256,7 +257,7 @@ import requests url = "http://localhost:8000/ent" message_text = "Fed raises interest rates 0.5 percent." headers = {'content-type': 'application/json'} -d = {'text': message_text, 'model': 'en', 'include_sentences': False, "attr_filter": ['text', 'start', 'end', 'lemma', 'pos']} +d = {'text': message_text, 'model': 'en', 'include_sentences': False, "token_filter": ['text', 'start', 'end', 'lemma', 'pos']} response = requests.post(url, data=json.dumps(d), headers=headers) r = response.json() @@ -314,7 +315,7 @@ Example response: | `cluster` | string | | ``` -curl -s localhost:8000/tag -d '{"text":"This a test that should split into sentences! This is the second.", "model":"en", "include_sentences": true, "attr_filter": ["text", "start", "end", "lemma", "pos"]}' +curl -s localhost:8000/tag -d '{"text":"This a test that should split into sentences! This is the second.", "model":"en", "include_sentences": true, "token_filter": ["text", "start", "end", "lemma", "pos"], "sentence_filter": ["text", "start", "end", "tokens"]}' ``` ```json @@ -346,6 +347,12 @@ curl -s localhost:8000/tag -d '{"text":"This a test that should split into sente ]} ] ``` +| Name | Type | Description | +| ---------------- | ------- | -------------------------------------------- | +| `end` | integer | character offset the sentence ends **after** | +| `start` | integer | character offset the sentence starts **on** | +| `text` | string | | +| `tokens` | array | | --- diff --git a/displacy_service/parse.py b/displacy_service/parse.py index 8cfbb44..a4760e4 100644 --- a/displacy_service/parse.py +++ b/displacy_service/parse.py @@ -64,9 +64,10 @@ def to_json(self): ] class Tokens(object): - def __init__(self, nlp, text, include_sentences, attr_filter): + def __init__(self, nlp, text, include_sentences, token_filter, sentence_filter): self.doc = nlp(text) - self.filter = attr_filter + self.token_filter = token_filter + self.sentence_filter = sentence_filter self.inc_sents = include_sentences def to_json(self): @@ -76,88 +77,88 @@ def to_json(self): return [ self.token_to_dict(tok) for tok in self.doc ] def sent_to_dict(self, sent): - all = len(self.filter) == 0 + all = len(self.sentence_filter) == 0 attrs = { - 'text': sent.text, 'start': sent.start_char, - 'end': sent.end_char, - "tokens" : [ self.token_to_dict(tok) for tok in sent ] + 'end': sent.end_char } - # if all or 'vector' in self.filter: - # attrs['vector'] = sent.vector + if all or 'text' in self.sentence_filter: + attrs['text'] = sent.text + if all or 'tokens' in self.sentence_filter: + attrs['tokens'] = [ self.token_to_dict(tok) for tok in sent ] return attrs def token_to_dict(self, tok): - all = len(self.filter) == 0 + all = len(self.token_filter) == 0 attrs = { 'start': tok.idx, 'end': tok.idx + len(tok), } - if all or 'text' in self.filter: + if all or 'text' in self.token_filter: attrs['text'] = tok.text - if all or 'orth' in self.filter: + if all or 'orth' in self.token_filter: attrs['orth'] = tok.orth_ - if all or 'lemma' in self.filter: + if all or 'lemma' in self.token_filter: attrs['lemma'] = tok.lemma_ - if all or 'pos' in self.filter: + if all or 'pos' in self.token_filter: attrs['pos'] = tok.pos_ - if all or 'tag' in self.filter: + if all or 'tag' in self.token_filter: attrs['tag'] = tok.tag_ - if all or 'dep' in self.filter: + if all or 'dep' in self.token_filter: attrs['dep'] = tok.dep_ - # if all or 'vector' in self.filter: + # if all or 'vector' in self.token_filter: # attrs['vector'] = tok.vector.tolist() - if all or 'ent_type' in self.filter: + if all or 'ent_type' in self.token_filter: attrs['ent_type'] = tok.ent_type_ - if all or 'ent_iob_' in self.filter: + if all or 'ent_iob_' in self.token_filter: attrs['ent_iob'] = tok.ent_iob_ - if all or 'norm' in self.filter: + if all or 'norm' in self.token_filter: attrs['norm'] = tok.norm_ - if all or 'lower' in self.filter: + if all or 'lower' in self.token_filter: attrs['lower'] = tok.lower_ - if all or 'shape' in self.filter: + if all or 'shape' in self.token_filter: attrs['shape'] = tok.shape_ - if all or 'prefix' in self.filter: + if all or 'prefix' in self.token_filter: attrs['prefix'] = tok.prefix_ - if all or 'suffix' in self.filter: + if all or 'suffix' in self.token_filter: attrs['suffix'] = tok.suffix_ - if all or 'is_alpha' in self.filter: + if all or 'is_alpha' in self.token_filter: attrs['is_alpha'] = tok.is_alpha - if all or 'is_ascii' in self.filter: + if all or 'is_ascii' in self.token_filter: attrs['is_ascii'] = tok.is_ascii - if all or 'is_digit' in self.filter: + if all or 'is_digit' in self.token_filter: attrs['is_digit'] = tok.is_digit - if all or 'is_lower' in self.filter: + if all or 'is_lower' in self.token_filter: attrs['is_lower'] = tok.is_lower - if all or 'is_upper' in self.filter: + if all or 'is_upper' in self.token_filter: attrs['is_upper'] = tok.is_upper - if all or 'is_title' in self.filter: + if all or 'is_title' in self.token_filter: attrs['is_title'] = tok.is_title - if all or 'is_punct' in self.filter: + if all or 'is_punct' in self.token_filter: attrs['is_punct'] = tok.is_punct - if all or 'is_left_punct' in self.filter: + if all or 'is_left_punct' in self.token_filter: attrs['is_left_punct'] = tok.is_left_punct - if all or 'is_right_punct' in self.filter: + if all or 'is_right_punct' in self.token_filter: attrs['is_right_punct'] = tok.is_right_punct - if all or 'is_space' in self.filter: + if all or 'is_space' in self.token_filter: attrs['is_space'] = tok.is_space - if all or 'is_bracket' in self.filter: + if all or 'is_bracket' in self.token_filter: attrs['is_bracket'] = tok.is_bracket - if all or 'is_quote' in self.filter: + if all or 'is_quote' in self.token_filter: attrs['is_quote'] = tok.is_quote - if all or 'is_currency' in self.filter: + if all or 'is_currency' in self.token_filter: attrs['is_currency'] = tok.is_currency - if all or 'like_url' in self.filter: + if all or 'like_url' in self.token_filter: attrs['like_url'] = tok.like_url - if all or 'like_num' in self.filter: + if all or 'like_num' in self.token_filter: attrs['like_num'] = tok.like_num - if all or 'like_email' in self.filter: + if all or 'like_email' in self.token_filter: attrs['like_email'] = tok.like_email - if all or 'is_oov' in self.filter: + if all or 'is_oov' in self.token_filter: attrs['is_oov'] = tok.is_oov - if all or 'is_stop' in self.filter: + if all or 'is_stop' in self.token_filter: attrs['is_stop'] = tok.is_stop - if all or 'cluster' in self.filter: + if all or 'cluster' in self.token_filter: attrs['cluster'] = tok.cluster return attrs diff --git a/displacy_service/scripts/app.py b/displacy_service/scripts/app.py index 6d4371d..4a693dd 100644 --- a/displacy_service/scripts/app.py +++ b/displacy_service/scripts/app.py @@ -11,3 +11,6 @@ def run(): print("Loaded all models. Starting HTTP server.") httpd = simple_server.make_server('0.0.0.0', 8000, APP) httpd.serve_forever() + +if __name__ == '__main__': + run() \ No newline at end of file diff --git a/displacy_service/server.py b/displacy_service/server.py index 33ef440..f3e1360 100644 --- a/displacy_service/server.py +++ b/displacy_service/server.py @@ -164,10 +164,11 @@ def on_post(self, req, resp): text = json_data.get('text') model_name = json_data.get('model', 'en') include_sentences = json_data.get('include_sentences', False) - attr_filter = json_data.get('attr_filter', []) + token_filter = json_data.get('token_filter', []) + sentence_filter = json_data.get('sentence_filter', []) try: model = get_model(model_name) - tokens = Tokens(model, text, include_sentences,attr_filter) + tokens = Tokens(model, text, include_sentences,token_filter,sentence_filter) resp.body = json.dumps(tokens.to_json(), indent=2) resp.content_type = 'application/json' diff --git a/displacy_service_tests/test_server.py b/displacy_service_tests/test_server.py index 13edffc..3e97997 100644 --- a/displacy_service_tests/test_server.py +++ b/displacy_service_tests/test_server.py @@ -50,7 +50,7 @@ def test_tag_with_filter(self): json={ "text": "Fed raises interest rates 0.5 percent.", "model": "en", - "attr_filter": ["text", "start", "end", "lemma", "pos"] + "token_filter": ["text", "start", "end", "lemma", "pos"] }).json assert toks == [{'start': 0, 'end': 3, 'text': 'Fed', 'lemma': 'fed', 'pos': 'PROPN'}, @@ -68,7 +68,8 @@ def test_tag_with_sents(self): "text": "This a test that should split into sentences! This is the second.", "model": "en", "include_sentences": True, - "attr_filter": ["text", "start", "end", "lemma", "pos"] + "token_filter": ["text", "start", "end", "lemma", "pos"], + "sentence_filter": ["text", "start", "end", "tokens"], }).json assert sents == [ {'text': 'This a test that should split into sentences!',