Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 88 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,21 +227,26 @@ curl -s localhost:8000/dep -d '{"text":"Pastafarians are smarter than people wit

---

### `POST` `/ent/`
### `POST` `/tag/`

Example request:

```json
{
"text": "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.",
"text": "Fed raises interest rates 0.5 percent.",
"model": "en"
"include_sentences": false,
"token_filter": ["text", "start", "end", "lemma", "pos"]
}
```

| Name | Type | Description |
| ------- | ------ | ----------------------------------------------------- |
| `text` | string | text to be parsed |
| `model` | string | identifier string for a model installed on the server |
| Name | Type | Description |
| ------------------- | ------- | ----------------------------------------------------- |
| `text` | string | text to be parsed |
| `model` | string | identifier string for a model installed on the server |
| `include_sentences` | boolean | include sentence layer |
| `token_filter` | array | array of token attributes to include in response |
| `sentence_filter` | array | array of sentence attributes to include in response |

Example request using the Python [Requests library](http://docs.python-requests.org/en/master/):

Expand All @@ -250,9 +255,9 @@ import json
import requests

url = "http://localhost:8000/ent"
message_text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
message_text = "Fed raises interest rates 0.5 percent."
headers = {'content-type': 'application/json'}
d = {'text': message_text, 'model': 'en'}
d = {'text': message_text, 'model': 'en', 'include_sentences': False, "token_filter": ['text', 'start', 'end', 'lemma', 'pos']}

response = requests.post(url, data=json.dumps(d), headers=headers)
r = response.json()
Expand All @@ -262,36 +267,92 @@ Example response:

```json
[
{ "end": 20, "start": 5, "type": "PERSON" },
{ "end": 67, "start": 61, "type": "ORG" },
{ "end": 75, "start": 71, "type": "DATE" }
{"start": 0, "end": 3, "text": "Fed", "lemma": "fed", "pos": "PROPN"},
{"start": 4, "end": 10, "text": "raises", "lemma": "raise", "pos": "VERB"},
{"start": 11, "end": 19, "text": "interest", "lemma": "interest", "pos": "NOUN"},
{"start": 20, "end": 25, "text": "rates", "lemma": "rate", "pos": "NOUN"},
{"start": 26, "end": 29, "text": "0.5", "lemma": "0.5", "pos": "NUM"},
{"start": 30, "end": 37, "text": "percent", "lemma": "percent", "pos": "NOUN"},
{"start": 37, "end": 38, "text": ".", "lemma": ".", "pos": "PUNCT"}
]
```

| Name | Type | Description |
| ------- | ------- | ------------------------------------------ |
| `end` | integer | character offset the entity ends **after** |
| `start` | integer | character offset the entity starts **on** |
| `type` | string | entity type |
| Name | Type | Description |
| ---------------- | ------- | ----------------------------------------- |
| `end` | integer | character offset the token ends **after** |
| `start` | integer | character offset the token starts **on** |
| `text` | string | |
| `orth` | string | |
| `lemma` | string | |
| `pos` | string | |
| `tag` | string | |
| `dep` | string | |
| `text` | string | |
| `ent_type` | string | |
| `ent_iob` | string | |
| `norm` | string | |
| `lower` | string | |
| `shape` | string | |
| `prefix` | string | |
| `suffix` | string | |
| `is_alpha` | string | |
| `is_ascii` | string | |
| `is_digit` | string | |
| `is_lower` | string | |
| `is_upper` | string | |
| `is_title` | string | |
| `is_punct` | string | |
| `is_left_punct` | string | |
| `is_right_punct` | string | |
| `is_space` | string | |
| `is_bracket` | string | |
| `is_currency` | string | |
| `like_url` | string | |
| `like_num` | string | |
| `like_email` | string | |
| `is_oov` | string | |
| `is_stop` | string | |
| `cluster` | string | |

```
curl -s localhost:8000/ent -d '{"text":"Pastafarians are smarter than people with Coca Cola bottles.", "model":"en"}'
curl -s localhost:8000/tag -d '{"text":"This a test that should split into sentences! This is the second.", "model":"en", "include_sentences": true, "token_filter": ["text", "start", "end", "lemma", "pos"], "sentence_filter": ["text", "start", "end", "tokens"]}'
```

```json
[
{
"end": 12,
"start": 0,
"type": "NORP"
},
{
"end": 51,
"start": 42,
"type": "ORG"
}
{"text": "This a test that should split into sentences!",
"start": 0,
"end": 45,
"tokens": [
{"text": "This", "start": 0, "end": 4, "lemma": "this", "pos": "DET"},
{"text": "a", "start": 5, "end": 6, "lemma": "a", "pos": "DET"},
{"text": "test", "start": 7, "end": 11, "lemma": "test", "pos": "NOUN"},
{"text": "that", "start": 12, "end": 16, "lemma": "that", "pos": "ADJ"},
{"text": "should", "start": 17, "end": 23, "lemma": "should", "pos": "VERB"},
{"text": "split", "start": 24, "end": 29, "lemma": "split", "pos": "VERB"},
{"text": "into", "start": 30, "end": 34, "lemma": "into", "pos": "ADP"},
{"text": "sentences", "start": 35, "end": 44, "lemma": "sentence", "pos": "NOUN"},
{"text": "!", "start": 44, "end": 45, "lemma": "!", "pos": "PUNCT"}
]},
{
"text": "This is the second.",
"start": 46,
"end": 65,
"tokens": [
{"text": "This", "start": 46, "end": 50, "lemma": "this", "pos": "DET"},
{"text": "is", "start": 51, "end": 53, "lemma": "be", "pos": "VERB"},
{"text": "the", "start": 54, "end": 57, "lemma": "the", "pos": "DET"},
{"text": "second", "start": 58, "end": 64, "lemma": "second", "pos": "ADJ"},
{"text": ".", "start": 64, "end": 65, "lemma": ".", "pos": "PUNCT"}
]}
]
```
| Name | Type | Description |
| ---------------- | ------- | -------------------------------------------- |
| `end` | integer | character offset the sentence ends **after** |
| `start` | integer | character offset the sentence starts **on** |
| `text` | string | |
| `tokens` | array | |

---

Expand Down
98 changes: 98 additions & 0 deletions displacy_service/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,104 @@ def to_json(self):
} for ent in self.doc.ents
]

class Tokens(object):
def __init__(self, nlp, text, include_sentences, token_filter, sentence_filter):
self.doc = nlp(text)
self.token_filter = token_filter
self.sentence_filter = sentence_filter
self.inc_sents = include_sentences

def to_json(self):
if self.inc_sents:
return [ self.sent_to_dict(sent) for sent in self.doc.sents]
else:
return [ self.token_to_dict(tok) for tok in self.doc ]

def sent_to_dict(self, sent):
all = len(self.sentence_filter) == 0
attrs = {
'start': sent.start_char,
'end': sent.end_char
}
if all or 'text' in self.sentence_filter:
attrs['text'] = sent.text
if all or 'tokens' in self.sentence_filter:
attrs['tokens'] = [ self.token_to_dict(tok) for tok in sent ]
return attrs

def token_to_dict(self, tok):
all = len(self.token_filter) == 0
attrs = {
'start': tok.idx,
'end': tok.idx + len(tok),
}
if all or 'text' in self.token_filter:
attrs['text'] = tok.text
if all or 'orth' in self.token_filter:
attrs['orth'] = tok.orth_
if all or 'lemma' in self.token_filter:
attrs['lemma'] = tok.lemma_
if all or 'pos' in self.token_filter:
attrs['pos'] = tok.pos_
if all or 'tag' in self.token_filter:
attrs['tag'] = tok.tag_
if all or 'dep' in self.token_filter:
attrs['dep'] = tok.dep_
# if all or 'vector' in self.token_filter:
# attrs['vector'] = tok.vector.tolist()
if all or 'ent_type' in self.token_filter:
attrs['ent_type'] = tok.ent_type_
if all or 'ent_iob_' in self.token_filter:
attrs['ent_iob'] = tok.ent_iob_
if all or 'norm' in self.token_filter:
attrs['norm'] = tok.norm_
if all or 'lower' in self.token_filter:
attrs['lower'] = tok.lower_
if all or 'shape' in self.token_filter:
attrs['shape'] = tok.shape_
if all or 'prefix' in self.token_filter:
attrs['prefix'] = tok.prefix_
if all or 'suffix' in self.token_filter:
attrs['suffix'] = tok.suffix_
if all or 'is_alpha' in self.token_filter:
attrs['is_alpha'] = tok.is_alpha
if all or 'is_ascii' in self.token_filter:
attrs['is_ascii'] = tok.is_ascii
if all or 'is_digit' in self.token_filter:
attrs['is_digit'] = tok.is_digit
if all or 'is_lower' in self.token_filter:
attrs['is_lower'] = tok.is_lower
if all or 'is_upper' in self.token_filter:
attrs['is_upper'] = tok.is_upper
if all or 'is_title' in self.token_filter:
attrs['is_title'] = tok.is_title
if all or 'is_punct' in self.token_filter:
attrs['is_punct'] = tok.is_punct
if all or 'is_left_punct' in self.token_filter:
attrs['is_left_punct'] = tok.is_left_punct
if all or 'is_right_punct' in self.token_filter:
attrs['is_right_punct'] = tok.is_right_punct
if all or 'is_space' in self.token_filter:
attrs['is_space'] = tok.is_space
if all or 'is_bracket' in self.token_filter:
attrs['is_bracket'] = tok.is_bracket
if all or 'is_quote' in self.token_filter:
attrs['is_quote'] = tok.is_quote
if all or 'is_currency' in self.token_filter:
attrs['is_currency'] = tok.is_currency
if all or 'like_url' in self.token_filter:
attrs['like_url'] = tok.like_url
if all or 'like_num' in self.token_filter:
attrs['like_num'] = tok.like_num
if all or 'like_email' in self.token_filter:
attrs['like_email'] = tok.like_email
if all or 'is_oov' in self.token_filter:
attrs['is_oov'] = tok.is_oov
if all or 'is_stop' in self.token_filter:
attrs['is_stop'] = tok.is_stop
if all or 'cluster' in self.token_filter:
attrs['cluster'] = tok.cluster
return attrs

class Sentences(object):
def __init__(self, nlp, text):
Expand Down
3 changes: 3 additions & 0 deletions displacy_service/scripts/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ def run():
print("Loaded all models. Starting HTTP server.")
httpd = simple_server.make_server('0.0.0.0', 8000, APP)
httpd.serve_forever()

if __name__ == '__main__':
run()
30 changes: 27 additions & 3 deletions displacy_service/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import spacy.about
import spacy.util

from .parse import Parse, Entities, Sentences

from .parse import Parse, Entities, Sentences, Tokens

MODELS = os.getenv("languages", "").split()

Expand Down Expand Up @@ -155,6 +155,29 @@ def on_post(self, req, resp):
resp.status = falcon.HTTP_500


class TaggerResource(object):
"""Returns tokens."""

def on_post(self, req, resp):
req_body = req.stream.read()
json_data = json.loads(req_body.decode('utf8'))
text = json_data.get('text')
model_name = json_data.get('model', 'en')
include_sentences = json_data.get('include_sentences', False)
token_filter = json_data.get('token_filter', [])
sentence_filter = json_data.get('sentence_filter', [])
try:
model = get_model(model_name)
tokens = Tokens(model, text, include_sentences,token_filter,sentence_filter)
resp.body = json.dumps(tokens.to_json(),
indent=2)
resp.content_type = 'application/json'
resp.append_header('Access-Control-Allow-Origin', "*")
resp.status = falcon.HTTP_200
except Exception as err:
resp.status = falcon.HTTP_500


class SentsResources(object):
"""Returns sentences"""

Expand All @@ -169,16 +192,17 @@ def on_post(self, req, resp):
sentences = Sentences(model, text)
resp.body = json.dumps(sentences.to_json(), sort_keys=True,
indent=2)
resp.content_type = 'text/string'
resp.content_type = 'application/json'
resp.append_header('Access-Control-Allow-Origin', "*")
resp.status = falcon.HTTP_200
except Exception:
except Exception as err:
resp.status = falcon.HTTP_500


APP = falcon.API()
APP.add_route('/dep', DepResource())
APP.add_route('/ent', EntResource())
APP.add_route('/tag', TaggerResource())
APP.add_route('/sents', SentsResources())
APP.add_route('/{model_name}/schema', SchemaResource())
APP.add_route('/models', ModelsResource())
Expand Down
Loading