-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_npr_ner_data.py
More file actions
79 lines (59 loc) · 2.78 KB
/
create_npr_ner_data.py
File metadata and controls
79 lines (59 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# read in the revision data (just used a random article dataset from a different course I had taken)
import pickle
import random
import re
import pandas as pd
import en_core_web_lg
npr_df = pd.read_csv("npr.csv")
# print row and column information
npr_df.head()
nlp = en_core_web_lg.load()
revision_texts = []
# convert the articles to spacy objects to better identify the sentences. Disabled unneeded components. # takes ~ 4 minutes
for doc in nlp.pipe(npr_df["Article"][:6000], batch_size=30, disable=["tagger", "ner"]):
for sentence in doc.sents:
if 40 < len(sentence.text) < 80:
# some of the sentences had excessive whitespace in between words, so we're trimming that
revision_texts.append(" ".join(re.split("\s+", sentence.text, flags=re.UNICODE)))
revisions = []
# Use the existing spaCy model to predict the entities, then append them to revision
for doc in nlp.pipe(revision_texts, batch_size=50, disable=["tagger", "parser"]):
# don't append sentences that have no entities
if len(doc.ents) > 0:
revisions.append((doc.text, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}))
# create arrays to store the revision data
TRAIN_REVISION_DATA = []
TEST_REVISION_DATA = []
# create dictionaries to keep count of the different entities
TRAIN_ENTITY_COUNTER = {}
TEST_ENTITY_COUNTER = {}
# This will help distribute the entities (i.e. we don't want 1000 PERSON entities, but only 80 ORG entities)
REVISION_SENTENCE_SOFT_LIMIT = 100
# helper function for incrementing the revision counters
def increment_revision_counters(entity_counter, entities):
for entity in entities:
label = entity[2]
if label in entity_counter:
entity_counter[label] += 1
else:
entity_counter[label] = 1
random.shuffle(revisions)
for revision in revisions:
# get the entities from the revision sentence
entities = revision[1]["entities"]
# simple hack to make sure spaCy entities don't get too one-sided
should_append_to_train_counter = 0
for _, _, label in entities:
if label in TRAIN_ENTITY_COUNTER and TRAIN_ENTITY_COUNTER[label] > REVISION_SENTENCE_SOFT_LIMIT:
should_append_to_train_counter -= 1
else:
should_append_to_train_counter += 1
# simple switch for deciding whether to append to train data or test data
if should_append_to_train_counter >= 0:
TRAIN_REVISION_DATA.append(revision)
increment_revision_counters(TRAIN_ENTITY_COUNTER, entities)
else:
TEST_REVISION_DATA.append(revision)
increment_revision_counters(TEST_ENTITY_COUNTER, entities)
with open('revision_data.pkl', 'wb') as f:
pickle.dump(TRAIN_REVISION_DATA, f)