-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdata_process.py
More file actions
50 lines (37 loc) · 1.38 KB
/
data_process.py
File metadata and controls
50 lines (37 loc) · 1.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#PREPROCESSING FUNCTIONS
def tokenize(x):
"""
Input is a List of sentences/strings to be tokenized
Returns a Tuple of (tokenized x data, tokenizer used to tokenize x)
"""
tk = Tokenizer()
tk.fit_on_texts(x)
return tk.texts_to_sequences(x), tk
def pad(x, length=32):
"""
Input is a List of sequences.
Returns Padded numpy array of sequences
"""
return pad_sequences(x, maxlen=length, padding='post')
def preprocess(x):
"""
Preprocess x:Feature List and y: Label List
Applying tokenize() and pad()
"""
preprocess_x, x_tk = tokenize(x)
preprocess_x = pad(preprocess_x)
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
#preprocess_y = y.reshape(*y.shape, 1)
return preprocess_x, x_tk #, preprocess_y
def logits_to_text(logits, tokenizer):
"""
Turn logits from a neural network into text using the tokenizer
logits: Logits from a neural network
tokenizer: Keras Tokenizer fit on the labels
Returns a String that represents the text of the logits
"""
index_to_words = {id: word for word, id in tokenizer.word_index.items()}
index_to_words[0] = '<PAD>'
return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])