toxic-comments/gru2.py at master · edwisdom/toxic-comments · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, Dropout, SpatialDropout1D, concatenate
from keras.layers import GRU, Conv1D, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint


# Load pre-trained word vectors
EMBEDDING ='data/glove.840B.300d.txt'
# EMBEDDING='data/crawl-300d-2M.vec'


# Save training and testing data
TRAIN_DATA ='train.csv'
TEST_DATA ='test.csv'
SAMPLE_SUB ='sample_submission.csv'


embed_size = 300 # Size of word vector, given by our pre-trained vectors
max_features = 150000 # Number of unique words to use (i.e num rows in embedding matrix)
maxlen = 200 # Max number of words in a comment to use


# Load data into pandas
train = pd.read_csv(TRAIN_DATA)
test = pd.read_csv(TEST_DATA)
submission = pd.read_csv(SAMPLE_SUB)


# Replace missing values in training and test set
list_train = train["comment_text"].fillna("_na_").values
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[classes].values
list_test = test["comment_text"].fillna("_na_").values

# Use Keras preprocessing tools
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(list_train))
tokenized_train = tok.texts_to_sequences(list_train)
tokenized_test = tok.texts_to_sequences(list_test)

# Pad vectors with 0s for sentences shorter than maxlen
X_t = pad_sequences(tokenized_train, maxlen=maxlen)
X_te = pad_sequences(tokenized_test, maxlen=maxlen)


# Read word vectors into a dictionary
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING))

# Create the embedding matrix
word_index = tok.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

# Bidirectional GRU with 2 layers, dropout, and max-pooling
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform", activation="relu")(x)
# avg_pool = GlobalAveragePooling1D()(x)
# max_pool = GlobalMaxPooling1D()(x)
# x = concatenate([avg_pool, max_pool])
x = Conv1D(64, kernel_size = 6, padding = "valid", kernel_initializer="he_uniform", activation="relu")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
x = Dense(128, activation="relu")(x)
x = Dropout(0.1)(x)
output = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=3,
                   verbose=0, mode='auto')
best_model = 'models/model44.h5'
checkpoint = ModelCheckpoint(best_model, monitor='val_loss', verbose=0, save_best_only=True, mode='auto')

# Fit the model
model.fit(X_t, y, batch_size=1024, epochs=20, callbacks=[es, checkpoint], validation_split=0.1)

model = load_model(best_model)
print ('**Predicting on test set**')
pred = model.predict(X_te, batch_size=1024, verbose=1)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = pred
submission.to_csv('preds/submission18.csv', index=False)