data_mining_ML/spam.py at main · Michalis-Managoudis/data_mining_ML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import pandas as pd
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import keras

LIMIT=200

# Load data
df = pd.read_csv('spam_or_not_spam.csv')
df.drop(index=1466, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

# Model of Neutral Network
model = keras.Sequential(name="Spam_mails")

model.add(keras.layers.LSTM(60, name='LSTM', input_shape=(100, LIMIT)))
model.add(keras.layers.Dense(32, name='Dense1', activation='relu'))
model.add(keras.layers.Dropout(0.3, name='Dropout1'))
model.add(keras.layers.Dense(16, name='Dense2', activation='relu'))
model.add(keras.layers.Dropout(0.1, name='Dropout2'))
model.add(keras.layers.Dense(1, name='Output', activation="sigmoid"))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=["binary_accuracy"])

# Make list of words
words =[]
for mail in df.email:
    for word1 in mail.lower().split(): #for word1 in mail.lower().split():
        if word1 not in words:
            words.append(word1)
# GloVe transform word to vector (word embeddings)
with open("glove.6B.200d.txt", 'r', encoding="utf-8") as f:
    embeddings_dict = {}
    for line1 in f:
        line1 = line1.split()
        word2 = line1[0]
        if word2 in words:
            vector = np.asarray(line1[1:], "float32")
            embeddings_dict[word2] = vector #=list(map(float, line1[1:]))

# limit length of embeddings_dict
for i in range(len(df)):
    mail2=[]
    for word3 in df.email[i].lower().split():
        if word3 in embeddings_dict.keys():
            mail2.append(embeddings_dict[word3])
    if (len(mail2) >= LIMIT):
        mail2=mail2[:LIMIT]
    else:
        mail2 += [list(np.zeros(200))] * (LIMIT - len(mail2))
    df.email[i]=mail2


#x=df.email.values
#y=df.label.values

x = np.asarray(list(df.email)).astype('float32')
y = np.array(df.label)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

model.summary()

model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=60,batch_size=32,verbose=1)

y_pred=(model.predict(x_test) >0.5).astype('int32')

print('\tf1_score: {0:.2f}%,\tprecision: {1:.2f}%,\trecall: {2:.2f}%'.format(
        f1_score(y_test, y_pred, average='weighted') * 100,
        precision_score(y_test, y_pred, average='weighted') * 100,
        recall_score(y_test, y_pred, average='weighted') * 100))