EmailClassificationModeling/model_visualization.py at main · eun2u/EmailClassificationModeling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib as mpl
from matplotlib import rc
import matplotlib.pyplot as plt
import pandas as pd
from gensim.models import KeyedVectors
# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rc('font', family='AppleGothic') # 한글출력
mpl.rcParams['axes.unicode_minus'] = False
#plt.rc('font', family='D2Coding')

def show_tsne():
    tsne = TSNE(n_components=2)
    X = tsne.fit_transform(X_show)
    df = pd.DataFrame(X, index=vocab_show, columns=['x', 'y'])
    fig = plt.figure()
    fig.set_size_inches(30, 20)
    ax = fig.add_subplot(1, 1, 1)
    ax.scatter(df['x'], df['y'])

    for word, pos in df.iterrows():
        ax.annotate(word, pos, fontsize=10)
        plt.xlabel("t-SNE feature 0")
        plt.ylabel("t-SNE feature 1")
        plt.show()

def show_pca():
    # PCA 모델을 생성합니다
    pca = PCA(n_components=2)
    pca.fit(X_show)
    # 처음 두 개의 주성분으로 숫자 데이터를 변환합니다
    x_pca = pca.transform(X_show)
    plt.figure(figsize=(30, 20))
    plt.xlim(x_pca[:, 0].min(), x_pca[:, 0].max())
    plt.ylim(x_pca[:, 1].min(), x_pca[:, 1].max())
    for i in range(len(X_show)):
        plt.text(x_pca[i, 0], x_pca[i, 1], str(vocab_show[i]), fontdict={'weight': 'bold', 'size': 9})

    plt.xlabel("first label")
    plt.ylabel("second label")
    plt.show()


model_name = './training_data/vector_clean_data_final_ver2_iter1000'
model = KeyedVectors.load_word2vec_format(model_name)
vocab = list(model.wv.vocab)
X = model[vocab]

# sz개의 단어에 대해서만 시각화
sz = 1500
X_show = X[:sz,:]
vocab_show = vocab[:sz]

#show_tsne()
show_pca()