cross_language/MatrixVector.py at master · tranchien2002/cross_language · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pdb
import spacy
import Mongodb as mongo
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from document2json import pickle_json, cosine_json
import _pickle as cPickle
from bson.objectid import ObjectId
nlp = spacy.load('en_core_web_md')
docs_col = mongo.get_colection("documents")

def keyword2matrix(keywords):
    keywords = keywords.replace(",", " ")
    doc = nlp(keywords)
    vectors = [item.vector for item in doc]
    matrix = np.asmatrix(vectors)
    return matrix

def similarity(matrix1, matrix2):
    matrix1 = np.asmatrix(matrix1)
    matrix2 = np.asmatrix(matrix2)
    return cosine_similarity(matrix1, matrix2)

def square_sum(matrix):
    square_matrix = np.square(np.asmatrix(matrix))
    return square_matrix.sum()

def square_mean(matrix):
    square_matrix = np.square(np.asmatrix(matrix))
    return square_matrix.mean()

def mean_max_each_row(matrix):
    max_rows = []
    for i in range(matrix.shape[0]):
        max_rows.append(np.max(matrix[i,]))
    return np.mean(max_rows)

def store_matrix():
    documents_en = docs_col.find({"lang": 'english'})
    # pdb.set_trace()
    documents_en = [pickle_json(str(item["_id"]), keyword2matrix(item["keyword"])) for item in documents_en]
    ouf = open('pickle_en.txt', 'wb')
    cPickle.dump(documents_en, ouf)
    ouf.close()

def get_matrix_vectors():
    inf = open('pickle_en.txt', 'rb')
    matrix_vectors = cPickle.load(inf)
    return matrix_vectors

def get_list_similarity(matrix, matrices):
    new_matrices = []
    for item in matrices:
        print(item)
        if(item['matrix'] == ''):
            continue
        new_matrices.append(cosine_json(item['id'], mean_max_each_row(similarity(matrix, item['matrix']))))
    # new_matrices = [cosine_json(item['id'], square_mean(similarity(mix, item['matrix']))) for item in matrices]
    return new_matrices


def get_documents(list_docs):
    sorted_list = sorted(list_docs, key= lambda x: float(x['similarity']), reverse=True)
    return sorted_list

def get_topn_similarity_documents(keywords, n=5):
    matrix_vectors = get_matrix_vectors()
    docs = get_list_similarity(keyword2matrix(keywords), matrix_vectors)
    sorted_docs = get_documents(docs)
    return sorted_docs[:n]
#
# def get_json_docs(list_ids):
#     list_docs = []
#     for item in list_ids:
#         list_docs.append(docs_col.find_one({"_id": ObjectId(str(item['id']))}))
#     return list_docs

def get_json_docs(list_ids):
    list_docs = []
    for item in list_ids:
        doc = docs_col.find_one({"_id": ObjectId(str(item['id']))})
        doc['similarity'] = item['similarity']
        doc.pop('_id', None)
        print(doc)
        list_docs.append(doc)
    return list_docs


if __name__ == '__main__':
    store_matrix()
    # articles_vi = docs_col.find({"lang": 'vietnamese'})
    # start_time = time.time()
    # for item in articles_vi:
    #     list_ids = get_topn_similarity_documents(item['keyword'])
    #     list_docs = get_json_docs(list_ids)
    #     list_titles = [item['title'] for item in list_docs]
    #     filename = "result_matrix.txt"
    #     myfile = open(filename, 'a+')
    #     myfile.write(item['title'] + "\n")
    #     for t in list_titles:
    #         myfile.write(t + "\n")
    #     myfile.write("============================" + "\n")
    #     print(list_titles)
    # elapsed_time = time.time() - start_time
    # print(elapsed_time)


    # docs = get_list_similarity(keyword2matrix(articles_vi[6]['keyword']), matrix_vectors)
    # sorted_docs = get_documents(docs)

    # test
    # title = articles_vi[6]['title']
    # pdb.set_trace()