KG-graph/textrank.py at main · kernel-loophole/KG-graph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import sys
import pandas as pd
from collections import defaultdict

class TextrankGraph:
    '''textrank graph'''
    def __init__(self):
        self.graph = defaultdict(list)
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 1000 # iteration steps
        self.weight_updates = defaultdict(list)

    def addEdge(self, start, end, weight):
        """Add edge between node"""
        self.graph[start].append((start, end, weight))
        self.graph[end].append((end, start, weight))
    def rank_2(self,graph):
        weight_default = 1.0 / (len(graph) or 1.0)
        nodeweight_dict = defaultdict(float)
        outsum_node_dict = defaultdict(float)

        for node, out_edge in graph.items():
            nodeweight_dict[node] = weight_default
            outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)

        sorted_keys = sorted(graph.keys())
        step_dict = [0]

        weight_updates_df = pd.DataFrame()

        for step in range(1, self.steps):
            for node in sorted_keys:
                s = 0
                for e in graph[node]:
                    s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]
                new_weight = (1 - self.d) + self.d * s
                nodeweight_dict[node] = new_weight
                self.weight_updates[node].append(new_weight)

            step_dict.append(sum(nodeweight_dict.values()))

            if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
                break

            weight_updates_df = pd.concat([weight_updates_df, pd.DataFrame(nodeweight_dict, index=[step])])

        min_rank, max_rank = 0, 0

        for w in nodeweight_dict.values():
            if w < min_rank:
                min_rank = w
            if w > max_rank:
                max_rank = w

        for n, w in nodeweight_dict.items():
            nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)

        # result_weights = {node: nodeweight_dict[node] for node in nodes_list}
        return nodeweight_dict

    def rank(self):
        weight_default = 1.0 / (len(self.graph) or 1.0)
        nodeweight_dict = defaultdict(float)
        outsum_node_dict = defaultdict(float)

        for node, out_edge in self.graph.items():
            nodeweight_dict[node] = weight_default
            outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)

        sorted_keys = sorted(self.graph.keys())
        step_dict = [0]

        weight_updates_df = pd.DataFrame()  # Initialize an empty DataFrame

        for step in range(1, self.steps):
            for node in sorted_keys:
                s = 0

                for e in self.graph[node]:
                    s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]

                new_weight = (1 - self.d) + self.d * s
                nodeweight_dict[node] = new_weight
                self.weight_updates[node].append(new_weight)

            step_dict.append(sum(nodeweight_dict.values()))

            if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
                break
                    # Append the current nodeweight_dict to the DataFrame
            weight_updates_df = pd.concat([weight_updates_df, pd.DataFrame(nodeweight_dict, index=[step])])

        min_rank, max_rank = 0, 0

        for w in nodeweight_dict.values():
            if w < min_rank:
                min_rank = w
            if w > max_rank:
                max_rank = w

        for n, w in nodeweight_dict.items():
            nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)
        # print(nodeweight_dict)
        # print(weight_updates_df)
        # print(nodeweight_dict)
        return nodeweight_dict


class TextRank:
    """Extract keywords based on textrank graph algorithm"""
    def __init__(self):
        self.candi_pos = ['NOUN', 'PROPN', 'VERB']
        self.stop_pos = ['NUM', 'ADV']
        self.span = 10

    def extract_keywords(self, word_list, num_keywords):
        # print(word_list,num_keywords)
        g = TextrankGraph()
        cm = defaultdict(int)
        #The pairs are created as long as the part-of-speech
        # tag of the subsequent word is in the allowed list (self.candi_pos)
        for i, word in enumerate(word_list): # word_list = [['previous', 'ADJ'], ['rumor', 'NOUN']]

            if word[1] in self.candi_pos and len(word[0]) > 1: # word = ['previous', 'ADJ']
                for j in range(i + 1, i + self.span):
                    if j >= len(word_list):
                        break
                    if word_list[j][1] not in self.candi_pos or word_list[j][1] in self.stop_pos or len(word_list[j][0]) < 2:
                        continue
                    # print(word[0],word_list[j][0])
                    pair = tuple((word[0], word_list[j][0]))
                    cm[(pair)] +=  1


        # cm = {('was', 'prison'): 1, ('become', 'prison'): 1}

        # print("++++",self.candi_pos)
        # print(word_list)
        # print(cm)
        for terms, w in cm.items():
            # print(terms[0],terms[1],w)
            g.addEdge(terms[0], terms[1], w)
        nodes_rank = g.rank()
        print(nodes_rank)
        nodes_rank = sorted(nodes_rank.items(), key=lambda asd:asd[1], reverse=True)
        # print(nodes_rank)
        # print(nodes_rank[:num_keywords])
        return nodes_rank[:num_keywords]