Extractive-Summarization/sentencefeatures.py at master · stepstefan/Extractive-Summarization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from stanfordnlp import StanfordNLP
from stanfordcorenlp import StanfordCoreNLP
from nltk.tree import Tree
import numpy as np
from wordfeatures import Wordftrs
sNLP = StanfordNLP()
wf=Wordftrs()

class Sentenceftrs:
    def position(self, slist):
        '''The position of the sentences. Suppose there are M sentences in the document, for th ith sentence, the position feature is computed as 1-(i-1)/(M=1)'''
        return 0

    def length(self, sentence):
        tokenized = sNLP.word_tokenize(sentence)
        return len(tokenized)

    def subs(self, sentence):
        '''Sub-sentence count in parsing tree'''
        return 0

    def depth(self, sentence):
        '''The root depth of the parsing tree'''
        tree=sNLP.parse(sentence)
        maxd = 0
        for pos in tree.treepositions():
            if len(pos)>maxd:
                maxd = len(pos)
        return maxd

    def atf(self, sentence, tf_dic):
        '''The mean TF values of words in the sentence, devided bu sentence length'''
        tf_sum = 0
        for word in sentence:
            tf_sum += word[0]
        return tf_sum / len(sentence)**2

    def aidf(self, sentence, idf_dic):
        '''The mean word IDF values in sentence, devided by the sentence lenght'''
        idf_sum = 0
        for word in sentence:
            idf_sum += word[0]
        return idf_sum / len(sentence)**2

    def acf(self, sentence):
        '''The mean word CF values in sentence, devided by the sentence length'''
        cf_sum = 0
        for word in sentence:
            cf_sum += word[0]
        return cf_sum / len(sentence)**2


    def posratio(self, sentence):
        '''The number of nouns, verbs, adjectives and adverbs in the sentence, devided by sentence length'''
        tags = sNLP.pos(sentence)
        cn = 0; cv = 0; cj = 0; cr = 0; c=0
        for tag in tags:
            c+=1
            if tag[1][0]=="N":
                cn+=1
            if tag[1][0]=="V":
                cv+=1
            if tag[1][0]=="J":
                cj+=1
            if tag[1][0:1]=="RB":
                cr+=1
            feature = np.array([cn/c,cv/c,cj/c,cr/c])
        return feature

    def neration(self, sentence):
        '''The number of named enitites, devided by sentence length'''
        tags = sNLP.ner(sentence)
        c1=0;c2=0
        for tag in tags:
            c1+=1
            if tag[1]!="O":
                c2+=1

        return c2/c1

    def numberratio(self, sentence):
        '''The number of digits, devided by sentence length'''
        tags = sNLP.pos(sentence)
        c1=0;c2=0
        for tag in tags:
            c1+=1
            if tag[1]=="CD":
                c2+=1

        return c2/c1

    def stopratio(self, sentence):
        '''The number of stopwords, devided by sentence length. Use stopword list of ROUGE'''
        return 0