-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentencefeatures.py
More file actions
94 lines (80 loc) · 2.82 KB
/
sentencefeatures.py
File metadata and controls
94 lines (80 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from stanfordnlp import StanfordNLP
from stanfordcorenlp import StanfordCoreNLP
from nltk.tree import Tree
import numpy as np
from wordfeatures import Wordftrs
sNLP = StanfordNLP()
wf=Wordftrs()
class Sentenceftrs:
def position(self, slist):
'''The position of the sentences. Suppose there are M sentences in the document, for th ith sentence, the position feature is computed as 1-(i-1)/(M=1)'''
return 0
def length(self, sentence):
tokenized = sNLP.word_tokenize(sentence)
return len(tokenized)
def subs(self, sentence):
'''Sub-sentence count in parsing tree'''
return 0
def depth(self, sentence):
'''The root depth of the parsing tree'''
tree=sNLP.parse(sentence)
maxd = 0
for pos in tree.treepositions():
if len(pos)>maxd:
maxd = len(pos)
return maxd
def atf(self, sentence, tf_dic):
'''The mean TF values of words in the sentence, devided bu sentence length'''
tf_sum = 0
for word in sentence:
tf_sum += word[0]
return tf_sum / len(sentence)**2
def aidf(self, sentence, idf_dic):
'''The mean word IDF values in sentence, devided by the sentence lenght'''
idf_sum = 0
for word in sentence:
idf_sum += word[0]
return idf_sum / len(sentence)**2
def acf(self, sentence):
'''The mean word CF values in sentence, devided by the sentence length'''
cf_sum = 0
for word in sentence:
cf_sum += word[0]
return cf_sum / len(sentence)**2
def posratio(self, sentence):
'''The number of nouns, verbs, adjectives and adverbs in the sentence, devided by sentence length'''
tags = sNLP.pos(sentence)
cn = 0; cv = 0; cj = 0; cr = 0; c=0
for tag in tags:
c+=1
if tag[1][0]=="N":
cn+=1
if tag[1][0]=="V":
cv+=1
if tag[1][0]=="J":
cj+=1
if tag[1][0:1]=="RB":
cr+=1
feature = np.array([cn/c,cv/c,cj/c,cr/c])
return feature
def neration(self, sentence):
'''The number of named enitites, devided by sentence length'''
tags = sNLP.ner(sentence)
c1=0;c2=0
for tag in tags:
c1+=1
if tag[1]!="O":
c2+=1
return c2/c1
def numberratio(self, sentence):
'''The number of digits, devided by sentence length'''
tags = sNLP.pos(sentence)
c1=0;c2=0
for tag in tags:
c1+=1
if tag[1]=="CD":
c2+=1
return c2/c1
def stopratio(self, sentence):
'''The number of stopwords, devided by sentence length. Use stopword list of ROUGE'''
return 0