-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode_without_puncstop.py
More file actions
84 lines (67 loc) · 2.36 KB
/
code_without_puncstop.py
File metadata and controls
84 lines (67 loc) · 2.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
__author__ = 'Qiji'
import nltk
from nltk.collocations import *
# ------------------Task 2: withpit punctuation and stoplist ---------------------
# load corpus
print "Task 1: without punctuation and stoplist"
print "Start................"
file_corpus = open('corpus.txt')
raw = file_corpus.read()
file_corpus.close()
# load punctuations and stoplist
f1 = open('punctuation.txt')
punc_line = f1.readlines()
punctuation = []
for line in punc_line:
punctuation.append(line.strip('\n'))
f1.close()
f2 = open('stoplist.txt')
stop_line = f2.readlines()
stoplist = []
for line in stop_line:
stoplist.append(line.strip('\n'))
f2.close()
# Create bigrams using nltk
tokens = nltk.word_tokenize(raw)
bi_gram = nltk.bigrams(tokens)
# Store bi_gram in hash-map(dict)
# and compute fequencies
fdist_raw = nltk.FreqDist(bi_gram)
fdist = {}
# remove the punctuations
for e in fdist_raw:
if e[0] in punctuation or e[1] in punctuation or e[0] in stoplist or e[1] in stoplist:
pass
else:
fdist[e] = fdist_raw[e]
# Re sorted the dict with velue in descreasing order
fdist = nltk.FreqDist(fdist)
# get key of the dict
collocation_key = fdist.keys()
top20_coll = collocation_key[:20]
print "The 20 most frequent collocations and their corresponding frequencies\n"
for i in top20_coll:
print i, fdist[i]
print '\n'
# Outputs the 20 highest scoring collocations according to point-wise mutual information
print "The 20 highest scoring collocations according to point-wise mutual information\n"
bigram_mesures = nltk.collocations.BigramAssocMeasures
finder = BigramCollocationFinder.from_documents(fdist)
top20_pmi = finder.nbest(bigram_mesures.pmi, 20)
for i in top20_pmi:
print i
print '\n'
# Outputs the 20 highest scoring collocations according to Pearson's (Chi-squared) test
print "The 20 highest scoring collocations according to Pearson's (Chi-squared) test\n"
top20_chi_sq = finder.nbest(bigram_mesures.chi_sq, 20)
for i in top20_chi_sq:
print i
print '\n'
# Outputs the 20 highest scoring collocations according to Dunning's log-likelihood test.
print "The 20 highest scoring collocations according to Dunning's log-likelihood test.\n"
top20_likelihood = finder.nbest(bigram_mesures.likelihood_ratio, 20)
for i in top20_likelihood:
print i
print '\n'
print "----------------------------------------------------------------------------------"
print "end............"