-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode_with_puncstop.py
More file actions
55 lines (44 loc) · 1.73 KB
/
code_with_puncstop.py
File metadata and controls
55 lines (44 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
__author__ = 'Qiji'
import nltk
from nltk.collocations import *
# ------------------Task 1: with punctuation and stoplist ---------------------
# load corpus
print "Task 1: with punctuation and stoplist"
print "Start................"
file_corpus = open('corpus.txt')
raw = file_corpus.read()
file_corpus.close()
# Create bigrams using nltk
tokens = nltk.word_tokenize(raw)
bi_gram = nltk.bigrams(tokens)
# Store bi_gram in hash-map(dict)
# and compute fequencies
fdist = nltk.FreqDist(bi_gram)
collocation_key = fdist.keys()
top20_coll = collocation_key[:20]
print "The 20 most frequent collocations and their corresponding frequencies\n"
for i in top20_coll:
print i, fdist[i]
print '\n'
# Outputs the 20 highest scoring collocations according to point-wise mutual information
print "The 20 highest scoring collocations according to point-wise mutual information\n"
bigram_mesures = nltk.collocations.BigramAssocMeasures
finder = BigramCollocationFinder.from_documents(fdist)
top20_pmi = finder.nbest(bigram_mesures.pmi, 20)
for i in top20_pmi:
print i
print '\n'
# Outputs the 20 highest scoring collocations according to Pearson's (Chi-squared) test
print "The 20 highest scoring collocations according to Pearson's (Chi-squared) test\n"
top20_chi_sq = finder.nbest(bigram_mesures.chi_sq, 20)
for i in top20_chi_sq:
print i
print '\n'
# Outputs the 20 highest scoring collocations according to Dunning's log-likelihood test.
print "The 20 highest scoring collocations according to Dunning's log-likelihood test.\n"
top20_likelihood = finder.nbest(bigram_mesures.likelihood_ratio, 20)
for i in top20_likelihood:
print i
print '\n'
print "----------------------------------------------------------------------------------"
print "end............"