forked from neuralinfo/Assignments
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAssignment3_2_2.py
More file actions
executable file
·80 lines (61 loc) · 1.87 KB
/
Assignment3_2_2.py
File metadata and controls
executable file
·80 lines (61 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# David Paculdo
# W205
# Assignment 3
import os
import pymongo
import string
import sys
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from nltk.tokenize import RegexpTokenizer
#Defining the tokenizer
#used to clean up the tweets a little bit
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
#Variables for mongodb
db_name="db_streamT"
coll1="tweets"
coll2="word_db"
coll3="lexical_db"
#Connecting to mongodb and make sure collections are accessible
conn=pymongo.MongoClient()
try:
db=conn[db_name]
collection=db[coll1]
except:
print "dbname or collection does not exist"
sys.exit()
#Variables for lexical diversity
total_words=0
word_count={}
#Block to determine the lexical diversity of the tweets
for data in collection.find():
#cleans up any non-printable characters
text=filter(lambda x: x in string.printable, data['text'])
text=text.replace("\n","")
text=string.lower(text)
tokens=tokenizer.tokenize(text)
total_words += len(tokens)
for word in tokens:
if word in word_count:
word_count[word]+=1
else:
word_count[word]=1
#Calculation of the lexical diversity
unique_words=len(set(word_count))
lexical_diversity=float(unique_words)/total_words
#Writes lexical diversity to file
my_file=open("lexical_diversity.txt","w")
my_file.write("total number of words: %i\n" %(total_words))
my_file.write("total number of unique words: %i\n" %(unique_words))
my_file.write("lexical diversity of corpus: %f\n" %(lexical_diversity))
#Insert words and number of times the word occurred into mongodb
words=db[coll2]
for key, value in word_count.iteritems():
words.insert({"word":key, "count":value})
#Insert lexical diversity numbers into mongodb
lexical=db[coll3]
lexical.insert({"total_words":total_words, "unique_words":unique_words, "lexical_diversity":lexical_diversity})