-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathslistener.py
More file actions
131 lines (114 loc) · 5.21 KB
/
slistener.py
File metadata and controls
131 lines (114 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# import packages
from tweepy.streaming import StreamListener
import json
import time
import sys
import pandas as pd
from sqlalchemy import create_engine
import pandas as pd
import preprocessor as p
import re
import emoji
import itertools
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def clean_method2(tweet):
#remove mentions and URLs
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.NUMBER)
tweet=p.clean(tweet)
#replace consecutive non-ASCII characters with a space
tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
#take care of contractions and stray quote marks
tweet = re.sub(r'’',"'", tweet)
#words = tweet.split()
tweet = re.sub(r":"," ", tweet)
tweet = re.sub(r"n't"," not", tweet)
#fix spellings
tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
#emojis conversion
tweet = emoji.demojize(tweet)
tweet = ' '.join(tweet.split())
return tweet
def filter_method2(tweet):
stop_words = set(stopwords.words('english'))
stray_tokens=['amp', '`', "``", "'", "''", '"', "n't", "I", "i", ",00"]#stray words
punct = r'[{}]'.format(string.punctuation)
tweet=re.sub(punct,' ',tweet)
tweet=re.sub(r'[0-9]',' ',tweet)
tweet=re.sub(r'aadhar|adhaar|aadhaar|aadar|aadahar|aadaar|aadarcard|aadaahr|adhar',' aadhaar ',tweet)
tweet = re.sub(r'(^|\s)[a-z]($|\s)',' ', tweet)
tweet = re.sub(r'(^|\s)[a-z][a-z]($|\s)',' ', tweet)
word_tokens = word_tokenize(tweet)
#filter using NLTK library append it to a string
filtered_tweet = [w for w in word_tokens if not w in stop_words]
filtered_tweet = []
#looping through conditions
for w in word_tokens:
#check tokens against stopwords and punctuations
if w not in stop_words and w not in string.punctuation and w not in stray_tokens:
w=w.lower()
filtered_tweet.append(w)
tweet=' '.join(filtered_tweet)
tweet = re.sub(r'(^|\s)[a-z]($|\s)',' ', tweet)#re-removing single characters
tweet = re.sub(r'(^|\s)[a][a]($|\s)',' ', tweet)#fixing for aadhaar
return tweet
# inherit from StreamListener class
class SListener(StreamListener):
# initialize the API and a counter for the number of tweets collected
def __init__(self, api = None, fprefix = 'streamer'):
self.api = api or API()
self.cnt = 0
# create a engine to the database
#self.engine = create_engine('sqlite:///app/tweets.sqlite')
# switch to the following definition if run this code locally
self.engine = create_engine('sqlite:///tweets.sqlite')
# for each tweet streamed
def on_status(self, status):
# increment the counter
self.cnt += 1
# parse the status object into JSON
status_json = json.dumps(status._json)
# convert the JSON string into dictionary
status_data = json.loads(status_json)
# initialize a list of potential full-text
full_text_list = [status_data['text']]
# add full-text field from all sources into the list
if 'extended_tweet' in status_data:
full_text_list.append(status_data['extended_tweet']['full_text'])
if 'retweeted_status' in status_data and 'extended_tweet' in status_data['retweeted_status']:
full_text_list.append(status_data['retweeted_status']['extended_tweet']['full_text'])
if 'quoted_status' in status_data and 'extended_tweet' in status_data['quoted_status']:
full_text_list.append(status_data['quoted_status']['extended_tweet']['full_text'])
# only retain the longest candidate
full_text = max(full_text_list, key=len)
# extract time and user info
tweets = {
'created_at': status_data['created_at'],
'text': full_text,
#'user': status_data['user']['description']
}
# uncomment the following to display tweets in the console
print("Writing tweet # {} to the database".format(self.cnt))
print("Tweet Created at: {}".format(tweets['created_at']))
print("Tweet Content:{}".format(tweets['text']))
#print("User Profile: {}".format(tweets['user']))
print()
# convert into dataframe
df=pd.DataFrame(tweets, index=[0])
# convert string of time into date time obejct
df['created_at'] = pd.to_datetime(df.created_at)
df['text'] = df['text'].apply(clean_method2)
df['text'] = df['text'].apply(filter_method2)
# push tweet into database
df.to_sql('tweets', con=self.engine, if_exists='append')
with self.engine.connect() as con:
con.execute("""
DELETE FROM tweets
WHERE created_at in(
SELECT created_at
FROM(
SELECT created_at, strftime('%s','now') - strftime('%s',created_at) AS time_passed
From tweets
WHERE time_passed >= 10800))""")