Skip to content

Commit 147d776

Browse files
committed
add retrieval styles
1 parent c56f0fe commit 147d776

1 file changed

Lines changed: 101 additions & 21 deletions

File tree

v2_tweets_to_file.py

Lines changed: 101 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,13 @@
7777
help="Output file format, valid options are either pkl or csv. "
7878
"Default: pkl")
7979

80+
# get retrieval style
81+
ap.add_argument("-s", "--style", required=True, default='iterative',
82+
help="Set retrieve style. Options: bulk or iterative. Bulk collects "
83+
"tweets to one big file. Iterative collects each day separately")
84+
8085
# get wait time
81-
ap.add_argument("-w", "--wait", required=True, default=45,
86+
ap.add_argument("-w", "--wait", required=False, default=45,
8287
help="Set wait time between requests to avoid Twitter rate limits. "
8388
"Default: 45")
8489

@@ -88,6 +93,9 @@
8893
# get waittime
8994
waittime = int(args['wait'])
9095

96+
# get retrieval style
97+
rstyle = args['style']
98+
9199
# define date range function
92100
def daterange(start_date, end_date):
93101
for n in range(int((end_date - start_date).days)):
@@ -447,27 +455,102 @@ def v2parser(tweets, maxcalls):
447455
start_date = args['startdate'].date()
448456
end_date = args['enddate'].date()
449457

450-
# loop through dates
451-
for single_date in daterange(start_date, end_date):
458+
# check which retrieval style
459+
if rstyle == 'iterative':
452460

453-
# set start timestamp
454-
start_ts = single_date
461+
# loop through dates
462+
for single_date in daterange(start_date, end_date):
463+
464+
# set start timestamp
465+
start_ts = single_date
466+
467+
# set end timestamp
468+
end_ts = single_date + timedelta(days=1)
469+
470+
# payload rules for v2 api
471+
rule = gen_request_parameters(query = config['query'],
472+
results_per_call = config['results_per_call'],
473+
start_time = start_ts.isoformat(),
474+
end_time = end_ts.isoformat(),
475+
tweet_fields = tweetfields,
476+
user_fields = userfields,
477+
media_fields = mediafields,
478+
place_fields = placefields,
479+
expansions = expansions,
480+
stringify = False)
481+
482+
# result stream from twitter v2 api
483+
rs = ResultStream(request_parameters = rule,
484+
max_results=100000,
485+
max_pages=1,
486+
max_tweets = config['max_tweets'],
487+
**search_creds)
488+
489+
# indicate which day is getting retrieved
490+
print('[INFO] - Retrieving tweets from ' + str(start_ts))
455491

456-
# set end timestamp
457-
end_ts = single_date + timedelta(days=1)
492+
# get json response to list
493+
tweets = list(rs.stream())
494+
495+
# parse results to dataframe
496+
print('[INFO] - Parsing tweets from ' + str(start_ts))
497+
tweetdf = v2parser(tweets, config['results_per_call'])
498+
499+
# try to order columns semantically
500+
try:
501+
tweetdf = tweetdf[['id', 'author_id', 'created_at', 'reply_settings', 'conversation_id',
502+
'source', 'in_reply_to_user_id', 'text', 'possibly_sensitive',
503+
'lang', 'referenced_tweets', 'referenced_tweets.id',
504+
'referenced_tweets.author_id', 'referenced_tweets.type',
505+
'public_metrics.retweet_count', 'public_metrics.reply_count',
506+
'public_metrics.like_count', 'public_metrics.quote_count',
507+
'entities.mentions', 'entities.urls', 'entities.hashtags',
508+
'entities.annotations', 'attachments.media_keys',
509+
'attachments.media_types', 'user.description', 'user.verified', 'user.id', 'user.protected',
510+
'user.url', 'user.profile_image_url', 'user.location', 'user.name',
511+
'user.created_at', 'user.username', 'user.public_metrics.followers_count',
512+
'user.public_metrics.following_count', 'user.public_metrics.tweet_count',
513+
'user.public_metrics.listed_count', 'user.entities.description.hashtags',
514+
'user.entities.url.urls', 'user.entities.description.mentions',
515+
'user.entities.description.urls', 'geo.place_id', 'geo.coordinates.type',
516+
'geo.coordinates.coordinates', 'geo.coordinates.x', 'geo.coordinates.y',
517+
'geo.full_name', 'geo.name', 'geo.place_type', 'geo.country',
518+
'geo.country_code', 'geo.type', 'geo.bbox', 'geo.centroid',
519+
'geo.centroid.x', 'geo.centroid.y']]
520+
except:
521+
pass
522+
523+
# set up file prefix from config
524+
file_prefix_w_date = config['filename_prefix'] + start_ts.isoformat()
525+
outpickle = file_prefix_w_date + '.pkl'
526+
outcsv = file_prefix_w_date + '.csv'
527+
528+
# save to file
529+
if args['output'] == 'pickle':
530+
# save to pickle
531+
tweetdf.to_pickle(outpickle)
532+
elif args['output'] == 'csv':
533+
# save to csv
534+
tweetdf.to_csv(outcsv, sep=';', encoding='utf-8')
535+
536+
# sleeps to not hit request limit so soon
537+
time.sleep(waittime)
538+
539+
# check if retrieval style if bulk
540+
elif rstyle == 'bulk':
458541

459542
# payload rules for v2 api
460543
rule = gen_request_parameters(query = config['query'],
461-
results_per_call = config['results_per_call'],
462-
start_time = start_ts.isoformat(),
463-
end_time = end_ts.isoformat(),
464-
tweet_fields = tweetfields,
465-
user_fields = userfields,
466-
media_fields = mediafields,
467-
place_fields = placefields,
468-
expansions = expansions,
469-
stringify = False)
470-
544+
results_per_call = config['results_per_call'],
545+
start_time = start_ts.isoformat(),
546+
end_time = end_ts.isoformat(),
547+
tweet_fields = tweetfields,
548+
user_fields = userfields,
549+
media_fields = mediafields,
550+
place_fields = placefields,
551+
expansions = expansions,
552+
stringify = False)
553+
471554
# result stream from twitter v2 api
472555
rs = ResultStream(request_parameters = rule,
473556
max_results=100000,
@@ -477,7 +560,7 @@ def v2parser(tweets, maxcalls):
477560

478561
# indicate which day is getting retrieved
479562
print('[INFO] - Retrieving tweets from ' + str(start_ts))
480-
563+
481564
# get json response to list
482565
tweets = list(rs.stream())
483566

@@ -521,8 +604,5 @@ def v2parser(tweets, maxcalls):
521604
elif args['output'] == 'csv':
522605
# save to csv
523606
tweetdf.to_csv(outcsv, sep=';', encoding='utf-8')
524-
525-
# sleeps to not hit request limit so soon
526-
time.sleep(waittime)
527607

528608
print('[INFO] - ... done!')

0 commit comments

Comments
 (0)