7777 help = "Output file format, valid options are either pkl or csv. "
7878 "Default: pkl" )
7979
80+ # get retrieval style
81+ ap .add_argument ("-s" , "--style" , required = True , default = 'iterative' ,
82+ help = "Set retrieve style. Options: bulk or iterative. Bulk collects "
83+ "tweets to one big file. Iterative collects each day separately" )
84+
8085# get wait time
81- ap .add_argument ("-w" , "--wait" , required = True , default = 45 ,
86+ ap .add_argument ("-w" , "--wait" , required = False , default = 45 ,
8287 help = "Set wait time between requests to avoid Twitter rate limits. "
8388 "Default: 45" )
8489
8893# get waittime
8994waittime = int (args ['wait' ])
9095
96+ # get retrieval style
97+ rstyle = args ['style' ]
98+
9199# define date range function
92100def daterange (start_date , end_date ):
93101 for n in range (int ((end_date - start_date ).days )):
@@ -447,27 +455,102 @@ def v2parser(tweets, maxcalls):
447455start_date = args ['startdate' ].date ()
448456end_date = args ['enddate' ].date ()
449457
450- # loop through dates
451- for single_date in daterange ( start_date , end_date ) :
458+ # check which retrieval style
459+ if rstyle == 'iterative' :
452460
453- # set start timestamp
454- start_ts = single_date
461+ # loop through dates
462+ for single_date in daterange (start_date , end_date ):
463+
464+ # set start timestamp
465+ start_ts = single_date
466+
467+ # set end timestamp
468+ end_ts = single_date + timedelta (days = 1 )
469+
470+ # payload rules for v2 api
471+ rule = gen_request_parameters (query = config ['query' ],
472+ results_per_call = config ['results_per_call' ],
473+ start_time = start_ts .isoformat (),
474+ end_time = end_ts .isoformat (),
475+ tweet_fields = tweetfields ,
476+ user_fields = userfields ,
477+ media_fields = mediafields ,
478+ place_fields = placefields ,
479+ expansions = expansions ,
480+ stringify = False )
481+
482+ # result stream from twitter v2 api
483+ rs = ResultStream (request_parameters = rule ,
484+ max_results = 100000 ,
485+ max_pages = 1 ,
486+ max_tweets = config ['max_tweets' ],
487+ ** search_creds )
488+
489+ # indicate which day is getting retrieved
490+ print ('[INFO] - Retrieving tweets from ' + str (start_ts ))
455491
456- # set end timestamp
457- end_ts = single_date + timedelta (days = 1 )
492+ # get json response to list
493+ tweets = list (rs .stream ())
494+
495+ # parse results to dataframe
496+ print ('[INFO] - Parsing tweets from ' + str (start_ts ))
497+ tweetdf = v2parser (tweets , config ['results_per_call' ])
498+
499+ # try to order columns semantically
500+ try :
501+ tweetdf = tweetdf [['id' , 'author_id' , 'created_at' , 'reply_settings' , 'conversation_id' ,
502+ 'source' , 'in_reply_to_user_id' , 'text' , 'possibly_sensitive' ,
503+ 'lang' , 'referenced_tweets' , 'referenced_tweets.id' ,
504+ 'referenced_tweets.author_id' , 'referenced_tweets.type' ,
505+ 'public_metrics.retweet_count' , 'public_metrics.reply_count' ,
506+ 'public_metrics.like_count' , 'public_metrics.quote_count' ,
507+ 'entities.mentions' , 'entities.urls' , 'entities.hashtags' ,
508+ 'entities.annotations' , 'attachments.media_keys' ,
509+ 'attachments.media_types' , 'user.description' , 'user.verified' , 'user.id' , 'user.protected' ,
510+ 'user.url' , 'user.profile_image_url' , 'user.location' , 'user.name' ,
511+ 'user.created_at' , 'user.username' , 'user.public_metrics.followers_count' ,
512+ 'user.public_metrics.following_count' , 'user.public_metrics.tweet_count' ,
513+ 'user.public_metrics.listed_count' , 'user.entities.description.hashtags' ,
514+ 'user.entities.url.urls' , 'user.entities.description.mentions' ,
515+ 'user.entities.description.urls' , 'geo.place_id' , 'geo.coordinates.type' ,
516+ 'geo.coordinates.coordinates' , 'geo.coordinates.x' , 'geo.coordinates.y' ,
517+ 'geo.full_name' , 'geo.name' , 'geo.place_type' , 'geo.country' ,
518+ 'geo.country_code' , 'geo.type' , 'geo.bbox' , 'geo.centroid' ,
519+ 'geo.centroid.x' , 'geo.centroid.y' ]]
520+ except :
521+ pass
522+
523+ # set up file prefix from config
524+ file_prefix_w_date = config ['filename_prefix' ] + start_ts .isoformat ()
525+ outpickle = file_prefix_w_date + '.pkl'
526+ outcsv = file_prefix_w_date + '.csv'
527+
528+ # save to file
529+ if args ['output' ] == 'pickle' :
530+ # save to pickle
531+ tweetdf .to_pickle (outpickle )
532+ elif args ['output' ] == 'csv' :
533+ # save to csv
534+ tweetdf .to_csv (outcsv , sep = ';' , encoding = 'utf-8' )
535+
536+ # sleeps to not hit request limit so soon
537+ time .sleep (waittime )
538+
539+ # check if retrieval style if bulk
540+ elif rstyle == 'bulk' :
458541
459542 # payload rules for v2 api
460543 rule = gen_request_parameters (query = config ['query' ],
461- results_per_call = config ['results_per_call' ],
462- start_time = start_ts .isoformat (),
463- end_time = end_ts .isoformat (),
464- tweet_fields = tweetfields ,
465- user_fields = userfields ,
466- media_fields = mediafields ,
467- place_fields = placefields ,
468- expansions = expansions ,
469- stringify = False )
470-
544+ results_per_call = config ['results_per_call' ],
545+ start_time = start_ts .isoformat (),
546+ end_time = end_ts .isoformat (),
547+ tweet_fields = tweetfields ,
548+ user_fields = userfields ,
549+ media_fields = mediafields ,
550+ place_fields = placefields ,
551+ expansions = expansions ,
552+ stringify = False )
553+
471554 # result stream from twitter v2 api
472555 rs = ResultStream (request_parameters = rule ,
473556 max_results = 100000 ,
@@ -477,7 +560,7 @@ def v2parser(tweets, maxcalls):
477560
478561 # indicate which day is getting retrieved
479562 print ('[INFO] - Retrieving tweets from ' + str (start_ts ))
480-
563+
481564 # get json response to list
482565 tweets = list (rs .stream ())
483566
@@ -521,8 +604,5 @@ def v2parser(tweets, maxcalls):
521604 elif args ['output' ] == 'csv' :
522605 # save to csv
523606 tweetdf .to_csv (outcsv , sep = ';' , encoding = 'utf-8' )
524-
525- # sleeps to not hit request limit so soon
526- time .sleep (waittime )
527607
528608print ('[INFO] - ... done!' )
0 commit comments