-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtool_scrape_movie_data.py
More file actions
188 lines (163 loc) · 6.79 KB
/
tool_scrape_movie_data.py
File metadata and controls
188 lines (163 loc) · 6.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
This script scrapes The Movie Database
for movie data to use as a training set for a neural net.
(https://developers.themoviedb.org/3)
API keys:
You'll need to apply for keys (it's easy) and put them in
API_keys.txt like:
[Keys]
API_V3 = 0oYKFlAbGuBe35QI7hO2AT7IDj8tTEtMOvVBh2rS
API_V4 = gH0VcY3hv8U1wCtJhSWa15ainzgNI2XJBttY69HV1YxcSBl8G2qw3mRU41QUD5PzfxK0aYx0GIcamKC
External Dependencies:
tmdbsimple (pip install tmdbsimple) https://github.com/celiao/tmdbsimple/
Usage:
scrape_movie_data.py --output ./
@author: Brad Beechler (brad.e.beechler@gmail.com)
# Last Modification: 09/20/2017 (Brad Beechler)
"""
from uplog import log
import os
import configparser
import tmdbsimple as tmdb
import json
import time
from tqdm import tqdm
API_FILE = "./API_keys.txt"
API_V3_KEY = None
API_V4_KEY = None
def read_api_keys(key_filename):
"""
:param key_filename: (str) name of the file with your API keys
:return config_dict: (dict) a dictionary with keys (api_v3 and api_v4)
"""
if os.path.isfile(key_filename):
log.out.debug('API file ' + key_filename + ' exists!')
else:
log.out.warn('API file ' + key_filename + ' does not exist.')
return
config = configparser.ConfigParser(defaults=os.environ,
interpolation=configparser.ExtendedInterpolation())
log.out.info('Reading API file: ' + key_filename)
config.read(key_filename)
section_list = config.sections()
config_dict = {}
for section in section_list:
config_dict[section] = {}
options = config.options(section)
for option in options:
try:
config_dict[option] = config.get(section, option)
if config_dict[option] == 'None':
config_dict[option] = None
if config_dict[option] == -1:
log.out.debug('Skip: %s' % option)
except SyntaxError:
log.out.warn('Exception on %s!' % option)
config_dict[option] = None
global API_V3_KEY
global API_V4_KEY
API_V3_KEY = config_dict["api_v3"]
API_V4_KEY = config_dict["api_v4"]
def request_v3_by_year(year, language="en", keys=None):
possible_keys = ["id", "genre_ids", "title", "original_title",
"video", "original_language", "release_date",
"vote_count", "vote_average", "popularity",
"adult", "poster_path", "backdrop_path", "overview"]
retry_limit = 100
year_of_movies = []
if keys is None:
requested_keys = possible_keys
else:
requested_keys = keys
# Need id key (it's used as master key)
if "id" not in requested_keys:
requested_keys.append("id")
discover = tmdb.Discover()
# Get the number of pages
for db_request in range(0, retry_limit):
while True:
try:
movie_page = discover.movie(language="en", primary_release_year=year)
except:
log.out.debug("HTTP error, waiting and retrying.")
time.sleep(db_request/retry_limit)
continue
break
log.out.info("Adding year: " + str(year) + " (" +
str(movie_page["total_pages"]) + " pages)")
for page_num in tqdm(range(1, movie_page["total_pages"] + 1)):
for db_request in range(0, retry_limit):
while True:
try:
movie_page = discover.movie(page=page_num,
language=language,
primary_release_year=year)
except:
log.out.debug("HTTP error, waiting and retrying.")
time.sleep(db_request / retry_limit)
continue
break
result_list = movie_page["results"]
for movie_dict in result_list:
requested_dict = {}
for key in requested_keys:
requested_dict[key] = movie_dict[key]
year_of_movies.append(requested_dict)
return year_of_movies
def get_movie_image(image_string):
image_base_url = "https://image.tmdb.org/t/p/w500/"
image_url = image_base_url + image_string
print("foo: " + image_url)
# TODO Finish this up
def write_json_from_dict(input_dict, filepath=".", filename="out.json", overwrite=True):
"""
:param input_dict: dictionary to write a file from
:param filepath: (str) the path to output to
:param filename: (str) name of the file
:param overwrite: (bool) overwrite existing files
:return: bool of success
"""
full_filepath = os.path.expanduser(filepath) # Fully resolve ~'s in path
full_filepath = os.path.realpath(full_filepath) # Fully resolve symbolic links in path
full_filepath = os.path.abspath(full_filepath) # Fully resolve relative paths (../../etc)
full_filename = os.path.join(full_filepath, filename) # Merge the filename and path for OS
# Check path existence and access permission
if os.access(full_filepath, os.W_OK):
if os.path.isfile(full_filename):
if overwrite:
log.out.warning("File exists! Overwriting.")
else:
log.out.warning("File exists! Not overwriting.")
return False
with open(full_filename, 'w') as file_handle:
json.dump(input_dict, file_handle)
return True
else:
log.out.error("Can not write to file with name: " + full_filename)
return False
def movie_data_to_json(output_json, year_start=1950, year_end=2017):
keys_wanted = ["genre_ids", "title", "release_date", "original_language",
"vote_count", "vote_average", "popularity",
"poster_path", "backdrop_path", "overview"]
master_dict = {}
for this_year in range(year_start, year_end+1):
log.out.info("Requesting year: " + str(this_year))
this_years_data = request_v3_by_year(this_year, keys=keys_wanted)
for movie_dict in this_years_data:
master_dict[movie_dict["id"]] = {}
master_dict[movie_dict["id"]]["year"] = this_year
for key in keys_wanted:
master_dict[movie_dict["id"]][key] = movie_dict[key]
# Overwrite temp file with new data
pathname, filename = os.path.split(output_json)
write_json_from_dict(master_dict, filepath=pathname,
filename=filename+'.tmp', overwrite=True)
# Write final data
pathname, filename = os.path.split(output_json)
write_json_from_dict(master_dict, filepath=pathname,
filename=filename, overwrite=True)
if __name__ == '__main__':
log.setLevel("INFO")
read_api_keys(API_FILE)
tmdb.API_KEY = API_V3_KEY
movie_data_to_json("./movie_info.json", year_start=1950, year_end=2017)