-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcsv_scraper.py
More file actions
33 lines (26 loc) · 991 Bytes
/
csv_scraper.py
File metadata and controls
33 lines (26 loc) · 991 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# movie data demo
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
url = 'http://www.imdb.com/chart/top'
bs = BeautifulSoup(requests.get(url).text, 'html.parser')
movies = bs.select('td.titleColumn')
crew = [a.attrs.get('title') for a in bs.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in bs.select('td.posterColumn span[name=ir]')]
movieList = []
for index in range(0, len(movies)):
movie_string = movies[index].get_text()
movie = (' '.join(movie_string.split()).replace('.', ''))
movie_title = movie[len(str(index))+1:-7]
year = re.search('\((.*?)\)', movie_string).group(1)
place = movie[:len(str(index))-(len(movie))]
data = {"place": place,
"movie_title": movie_title,
"rating": ratings[index],
"year": year,
"star_cast": crew[index],
}
movieList.append(data)
df = pd.DataFrame(movieList)
df.to_csv('imdb_top_250_movies.csv',index=False)