-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatalake.py
More file actions
121 lines (94 loc) · 4.22 KB
/
datalake.py
File metadata and controls
121 lines (94 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Import libraries
import requests
import pandas as pd
import re
from sqlalchemy import create_engine
import os
# Fetch mySQL password from env variable
sql_pass = os.environ["MYSQL_PASSWORD"]
# Create a connection to the database
conn = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
.format(user="root",
pw=sql_pass,
db="datalake"))
# Variable declaration
school_dict = {'ironhack': 10828}
locations_list = []
courses_list = []
badges_list = []
school_list = []
def get_comments_school(school):
TAG_RE = re.compile(r'<[^>]+>')
# defines url to make api call to data -> dynamic with school if you want to scrape competition
url = "https://www.switchup.org/chimera/v1/school-review-list?mainTemplate=school-review-list&path=%2Fbootcamps%2F" + \
school + "&isDataTarget=false&page=3&perPage=10000&simpleHtml=true&truncationLength=250"
# makes get request and converts answer to json
# url defines the page of all the information, request is made, and information is returned to data variable
data = requests.get(url).json()
# converts json to dataframe
reviews = pd.DataFrame(data['content']['reviews'])
# aux function to apply regex and remove tags
def remove_tags(x):
return TAG_RE.sub('', x)
reviews['review_body'] = reviews['body'].apply(remove_tags)
reviews['school'] = school
return reviews
def get_school_info(school, school_id):
url = 'https://www.switchup.org/chimera/v1/bootcamp-data?mainTemplate=bootcamp-data%2Fdescription&path=%2Fbootcamps%2F' + \
str(school) + '&isDataTarget=false&bootcampId=' + str(school_id) + \
'&logoTag=logo&truncationLength=250&readMoreOmission=...&readMoreText=Read%20More&readLessText=Read%20Less'
data = requests.get(url).json()
data.keys()
courses = data['content']['courses']
courses_df = pd.DataFrame(courses, columns=['courses'])
locations = data['content']['locations']
locations_df = pd.json_normalize(locations)
badges_df = pd.DataFrame(data['content']['meritBadges'])
website = data['content']['webaddr']
description = data['content']['description']
logoUrl = data['content']['logoUrl']
school_df = pd.DataFrame([website, description, logoUrl]).T
school_df.columns = ['website', 'description', 'LogoUrl']
locations_df['school'] = school
courses_df['school'] = school
badges_df['school'] = school
school_df['school'] = school
locations_df['school_id'] = school_id
courses_df['school_id'] = school_id
badges_df['school_id'] = school_id
school_df['school_id'] = school_id
return locations_df, courses_df, badges_df, school_df
def create_datalake():
"""Create tables and populate them with data from the corresponding dataframe"""
# Create cursor and datalake db
#cur = conn.cursor()
#cur.execute("CREATE DATABASE datalake;")
#cur.execute("USE datalake;")
# Drop tables to be able to insert data
comments.drop(['user', 'body', 'rawBody', 'comments'],
inplace=True, axis=1)
comments.to_sql('comments', con=conn,
if_exists='replace', chunksize=1000)
locations.to_sql('locations', con=conn,
if_exists='replace', chunksize=1000)
courses.to_sql('courses', con=conn, if_exists='replace', chunksize=1000)
badges.to_sql('badges', con=conn, if_exists='replace', chunksize=1000)
school.to_sql('school', con=conn, if_exists='replace', chunksize=1000)
for school, id in school_dict.items():
a, b, c, d = get_school_info(school, id)
locations_list.append(a)
courses_list.append(b)
badges_list.append(c)
school_list.append(d)
# could you write this as a list comprehension? ;)
# YES WE CAN!
comments_list = [get_comments_school(s) for s in school_dict.keys()]
comments = pd.concat(comments_list)
locations = pd.concat(locations_list)
courses = pd.concat(courses_list)
badges = pd.concat(badges_list)
school = pd.concat(school_list)
# Call function to populate tables in datalake database
create_datalake()
print("Datalake created! Time to clean it (:")
#######################################################################################################################