-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.py
More file actions
82 lines (71 loc) · 2.94 KB
/
extract.py
File metadata and controls
82 lines (71 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import csv
import tempfile
from zipfile import ZipFile
import requests
# Paths
base_path = "C:\\Users\\user\Desktop\DataAnalysis\Workspace"
source_url = "https://assets.datacamp.com/production/repositories/5899/datasets/66691278303f789ca4acd3c6406baa5fc6adaf28/PPR-ALL.zip"
source_path = f"{base_path}/data/source/downloaded_at=2021-01-01/ppr-all.zip"
# Raw path where we want to extract the new .csv data
raw_path = f"{base_path}/data/raw/downloaded_at=2021-02-01/ppr-all.csv"
# Create a directory at the `path` passed as an argument
def create_folder_if_not_exists(path):
"""
Create a new folder if it doesn't exists
"""
os.makedirs(os.path.dirname(path), exist_ok=True)
def download_snapshot():
"""
Download the new dataset from the source
"""
create_folder_if_not_exists(source_path)
with open(source_path, "wb") as source_ppr:
response = requests.get(source_url, verify=True)
source_ppr.write(response.content)
def save_new_raw_data():
"""
Save new raw data from the source
"""
create_folder_if_not_exists(raw_path)
with tempfile.TemporaryDirectory() as dirpath:
with ZipFile(
source_path,
"r",
) as zipfile:
names_list = zipfile.namelist()
csv_file_path = zipfile.extract(names_list[0], path=dirpath)
# Open the CSV file in read mode
with open(csv_file_path, mode="r", encoding="windows-1252") as csv_file:
reader = csv.DictReader(csv_file)
row = next(reader) # Get first row from reader
print("[Extract] First row example:", row)
# Open the CSV file in write mode
with open(
raw_path,
mode="w",
encoding="windows-1252"
) as csv_file:
# Rename field names so they're ready for the next step
fieldnames = {
"Date of Sale (dd/mm/yyyy)": "date_of_sale",
"Address": "address",
"Postal Code": "postal_code",
"County": "county",
"Price (€)": "price",
"Description of Property": "description",
}
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# Write headers as first line
writer.writerow(fieldnames)
for row in reader:
# Write all rows in file
writer.writerow(row)
# Main function called inside the execute.py script
def main():
print("[Extract] Start")
print("[Extract] Downloading snapshot")
download_snapshot()
print(f"[Extract] Saving data from '{source_path}' to '{raw_path}'")
save_new_raw_data()
print(f"[Extract] End")