DWA-Data-Scraper/scraper.py at master · CSSAW/DWA-Data-Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import argparse
import scraper_functions
import pandas as pd
import os
import json

parser = argparse.ArgumentParser(description="Scrape data from the South African department of water and sanitation.")
parser.add_argument("--station_list_url", default="http://www.dwa.gov.za/Hydrology/Verified/HyStations.aspx?Region=A&StationType=rbRiver", help="The url to the list of the stations. Defaults to the verified list for the Limpopo basin.")
parser.add_argument("--data_output_dir", default="csv_output", help="This is where the CSVs of data from each of the stations will go.")
parser.add_argument("--always_download", action='store_true', help="Will download webpages regardless if they are already downloaded. This is akin to refreshing a page in a web browser")
parser.add_argument("--store_web_data", action='store_true', help="Will store web data so that the script doesn't have to redownload everything. This is usefull for debugging because the script will run much quicker.")
args = parser.parse_args()

stationListSoup = scraper_functions.documentDownloader(args.station_list_url, alwaysDownload=args.always_download, storeData=args.store_web_data)
stationListTable = stationListSoup.find(id="tableStations")
if stationListTable == None:
    raise ValueError("The table with the id tableStations has not been found.")

baseURL = args.station_list_url[:args.station_list_url.rfind("/")+1]
stationListLinks = [i.get("href") for i in stationListTable.find_all('a')]

if not os.path.exists(args.data_output_dir):
    os.mkdir(args.data_output_dir)
for stationLink in stationListLinks:
    stationData = scraper_functions.getStationData(baseURL+stationLink,alwaysDownload=args.always_download, storeData=args.store_web_data)
    metaDataStr = "#" + json.dumps({"lat":stationData["lat"],"long":stationData["long"],"river":stationData["river"]}) + "\n"
    if type(stationData["dfPrimary"]) != type(None):
        csvPath = os.path.join(args.data_output_dir,stationData["station"]+"_PRIMARY.csv")
        stationData["dfPrimary"].to_csv(csvPath,sep=",",index=False)
        csvFile = open(csvPath, "r+")
        csvContent = csvFile.read()
        csvFile.seek(0)
        csvFile.write(metaDataStr)
        csvFile.write(csvContent)
        csvFile.close()

    if type(stationData["dfFlow"]) != type(None):
        csvPath = os.path.join(args.data_output_dir,stationData["station"]+"_FLOW.csv")
        stationData["dfFlow"].to_csv(csvPath,sep=",", index = False)
        csvFile = open(csvPath, "r+")
        csvContent = csvFile.read()
        csvFile.seek(0)
        csvFile.write(metaDataStr)
        csvFile.write(csvContent)
        csvFile.close()