Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
45e07cb
new region (GB) preprocess/export
tommylees112 Oct 15, 2020
94a97f0
fix scripts
tommylees112 Oct 15, 2020
ba9dd6e
update scripts
tommylees112 Oct 15, 2020
ae99b6f
update export
tommylees112 Oct 15, 2020
46c0e8a
update scripts to do year by year
tommylees112 Oct 15, 2020
0486122
update preprocess to not do merge unless asked
tommylees112 Oct 15, 2020
ed25ee9
black
tommylees112 Oct 15, 2020
d821eef
update csrpts
tommylees112 Oct 15, 2020
d240341
update naming
tommylees112 Oct 15, 2020
be1680a
fix bug in era5 land
tommylees112 Oct 15, 2020
f76e065
update to resample BEFORE merge
tommylees112 Oct 15, 2020
7b0ef0a
update the preprocessor documentation
tommylees112 Oct 15, 2020
7e71e18
add resample_before_merge option to scripts
tommylees112 Oct 15, 2020
6f55a3d
update
tommylees112 Oct 15, 2020
6da893f
assign properly
tommylees112 Oct 15, 2020
7a5ec20
update todo list
tommylees112 Oct 15, 2020
2ab4de6
update get data script
tommylees112 Oct 15, 2020
a1480ba
path to base
tommylees112 Oct 15, 2020
254fe6e
update granulatiy arg'
tommylees112 Oct 15, 2020
b98347c
update fname
tommylees112 Oct 15, 2020
116ccaa
trry sm again
tommylees112 Oct 16, 2020
bc5677b
update script
tommylees112 Oct 21, 2020
077bdcb
nb
tommylees112 Oct 21, 2020
10b21e8
update gb space
tommylees112 Oct 29, 2020
202f127
Merge branch 'gb_soil_moisture' of https://github.com/esowc/ml_drough…
tommylees112 Oct 29, 2020
21e355d
add vlumetric 1-4
tommylees112 Nov 4, 2020
50790ef
update export
tommylees112 Mar 4, 2021
7bde7bb
Merge branch 'gb_soil_moisture' of https://github.com/esowc/ml_drough…
tommylees112 Mar 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,954 changes: 2,954 additions & 0 deletions notebooks/draft/47_compare_to_FUSE.ipynb

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions scripts/drafts/cut_basins_from_soil_moisture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sys
sys.path.append("../..")

import pandas as pd
from pathlib import Path
import xarray as xr
from scripts.utils import get_data_path
from src.preprocess.utils import SHPtoXarray


if __name__ == "__main__":
data_dir = Path("/cats/datastore/data")

sm_path = data_dir / "RUNOFF/gb_soil_moisture_2000_2020.nc"
shp_path = data_dir / "CAMELS_GB_DATASET/Catchment_Boundaries/CAMELS_GB_catchment_boundaries.shp"

da = xr.open_dataset(sm_path)["swvl1"]

converter = SHPtoXarray()
shp_xr = converter.shapefile_to_xarray(da, shp_path, var_name="station_id", lookup_colname="ID_STRING")
152 changes: 152 additions & 0 deletions scripts/drafts/get_basin_soil_moisture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""
ipython --pdb scripts/drafts/get_basin_soil_moisture.py
"""

import sys
from pathlib import Path
import numpy as np
import xarray as xr
from typing import Optional, Union, List
from itertools import product

sys.path.append("../..")

from scripts.preprocess import process_era5_land
from scripts.export import export_era5_land
from scripts.utils import get_data_path, _rename_directory
from src.preprocess import ERA5LandPreprocessor


def load_reference_nc(reference_nc_filepath: Path) -> xr.DataArray:
target_ds = xr.ones_like(xr.open_dataset(reference_nc_filepath))
data_var = [d for d in target_ds.data_vars][0]
da = target_ds[data_var]

return da


def extract_time_series_of_soil_moisture() -> xr.Dataset:
data_dir = get_data_path()x
# load in shapefile
#  convert shapefile to xarray
shp_filepath = Path(
"/soge-home/projects/crop_yield/CAMELS/CAMELS_GB_DATASET"
"/Catchment_Boundaries/CAMELS_GB_catchment_boundaries.shp"
)
var_name = "swvl1"
reference_nc_filepath = data_dir / "interim/reanalysis-era5-land_interim/.nc"
# MUST have a target dataset to create the same shape
target_ds = xr.ones_like(xr.open_dataset(reference_nc_filepath))
data_var = [d for d in target_ds.data_vars][0]
da = target_ds[data_var]

# turn the shapefile into a categorical variable (like landcover)
shp_to_nc = SHPtoXarray()
ds = shp_to_nc.shapefile_to_xarray(
da=da, shp_path=shp_filepath, var_name=var_name, lookup_colname=lookup_colname,
)

# ensure shapefile is same shape as era5land
# for each variable (swvl1, swvl2, swvl3, swvl4)
# for each basin extract timeseries
#  save as xarray object with dims (time, basin)
pass


def export_preprocess_one_year(
year: int,
variable: str,
cleanup: bool = False,
subset_str: str = "great_britain",
) -> None:
# Download ERA5-Land = HOURLY
export_era5_land(
region_str=subset_str,
years=[year],
variables=[variable],
granularity="hourly",
)
#  Preprocess ERA5-Land -> DAILY
process_era5_land(
subset_str=subset_str,
monmean=False,
resample_time="D",
years=[year],
cleanup=False,
with_merge=False,
resample_before_merge=True,
)

# -- Check that files correctly exported/processed -- #
data_dir = get_data_path()
# has the raw file been downloaded?
assert (
data_dir / f"raw/reanalysis-era5-land/{variable}/{str(year)}/01_12.nc"
).exists()
# has the preprocessed file been created?
fname = f"{year}_01_12_{variable}_great_britain.nc"
assert (
data_dir / f"interim/reanalysis-era5-land_interim/{fname}"
).exists()

# -- Remove the raw file -- #
raw_nc_file = data_dir / f"raw/reanalysis-era5-land/{variable}/{str(year)}/01_12.nc"
if cleanup:
# delete the raw hourly file
raw_nc_file.unlink()
print(f"Removed File: {raw_nc_file}")

print(f"\n-- Downloaded and preprocessed {variable} {year} --\n")


def merge_files(variable: str, subset_str: str = "great_britain") -> None:
data_dir = get_data_path()
processor = ERA5LandPreprocessor(data_dir)
filename = (
f'{variable}_data{"_" + subset_str if subset_str is not None else ""}.nc'
)

processor.merge_files(
subset_str=subset_str,
resample_time="D",
upsampling=False,
filename=filename,
)

# move all of the interim files
from_paths = [f for f in (data_dir / "interim/reanalysis-era5-land_interim").glob("*.nc")]
to_paths = [data_dir / f"interim/reanalysis-era5-land_OLD/{path.name}" for path in from_paths]
to_paths[0].parents[0].mkdir(exist_ok=True, parents=True)

for fp, tp in zip(from_paths, to_paths):
_rename_directory(
from_path=fp,
to_path=tp,
)


if __name__ == "__main__":
subset_str = "great_britain"
variables = [
# "volumetric_soil_water_layer_1",
"volumetric_soil_water_layer_2",
"volumetric_soil_water_layer_3",
"volumetric_soil_water_layer_4",
]
# years = np.arange(2004, 2016)
years = np.arange(1993, 2021)

# Due to memory constraints process hourly data into daily
# after every Variable/Year then merge all of the variable files
for variable in variables:
for year in years:
export_preprocess_one_year(year=year, variable=variable, cleanup=True)

# merge all of these daily files into one NETCDF file
# merge_files(variable, subset_str=subset_str)

# Do we need to unlink the interim files ???

# Extract time series for each basin (defined in shapefile)
# TODO: need to get this working
# extract_time_series_of_soil_moisture()
32 changes: 19 additions & 13 deletions scripts/export.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import numpy as np
from typing import List

sys.path.append("..")
from src.exporters import (
Expand Down Expand Up @@ -73,26 +74,30 @@ def export_era5(region_str="kenya"):
exporter.export(variable=variable, granularity="monthly", region_str=region_str)


def export_era5_land(region_str="kenya"):
exporter = ERA5LandExporter(get_data_path())

variables = [
"total_precipitation",
def export_era5_land(
region_str="kenya",
granularity="monthly",
years: List[int] = np.arange(2000, 2021),
variables=[
# "total_precipitation",
# "2m_temperature",
# "evapotranspiration",
# "potential_evaporation",
# "volumetric_soil_water_layer_1",
# "volumetric_soil_water_layer_2",
# "volumetric_soil_water_layer_3",
# "volumetric_soil_water_layer_4",
]
"volumetric_soil_water_layer_1",
"volumetric_soil_water_layer_2",
"volumetric_soil_water_layer_3",
"volumetric_soil_water_layer_4",
],
):
exporter = ERA5LandExporter(get_data_path())

for variable in variables:
exporter.export(
variable=variable,
break_up="yearly",
region_str=region_str,
granularity="monthly",
selection_request=dict(year=np.arange(2000, 2021)),
granularity=granularity,
selection_request=dict(year=years),
)


Expand Down Expand Up @@ -190,7 +195,8 @@ def export_boku_ndvi():

if __name__ == "__main__":
print(f"Writing data to: {get_data_path()}")
export_era5_land(region_str="india")
region_str = "great_britain"
export_era5_land(region_str=region_str, granularity="hourly", years=np.arange(1970, 2016))
# export_era5(region_str="kenya")
# export_vhi()
# export_chirps()
Expand Down
31 changes: 22 additions & 9 deletions scripts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ def process_era5_land(
variables: Optional[Union[List, str]] = None,
subset_str: str = "kenya",
monmean: bool = True,
resample_time: Optional[str] = "M",
years: Optional[List[int]] = None,
cleanup: bool = False,
with_merge: bool = True,
resample_before_merge: bool = False,
):
data_path = get_data_path()

Expand Down Expand Up @@ -111,10 +116,14 @@ def process_era5_land(
for variable in variables:
processor.preprocess(
subset_str=subset_str,
regrid=None,
resample_time="M",
regrid=regrid_path,
resample_time=resample_time,
upsampling=False,
variable=variable,
years=years,
cleanup=cleanup,
with_merge=with_merge,
resample_before_merge=resample_before_merge,
)


Expand All @@ -127,7 +136,7 @@ def process_gleam(subset_str: str = "kenya"):
assert regrid_path.exists(), f"{regrid_path} not available"


def process_gleam():
def process_gleam(resample_time: Optional[str] = "M"):
# if the working directory is alread ml_drought don't need ../data
if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought":
data_path = Path("data")
Expand All @@ -142,7 +151,10 @@ def process_gleam():
processor = GLEAMPreprocessor(data_path)

processor.preprocess(
subset_str=subset_str, regrid=regrid_path, resample_time="M", upsampling=False
subset_str=subset_str,
regrid=regrid_path,
resample_time=resample_time,
upsampling=False,
)


Expand Down Expand Up @@ -288,15 +300,16 @@ def preprocess_s5_ouce():


if __name__ == "__main__":
subset_str = "india"
subset_str = "great_britain"
# preprocess_era5(subset_str=subset_str)
process_era5_land(
subset_str=subset_str,
variables=[
"volumetric_soil_water_layer_1",
"potential_evaporation",
], #  total_precipitation 2m_temperature evapotranspiration
monmean=False,
resample_time="D",
years=[2000],
with_merge=False,
cleanup=False,
resample_before_merge=True,
)
# process_vci(subset_str=subset_str)
# process_precip_2018(subset_str=subset_str)
Expand Down
6 changes: 6 additions & 0 deletions src/exporters/era5_land.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ class ERA5LandExporter(CDSExporter):
dataset = "reanalysis-era5-land-monthly-means"
granularity = "monthly"

def __post_init__(self):
assert self.granularity in [
"hourly",
"monthly",
], "Only support two granularities: [ hourly monthly ]"

@staticmethod
def print_valid_vars():
print(VALID_ERA5_LAND_VARS)
Expand Down
Loading