ECMWFCode4Earth · tommylees112 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/notebooks/draft/47_compare_to_FUSE.ipynb b/notebooks/draft/47_compare_to_FUSE.ipynb
diff --git a/scripts/drafts/cut_basins_from_soil_moisture.py b/scripts/drafts/cut_basins_from_soil_moisture.py
@@ -0,0 +1,20 @@
+import sys
+sys.path.append("../..")
+
+import pandas as pd
+from pathlib import Path
+import xarray as xr
+from scripts.utils import get_data_path
+from src.preprocess.utils import SHPtoXarray
+
+
+if __name__ == "__main__":
+    data_dir = Path("/cats/datastore/data")
+
+    sm_path = data_dir / "RUNOFF/gb_soil_moisture_2000_2020.nc"
+    shp_path = data_dir / "CAMELS_GB_DATASET/Catchment_Boundaries/CAMELS_GB_catchment_boundaries.shp"
+
+    da = xr.open_dataset(sm_path)["swvl1"]
+
+    converter = SHPtoXarray()
+    shp_xr = converter.shapefile_to_xarray(da, shp_path, var_name="station_id", lookup_colname="ID_STRING")
diff --git a/scripts/drafts/get_basin_soil_moisture.py b/scripts/drafts/get_basin_soil_moisture.py
@@ -0,0 +1,152 @@
+"""
+ipython --pdb scripts/drafts/get_basin_soil_moisture.py
+"""
+
+import sys
+from pathlib import Path
+import numpy as np
+import xarray as xr
+from typing import Optional, Union, List
+from itertools import product
+
+sys.path.append("../..")
+
+from scripts.preprocess import process_era5_land
+from scripts.export import export_era5_land
+from scripts.utils import get_data_path, _rename_directory
+from src.preprocess import ERA5LandPreprocessor
+
+
+def load_reference_nc(reference_nc_filepath: Path) -> xr.DataArray:
+    target_ds = xr.ones_like(xr.open_dataset(reference_nc_filepath))
+    data_var = [d for d in target_ds.data_vars][0]
+    da = target_ds[data_var]
+
+    return da
+
+
+def extract_time_series_of_soil_moisture() -> xr.Dataset:
+    data_dir = get_data_path()x
+    # load in shapefile
+    #  convert shapefile to xarray
+    shp_filepath = Path(
+        "/soge-home/projects/crop_yield/CAMELS/CAMELS_GB_DATASET"
+        "/Catchment_Boundaries/CAMELS_GB_catchment_boundaries.shp"
+    )
+    var_name = "swvl1"
+    reference_nc_filepath = data_dir / "interim/reanalysis-era5-land_interim/.nc"
+    # MUST have a target dataset to create the same shape
+    target_ds = xr.ones_like(xr.open_dataset(reference_nc_filepath))
+    data_var = [d for d in target_ds.data_vars][0]
+    da = target_ds[data_var]
+
+    # turn the shapefile into a categorical variable (like landcover)
+    shp_to_nc = SHPtoXarray()
+    ds = shp_to_nc.shapefile_to_xarray(
+        da=da, shp_path=shp_filepath, var_name=var_name, lookup_colname=lookup_colname,
+    )
+
+    # ensure shapefile is same shape as era5land
+    # for each variable (swvl1, swvl2, swvl3, swvl4)
+    # for each basin extract timeseries
+    #  save as xarray object with dims (time, basin)
+    pass
+
+
+def export_preprocess_one_year(
+    year: int,
+    variable: str,
+    cleanup: bool = False,
+    subset_str: str = "great_britain",
+) -> None:
+    # Download ERA5-Land = HOURLY
+    export_era5_land(
+        region_str=subset_str,
+        years=[year],
+        variables=[variable],
+        granularity="hourly",
+    )
+    #  Preprocess ERA5-Land -> DAILY
+    process_era5_land(
+        subset_str=subset_str,
+        monmean=False,
+        resample_time="D",
+        years=[year],
+        cleanup=False,
+        with_merge=False,
+        resample_before_merge=True,
+    )
+
+    # -- Check that files correctly exported/processed -- #
+    data_dir = get_data_path()
+    # has the raw file been downloaded?
+    assert (
+        data_dir / f"raw/reanalysis-era5-land/{variable}/{str(year)}/01_12.nc"
+    ).exists()
+    # has the preprocessed file been created?
+    fname = f"{year}_01_12_{variable}_great_britain.nc"
+    assert (
+        data_dir / f"interim/reanalysis-era5-land_interim/{fname}"
+    ).exists()
+
+    # -- Remove the raw file -- #
+    raw_nc_file = data_dir / f"raw/reanalysis-era5-land/{variable}/{str(year)}/01_12.nc"
+    if cleanup:
+        # delete the raw hourly file
+        raw_nc_file.unlink()
+        print(f"Removed File: {raw_nc_file}")
+
+    print(f"\n-- Downloaded and preprocessed {variable} {year} --\n")
+
+
+def merge_files(variable: str, subset_str: str = "great_britain") -> None:
+    data_dir = get_data_path()
+    processor = ERA5LandPreprocessor(data_dir)
+    filename = (
+        f'{variable}_data{"_" + subset_str if subset_str is not None else ""}.nc'
+    )
+
+    processor.merge_files(
+        subset_str=subset_str,
+        resample_time="D",
+        upsampling=False,
+        filename=filename,
+    )
+
+    # move all of the interim files
+    from_paths = [f for f in (data_dir / "interim/reanalysis-era5-land_interim").glob("*.nc")]
+    to_paths = [data_dir / f"interim/reanalysis-era5-land_OLD/{path.name}" for path in from_paths]
+    to_paths[0].parents[0].mkdir(exist_ok=True, parents=True)
+
+    for fp, tp in zip(from_paths, to_paths):
+        _rename_directory(
+            from_path=fp,
+            to_path=tp,
+        )
+
+
+if __name__ == "__main__":
+    subset_str = "great_britain"
+    variables = [
+        # "volumetric_soil_water_layer_1",
+        "volumetric_soil_water_layer_2",
+        "volumetric_soil_water_layer_3",
+        "volumetric_soil_water_layer_4",
+    ]
+    # years = np.arange(2004, 2016)
+    years = np.arange(1993, 2021)
+
+    # Due to memory constraints process hourly data into daily
+    # after every Variable/Year then merge all of the variable files
+    for variable in variables:
+        for year in years:
+            export_preprocess_one_year(year=year, variable=variable, cleanup=True)
+
+        # merge all of these daily files into one NETCDF file
+        # merge_files(variable, subset_str=subset_str)
+
+        # Do we need to unlink the interim files ???
+
+    # Extract time series for each basin (defined in shapefile)
+    # TODO: need to get this working
+    # extract_time_series_of_soil_moisture()
diff --git a/scripts/export.py b/scripts/export.py
@@ -1,5 +1,6 @@
 import sys
 import numpy as np
+from typing import List
 
 sys.path.append("..")
 from src.exporters import (
@@ -73,26 +74,30 @@ def export_era5(region_str="kenya"):
         exporter.export(variable=variable, granularity="monthly", region_str=region_str)
 
 
-def export_era5_land(region_str="kenya"):
-    exporter = ERA5LandExporter(get_data_path())
-
-    variables = [
-        "total_precipitation",
+def export_era5_land(
+    region_str="kenya",
+    granularity="monthly",
+    years: List[int] = np.arange(2000, 2021),
+    variables=[
+        # "total_precipitation",
         # "2m_temperature",
         # "evapotranspiration",
         # "potential_evaporation",
-        # "volumetric_soil_water_layer_1",
-        # "volumetric_soil_water_layer_2",
-        # "volumetric_soil_water_layer_3",
-        # "volumetric_soil_water_layer_4",
-    ]
+        "volumetric_soil_water_layer_1",
+        "volumetric_soil_water_layer_2",
+        "volumetric_soil_water_layer_3",
+        "volumetric_soil_water_layer_4",
+    ],
+):
+    exporter = ERA5LandExporter(get_data_path())
+
     for variable in variables:
         exporter.export(
             variable=variable,
             break_up="yearly",
             region_str=region_str,
-            granularity="monthly",
-            selection_request=dict(year=np.arange(2000, 2021)),
+            granularity=granularity,
+            selection_request=dict(year=years),
         )
 
 
@@ -190,7 +195,8 @@ def export_boku_ndvi():
 
 if __name__ == "__main__":
     print(f"Writing data to: {get_data_path()}")
-    export_era5_land(region_str="india")
+    region_str = "great_britain"
+    export_era5_land(region_str=region_str, granularity="hourly", years=np.arange(1970, 2016))
     # export_era5(region_str="kenya")
     # export_vhi()
     # export_chirps()

diff --git a/scripts/preprocess.py b/scripts/preprocess.py
@@ -73,6 +73,11 @@ def process_era5_land(
     variables: Optional[Union[List, str]] = None,
     subset_str: str = "kenya",
     monmean: bool = True,
+    resample_time: Optional[str] = "M",
+    years: Optional[List[int]] = None,
+    cleanup: bool = False,
+    with_merge: bool = True,
+    resample_before_merge: bool = False,
 ):
     data_path = get_data_path()
 
@@ -111,10 +116,14 @@ def process_era5_land(
     for variable in variables:
         processor.preprocess(
             subset_str=subset_str,
-            regrid=None,
-            resample_time="M",
+            regrid=regrid_path,
+            resample_time=resample_time,
             upsampling=False,
             variable=variable,
+            years=years,
+            cleanup=cleanup,
+            with_merge=with_merge,
+            resample_before_merge=resample_before_merge,
         )
 
 
@@ -127,7 +136,7 @@ def process_gleam(subset_str: str = "kenya"):
     assert regrid_path.exists(), f"{regrid_path} not available"
 
 
-def process_gleam():
+def process_gleam(resample_time: Optional[str] = "M"):
     # if the working directory is alread ml_drought don't need ../data
     if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought":
         data_path = Path("data")
@@ -142,7 +151,10 @@ def process_gleam():
     processor = GLEAMPreprocessor(data_path)
 
     processor.preprocess(
-        subset_str=subset_str, regrid=regrid_path, resample_time="M", upsampling=False
+        subset_str=subset_str,
+        regrid=regrid_path,
+        resample_time=resample_time,
+        upsampling=False,
     )
 
 
@@ -288,15 +300,16 @@ def preprocess_s5_ouce():
 
 
 if __name__ == "__main__":
-    subset_str = "india"
+    subset_str = "great_britain"
     # preprocess_era5(subset_str=subset_str)
     process_era5_land(
         subset_str=subset_str,
-        variables=[
-            "volumetric_soil_water_layer_1",
-            "potential_evaporation",
-        ],  #  total_precipitation 2m_temperature evapotranspiration
         monmean=False,
+        resample_time="D",
+        years=[2000],
+        with_merge=False,
+        cleanup=False,
+        resample_before_merge=True,
     )
     # process_vci(subset_str=subset_str)
     # process_precip_2018(subset_str=subset_str)

diff --git a/src/exporters/era5_land.py b/src/exporters/era5_land.py
@@ -68,6 +68,12 @@ class ERA5LandExporter(CDSExporter):
     dataset = "reanalysis-era5-land-monthly-means"
     granularity = "monthly"
 
+    def __post_init__(self):
+        assert self.granularity in [
+            "hourly",
+            "monthly",
+        ], "Only support two granularities: [ hourly monthly ]"
+
     @staticmethod
     def print_valid_vars():
         print(VALID_ERA5_LAND_VARS)