diff --git a/src/climatebenchpress/data_loader/datasets/cams.py b/src/climatebenchpress/data_loader/datasets/cams.py index c137993..57d273f 100644 --- a/src/climatebenchpress/data_loader/datasets/cams.py +++ b/src/climatebenchpress/data_loader/datasets/cams.py @@ -34,6 +34,10 @@ def download(download_path: Path, progress: bool = True): @staticmethod def open(download_path: Path) -> xr.Dataset: ds = xr.open_dataset(download_path / Path(NO2_FILE).name) + + # Restrict data to a single day. + # The specific day is arbitrary. + ds = ds.sel(valid_time=slice("2023-06-15", "2023-06-15")).chunk(-1) # Needed to make the dataset CF-compliant. ds.longitude.attrs["axis"] = "X" ds.latitude.attrs["axis"] = "Y" diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/abc.py b/src/climatebenchpress/data_loader/datasets/cmip6/abc.py index b42855d..fc9ed38 100644 --- a/src/climatebenchpress/data_loader/datasets/cmip6/abc.py +++ b/src/climatebenchpress/data_loader/datasets/cmip6/abc.py @@ -48,8 +48,13 @@ def download_with( zstore = zstore.replace("gs://", "https://storage.googleapis.com/") ds = xr.open_zarr(fsspec.get_mapper(zstore), consolidated=True) + # Only select the year 2020 for the dataset. The exact choice of this + # year is arbitrary, + # .chunk(-1) ensures that we only use a single chunk for the entire dataset. + ds = ds.sel(time=slice("2020", "2020")).chunk(-1) if variable_selector is not None: ds = ds[variable_selector] + with monitor.progress_bar(progress): ds.to_zarr(downloadfile, mode="w", encoding=dict(), compute=False).compute() diff --git a/src/climatebenchpress/data_loader/datasets/era5.py b/src/climatebenchpress/data_loader/datasets/era5.py index 2fbdc93..d9d0d88 100644 --- a/src/climatebenchpress/data_loader/datasets/era5.py +++ b/src/climatebenchpress/data_loader/datasets/era5.py @@ -26,13 +26,15 @@ def download(download_path: Path, progress: bool = True): era5 = xr.open_zarr(ERA5_GCP_PATH, chunks={"time": 48}, consolidated=True) - ds = era5.sel(time=slice("2020-03-01", "2020-03-07"))[ + # Restrict data to a single day. + # The specific day is arbitrary. + ds = era5.sel(time=slice("2020-03-01", "2020-03-01"))[ [ "mean_sea_level_pressure", "10m_u_component_of_wind", "10m_v_component_of_wind", ] - ] + ].chunk(-1) # Needed to make the dataset CF-compliant. ds.time.attrs["standard_name"] = "time" ds.longitude.attrs["axis"] = "X" diff --git a/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py b/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py index f86aa04..8120322 100644 --- a/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py +++ b/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py @@ -13,8 +13,13 @@ from .abc import Dataset NUM_RETRIES = 3 -# Bounding box for an area in mainland France -FRANCE_BBOX = {"T": slice(0, 1), "X": slice(202531, 207531), "Y": slice(35469, 40469)} + +# Define rough bounding box coordinates for mainland France. +# Format: [min_longitude, min_latitude, max_longitude, max_latitude]. +FRANCE_BBOX = [-5.5, 42.3, 9.6, 51.1] + +# Biomass estimate for the year 2020. +BIOMASS_URL = "https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-2020-fv5.01.nc" class EsaBiomassCciDataset(Dataset): @@ -22,20 +27,14 @@ class EsaBiomassCciDataset(Dataset): @staticmethod def download(download_path: Path, progress: bool = True): - urls = [ - f"https://dap.ceda.ac.uk/neodc/esacci/biomass/data/agb/maps/v5.01/netcdf/ESACCI-BIOMASS-L4-AGB-MERGED-100m-{year}-fv5.01.nc" - # Restrict to 2 years for now for smaller download. - for year in [2010, 2015] - ] - for url in urls: - output_path = download_path / Path(url).name - for _ in range(NUM_RETRIES): - success = _download_netcdf(url, output_path, progress) - if success: - break - if not success: - logging.info(f"Failed to download {url}") - return + output_path = download_path / Path(BIOMASS_URL).name + for _ in range(NUM_RETRIES): + success = _download_netcdf(BIOMASS_URL, output_path, progress) + if success: + break + if not success: + logging.info(f"Failed to download {BIOMASS_URL}") + return @staticmethod def open(download_path: Path) -> xr.Dataset: @@ -44,12 +43,28 @@ def open(download_path: Path) -> xr.Dataset: # Needed to make the dataset CF-compliant. ds.lon.attrs["axis"] = "X" ds.lat.attrs["axis"] = "Y" + # We are constraining the dataset to mainland France to reduce its overall size. + # The global snapshot would be around 20 GB, which is too large for our use case. + # We chose France because it should have a fairly diverse set of biomass estimates + # but the choice is overall somewhat arbitrary. + ds = ds.sel( + lon=slice(FRANCE_BBOX[0], FRANCE_BBOX[2]), + lat=slice(FRANCE_BBOX[3], FRANCE_BBOX[1]), + ).chunk(-1) return ds[["agb"]] if __name__ == "__main__": ds = open_downloaded_canonicalized_dataset(EsaBiomassCciDataset) - open_downloaded_tiny_canonicalized_dataset(EsaBiomassCciDataset, slices=FRANCE_BBOX) + num_lon, num_lat = ds.lon.size, ds.lat.size + open_downloaded_tiny_canonicalized_dataset( + EsaBiomassCciDataset, + # Use a smaller spatial subset for the tiny dataset. + slices={ + "X": slice(num_lon // 2, (num_lon // 2) + 500), + "Y": slice(num_lat // 2, (num_lat // 2) + 500), + }, + ) for v, da in ds.items(): print(f"- {v}: {da.dims}") diff --git a/src/climatebenchpress/data_loader/datasets/nextgems.py b/src/climatebenchpress/data_loader/datasets/nextgems.py index c2d4b72..4d1713c 100644 --- a/src/climatebenchpress/data_loader/datasets/nextgems.py +++ b/src/climatebenchpress/data_loader/datasets/nextgems.py @@ -44,16 +44,19 @@ def download(download_path: Path, progress: bool = True): zoom=ZOOM, time=TIME_RESOLUTION, chunks=dict() ).to_dask() - ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-07")) + # Restrict data to a single day. + # The specific day is arbitrary. + ds = icon[[PRECIP_KEY, OLR_KEY]].sel(time=slice("2020-03-01", "2020-03-01")) # Regrid the data to 0.125 degree resolution. - # NOTE: This is using nearest neighbour interpolation. We need to do some - # quality checks to ensure we don't get any significant aliasing - # artifacts as the result of interpolation. For more details: - # https://easy.gems.dkrz.de/Processing/healpix/lonlat_remap.html. + # NOTE: + # This is using nearest neighbour interpolation. Different interpolation methods + # should not have a drastic effect on the intercomparison of different compressors. + # However, this should be studied in more detail because re-gridding can often + # have unforeseen consequences. idx = _get_nn_lon_lat_index( 2**ZOOM, np.linspace(-180, 180, NUM_LON), np.linspace(-90, 90, NUM_LAT) ) - ds = ds.isel(cell=idx).chunk({"time": 1, "lat": NUM_LAT, "lon": NUM_LON}) + ds = ds.isel(cell=idx).chunk(-1) ds.lon.attrs["axis"] = "X" ds.lat.attrs["axis"] = "Y"