diff --git a/README.md b/README.md index 0c2f75d..38096c3 100644 --- a/README.md +++ b/README.md @@ -1 +1,32 @@ -# data-loader \ No newline at end of file +# ClimateBenchPress Data Loader + +This repository contains the code to download the datasets for the ClimateBenchPress compression benchmark. + +## Getting Started + +This project uses the uv package manager to handle dependencies. If you don't already have it installed follow the instructions at [https://docs.astral.sh/uv/getting-started/installation/](https://docs.astral.sh/uv/getting-started/installation/). + +Next, clone this repository and within the project directory install all the necessary dependencies with: +```bash +uv sync +uv pip install -e . +``` + +## Downloading the Data + +To download all the data used for the benchmark run the following commands: +```bash +uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci +uv run python -m climatebenchpress.data_loader.datasets.cams +uv run python -m climatebenchpress.data_loader.datasets.era5 +uv run python -m climatebenchpress.data_loader.datasets.nextgems +uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta +uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos +``` +This will download the data into a sub-directory named `datasets` within this repository. If you want to store the data in a different directory you can use the `--basepath=${path/to/dir}` command line argument for the scripts which will store the data at `${path/to/dir}/datasets` instead. + +## Funding + +ClimateBenchPress has been developed as part of [Embed2Scale](https://embed2scale.eu/) and [ESiWACE3](https://www.esiwace.eu/). + +Funded by the European Union. This work has received funding from the European High Performance Computing Joint Undertaking (JU) under grant agreement No 101093054 and EU’s Horizon Europe program under grant agreement number 101131841. This work also received funding from [UK Research and Innovation (UKRI)](https://www.ukri.org/). \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 0c2f75d..e9047a6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1 +1,3 @@ -# data-loader \ No newline at end of file +{% + include-markdown "../README.md" +%} \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index f5663fd..1886a83 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,6 +2,7 @@ mkdocs mkdocstrings[python] mkdocs-exclude mkdocstrings-python-generator==1.0.0rc1 +mkdocs-include-markdown-plugin Pygments diff --git a/mkdocs.yml b/mkdocs.yml index 49c39cd..6c01157 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -34,13 +34,14 @@ plugins: - exclude: glob: - requirements.txt + - include-markdown - search - autorefs - mkdocstrings-python-generator: source_dirs: - nav_heading: [Documentation] base: src - ignore: ["cf.py", "download.py"] + ignore: ["cf.py", "download.py", "canon.py", "monitor.py", "all.py"] - mkdocstrings: enable_inventory: true handlers: @@ -49,7 +50,15 @@ plugins: docstring_section_style: list docstring_style: numpy show_if_no_docstring: true - filters: ["!^_$", "!^_[^_]", "!^__", "__init__", "!^cf$", "!^download$"] + filters: [ + "!^_$", + "!^_[^_]", + "!^__", + "__init__", + "!^cf$", + "!^registry$", + "!^get_stores$", + ] members_order: source group_by_category: false show_source: false diff --git a/src/climatebenchpress/data_loader/__init__.py b/src/climatebenchpress/data_loader/__init__.py index 7a98367..80a1a35 100644 --- a/src/climatebenchpress/data_loader/__init__.py +++ b/src/climatebenchpress/data_loader/__init__.py @@ -19,6 +19,22 @@ def open_downloaded_canonicalized_dataset( basepath: Path = Path(), progress: bool = True, ) -> xr.Dataset: + """Download a given dataset and canonicalize it, i.e. ensure that all the axes names are consistent between different datasets. + + Parameters + ---------- + cls : type[Dataset] + The dataset class to download and open + basepath : Path, optional + The base path where the dataset should be stored, by default Path() + progress : bool, optional + Whether to show a progress bar during the download, by default True + + Returns + ------- + xr.Dataset + The canonicalized dataset as an xarray Dataset + """ datasets = basepath / "datasets" download = datasets / cls.name / "download" @@ -46,6 +62,26 @@ def open_downloaded_tiny_canonicalized_dataset( progress: bool = True, slices: Optional[dict[str, slice]] = None, ) -> xr.Dataset: + """Same as `open_downloaded_canonicalized_dataset`, but returns a subset of the dataset. + + These tiny datasets are mainly used for testing purposes. + + Parameters + ---------- + cls : type[Dataset] + The dataset class to download and open + basepath : Path, optional + The base path where the dataset should be stored, by default Path() + progress : bool, optional + Whether to show a progress bar during the download, by default True + slices : Optional[dict[str, slice]], optional + A dictionary of slices to apply to the dataset, by default None + + Returns + ------- + xr.Dataset + The canonicalized tiny dataset as an xarray Dataset + """ datasets = basepath / "datasets" download = datasets / f"{cls.name}" / "download" diff --git a/src/climatebenchpress/data_loader/datasets/abc.py b/src/climatebenchpress/data_loader/datasets/abc.py index 0075a25..6b83387 100644 --- a/src/climatebenchpress/data_loader/datasets/abc.py +++ b/src/climatebenchpress/data_loader/datasets/abc.py @@ -11,17 +11,50 @@ class Dataset(ABC): - # Abstract interface, must be implemented by sublcasses + """Abstract base class for datasets. + + Each dataset has a unique name associated + with it that will be used to name the directory where the dataset is stored. + + The dataset should implement the `download` and `open` methods to handle downloading + the dataset (in whatever data format the original data comes) and opening it as an xarray + Dataset, respectively. + """ + name: str @staticmethod @abstractmethod def download(download_path: Path, progress: bool = True): + """Download the dataset to the specified path. The download function is responsible + for checking whether the download is complete or not. If the previous download was + interrupted, it will resume the download. If the download is complete, it will skip + the download. + + Parameters + ---------- + download_path : Path + The path where the dataset should be downloaded to + progress : bool, optional + Whether to show a progress bar during the download, by default True + """ pass @staticmethod @abstractmethod def open(download_path: Path) -> xr.Dataset: + """Open the dataset from the specified path as an xarray Dataset. + + Parameters + ---------- + download_path : Path + The path where the dataset is stored + + Returns + ------- + xr.Dataset + The dataset as an xarray Dataset + """ pass # Class interface diff --git a/src/climatebenchpress/data_loader/datasets/cams.py b/src/climatebenchpress/data_loader/datasets/cams.py index 12b3fbc..a176c8a 100644 --- a/src/climatebenchpress/data_loader/datasets/cams.py +++ b/src/climatebenchpress/data_loader/datasets/cams.py @@ -1,8 +1,8 @@ __all__ = ["CamsNitrogenDioxideDataset"] +import argparse import logging from pathlib import Path -import argparse import xarray as xr @@ -18,6 +18,13 @@ class CamsNitrogenDioxideDataset(Dataset): + """Dataset for CAMS Nitrogen Dioxide data. + + The dataset comes from the + [Copernicus Atmosphere Monitoring Service (CAMS)](https://atmosphere.copernicus.eu/). + This particular class downloads Nitrogen Dioxide reanalysis data. + """ + name = "cams-nitrogen-dioxide" @staticmethod diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/abc.py b/src/climatebenchpress/data_loader/datasets/cmip6/abc.py index fc9ed38..f4c8a31 100644 --- a/src/climatebenchpress/data_loader/datasets/cmip6/abc.py +++ b/src/climatebenchpress/data_loader/datasets/cmip6/abc.py @@ -17,6 +17,12 @@ class Cmip6Dataset(Dataset): + """Abstract base class for CMIP6 datasets. + + Defines some shared functionality for downloading and opening CMIP6 datasets. + All data is downloaded as Zarr files from the [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/noaa-public/cmip6?inv=1&invt=Ab4N6A). + """ + model_id: str ssp_id: str variable_id: str diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/access_atmos.py b/src/climatebenchpress/data_loader/datasets/cmip6/access_atmos.py index 559f06a..fda28d4 100644 --- a/src/climatebenchpress/data_loader/datasets/cmip6/access_atmos.py +++ b/src/climatebenchpress/data_loader/datasets/cmip6/access_atmos.py @@ -1,7 +1,7 @@ __all__ = ["Cmip6AtmosphereAccessDataset"] -from pathlib import Path import argparse +from pathlib import Path from ... import ( open_downloaded_canonicalized_dataset, @@ -11,6 +11,8 @@ class Cmip6AtmosphereAccessDataset(Cmip6AtmosphereDataset): + """Air temperature dataset from the CMIP6 ACCESS-ESM1-5 model under the SSP585 scenario.""" + name = "cmip6-access-ta" model_id = "ACCESS-ESM1-5" diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/access_ocean.py b/src/climatebenchpress/data_loader/datasets/cmip6/access_ocean.py index 4dd40f6..95aee04 100644 --- a/src/climatebenchpress/data_loader/datasets/cmip6/access_ocean.py +++ b/src/climatebenchpress/data_loader/datasets/cmip6/access_ocean.py @@ -1,7 +1,7 @@ __all__ = ["Cmip6OceanAccessDataset"] -from pathlib import Path import argparse +from pathlib import Path from ... import ( open_downloaded_canonicalized_dataset, @@ -11,6 +11,8 @@ class Cmip6OceanAccessDataset(Cmip6OceanDataset): + """Sea surface temperature dataset from the CMIP6 ACCESS-ESM1-5 model under the SSP585 scenario.""" + name = "cmip6-access-tos" model_id = "ACCESS-ESM1-5" diff --git a/src/climatebenchpress/data_loader/datasets/era5.py b/src/climatebenchpress/data_loader/datasets/era5.py index 2570694..8de8583 100644 --- a/src/climatebenchpress/data_loader/datasets/era5.py +++ b/src/climatebenchpress/data_loader/datasets/era5.py @@ -1,7 +1,7 @@ __all__ = ["Era5Dataset"] -from pathlib import Path import argparse +from pathlib import Path import xarray as xr @@ -16,6 +16,15 @@ class Era5Dataset(Dataset): + """ERA5 reanalysis dataset. + + This dataset accesses the cloud optimized ERA5 reanalysis data published on the + Google Cloud Platform. See [https://github.com/google-research/arco-era5](https://github.com/google-research/arco-era5) + for more details. + + The original ERA5 dataset is generated and published by [ECMWF](https://www.ecmwf.int/). + """ + name = "era5" @staticmethod diff --git a/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py b/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py index c97d3a7..f05a887 100644 --- a/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py +++ b/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py @@ -1,8 +1,8 @@ __all__ = ["EsaBiomassCciDataset"] +import argparse import logging from pathlib import Path -import argparse import xarray as xr @@ -24,6 +24,13 @@ class EsaBiomassCciDataset(Dataset): + """ESA Biomass CCI dataset. + + This dataset provides above-ground biomass (AGB) estimates for the year 2020. + The data is provided by the European Space Agency's Climate Change Initiative (CCI). + For more details see [https://climate.esa.int/en/projects/biomass/](https://climate.esa.int/en/projects/biomass/). + """ + name = "esa-biomass-cci" @staticmethod diff --git a/src/climatebenchpress/data_loader/datasets/nextgems.py b/src/climatebenchpress/data_loader/datasets/nextgems.py index bc8a3f0..631f17c 100644 --- a/src/climatebenchpress/data_loader/datasets/nextgems.py +++ b/src/climatebenchpress/data_loader/datasets/nextgems.py @@ -1,7 +1,7 @@ __all__ = ["NextGemsDataset"] -from pathlib import Path import argparse +from pathlib import Path import healpy import intake @@ -31,6 +31,15 @@ class NextGemsDataset(Dataset): + """NextGEMS ICON dataset. + + This dataset provides model output from ICON climate model runs, published as + part of the [NextGEMS project](https://nextgems-h2020.eu/). + + See [https://easy.gems.dkrz.de/index.html](https://easy.gems.dkrz.de/index.html) + for more details on the NextGEMS data. + """ + name = "nextgems-icon" @staticmethod