ClimateBenchPress · treigerm · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/README.md b/README.md
@@ -1 +1,32 @@
-# data-loader
+# ClimateBenchPress Data Loader
+
+This repository contains the code to download the datasets for the ClimateBenchPress compression benchmark. 
+
+## Getting Started
+
+This project uses the uv package manager to handle dependencies. If you don't already have it installed follow the instructions at [https://docs.astral.sh/uv/getting-started/installation/](https://docs.astral.sh/uv/getting-started/installation/). 
+
+Next, clone this repository and within the project directory install all the necessary dependencies with:
+```bash
+uv sync
+uv pip install -e .
+```
+
+## Downloading the Data
+
+To download all the data used for the benchmark run the following commands:
+```bash
+uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci
+uv run python -m climatebenchpress.data_loader.datasets.cams
+uv run python -m climatebenchpress.data_loader.datasets.era5
+uv run python -m climatebenchpress.data_loader.datasets.nextgems
+uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta
+uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos
+```
+This will download the data into a sub-directory named `datasets` within this repository. If you want to store the data in a different directory you can use the `--basepath=${path/to/dir}` command line argument for the scripts which will store the data at `${path/to/dir}/datasets` instead.
+
+## Funding 
+
+ClimateBenchPress has been developed as part of [Embed2Scale](https://embed2scale.eu/) and [ESiWACE3](https://www.esiwace.eu/).
+
+Funded by the European Union. This work has received funding from the European High Performance Computing Joint Undertaking (JU) under grant agreement No 101093054 and EU’s Horizon Europe program under grant agreement number 101131841. This work also received funding from [UK Research and Innovation (UKRI)](https://www.ukri.org/).
diff --git a/docs/index.md b/docs/index.md
@@ -1 +1,3 @@
-# data-loader
+{%
+    include-markdown "../README.md"
+%}
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -2,6 +2,7 @@ mkdocs
 mkdocstrings[python]
 mkdocs-exclude
 mkdocstrings-python-generator==1.0.0rc1
+mkdocs-include-markdown-plugin
 
 Pygments
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -34,13 +34,14 @@ plugins:
   - exclude:
       glob:
         - requirements.txt
+  - include-markdown
   - search
   - autorefs
   - mkdocstrings-python-generator:
       source_dirs:
         - nav_heading: [Documentation]
           base: src
-          ignore: ["cf.py", "download.py"]
+          ignore: ["cf.py", "download.py", "canon.py", "monitor.py", "all.py"]
   - mkdocstrings:
       enable_inventory: true
       handlers:
@@ -49,7 +50,15 @@ plugins:
             docstring_section_style: list
             docstring_style: numpy
             show_if_no_docstring: true
-            filters: ["!^_$", "!^_[^_]", "!^__", "__init__", "!^cf$", "!^download$"]
+            filters: [
+              "!^_$", 
+              "!^_[^_]", 
+              "!^__", 
+              "__init__", 
+              "!^cf$", 
+              "!^registry$",
+              "!^get_stores$",
+            ]
             members_order: source
             group_by_category: false
             show_source: false

diff --git a/src/climatebenchpress/data_loader/__init__.py b/src/climatebenchpress/data_loader/__init__.py
@@ -19,6 +19,22 @@ def open_downloaded_canonicalized_dataset(
     basepath: Path = Path(),
     progress: bool = True,
 ) -> xr.Dataset:
+    """Download a given dataset and canonicalize it, i.e. ensure that all the axes names are consistent between different datasets.
+
+    Parameters
+    ----------
+    cls : type[Dataset]
+        The dataset class to download and open
+    basepath : Path, optional
+        The base path where the dataset should be stored, by default Path()
+    progress : bool, optional
+        Whether to show a progress bar during the download, by default True
+
+    Returns
+    -------
+    xr.Dataset
+        The canonicalized dataset as an xarray Dataset
+    """
     datasets = basepath / "datasets"
 
     download = datasets / cls.name / "download"
@@ -46,6 +62,26 @@ def open_downloaded_tiny_canonicalized_dataset(
     progress: bool = True,
     slices: Optional[dict[str, slice]] = None,
 ) -> xr.Dataset:
+    """Same as `open_downloaded_canonicalized_dataset`, but returns a subset of the dataset.
+
+    These tiny datasets are mainly used for testing purposes.
+
+    Parameters
+    ----------
+    cls : type[Dataset]
+        The dataset class to download and open
+    basepath : Path, optional
+        The base path where the dataset should be stored, by default Path()
+    progress : bool, optional
+        Whether to show a progress bar during the download, by default True
+    slices : Optional[dict[str, slice]], optional
+        A dictionary of slices to apply to the dataset, by default None
+
+    Returns
+    -------
+    xr.Dataset
+        The canonicalized tiny dataset as an xarray Dataset
+    """
     datasets = basepath / "datasets"
 
     download = datasets / f"{cls.name}" / "download"

diff --git a/src/climatebenchpress/data_loader/datasets/abc.py b/src/climatebenchpress/data_loader/datasets/abc.py
@@ -11,17 +11,50 @@
 
 
 class Dataset(ABC):
-    # Abstract interface, must be implemented by sublcasses
+    """Abstract base class for datasets.
+
+    Each dataset has a unique name associated
+    with it that will be used to name the directory where the dataset is stored.
+
+    The dataset should implement the `download` and `open` methods to handle downloading
+    the dataset (in whatever data format the original data comes) and opening it as an xarray
+    Dataset, respectively.
+    """
+
     name: str
 
     @staticmethod
     @abstractmethod
     def download(download_path: Path, progress: bool = True):
+        """Download the dataset to the specified path. The download function is responsible
+        for checking whether the download is complete or not. If the previous download was
+        interrupted, it will resume the download. If the download is complete, it will skip
+        the download.
+
+        Parameters
+        ----------
+        download_path : Path
+            The path where the dataset should be downloaded to
+        progress : bool, optional
+            Whether to show a progress bar during the download, by default True
+        """
         pass
 
     @staticmethod
     @abstractmethod
     def open(download_path: Path) -> xr.Dataset:
+        """Open the dataset from the specified path as an xarray Dataset.
+
+        Parameters
+        ----------
+        download_path : Path
+            The path where the dataset is stored
+
+        Returns
+        -------
+        xr.Dataset
+            The dataset as an xarray Dataset
+        """
         pass
 
     # Class interface

diff --git a/src/climatebenchpress/data_loader/datasets/cams.py b/src/climatebenchpress/data_loader/datasets/cams.py
@@ -1,8 +1,8 @@
 __all__ = ["CamsNitrogenDioxideDataset"]
 
+import argparse
 import logging
 from pathlib import Path
-import argparse
 
 import xarray as xr
 
@@ -18,6 +18,13 @@
 
 
 class CamsNitrogenDioxideDataset(Dataset):
+    """Dataset for CAMS Nitrogen Dioxide data.
+
+    The dataset comes from the
+    [Copernicus Atmosphere Monitoring Service (CAMS)](https://atmosphere.copernicus.eu/).
+    This particular class downloads Nitrogen Dioxide reanalysis data.
+    """
+
     name = "cams-nitrogen-dioxide"
 
     @staticmethod

diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/abc.py b/src/climatebenchpress/data_loader/datasets/cmip6/abc.py
@@ -17,6 +17,12 @@
 
 
 class Cmip6Dataset(Dataset):
+    """Abstract base class for CMIP6 datasets.
+
+    Defines some shared functionality for downloading and opening CMIP6 datasets.
+    All data is downloaded as Zarr files from the [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/noaa-public/cmip6?inv=1&invt=Ab4N6A).
+    """
+
     model_id: str
     ssp_id: str
     variable_id: str

diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/access_atmos.py b/src/climatebenchpress/data_loader/datasets/cmip6/access_atmos.py
@@ -1,7 +1,7 @@
 __all__ = ["Cmip6AtmosphereAccessDataset"]
 
-from pathlib import Path
 import argparse
+from pathlib import Path
 
 from ... import (
     open_downloaded_canonicalized_dataset,
@@ -11,6 +11,8 @@
 
 
 class Cmip6AtmosphereAccessDataset(Cmip6AtmosphereDataset):
+    """Air temperature dataset from the CMIP6 ACCESS-ESM1-5 model under the SSP585 scenario."""
+
     name = "cmip6-access-ta"
 
     model_id = "ACCESS-ESM1-5"

diff --git a/src/climatebenchpress/data_loader/datasets/cmip6/access_ocean.py b/src/climatebenchpress/data_loader/datasets/cmip6/access_ocean.py
@@ -1,7 +1,7 @@
 __all__ = ["Cmip6OceanAccessDataset"]
 
-from pathlib import Path
 import argparse
+from pathlib import Path
 
 from ... import (
     open_downloaded_canonicalized_dataset,
@@ -11,6 +11,8 @@
 
 
 class Cmip6OceanAccessDataset(Cmip6OceanDataset):
+    """Sea surface temperature dataset from the CMIP6 ACCESS-ESM1-5 model under the SSP585 scenario."""
+
     name = "cmip6-access-tos"
 
     model_id = "ACCESS-ESM1-5"

diff --git a/src/climatebenchpress/data_loader/datasets/era5.py b/src/climatebenchpress/data_loader/datasets/era5.py
@@ -1,7 +1,7 @@
 __all__ = ["Era5Dataset"]
 
-from pathlib import Path
 import argparse
+from pathlib import Path
 
 import xarray as xr
 
@@ -16,6 +16,15 @@
 
 
 class Era5Dataset(Dataset):
+    """ERA5 reanalysis dataset.
+
+    This dataset accesses the cloud optimized ERA5 reanalysis data published on the
+    Google Cloud Platform. See [https://github.com/google-research/arco-era5](https://github.com/google-research/arco-era5)
+    for more details.
+
+    The original ERA5 dataset is generated and published by [ECMWF](https://www.ecmwf.int/).
+    """
+
     name = "era5"
 
     @staticmethod

diff --git a/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py b/src/climatebenchpress/data_loader/datasets/esa_biomass_cci.py
@@ -1,8 +1,8 @@
 __all__ = ["EsaBiomassCciDataset"]
 
+import argparse
 import logging
 from pathlib import Path
-import argparse
 
 import xarray as xr
 
@@ -24,6 +24,13 @@
 
 
 class EsaBiomassCciDataset(Dataset):
+    """ESA Biomass CCI dataset.
+
+    This dataset provides above-ground biomass (AGB) estimates for the year 2020.
+    The data is provided by the European Space Agency's Climate Change Initiative (CCI).
+    For more details see [https://climate.esa.int/en/projects/biomass/](https://climate.esa.int/en/projects/biomass/).
+    """
+
     name = "esa-biomass-cci"
 
     @staticmethod

diff --git a/src/climatebenchpress/data_loader/datasets/nextgems.py b/src/climatebenchpress/data_loader/datasets/nextgems.py
@@ -1,7 +1,7 @@
 __all__ = ["NextGemsDataset"]
 
-from pathlib import Path
 import argparse
+from pathlib import Path
 
 import healpy
 import intake
@@ -31,6 +31,15 @@
 
 
 class NextGemsDataset(Dataset):
+    """NextGEMS ICON dataset.
+
+    This dataset provides model output from ICON climate model runs, published as
+    part of the [NextGEMS project](https://nextgems-h2020.eu/).
+
+    See [https://easy.gems.dkrz.de/index.html](https://easy.gems.dkrz.de/index.html)
+    for more details on the NextGEMS data.
+    """
+
     name = "nextgems-icon"
 
     @staticmethod