From 81fb325f8367238c8f6e9b8c0f62883b3f93b064 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Wed, 8 Oct 2025 17:48:50 +0200
Subject: [PATCH 01/14] Remove bf16_mode and bring back autocast

---
 aurora/model/aurora.py | 68 ++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 43 deletions(-)

diff --git a/aurora/model/aurora.py b/aurora/model/aurora.py
index f3e3401..f3ae919 100644
--- a/aurora/model/aurora.py
+++ b/aurora/model/aurora.py
@@ -145,10 +145,8 @@ def __init__(
             surf_stats (dict[str, tuple[float, float]], optional): For these surface-level
                 variables, adjust the normalisation to the given tuple consisting of a new location
                 and scale.
-            bf16_mode (bool, optional): To reduce memory usage, convert the tokens to BF16, run
-                the backbone in pure BF16, and run the decoder in FP16 AMP. This should enable a
-                gradient computation. USE AT YOUR OWN RISK. THIS WAS NOT USED DURING THE DEVELOPMENT
-                OF AURORA AND IS PURELY PROVIDED AS A STARTING POINT FOR FINE-TUNING.
+            autocast (bool, optional): To reduce memory usage, `torch.autocast` only the backbone
+                to BF16. This reduced memory usage and is critical to enable fine-tuning.
             level_condition (tuple[int | float, ...], optional): Make the patch embeddings dependent
                 on pressure level. If you want to enable this feature, provide a tuple of all
                 possible pressure levels.
@@ -252,18 +250,16 @@ def __init__(
             modulation_heads=modulation_heads,
         )
 
-        if autocast and not bf16_mode:
+        if bf16_mode and not autocast:
             warnings.warn(
-                "The argument `autocast` no longer does anything due to limited utility. "
-                "Consider instead using `bf16_mode`.",
+                "`bf16_mode` was removed, because it caused serious issues for gradient "
+                "computation. `bf16_mode` now automatically activates `autocast`, which will not "
+                "save as much memory, but should be much more stable.",
                 stacklevel=2,
             )
+            autocast = True
 
-        self.bf16_mode = bf16_mode
-
-        if self.bf16_mode:
-            # We run the backbone in pure BF16.
-            self.backbone.to(torch.bfloat16)
+        self.autocast = autocast
 
     def forward(self, batch: Batch) -> Batch:
         """Forward pass.
@@ -327,44 +323,30 @@ def forward(self, batch: Batch) -> Batch:
             lead_time=self.timestep,
         )
 
-        # In BF16 mode, the backbone is run in pure BF16.
-        if self.bf16_mode:
-            x = x.to(torch.bfloat16)
-        x = self.backbone(
-            x,
-            lead_time=self.timestep,
-            patch_res=patch_res,
-            rollout_step=batch.metadata.rollout_step,
-        )
-
-        # In BF16 mode, the decoder is run in AMP PF16, and the output is converted back to FP32.
-        # We run in PF16 as opposed to BF16 for improved relative precision.
-        if self.bf16_mode:
-            device_type = (
-                "cuda"
-                if torch.cuda.is_available()
-                else "xpu"
-                if torch.xpu.is_available()
-                else "cpu"
-            )
-            context = torch.autocast(device_type=device_type, dtype=torch.float16)
-            x = x.to(torch.float16)
+        if self.autocast:
+            if torch.cuda.is_available():
+                device_type = "cuda"
+            elif torch.xpu.is_available():
+                device_type = "xpu"
+            else:
+                device_type = "cpu"
+            context = torch.autocast(device_type=device_type, dtype=torch.bfloat16)
         else:
             context = contextlib.nullcontext()
         with context:
-            pred = self.decoder(
+            x = self.backbone(
                 x,
-                batch,
                 lead_time=self.timestep,
                 patch_res=patch_res,
+                rollout_step=batch.metadata.rollout_step,
             )
-        if self.bf16_mode:
-            pred = dataclasses.replace(
-                pred,
-                surf_vars={k: v.float() for k, v in pred.surf_vars.items()},
-                static_vars={k: v.float() for k, v in pred.static_vars.items()},
-                atmos_vars={k: v.float() for k, v in pred.atmos_vars.items()},
-            )
+
+        pred = self.decoder(
+            x,
+            batch,
+            lead_time=self.timestep,
+            patch_res=patch_res,
+        )
 
         # Remove batch and history dimension from static variables.
         pred = dataclasses.replace(

From 195dfd130f0545f02fe0aa970544e2ef329c27c7 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 16:21:34 +0200
Subject: [PATCH 02/14] Make activation checkpointing configurable

Also improve default checkpointing strategy.
---
 .pre-commit-config.yaml |  3 ---
 aurora/model/aurora.py  | 44 ++++++++++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 365fe32..d3f71f4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,9 +2,6 @@ ci:
   autoupdate_commit_msg: "chore: Update pre-commit hooks"
   autofix_commit_msg: "style: Pre-commit fixes"
 
-default_language_version:
-  python: python3.10
-
 repos:
   - repo: meta
     hooks:
diff --git a/aurora/model/aurora.py b/aurora/model/aurora.py
index f3ae919..ac7fa12 100644
--- a/aurora/model/aurora.py
+++ b/aurora/model/aurora.py
@@ -502,27 +502,49 @@ def adapt_checkpoint_max_history_size(self, checkpoint: dict[str, torch.Tensor])
 
                 checkpoint[name] = new_weight
 
-    def configure_activation_checkpointing(self):
+    def configure_activation_checkpointing(
+        self,
+        module_names: tuple[str, ...] = (
+            "Basic3DDecoderLayer",
+            "Basic3DEncoderLayer",
+            "LinearPatchReconstruction",
+            "Perceiver3DDecoder",
+            "Perceiver3DEncoder",
+            "Swin3DTransformerBackbone",
+            "Swin3DTransformerBlock",
+        ),
+    ) -> None:
         """Configure activation checkpointing.
 
         This is required in order to compute gradients without running out of memory.
+
+        Args:
+            module_names (tuple[str, ...], optional): Names of the modules to checkpoint
+                on.
+
+        Raises:
+            RuntimeError: If any module specifies in `module_names` was not found and
+                thus could not be checkpointed.
         """
-        # Checkpoint these modules:
-        module_names = (
-            "Perceiver3DEncoder",
-            "Swin3DTransformerBackbone",
-            "Basic3DEncoderLayer",
-            "Basic3DDecoderLayer",
-            "Perceiver3DDecoder",
-            "LinearPatchReconstruction",
-        )
+
+        found: set[str] = set()
 
         def check(x: torch.nn.Module) -> bool:
             name = x.__class__.__name__
-            return name in module_names
+            if name in module_names:
+                found.add(name)
+                return True
+            else:
+                return False
 
         apply_activation_checkpointing(self, check_fn=check)
 
+        if found != set(module_names):
+            raise RuntimeError(
+                f'Could not checkpoint on the following modules: '
+                f'{", ".join(sorted(set(module_names) - found))}.'
+            )
+
 
 class AuroraPretrained(Aurora):
     """Pretrained version of Aurora."""

From 38d9b7e11e23f1c6e1398bb49bc45d43685b3679 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 16:32:47 +0200
Subject: [PATCH 03/14] Change dropout default and fix bug

---
 aurora/model/aurora.py  | 1 +
 aurora/model/encoder.py | 4 ++--
 aurora/model/swin3d.py  | 8 ++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/aurora/model/aurora.py b/aurora/model/aurora.py
index ac7fa12..475fd35 100644
--- a/aurora/model/aurora.py
+++ b/aurora/model/aurora.py
@@ -226,6 +226,7 @@ def __init__(
             embed_dim=embed_dim,
             mlp_ratio=mlp_ratio,
             drop_path_rate=drop_path,
+            attn_drop_rate=drop_rate,
             drop_rate=drop_rate,
             use_lora=use_lora,
             lora_steps=lora_steps,
diff --git a/aurora/model/encoder.py b/aurora/model/encoder.py
index 84aa1d4..c4d1988 100644
--- a/aurora/model/encoder.py
+++ b/aurora/model/encoder.py
@@ -41,7 +41,7 @@ def __init__(
         embed_dim: int = 1024,
         num_heads: int = 16,
         head_dim: int = 64,
-        drop_rate: float = 0.1,
+        drop_rate: float = 0.0,
         depth: int = 2,
         mlp_ratio: float = 4.0,
         max_history_size: int = 2,
@@ -66,7 +66,7 @@ def __init__(
                 Defaults to `16`.
             head_dim (int, optional): Dimension of attention heads used in aggregation blocks.
                 Defaults to `64`.
-            drop_rate (float, optional): Drop out rate for input patches. Defaults to `0.1`.
+            drop_rate (float, optional): Drop out rate for input patches. Defaults to `0.0`.
             depth (int, optional): Number of Perceiver cross-attention and feed-forward blocks.
                 Defaults to `2`.
             mlp_ratio (float, optional): Ratio of hidden dimensionality to embedding dimensionality
diff --git a/aurora/model/swin3d.py b/aurora/model/swin3d.py
index 4d4c084..cc83e72 100644
--- a/aurora/model/swin3d.py
+++ b/aurora/model/swin3d.py
@@ -762,8 +762,8 @@ def __init__(
         mlp_ratio: float = 4.0,
         qkv_bias: bool = True,
         drop_rate: float = 0.0,
-        attn_drop_rate: float = 0.1,
-        drop_path_rate: float = 0.1,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
         lora_steps: int = 40,
         lora_mode: LoRAMode = "single",
         use_lora: bool = False,
@@ -785,8 +785,8 @@ def __init__(
             qkv_bias (bool): If `True`, add a learnable bias to the query, key, and value. Defaults
                 to `True`.
             drop_rate (float): Drop-out rate. Defaults to `0.0`.
-            attn_drop_rate (float): Attention drop-out rate. Defaults to `0.1`.
-            drop_path_rate (float): Stochastic depth rate. Defaults to `0.1`.
+            attn_drop_rate (float): Attention drop-out rate. Defaults to `0.0`.
+            drop_path_rate (float): Stochastic depth rate. Defaults to `0.0`.
             lora_steps (int, optional): Maximum number of LoRA roll-out steps. Defaults to `40`.
             lora_mode (str, optional): LoRA mode. `"single"` uses the same LoRA for all roll-out
                 steps, `"from_second"` uses the same LoRA from the second roll-out step on,

From 963b5a9281152ac7a4f8336dbc857e4fc541a24b Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 16:43:39 +0200
Subject: [PATCH 04/14] Remove `bf16_mode` from the docs

---
 docs/finetuning.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/docs/finetuning.md b/docs/finetuning.md
index 3a1353a..07815d0 100644
--- a/docs/finetuning.md
+++ b/docs/finetuning.md
@@ -19,13 +19,7 @@ You can do this as follows:
 ```python
 from aurora import AuroraPretrained
 
-model = AuroraPretrained(
-    # BF16 mode is an EXPERIMENTAL mode that saves memory by running the backbone in pure BF16
-    # and the decoder in FP16 AMP. This should enable gradient computation. USE AT YOUR OWN RISK.
-    # THIS WAS NOT USED IN THE DEVELOPMENT OF AURORA AND IS PURELY PROVIDED AS A STARTING POINT
-    # FOR FINE-TUNING.
-    bf16_mode=True,
-)
+model = AuroraPretrained(autocast=True)
 model.load_checkpoint()
 
 batch = ...  # Load some data.
@@ -39,6 +33,9 @@ loss = ...
 loss.backward()
 ```
 
+Here `autocast` enables AMP with `bfloat16` for only the backbone.
+This is necessary to be able to fit gradients in memory.
+
 ## Exploding Gradients
 
 When fine-tuning, you may run into very large gradient values.

From cb6080cf75eda11313aa8c556898cefc33b9efee Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 18:00:41 +0200
Subject: [PATCH 05/14] Add sample environment and fine-tuning loop

---
 finetuning/Dockerfile  | 21 +++++++++++++++++++++
 finetuning/finetune.py | 43 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 finetuning/Dockerfile
 create mode 100644 finetuning/finetune.py

diff --git a/finetuning/Dockerfile b/finetuning/Dockerfile
new file mode 100644
index 0000000..7434b0e
--- /dev/null
+++ b/finetuning/Dockerfile
@@ -0,0 +1,21 @@
+FROM nvcr.io/nvidia/pytorch:25.08-py3
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+WORKDIR /app
+SHELL ["/bin/bash", "-c"]
+
+# Copy over source, create the environment, and install the repo in editable
+# mode.
+RUN mkdir -p /app/aurora/aurora
+COPY pyproject.toml LICENSE.txt /app/aurora
+RUN touch /app/aurora/__init__.py \
+    && touch /app/aurora/README.md \
+    && uv venv --python 3.13 \
+    && SETUPTOOLS_SCM_PRETEND_VERSION=0.0.0 uv pip install -e /app/aurora
+
+# Use the environment automatically.
+ENV VIRTUAL_ENV="/app/.venv/"
+ENV PATH="/app/.venv/bin:$PATH"
+
+# Let the user enter at `/app/aurora`.
+WORKDIR /app/aurora
diff --git a/finetuning/finetune.py b/finetuning/finetune.py
new file mode 100644
index 0000000..1ceb494
--- /dev/null
+++ b/finetuning/finetune.py
@@ -0,0 +1,43 @@
+from datetime import datetime
+
+import torch
+
+from aurora import AuroraPretrained, Batch, Metadata
+
+
+def loss(pred: Batch) -> torch.Tensor:
+    """A sample loss function. You should replace this with your own loss function."""
+    surf_values = prediction.surf_vars.values()
+    atmos_values = prediction.atmos_vars.values()
+    return sum((x * x).sum() for x in tuple(surf_values) + tuple(atmos_values))
+
+
+model = AuroraPretrained()
+model.load_checkpoint()
+model.configure_activation_checkpointing()
+model.train()
+model = model.to("cuda")
+
+opt = torch.optim.AdamW(model.parameters(), lr=3e-4)
+
+for i in range(10):
+    print(f"Step {i}")
+
+    # Train on random data. You should replace this with your own data.
+    batch = Batch(
+        surf_vars={k: torch.randn(1, 2, 721, 1440) for k in ("2t", "10u", "10v", "msl")},
+        static_vars={k: torch.randn(721, 1440) for k in ("lsm", "z", "slt")},
+        atmos_vars={k: torch.randn(1, 2, 13, 721, 1440) for k in ("z", "u", "v", "t", "q")},
+        metadata=Metadata(
+            lat=torch.linspace(90, -90, 721),
+            lon=torch.linspace(0, 360, 1440 + 1)[:-1],
+            time=(datetime(2020, 6, 1, 12, 0),),
+            atmos_levels=(50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 850, 925, 1000),
+        ),
+    )
+
+    opt.zero_grad()
+    prediction = model.forward(batch.to("cuda"))
+    loss_value = loss(prediction)
+    loss_value.backward()
+    opt.step()

From f398e9f0733da764774b1ec1b939d6db6897b2f1 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 18:04:18 +0200
Subject: [PATCH 06/14] Describe Docker image in docs

---
 docs/finetuning.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/docs/finetuning.md b/docs/finetuning.md
index 07815d0..184ba5a 100644
--- a/docs/finetuning.md
+++ b/docs/finetuning.md
@@ -10,6 +10,30 @@ model = AuroraPretrained()
 model.load_checkpoint()
 ```
 
+We provide a very basic Docker image and fine-tuning loop to get you started.
+The Docker image can be found at `finetuning/Dockerfile` and the fine-tuning
+loop at `finetuning/finetune.py`.
+You can build and run the image as follows:
+
+```bash
+docker build . -t aurora:latest -f finetuning/Dockerfile
+docker run --rm -it \
+    && --gpus all \
+    && --ipc=host \
+    && --ulimit memlock=-1 \
+    && --ulimit stack=67108864 \
+    && -v .:/app/aurora \
+    aurora:latest
+```
+
+Then, within the image, execute
+
+```bash
+python finetuning/finetune.py
+```
+
+to run the sample fine-tuning loop.
+
 ## Computing Gradients
 
 To compute gradients, you will need an A100 with 80 GB of memory.

From 80e279514f73819d781f421d718b84562e6b5f00 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 18:05:24 +0200
Subject: [PATCH 07/14] Enable autocast

---
 finetuning/finetune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetuning/finetune.py b/finetuning/finetune.py
index 1ceb494..bf6de2c 100644
--- a/finetuning/finetune.py
+++ b/finetuning/finetune.py
@@ -12,7 +12,7 @@ def loss(pred: Batch) -> torch.Tensor:
     return sum((x * x).sum() for x in tuple(surf_values) + tuple(atmos_values))
 
 
-model = AuroraPretrained()
+model = AuroraPretrained(autocast=True)
 model.load_checkpoint()
 model.configure_activation_checkpointing()
 model.train()

From 8c23afdd3516a5401a20d5d5fd5ca6c6636e0d15 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 18:12:06 +0200
Subject: [PATCH 08/14] Fix comment

---
 finetuning/Dockerfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/finetuning/Dockerfile b/finetuning/Dockerfile
index 7434b0e..1c29aab 100644
--- a/finetuning/Dockerfile
+++ b/finetuning/Dockerfile
@@ -4,8 +4,7 @@ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 WORKDIR /app
 SHELL ["/bin/bash", "-c"]
 
-# Copy over source, create the environment, and install the repo in editable
-# mode.
+# Create the environment and install the repo in editable mode.
 RUN mkdir -p /app/aurora/aurora
 COPY pyproject.toml LICENSE.txt /app/aurora
 RUN touch /app/aurora/__init__.py \

From 071116f0c619f3316260f8d8e4c07b87e579bcc0 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Mon, 13 Oct 2025 18:24:39 +0200
Subject: [PATCH 09/14] Add notice

---
 finetuning/finetune.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/finetuning/finetune.py b/finetuning/finetune.py
index bf6de2c..4b299ef 100644
--- a/finetuning/finetune.py
+++ b/finetuning/finetune.py
@@ -1,3 +1,5 @@
+"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""
+
 from datetime import datetime
 
 import torch

From 54b91beae776596e2923902b6ca05fd077357f69 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Wed, 15 Oct 2025 13:05:21 +0200
Subject: [PATCH 10/14] Finalise instructions

---
 docs/finetuning.md    | 26 ++++++++++++++++++++------
 finetuning/Dockerfile |  2 +-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/docs/finetuning.md b/docs/finetuning.md
index 184ba5a..bb1060e 100644
--- a/docs/finetuning.md
+++ b/docs/finetuning.md
@@ -10,6 +10,8 @@ model = AuroraPretrained()
 model.load_checkpoint()
 ```
 
+## Basic Fine-Tuning Environment
+
 We provide a very basic Docker image and fine-tuning loop to get you started.
 The Docker image can be found at `finetuning/Dockerfile` and the fine-tuning
 loop at `finetuning/finetune.py`.
@@ -17,12 +19,8 @@ You can build and run the image as follows:
 
 ```bash
 docker build . -t aurora:latest -f finetuning/Dockerfile
-docker run --rm -it \
-    && --gpus all \
-    && --ipc=host \
-    && --ulimit memlock=-1 \
-    && --ulimit stack=67108864 \
-    && -v .:/app/aurora \
+docker run --rm -it -v .:/app/aurora \
+    --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
     aurora:latest
 ```
 
@@ -34,6 +32,22 @@ python finetuning/finetune.py
 
 to run the sample fine-tuning loop.
 
+For example, on Azure, launch a VM with size `Standard_NC24ads_A100_v4`, image
+Ubuntu 24.04 LTS (x64), and 256 GB of disk space.
+Then [install CUDA](https://learn.microsoft.com/en-us/azure/virtual-machines/linux/n-series-driver-setup).
+Be sure to install the latest supported version of the CUDA Toolkit by
+checking `nvidia-smi` after installing the drivers with
+`sudo ubuntu-drivers autoinstall` and rebooting.
+Best performance is achieved with CUDA Toolkit 13.0 or higher, which
+requires drivers that support CUDA 13.0 or higher.
+Then install Docker with `sudo apt install docker.io`,
+set the right permissions for the current user with
+`sudo usermod -a -G docker $USER`,
+[install the NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html),
+and reboot.
+You should now be able to clone the repo and build and run the image using
+the instructions above.
+
 ## Computing Gradients
 
 To compute gradients, you will need an A100 with 80 GB of memory.
diff --git a/finetuning/Dockerfile b/finetuning/Dockerfile
index 1c29aab..575138b 100644
--- a/finetuning/Dockerfile
+++ b/finetuning/Dockerfile
@@ -6,7 +6,7 @@ SHELL ["/bin/bash", "-c"]
 
 # Create the environment and install the repo in editable mode.
 RUN mkdir -p /app/aurora/aurora
-COPY pyproject.toml LICENSE.txt /app/aurora
+COPY pyproject.toml LICENSE.txt /app/aurora/
 RUN touch /app/aurora/__init__.py \
     && touch /app/aurora/README.md \
     && uv venv --python 3.13 \

From 1c5b6e1f5c6657d1ca2eaf37cd98dde24fcf931a Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Wed, 15 Oct 2025 13:07:17 +0200
Subject: [PATCH 11/14] Fix wording

---
 aurora/model/aurora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aurora/model/aurora.py b/aurora/model/aurora.py
index 475fd35..db3a5d2 100644
--- a/aurora/model/aurora.py
+++ b/aurora/model/aurora.py
@@ -146,7 +146,7 @@ def __init__(
                 variables, adjust the normalisation to the given tuple consisting of a new location
                 and scale.
             autocast (bool, optional): To reduce memory usage, `torch.autocast` only the backbone
-                to BF16. This reduced memory usage and is critical to enable fine-tuning.
+                to BF16. This is critical to enable fine-tuning.
             level_condition (tuple[int | float, ...], optional): Make the patch embeddings dependent
                 on pressure level. If you want to enable this feature, provide a tuple of all
                 possible pressure levels.

From 4eb006804cb1f99a6ef8c4cd107dde47c30b17d0 Mon Sep 17 00:00:00 2001
From: Wessel <wessel.p.bruinsma@gmail.com>
Date: Fri, 17 Oct 2025 16:46:23 +0200
Subject: [PATCH 12/14] Update finetuning/finetune.py

Co-authored-by: Ryan Chan <rchan@turing.ac.uk>
---
 finetuning/finetune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetuning/finetune.py b/finetuning/finetune.py
index 4b299ef..74233ab 100644
--- a/finetuning/finetune.py
+++ b/finetuning/finetune.py
@@ -39,7 +39,7 @@ def loss(pred: Batch) -> torch.Tensor:
     )
 
     opt.zero_grad()
-    prediction = model.forward(batch.to("cuda"))
+    prediction = model(batch.to("cuda")
     loss_value = loss(prediction)
     loss_value.backward()
     opt.step()

From 755bcb9f97ede61948b7291aedb1fbb563f4a456 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Fri, 17 Oct 2025 16:46:31 +0200
Subject: [PATCH 13/14] Added clarifications suggested by Ryan

---
 docs/finetuning.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/finetuning.md b/docs/finetuning.md
index bb1060e..0ce9ab1 100644
--- a/docs/finetuning.md
+++ b/docs/finetuning.md
@@ -13,9 +13,12 @@ model.load_checkpoint()
 ## Basic Fine-Tuning Environment
 
 We provide a very basic Docker image and fine-tuning loop to get you started.
-The Docker image can be found at `finetuning/Dockerfile` and the fine-tuning
+This Docker image is built from a NVIDIA PyTorch base image,
+so is tailored to work for NVIDIA GPUs, and has been tested on an 80 GB A100.
+The image can be found at `finetuning/Dockerfile` and the fine-tuning
 loop at `finetuning/finetune.py`.
-You can build and run the image as follows:
+Assuming that you have cloned the Aurora repository, you can build and run
+the image by running the following from the root of the repository:
 
 ```bash
 docker build . -t aurora:latest -f finetuning/Dockerfile

From bd1f9797a1e390e1f0b9ea0edbd8f5ef717e4f58 Mon Sep 17 00:00:00 2001
From: Wessel Bruinsma <wessel.p.bruinsma@gmail.com>
Date: Fri, 17 Oct 2025 16:50:02 +0200
Subject: [PATCH 14/14] Add missing parenthesis

---
 finetuning/finetune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetuning/finetune.py b/finetuning/finetune.py
index 74233ab..c862c42 100644
--- a/finetuning/finetune.py
+++ b/finetuning/finetune.py
@@ -39,7 +39,7 @@ def loss(pred: Batch) -> torch.Tensor:
     )
 
     opt.zero_grad()
-    prediction = model(batch.to("cuda")
+    prediction = model(batch.to("cuda"))
     loss_value = loss(prediction)
     loss_value.backward()
     opt.step()