NVIDIA · pstjohn · Mar 4, 2026 · Mar 4, 2026
@@ -34,7 +34,9 @@
 from torch.distributed.checkpoint.state_dict_saver import async_save as dcp_async_save
 from torch.distributed.checkpoint.state_dict_saver import save as dcp_save
 from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.tensor import DTensor
 from torchdata.stateful_dataloader import StatefulDataLoader
+from transformer_engine.pytorch.quantized_tensor import QuantizedTensor
 
 from distributed_config import DistributedConfig
 
@@ -115,8 +117,20 @@ def load_checkpoint_ddp(
     ckpt_path: str | os.PathLike,
     dist_config: DistributedConfig,
     dataloader: StatefulDataLoader | None = None,
+    weights_only: bool = True,
 ) -> CheckpointOutput:
-    """Load DDP checkpoint."""
+    """Load DDP checkpoint.
+
+    Args:
+        model: The model to load.
+        optimizer: The optimizer to load.
+        scheduler: The LR scheduler to load.
+        ckpt_path: The path to the checkpoint.
+        dist_config: The distributed configuration.
+        dataloader: The dataloader to load.
+        weights_only: Whether to load the checkpoint weights only. We have to set this to True when loading FP8
+            checkpoints.
+    """
     checkpoint_path, _ = get_latest_checkpoint(ckpt_path)
 
     if not checkpoint_path:
@@ -126,7 +140,7 @@ def load_checkpoint_ddp(
     checkpoint = torch.load(
         checkpoint_path / "checkpoint.pt",
         map_location=f"cuda:{dist_config.local_rank}",
-        weights_only=True,
+        weights_only=weights_only,
     )
 
     model.load_state_dict(checkpoint["model"])
@@ -221,6 +235,7 @@ class AppState(Stateful):
     def state_dict(self):
         """Get the state dict for the model, optimizer, scheduler, and step."""
         model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+        model_state_dict = {k: v for k, v in model_state_dict.items() if not k.endswith("_extra_state")}
         return {
             "model": model_state_dict,
             "optim": optimizer_state_dict,
@@ -236,6 +251,7 @@ def load_state_dict(self, state_dict: dict):
             self.optimizer,
             model_state_dict=state_dict["model"],
             optim_state_dict=state_dict["optim"],
+            options=StateDictOptions(strict=False),
         )
         self.scheduler.load_state_dict(state_dict["scheduler"])
         self.step = state_dict["step"]
@@ -322,6 +338,13 @@ def save_checkpoint_fsdp2(
     checkpoint_path = ckpt_path / f"step_{step}"
     checkpoint_path.mkdir(parents=True, exist_ok=True)
 
+    model_params = (p.to_local() if isinstance(p, DTensor) else p for p in model.parameters())
+    if async_save and any((isinstance(p, QuantizedTensor) for p in model_params)):
+        logger.warning(
+            "Async checkpointing is not supported for FP8 models, falling back to synchronous checkpointing."
+        )
+        async_save = False
+
     if dataloader is not None:
         save_dataloader(
             dataloader=dataloader,

@@ -44,7 +44,7 @@ class PerfLogger:
         min_loss: The minimum loss seen so far.
     """
 
-    def __init__(self, dist_config: DistributedConfig, args: DictConfig):
+    def __init__(self, dist_config: DistributedConfig, args: DictConfig, start_step: int):
         """Initialize the logger."""
         self._dist_config = dist_config
         self._run_config = OmegaConf.to_container(args, resolve=True, throw_on_missing=True)
@@ -75,7 +75,7 @@ def __init__(self, dist_config: DistributedConfig, args: DictConfig):
         if self._dist_config.is_main_process():
             # Log the entire args object to wandb for experiment tracking and reproducibility.
             self._wandb_run = wandb.init(**args.wandb, config=self._run_config)
-            self._progress_bar = tqdm(total=args.num_train_steps, desc="Training")
+            self._progress_bar = tqdm(initial=start_step, total=args.num_train_steps, desc="Training")
 
             if args.profiler.enabled:
                 self._profiler = NsightProfiler(

@@ -19,6 +19,7 @@
 
 import pytest
 import torch
+from transformer_engine.pytorch import fp8 as te_fp8
 
 
 sys.path.append(Path(__file__).parent.parent.as_posix())
@@ -61,6 +62,56 @@ def pytest_collection_modifyitems(items):
     items[:] = stats_tests + other_tests
 
 
+# ---------------------------------------------------------------------------
+# FP8 recipe parametrization
+# ---------------------------------------------------------------------------
+
+# Each entry: (recipe_class_name, hydra_overrides, check_fn)
+_FP8_RECIPE_CONFIGS = [
+    (
+        "DelayedScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.DelayedScaling"],
+        te_fp8.check_fp8_support,
+    ),
+    (
+        "Float8CurrentScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8CurrentScaling"],
+        te_fp8.check_fp8_support,
+    ),
+    (
+        "Float8BlockScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.Float8BlockScaling"],
+        te_fp8.check_fp8_block_scaling_support,
+    ),
+    (
+        "MXFP8BlockScaling",
+        ["fp8_config.fp8_recipe=transformer_engine.common.recipe.MXFP8BlockScaling"],
+        te_fp8.check_mxfp8_support,
+    ),
+]
+
+
+def _parametrize_fp8_recipes():
+    """Generate pytest.param objects with xfail marks for unsupported FP8 recipes."""
+    params = []
+    for name, overrides, check_fn in _FP8_RECIPE_CONFIGS:
+        supported, reason = check_fn()
+        params.append(
+            pytest.param(
+                overrides,
+                id=name,
+                marks=pytest.mark.xfail(condition=not supported, reason=reason),
+            )
+        )
+    return params
+
+
+@pytest.fixture(params=_parametrize_fp8_recipes())
+def fp_recipe(request):
+    """Parametrized fixture providing FP8 recipe Hydra overrides for each supported TE recipe."""
+    return request.param
+
+
 @pytest.fixture(scope="session", autouse=True)
 def device_mesh():
     """Create a re-usable torch process group for testing.