ddmms · zwei-beiner · Jan 29, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 3, 2026
diff --git a/docs/source/user_guide/benchmarks/nebs.rst b/docs/source/user_guide/benchmarks/nebs.rst
@@ -46,3 +46,60 @@ Reference data:
 
 * Manually taken from https://doi.org/10.1149/1.1633511.
 * Meta-GGA (Perdew-Wang) exchange correlation functional
+
+
+Si defects
+==========
+
+Summary
+-------
+
+Performance in predicting DFT singlepoint energies and forces along fixed nudged-elastic-band
+(NEB) images for a silicon interstitial migration pathway.
+
+Metrics
+-------
+
+For each of the three NEB datasets (64 atoms, 216 atoms, and 216 atoms di-to-single), MLIPs are
+evaluated on the same ordered NEB images as the reference.
+
+1. Energy MAE
+
+Mean absolute error (MAE) of *relative* energies along the NEB, shifting image 0 to 0 eV
+for both the DFT reference and the MLIP predictions.
+
+2. Force MAE
+
+Mean absolute error (MAE) of forces across all atoms and images along the NEB.
+
+Computational cost
+------------------
+
+Medium: tests are likely to take several minutes to run on CPU.
+
+Data availability
+-----------------
+
+Input/reference data:
+
+* Reference extxyz trajectories (including per-image DFT energies and forces) are distributed as a
+  separate zip archive and downloaded on-demand from the ML-PEG data store.
+  The calculation script uses the public ML-PEG S3 bucket to retrieve these inputs.
+* The reference DFT energies/forces come from Quantum ESPRESSO (PWscf) single-point calculations
+  with:
+
+  - Code/version: Quantum ESPRESSO PWSCF v.7.0
+  - XC functional: ``input_dft='PBE'``
+  - Cutoffs: ``ecutwfc=30.0`` Ry, ``ecutrho=240.0`` Ry
+  - Smearing: ``occupations='smearing'``, ``smearing='mv'``, ``degauss=0.01`` Ry
+  - SCF convergence/mixing: ``conv_thr=1.0d-6``, ``electron_maxstep=250``, ``mixing_beta=0.2``,
+    ``mixing_mode='local-TF'``
+  - Diagonalization: ``diagonalization='david'``
+  - Symmetry: ``nosym=.false.``, ``noinv=.false.`` (symmetry enabled)
+  - Pseudopotential: ``Si.pbe-n-kjpaw_psl.1.0.0.UPF`` (PSLibrary)
+
+  K-points by case:
+
+  - 64 atoms: Γ-only (``K_POINTS automatic 1 1 1 0 0 0``)
+  - 216 atoms: Γ-only (``K_POINTS gamma``)
+  - 216 atoms di-to-single: Γ-only (``K_POINTS gamma``)
diff --git a/ml_peg/analysis/nebs/si_defects/analyse_si_defects.py b/ml_peg/analysis/nebs/si_defects/analyse_si_defects.py
@@ -0,0 +1,294 @@
+"""Analyse Si defects DFT singlepoints (energies + forces)."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+
+from ase.atoms import Atoms
+from ase.io import read, write
+import numpy as np
+import pandas as pd
+import pytest
+
+from ml_peg.analysis.utils.decorators import build_table, plot_scatter
+from ml_peg.analysis.utils.utils import load_metrics_config, mae
+from ml_peg.app import APP_ROOT
+from ml_peg.calcs import CALCS_ROOT
+from ml_peg.models.get_models import get_model_names
+from ml_peg.models.models import current_models
+
+MODELS = get_model_names(current_models)
+
+BENCHMARK_DIR = "si_defects"
+CALC_PATH = CALCS_ROOT / "nebs" / BENCHMARK_DIR / "outputs"
+OUT_PATH = APP_ROOT / "data" / "nebs" / BENCHMARK_DIR
+
+METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml")
+DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config(
+    METRICS_CONFIG_PATH
+)
+
+
+@dataclass(frozen=True)
+class _Case:
+    """Definition of a single Si defects NEB dataset."""
+
+    key: str
+    label: str
+
+
+CASES: tuple[_Case, ...] = (
+    _Case(key="64_atoms", label="64"),
+    _Case(key="216_atoms", label="216"),
+    _Case(key="216_atoms_di_to_single", label="216 di-to-single"),
+)
+
+
+@dataclass(frozen=True)
+class _Results:
+    """Computed errors and assets for one (case, model) pair."""
+
+    energy_mae: float
+    force_mae: float
+    x: list[float]
+    energy_error: list[float]
+    force_rms: list[float]
+    structs: list[Atoms]
+
+
+def _load_structs(case_key: str, model: str) -> list[Atoms] | None:
+    """
+    Load MLIP-evaluated NEB frames (with DFT reference stored).
+
+    Parameters
+    ----------
+    case_key
+        Case key.
+    model
+        Model name.
+
+    Returns
+    -------
+    list[ase.atoms.Atoms] | None
+        Frames if the calculation output exists, otherwise ``None``.
+    """
+    path = CALC_PATH / case_key / model / f"{BENCHMARK_DIR}.extxyz"
+    if not path.exists():
+        return None
+    structs = read(path, index=":")
+    if not isinstance(structs, list) or not structs:
+        raise ValueError(f"Unexpected output content: {path}")
+    return structs
+
+
+def _compute(structs: list[Atoms]) -> _Results:
+    """
+    Compute per-image errors and global MAEs.
+
+    Parameters
+    ----------
+    structs
+        Ordered NEB frames with ``ref_energy_ev``, ``ref_forces``, ``pred_energy_ev``,
+        and ``pred_forces``.
+
+    Returns
+    -------
+    _Results
+        Computed errors and frames for downstream plotting/app.
+    """
+    ref_e = np.asarray([float(s.info["ref_energy_ev"]) for s in structs])
+    pred_e = np.asarray([float(s.info["pred_energy_ev"]) for s in structs])
+    # Compare *relative* energies along the NEB (shift image 0 to 0 eV for both
+    # reference and predictions). This removes any constant energy offset.
+    ref_e_rel = ref_e - ref_e[0]
+    pred_e_rel = pred_e - pred_e[0]
+    energy_error = pred_e_rel - ref_e_rel
+
+    ref_f = np.asarray([s.arrays["ref_forces"] for s in structs], dtype=float)
+    pred_f = np.asarray([s.arrays["pred_forces"] for s in structs], dtype=float)
+    force_error = pred_f - ref_f
+
+    x = [float(s.info.get("image_index", idx)) for idx, s in enumerate(structs)]
+    return _Results(
+        energy_mae=mae(ref_e_rel, pred_e_rel),
+        force_mae=mae(ref_f.reshape(-1), pred_f.reshape(-1)),
+        x=x,
+        energy_error=[float(v) for v in energy_error],
+        force_rms=[float(v) for v in np.sqrt(np.mean(force_error**2, axis=(1, 2)))],
+        structs=structs,
+    )
+
+
+def _write_assets(case_key: str, model: str, results: _Results) -> None:
+    """
+    Write structure trajectory and per-image CSV for the app.
+
+    Parameters
+    ----------
+    case_key
+        Case key.
+    model
+        Model name.
+    results
+        Computed errors and frames.
+    """
+    out_dir = OUT_PATH / case_key / model
+    out_dir.mkdir(parents=True, exist_ok=True)
+    write(out_dir / f"{model}-neb-band.extxyz", results.structs)
+    pd.DataFrame(
+        {
+            "image": results.x,
+            "energy_error_ev": results.energy_error,
+            "force_rms_ev_per_ang": results.force_rms,
+        }
+    ).to_csv(out_dir / f"{model}-per_image_errors.csv", index=False)
+
+
+def _write_plots(case_key: str, case_label: str, model: str, results: _Results) -> None:
+    """
+    Write Dash/Plotly JSON plots for one (case, model) pair.
+
+    Parameters
+    ----------
+    case_key
+        Case key.
+    case_label
+        Human-friendly case label.
+    model
+        Model name.
+    results
+        Computed errors and frames.
+    """
+    OUT_PATH.mkdir(parents=True, exist_ok=True)
+
+    @plot_scatter(
+        filename=OUT_PATH / f"figure_{model}_{case_key}_energy_error.json",
+        title=f"Energy error along NEB ({case_label})",
+        x_label="Image",
+        y_label="Energy error / eV",
+        show_line=True,
+    )
+    def plot_energy() -> dict[str, tuple[list[float], list[float]]]:
+        """
+        Plot per-image energy errors for a single model and case.
+
+        Returns
+        -------
+        dict[str, tuple[list[float], list[float]]]
+            Mapping of model name to x/y arrays.
+        """
+        return {model: [results.x, results.energy_error]}
+
+    @plot_scatter(
+        filename=OUT_PATH / f"figure_{model}_{case_key}_force_rms.json",
+        title=f"Force RMS error along NEB ({case_label})",
+        x_label="Image",
+        y_label="Force RMS error / (eV/Å)",
+        show_line=True,
+    )
+    def plot_forces() -> dict[str, tuple[list[float], list[float]]]:
+        """
+        Plot per-image force RMS errors for a single model and case.
+
+        Returns
+        -------
+        dict[str, tuple[list[float], list[float]]]
+            Mapping of model name to x/y arrays.
+        """
+        return {model: [results.x, results.force_rms]}
+
+    plot_energy()
+    plot_forces()
+
+
+@pytest.fixture
+def case_results() -> dict[tuple[str, str], _Results]:
+    """
+    Compute results for all available (case, model) combinations.
+
+    Returns
+    -------
+    dict[tuple[str, str], _Results]
+        Results indexed by ``(case_key, model_name)``.
+    """
+    OUT_PATH.mkdir(parents=True, exist_ok=True)
+    results: dict[tuple[str, str], _Results] = {}
+    for case in CASES:
+        for model in MODELS:
+            structs = _load_structs(case.key, model)
+            if structs is None:
+                continue
+            computed = _compute(structs)
+            _write_assets(case.key, model, computed)
+            _write_plots(case.key, case.label, model, computed)
+            results[(case.key, model)] = computed
+    return results
+
+
+@pytest.fixture
+def metrics_dict(
+    case_results: dict[tuple[str, str], _Results],
+) -> dict[str, dict[str, float]]:
+    """
+    Build raw metric dict for the benchmark table.
+
+    Parameters
+    ----------
+    case_results
+        Results indexed by ``(case_key, model_name)``.
+
+    Returns
+    -------
+    dict[str, dict[str, float]]
+        Metric values for all models.
+    """
+    metrics: dict[str, dict[str, float]] = {}
+    for case in CASES:
+        energy_key = f"Energy MAE ({case.label})"
+        force_key = f"Force MAE ({case.label})"
+        metrics[energy_key] = {}
+        metrics[force_key] = {}
+        for model in MODELS:
+            result = case_results.get((case.key, model))
+            if result is None:
+                continue
+            metrics[energy_key][model] = result.energy_mae
+            metrics[force_key][model] = result.force_mae
+    return metrics
+
+
+@pytest.fixture
+@build_table(
+    filename=OUT_PATH / f"{BENCHMARK_DIR}_metrics_table.json",
+    metric_tooltips=DEFAULT_TOOLTIPS,
+    thresholds=DEFAULT_THRESHOLDS,
+    weights=DEFAULT_WEIGHTS,
+)
+def metrics(metrics_dict: dict[str, dict[str, float]]) -> dict[str, dict]:
+    """
+    Build the benchmark table JSON.
+
+    Parameters
+    ----------
+    metrics_dict
+        Metric values for all models.
+
+    Returns
+    -------
+    dict[str, dict]
+        Metric names and values for all models.
+    """
+    return metrics_dict
+
+
+def test_si_defects(metrics: dict[str, dict]) -> None:
+    """
+    Run analysis for Si defects DFT singlepoints.
+
+    Parameters
+    ----------
+    metrics
+        Benchmark metrics table.
+    """
+    return
diff --git a/ml_peg/analysis/nebs/si_defects/metrics.yml b/ml_peg/analysis/nebs/si_defects/metrics.yml
@@ -0,0 +1,37 @@
+metrics:
+  Energy MAE (64):
+    good: 0.02
+    bad: 0.2
+    unit: eV
+    tooltip: "MAE between MLIP and DFT (PBE) relative single-point energies along the NEB (image 0 shifted to 0 eV)."
+    level_of_theory: PBE
+  Force MAE (64):
+    good: 0.05
+    bad: 0.5
+    unit: eV/Å
+    tooltip: "MAE between MLIP and DFT (PBE) single-point forces across all atoms and images along the NEB."
+    level_of_theory: PBE
+  Energy MAE (216):
+    good: 0.02
+    bad: 0.2
+    unit: eV
+    tooltip: "MAE between MLIP and DFT (PBE) relative single-point energies along the NEB (image 0 shifted to 0 eV)."
+    level_of_theory: PBE
+  Force MAE (216):
+    good: 0.05
+    bad: 0.5
+    unit: eV/Å
+    tooltip: "MAE between MLIP and DFT (PBE) single-point forces across all atoms and images along the NEB."
+    level_of_theory: PBE
+  Energy MAE (216 di-to-single):
+    good: 0.02
+    bad: 0.2
+    unit: eV
+    tooltip: "MAE between MLIP and DFT (PBE) relative single-point energies along the NEB (image 0 shifted to 0 eV)."
+    level_of_theory: PBE
+  Force MAE (216 di-to-single):
+    good: 0.05
+    bad: 0.5
+    unit: eV/Å
+    tooltip: "MAE between MLIP and DFT (PBE) single-point forces across all atoms and images along the NEB."
+    level_of_theory: PBE