From e1bcd884c7ab9cc79a40543011c2d61626025d9f Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Wed, 1 Oct 2025 06:06:36 +1000
Subject: [PATCH 01/24] Initial implementation

---
 pyproject.toml                           |   2 +
 src/rompy/backends/__init__.py           |   3 +-
 src/rompy/backends/config.py             | 111 ++++-
 src/rompy/backends/config_slurm_fixed.py | 103 +++++
 src/rompy/run/slurm.py                   | 274 ++++++++++++
 tests/backends/test_slurm_backend.py     | 507 +++++++++++++++++++++++
 6 files changed, 998 insertions(+), 2 deletions(-)
 create mode 100644 src/rompy/backends/config_slurm_fixed.py
 create mode 100644 src/rompy/run/slurm.py
 create mode 100644 tests/backends/test_slurm_backend.py

diff --git a/pyproject.toml b/pyproject.toml
index c2f1de2..90683f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,6 +83,7 @@ rompy = "rompy.cli:main"
 
 [project.entry-points."rompy.config"]
 base = "rompy.core.config:BaseConfig"
+slurm = "rompy.backends.config:SlurmConfig"
 
 [project.entry-points."rompy.source"]
 file = "rompy.core.source:SourceFile"
@@ -97,6 +98,7 @@ rompy_data = "rompy:cat"
 [project.entry-points."rompy.run"]
 local = "rompy.run:LocalRunBackend"
 docker = "rompy.run.docker:DockerRunBackend"
+slurm = "rompy.run.slurm:SlurmRunBackend"
 
 [project.entry-points."rompy.postprocess"]
 noop = "rompy.postprocess:NoopPostprocessor"
diff --git a/src/rompy/backends/__init__.py b/src/rompy/backends/__init__.py
index a5d990e..637fa4d 100644
--- a/src/rompy/backends/__init__.py
+++ b/src/rompy/backends/__init__.py
@@ -5,11 +5,12 @@
 execution backends, enabling type-safe and validated backend configurations.
 """
 
-from .config import BackendConfig, BaseBackendConfig, DockerConfig, LocalConfig
+from .config import BackendConfig, BaseBackendConfig, DockerConfig, LocalConfig, SlurmConfig
 
 __all__ = [
     "BackendConfig",
     "BaseBackendConfig",
     "DockerConfig",
     "LocalConfig",
+    "SlurmConfig",
 ]
diff --git a/src/rompy/backends/config.py b/src/rompy/backends/config.py
index 23b0e19..43c8b95 100644
--- a/src/rompy/backends/config.py
+++ b/src/rompy/backends/config.py
@@ -284,5 +284,114 @@ def model_post_init(self, __context) -> None:
     )
 
 
+class SlurmConfig(BaseBackendConfig):
+    """Configuration for SLURM cluster execution."""
+
+    queue: str = Field(
+        ..., 
+        description="SLURM partition name (equivalent to queue)"
+    )
+    
+    command: Optional[str] = Field(
+        None, description="Optional shell command to run instead of config.run()"
+    )
+    nodes: int = Field(
+        1, 
+        ge=1, 
+        le=100, 
+        description="Number of nodes to allocate"
+    )
+    ntasks: int = Field(
+        1, 
+        ge=1, 
+        description="Number of tasks (processes) to run"
+    )
+    cpus_per_task: int = Field(
+        1, 
+        ge=1, 
+        le=128, 
+        description="Number of CPU cores per task"
+    )
+    time_limit: str = Field(
+        "1:00:00", 
+        description="Time limit in format HH:MM:SS"
+    )
+    account: Optional[str] = Field(
+        None, 
+        description="Account for billing/resource tracking"
+    )
+    qos: Optional[str] = Field(
+        None, 
+        description="Quality of Service for the job"
+    )
+    reservation: Optional[str] = Field(
+        None, 
+        description="Reservation name to run job under"
+    )
+    output_file: Optional[str] = Field(
+        None, 
+        description="Output file path for job output"
+    )
+    error_file: Optional[str] = Field(
+        None, 
+        description="Error file path for job errors"
+    )
+    job_name: Optional[str] = Field(
+        None, 
+        description="Name for the SLURM job"
+    )
+    mail_type: Optional[str] = Field(
+        None,
+        description="Type of mail to send (BEGIN, END, FAIL, ALL, etc.)"
+    )
+    mail_user: Optional[str] = Field(
+        None,
+        description="Email address for notifications"
+    )
+    additional_options: List[str] = Field(
+        default_factory=list,
+        description="Additional SLURM options (e.g., '--gres=gpu:1')"
+    )
+
+    @field_validator('time_limit')
+    @classmethod
+    def validate_time_limit(cls, v):
+        """Validate time limit format (HH:MM:SS)."""
+        import re
+        if not re.match(r'^\d{1,4}:\d{2}:\d{2}$', v):
+            raise ValueError("Time limit must be in format HH:MM:SS")
+        return v
+
+    def get_backend_class(self):
+        """Return the SlurmRunBackend class."""
+        from rompy.run.slurm import SlurmRunBackend
+        return SlurmRunBackend
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {
+                    "queue": "general",
+                    "nodes": 1,
+                    "ntasks": 1,
+                    "cpus_per_task": 4,
+                    "time_limit": "02:00:00",
+                    "account": "myproject",
+                    "timeout": 7200,
+                },
+                {
+                    "queue": "gpu",
+                    "nodes": 2,
+                    "ntasks": 8,
+                    "cpus_per_task": 2,
+                    "time_limit": "24:00:00",
+                    "reservation": "special_reservation",
+                    "additional_options": ["--gres=gpu:v100:2"],
+                },
+            ]
+        }
+    )
+
+
 # Type alias for all backend configurations
-BackendConfig = Union[LocalConfig, DockerConfig]
+BackendConfig = Union[LocalConfig, DockerConfig, SlurmConfig]
\ No newline at end of file
diff --git a/src/rompy/backends/config_slurm_fixed.py b/src/rompy/backends/config_slurm_fixed.py
new file mode 100644
index 0000000..39a36d2
--- /dev/null
+++ b/src/rompy/backends/config_slurm_fixed.py
@@ -0,0 +1,103 @@
+class SlurmConfig(BaseBackendConfig):
+    """Configuration for SLURM cluster execution."""
+
+    queue: Optional[str] = Field(
+        None, 
+        description="SLURM partition name (equivalent to queue)"
+    )
+    nodes: int = Field(
+        1, 
+        ge=1, 
+        le=100, 
+        description="Number of nodes to allocate"
+    )
+    ntasks: int = Field(
+        1, 
+        ge=1, 
+        description="Number of tasks (processes) to run"
+    )
+    cpus_per_task: int = Field(
+        1, 
+        ge=1, 
+        le=128, 
+        description="Number of CPU cores per task"
+    )
+    time_limit: str = Field(
+        "1:00:00", 
+        description="Time limit in format HH:MM:SS"
+    )
+    account: Optional[str] = Field(
+        None, 
+        description="Account for billing/resource tracking"
+    )
+    qos: Optional[str] = Field(
+        None, 
+        description="Quality of Service for the job"
+    )
+    reservation: Optional[str] = Field(
+        None, 
+        description="Reservation name to run job under"
+    )
+    output_file: Optional[str] = Field(
+        None, 
+        description="Output file path for job output"
+    )
+    error_file: Optional[str] = Field(
+        None, 
+        description="Error file path for job errors"
+    )
+    job_name: Optional[str] = Field(
+        None, 
+        description="Name for the SLURM job"
+    )
+    mail_type: Optional[str] = Field(
+        None,
+        description="Type of mail to send (BEGIN, END, FAIL, ALL, etc.)"
+    )
+    mail_user: Optional[str] = Field(
+        None,
+        description="Email address for notifications"
+    )
+    additional_options: List[str] = Field(
+        default_factory=list,
+        description="Additional SLURM options (e.g., '--gres=gpu:1')"
+    )
+
+    @field_validator('time_limit')
+    @classmethod
+    def validate_time_limit(cls, v):
+        """Validate time limit format (HH:MM:SS)."""
+        import re
+        if not re.match(r'^\d{1,4}:\d{2}:\d{2}$', v):
+            raise ValueError("Time limit must be in format HH:MM:SS")
+        return v
+
+    def get_backend_class(self):
+        """Return the SlurmRunBackend class."""
+        from rompy.run.slurm import SlurmRunBackend
+        return SlurmRunBackend
+
+    model_config = ConfigDict(
+        json_schema_extra={
+            "examples": [
+                {
+                    "queue": "general",
+                    "nodes": 1,
+                    "ntasks": 1,
+                    "cpus_per_task": 4,
+                    "time_limit": "02:00:00",
+                    "account": "myproject",
+                    "timeout": 7200,
+                },
+                {
+                    "queue": "gpu",
+                    "nodes": 2,
+                    "ntasks": 8,
+                    "cpus_per_task": 2,
+                    "time_limit": "24:00:00",
+                    "reservation": "special_reservation",
+                    "additional_options": ["--gres=gpu:v100:2"],
+                },
+            ]
+        }
+    )
\ No newline at end of file
diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
new file mode 100644
index 0000000..97ccd90
--- /dev/null
+++ b/src/rompy/run/slurm.py
@@ -0,0 +1,274 @@
+"""
+SLURM backend for running models.
+
+This module provides a SLURM-based execution backend for rompy models.
+"""
+
+import logging
+import os
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
+
+if TYPE_CHECKING:
+    from rompy.backends import SlurmConfig
+
+logger = logging.getLogger(__name__)
+
+
+class SlurmRunBackend:
+    """Execute models on SLURM clusters.
+
+    This backend submits model runs to a SLURM-managed HPC cluster
+    for execution.
+    """
+
+    def run(
+        self, model_run, config: "SlurmConfig", workspace_dir: Optional[str] = None
+    ) -> bool:
+        """Submit model run to SLURM queue.
+
+        Args:
+            model_run: The ModelRun instance to execute
+            config: SlurmConfig instance with execution parameters
+            workspace_dir: Path to the generated workspace directory (if None, will generate)
+
+        Returns:
+            True if execution was successful, False otherwise
+        """
+        logger.debug(f"Using SlurmConfig: nodes={config.nodes}, ntasks={config.ntasks}")
+
+        # Use provided workspace or generate if not provided (for backwards compatibility)
+        if workspace_dir is None:
+            logger.warning(
+                "No workspace_dir provided, generating files (this may cause double generation in pipeline)"
+            )
+            staging_dir = model_run.generate()
+            logger.info(f"Model inputs generated in: {staging_dir}")
+        else:
+            logger.info(f"Using provided workspace directory: {workspace_dir}")
+            staging_dir = workspace_dir
+
+        try:
+            # Create and submit SLURM job script
+            job_script = self._create_job_script(model_run, config, staging_dir)
+            job_id = self._submit_job(job_script)
+
+            if job_id:
+                logger.info(f"SLURM job submitted successfully with ID: {job_id}")
+                return self._wait_for_completion(job_id, config)
+            else:
+                logger.error("Failed to submit SLURM job")
+                return False
+
+        except Exception as e:
+            logger.exception(f"SLURM execution failed: {e}")
+            return False
+
+    def _create_job_script(
+        self, model_run, config: "SlurmConfig", staging_dir: str
+    ) -> str:
+        """Create SLURM job script.
+
+        Args:
+            model_run: The ModelRun instance
+            config: SlurmConfig with execution parameters
+            staging_dir: Path to workspace directory
+
+        Returns:
+            Path to the created job script
+        """
+        # Determine the working directory for the job
+        work_dir = config.working_dir if config.working_dir else staging_dir
+        
+        # Create the job script content
+        script_lines = [
+            "#!/bin/bash",
+            "# SLURM job script generated by rompy",
+        ]
+
+        # Add SBATCH directives from configuration
+        if config.job_name:
+            script_lines.append(f"#SBATCH --job-name={config.job_name}")
+        
+        if config.output_file:
+            script_lines.append(f"#SBATCH --output={config.output_file}")
+        else:
+            # Default output file with job ID
+            script_lines.append(f"#SBATCH --output={work_dir}/slurm-%j.out")
+        
+        if config.error_file:
+            script_lines.append(f"#SBATCH --error={config.error_file}")
+        else:
+            # Default error file with job ID
+            script_lines.append(f"#SBATCH --error={work_dir}/slurm-%j.err")
+
+        if config.queue:
+            script_lines.append(f"#SBATCH --partition={config.queue}")
+        
+        script_lines.append(f"#SBATCH --nodes={config.nodes}")
+        script_lines.append(f"#SBATCH --ntasks={config.ntasks}")
+        script_lines.append(f"#SBATCH --cpus-per-task={config.cpus_per_task}")
+        script_lines.append(f"#SBATCH --time={config.time_limit}")
+        
+        if config.account:
+            script_lines.append(f"#SBATCH --account={config.account}")
+        
+        if config.qos:
+            script_lines.append(f"#SBATCH --qos={config.qos}")
+        
+        if config.reservation:
+            script_lines.append(f"#SBATCH --reservation={config.reservation}")
+        
+        if config.mail_type and config.mail_user:
+            script_lines.append(f"#SBATCH --mail-type={config.mail_type}")
+            script_lines.append(f"#SBATCH --mail-user={config.mail_user}")
+
+        # Add additional options
+        for option in config.additional_options:
+            script_lines.append(f"#SBATCH {option}")
+
+        script_lines.extend([
+            "",
+            "# Change to working directory",
+            f"cd {work_dir}",
+            "",
+            "# Set environment variables",
+        ])
+
+        # Add environment variables
+        for key, value in config.env_vars.items():
+            script_lines.append(f"export {key}={value}")
+
+        # Add the actual command to run the model\n        # First, check if there's a specific command in config, otherwise use the model's run method\n        if hasattr(config, 'command') and config.command:\n            script_lines.extend([\n                \"\",\n                \"# Execute custom command\",\n                config.command,\n            ])\n        else:\n            script_lines.extend([\n                \"\",\n                \"# Execute model using model_run.config.run() method\",\n                \"python -c \\\"\",\n                \"import sys\",\n                \"import os\",\n                \"sys.path.insert(0, os.getcwd())\",\n                \"from rompy.model import ModelRun\",\n                f\"model_run = ModelRun.from_dict({model_run.model_dump()})\",\n                \"model_run.config.run(model_run)\",\n                \"\\\"\",\n            ])
+
+        # Create temporary job script file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
+            f.write('\n'.join(script_lines))
+            script_path = f.name
+
+        logger.debug(f"SLURM job script created at: {script_path}")
+        logger.debug(f"Job script content:\n{'\n'.join(script_lines)}")
+
+        return script_path
+
+    def _submit_job(self, job_script: str) -> Optional[str]:
+        """Submit job to SLURM.
+
+        Args:
+            job_script: Path to the job script to submit
+
+        Returns:
+            Job ID if submission successful, None otherwise
+        """
+        try:
+            # Submit the job using sbatch
+            result = subprocess.run(
+                ["sbatch", job_script],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+
+            # Extract job ID from sbatch output (format: "Submitted batch job <ID>")
+            output = result.stdout.strip()
+            if "Submitted batch job" in output:
+                job_id = output.split()[-1]
+                logger.info(f"Submitted SLURM job with ID: {job_id}")
+                return job_id
+            else:
+                logger.error(f"Unexpected sbatch output format: {output}")
+                return None
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to submit SLURM job: {e.stderr}")
+            return None
+        except Exception as e:
+            logger.error(f"Error submitting SLURM job: {e}")
+            return None
+        finally:
+            # Clean up the temporary job script
+            try:
+                os.remove(job_script)
+                logger.debug(f"Cleaned up temporary job script: {job_script}")
+            except OSError:
+                logger.warning(f"Could not remove temporary job script: {job_script}")
+
+    def _wait_for_completion(self, job_id: str, config: "SlurmConfig") -> bool:
+        """Wait for job completion.
+
+        Args:
+            job_id: SLURM job ID to monitor
+            config: SlurmConfig with timeout parameters
+
+        Returns:
+            True if job completed successfully, False otherwise
+        """
+        logger.info(f"Waiting for SLURM job {job_id} to complete...")
+
+        # Terminal states that indicate job completion (successful or failed)
+        terminal_states = {'CD', 'CA', 'F', 'TO', 'NF', 'OOM', 'BF', 'DL', 'PR'}
+
+        # Start time for timeout check
+        start_time = time.time()
+
+        while True:
+            # Check if we've exceeded the timeout
+            elapsed_time = time.time() - start_time
+            if elapsed_time > config.timeout:
+                logger.error(f"Timeout waiting for job {job_id} after {config.timeout} seconds")
+                
+                # Try to cancel the job
+                try:
+                    subprocess.run(['scancel', job_id], check=True, capture_output=True)
+                    logger.info(f"Cancelled job {job_id} due to timeout")
+                except subprocess.CalledProcessError:
+                    logger.warning(f"Could not cancel job {job_id} due to timeout")
+                
+                return False
+
+            # Get job status
+            try:
+                result = subprocess.run(
+                    ['squeue', '-j', job_id, '-h', '-o', '%T'],
+                    capture_output=True,
+                    text=True,
+                    check=True
+                )
+                
+                state = result.stdout.strip()
+                
+                if not state:  # If job is not found, it may have completed and been purged
+                    logger.info(f"Job {job_id} not found in queue - likely completed")
+                    return True  # Assume successful completion if not in queue
+                
+                if state in terminal_states:
+                    if state == 'CD':  # Completed
+                        logger.info(f"SLURM job {job_id} completed successfully")
+                        return True
+                    elif state == 'CA':  # Cancelled
+                        logger.warning(f"SLURM job {job_id} was cancelled")
+                        return False
+                    elif state == 'F':  # Failed
+                        logger.error(f"SLURM job {job_id} failed")
+                        return False
+                    elif state == 'TO':  # Timeout
+                        logger.error(f"SLURM job {job_id} timed out")
+                        return False
+                    else:
+                        logger.error(f"SLURM job {job_id} ended with state: {state}")
+                        return False
+
+                # Job is still running or pending, wait before checking again
+                logger.debug(f"Job {job_id} still in state: {state}, waiting...")
+                time.sleep(30)  # Wait 30 seconds before next check
+
+            except subprocess.CalledProcessError as e:
+                logger.error(f"Error checking job status for {job_id}: {e.stderr}")
+                # If we can't check the status, we consider it a failure
+                return False
+            except Exception as e:
+                logger.error(f"Unexpected error while monitoring job {job_id}: {e}")
+                return False
\ No newline at end of file
diff --git a/tests/backends/test_slurm_backend.py b/tests/backends/test_slurm_backend.py
new file mode 100644
index 0000000..a91414a
--- /dev/null
+++ b/tests/backends/test_slurm_backend.py
@@ -0,0 +1,507 @@
+"""
+Unit tests for the SLURM backend configuration and execution.
+
+Tests verify that the SLURM backend configuration class works correctly,
+provides proper validation, and integrates with the SLURM execution backend.
+"""
+
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from unittest.mock import MagicMock, patch, mock_open
+import tempfile
+import os
+import pytest
+from pydantic import ValidationError
+
+from rompy.backends import SlurmConfig
+
+
+class TestSlurmConfig:
+    """Test the SlurmConfig class."""
+
+    def test_default_values(self):
+        """Test default values for SlurmConfig."""
+        config = SlurmConfig(
+            queue="general",  # Required field
+        )
+
+        assert config.timeout == 3600
+        assert config.env_vars == {}
+        assert config.working_dir is None
+        assert config.queue == "general"
+        assert config.nodes == 1
+        assert config.ntasks == 1
+        assert config.cpus_per_task == 1
+        assert config.time_limit == "1:00:00"
+        assert config.account is None
+        assert config.qos is None
+        assert config.reservation is None
+        assert config.output_file is None
+        assert config.error_file is None
+        assert config.job_name is None
+        assert config.mail_type is None
+        assert config.mail_user is None
+        assert config.additional_options == []
+
+    def test_custom_values(self):
+        """Test setting custom values."""
+        with TemporaryDirectory() as tmp_dir:
+            config = SlurmConfig(
+                queue="compute",
+                nodes=2,
+                ntasks=4,
+                cpus_per_task=8,
+                time_limit="24:00:00",
+                account="myproject",
+                qos="priority",
+                reservation="special_reservation",
+                output_file="slurm-%j.out",
+                error_file="slurm-%j.err",
+                job_name="test_job",
+                mail_type="END",
+                mail_user="test@example.com",
+                additional_options=["--gres=gpu:1", "--exclusive"],
+                timeout=7200,
+                env_vars={"OMP_NUM_THREADS": "8"},
+                working_dir=Path(tmp_dir),
+            )
+
+            assert config.queue == "compute"
+            assert config.nodes == 2
+            assert config.ntasks == 4
+            assert config.cpus_per_task == 8
+            assert config.time_limit == "24:00:00"
+            assert config.account == "myproject"
+            assert config.qos == "priority"
+            assert config.reservation == "special_reservation"
+            assert config.output_file == "slurm-%j.out"
+            assert config.error_file == "slurm-%j.err"
+            assert config.job_name == "test_job"
+            assert config.mail_type == "END"
+            assert config.mail_user == "test@example.com"
+            assert config.additional_options == ["--gres=gpu:1", "--exclusive"]
+            assert config.timeout == 7200
+            assert config.env_vars == {"OMP_NUM_THREADS": "8"}
+            assert config.working_dir == Path(tmp_dir)
+
+    def test_time_limit_validation(self):
+        """Test time limit validation."""
+        # Valid time limits
+        valid_time_limits = [
+            "01:00:00",
+            "00:30:00",
+            "23:59:59",
+            "100:00:00",  # Allow longer times for long jobs
+        ]
+
+        for time_limit in valid_time_limits:
+            config = SlurmConfig(queue="test", time_limit=time_limit)
+            assert config.time_limit == time_limit
+
+        # Invalid time limits (format-based validation)
+        invalid_time_limits = [
+            "00:00",      # Missing seconds
+            "invalid",    # Not matching format
+            "1:1:1",      # Not in HH:MM:SS format (only 1 digit for each part)
+            "25-00-00",   # Wrong separator
+            "12345:00:00", # Too many digits for hours (5 digits instead of max 4)
+            "23:5",       # Missing seconds part
+            ":23:59",     # Missing hours
+            "23::59",     # Missing minutes
+        ]
+
+        for time_limit in invalid_time_limits:
+            with pytest.raises(ValidationError):
+                SlurmConfig(queue="test", time_limit=time_limit)
+
+    def test_additional_options_validation(self):
+        """Test additional options validation."""
+        # Valid additional options
+        config = SlurmConfig(
+            queue="test",
+            additional_options=["--gres=gpu:1", "--exclusive", "--mem-per-cpu=2048"]
+        )
+        assert config.additional_options == ["--gres=gpu:1", "--exclusive", "--mem-per-cpu=2048"]
+
+        # Empty list should be valid
+        config = SlurmConfig(queue="test", additional_options=[])
+        assert config.additional_options == []
+
+    def test_get_backend_class(self):
+        """Test that get_backend_class returns the correct class."""
+        config = SlurmConfig(queue="test")
+        backend_class = config.get_backend_class()
+
+        # Should return SlurmRunBackend class
+        assert backend_class.__name__ == "SlurmRunBackend"
+
+    def test_config_examples(self):
+        """Test that the schema examples are valid."""
+        schema = SlurmConfig.model_json_schema()
+        examples = schema.get("examples", [])
+
+        for example in examples:
+            # Should be able to create config from example
+            config = SlurmConfig(**example)
+            assert isinstance(config, SlurmConfig)
+
+    def test_required_queue_field(self):
+        """Test that queue field is required."""
+        # Should fail without queue
+        with pytest.raises(ValidationError, match="Field required"):
+            SlurmConfig()
+
+        # Should work with queue
+        config = SlurmConfig(queue="general")
+        assert config.queue == "general"
+
+    def test_field_boundaries(self):
+        """Test field boundary values."""
+        # Test minimum values
+        config = SlurmConfig(
+            queue="test",
+            nodes=1,
+            ntasks=1,
+            cpus_per_task=1,
+        )
+        assert config.nodes == 1
+        assert config.ntasks == 1
+        assert config.cpus_per_task == 1
+
+        # Test maximum values
+        config = SlurmConfig(
+            queue="test",
+            nodes=100,  # Max nodes
+            cpus_per_task=128,  # Max cpus per task
+        )
+        assert config.nodes == 100
+        assert config.cpus_per_task == 128
+
+        # Test out of bounds
+        with pytest.raises(ValidationError):
+            SlurmConfig(queue="test", nodes=0)  # Min nodes is 1
+
+        with pytest.raises(ValidationError):
+            SlurmConfig(queue="test", nodes=101)  # Max nodes is 100
+
+        with pytest.raises(ValidationError):
+            SlurmConfig(queue="test", cpus_per_task=0)  # Min cpus_per_task is 1
+
+        with pytest.raises(ValidationError):
+            SlurmConfig(queue="test", cpus_per_task=129)  # Max cpus_per_task is 128
+
+
+class TestSlurmRunBackend:
+    """Test the SlurmRunBackend class."""
+
+    @pytest.fixture
+    def mock_model_run(self):
+        """Create a mock ModelRun instance."""
+        model_run = MagicMock()
+        model_run.run_id = "test_run_123"
+        model_run.output_dir = Path("/tmp/test_output")
+
+        # Create a temporary directory for staging
+        import tempfile
+
+        temp_dir = tempfile.mkdtemp()
+        model_run.generate.return_value = temp_dir
+        model_run.config.run.return_value = True
+        model_run.model_dump.return_value = {"test": "data"}  # Mock for serialization
+        return model_run
+
+    @pytest.fixture
+    def basic_config(self):
+        """Create a basic SlurmConfig."""
+        return SlurmConfig(
+            queue="general",
+            timeout=3600,
+            nodes=1,
+            ntasks=1,
+            cpus_per_task=2,
+            time_limit="01:00:00",
+        )
+
+    def test_create_job_script(self, mock_model_run, basic_config):
+        """Test the _create_job_script method."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        with TemporaryDirectory() as staging_dir:
+            # Create the job script
+            script_path = backend._create_job_script(mock_model_run, basic_config, staging_dir)
+            
+            # Verify the file was created
+            assert os.path.exists(script_path)
+            
+            # Read and check the contents
+            with open(script_path, 'r') as f:
+                content = f.read()
+            
+            # Check for SLURM directives
+            assert "#!/bin/bash" in content
+            assert "#SBATCH --partition=general" in content
+            assert "#SBATCH --nodes=1" in content
+            assert "#SBATCH --ntasks=1" in content
+            assert "#SBATCH --cpus-per-task=2" in content
+            assert "#SBATCH --time=01:00:00" in content
+            
+            # Clean up
+            if os.path.exists(script_path):
+                os.remove(script_path)
+
+    def test_create_job_script_with_all_options(self, mock_model_run):
+        """Test the _create_job_script method with all options."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        config = SlurmConfig(
+            queue="priority",
+            nodes=2,
+            ntasks=4,
+            cpus_per_task=8,
+            time_limit="24:00:00",
+            account="myproject",
+            qos="high",
+            reservation="special",
+            output_file="output_%j.txt",
+            error_file="error_%j.txt",
+            job_name="test_job",
+            mail_type="BEGIN,END,FAIL",
+            mail_user="test@example.com",
+            additional_options=["--gres=gpu:1", "--exclusive"],
+            timeout=86400,
+            env_vars={"OMP_NUM_THREADS": "8", "MY_VAR": "value"},
+        )
+        
+        backend = SlurmRunBackend()
+        
+        with TemporaryDirectory() as staging_dir:
+            script_path = backend._create_job_script(mock_model_run, config, staging_dir)
+            
+            with open(script_path, 'r') as f:
+                content = f.read()
+            
+            # Check for all SBATCH directives
+            assert "#SBATCH --partition=priority" in content
+            assert "#SBATCH --nodes=2" in content
+            assert "#SBATCH --ntasks=4" in content
+            assert "#SBATCH --cpus-per-task=8" in content
+            assert "#SBATCH --time=24:00:00" in content
+            assert "#SBATCH --account=myproject" in content
+            assert "#SBATCH --qos=high" in content
+            assert "#SBATCH --reservation=special" in content
+            assert "#SBATCH --output=output_%j.txt" in content
+            assert "#SBATCH --error=error_%j.txt" in content
+            assert "#SBATCH --job-name=test_job" in content
+            assert "#SBATCH --mail-type=BEGIN,END,FAIL" in content
+            assert "#SBATCH --mail-user=test@example.com" in content
+            assert "#SBATCH --gres=gpu:1" in content
+            assert "#SBATCH --exclusive" in content
+            
+            # Check for environment variables
+            assert "export OMP_NUM_THREADS=8" in content
+            assert "export MY_VAR=value" in content
+            
+            # Clean up
+            if os.path.exists(script_path):
+                os.remove(script_path)
+
+    def test_submit_job(self, basic_config):
+        """Test the _submit_job method."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        # Create a simple job script
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
+            f.write("#!/bin/bash\n#SBATCH --job-name=test\n")
+            script_path = f.name
+        
+        try:
+            # Mock subprocess.run to return a successful job submission
+            with patch("subprocess.run") as mock_run:
+                mock_run.return_value.stdout = "Submitted batch job 12345"
+                mock_run.return_value.stderr = ""
+                mock_run.return_value.returncode = 0
+                
+                job_id = backend._submit_job(script_path)
+                
+                assert job_id == "12345"
+                mock_run.assert_called_once()
+                
+        finally:
+            # Clean up
+            if os.path.exists(script_path):
+                os.remove(script_path)
+
+    def test_submit_job_failure(self, basic_config):
+        """Test the _submit_job method with failure."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        # Create a simple job script
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
+            f.write("#!/bin/bash\n#SBATCH --job-name=test\n")
+            script_path = f.name
+        
+        try:
+            # Mock subprocess.run to return a failure
+            with patch("subprocess.run") as mock_run:
+                mock_run.side_effect = Exception("Submission failed")
+                
+                job_id = backend._submit_job(script_path)
+                
+                assert job_id is None
+                mock_run.assert_called_once()
+                
+        finally:
+            # Clean up
+            if os.path.exists(script_path):
+                os.remove(script_path)
+
+    def test_wait_for_completion_completed(self, basic_config):
+        """Test _wait_for_completion method for completed job."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        # Mock subprocess.run for squeue to return completed state
+        with patch("subprocess.run") as mock_run:
+            # First call returns running, second returns completed
+            mock_run.side_effect = [
+                # Running
+                MagicMock(
+                    stdout="R\n",
+                    stderr="",
+                    returncode=0
+                ),
+                # Completed 
+                MagicMock(
+                    stdout="CD\n",
+                    stderr="",
+                    returncode=0
+                )
+            ]
+            
+            result = backend._wait_for_completion("12345", basic_config)
+            
+            assert result is True
+            assert mock_run.call_count == 2
+
+    def test_wait_for_completion_failed(self, basic_config):
+        """Test _wait_for_completion method for failed job."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        # Mock subprocess.run for squeue to return failed state
+        with patch("subprocess.run") as mock_run:
+            mock_result = MagicMock(stdout="F\n", stderr="", returncode=0)
+            mock_run.return_value = mock_result
+            
+            result = backend._wait_for_completion("12345", basic_config)
+            
+            assert result is False
+
+    def test_wait_for_completion_timeout(self):
+        """Test _wait_for_completion method with timeout."""
+        from rompy.run.slurm import SlurmRunBackend
+        import time
+        from unittest.mock import ANY
+        
+        config = SlurmConfig(
+            queue="test",
+            timeout=60,  # Minimum valid timeout value
+            nodes=1,
+            ntasks=1,
+            cpus_per_task=1,
+            time_limit="01:00:00",
+        )
+        
+        backend = SlurmRunBackend()
+        
+        # Use a more advanced approach with time mocking
+        initial_time = time.time()
+        def time_side_effect():
+            # Return an increasing time value to simulate timeout
+            return initial_time + 120  # More than 60s timeout
+        
+        with patch("subprocess.run") as mock_run:
+            with patch("time.time", side_effect=time_side_effect):
+                # Return running state to avoid early exit due to job completion
+                mock_result = MagicMock(stdout="R\n", stderr="", returncode=0)
+                mock_run.return_value = mock_result
+                
+                result = backend._wait_for_completion("12345", config)
+                
+                # Should return False due to timeout
+                assert result is False
+                
+                # Verify that scancel was called during timeout handling
+                mock_run.assert_any_call(['scancel', '12345'], check=True, capture_output=True)
+
+    def test_run_method_success(self, mock_model_run, basic_config):
+        """Test the full run method with success."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        with TemporaryDirectory() as staging_dir:
+            # Mock the internal methods
+            with patch.object(backend, '_create_job_script') as mock_create_script, \
+                 patch.object(backend, '_submit_job') as mock_submit, \
+                 patch.object(backend, '_wait_for_completion') as mock_wait:
+                
+                # Mock the methods to return expected values
+                mock_create_script.return_value = "/tmp/job_script.sh"
+                mock_submit.return_value = "12345"
+                mock_wait.return_value = True  # Job completed successfully
+                
+                # Set up the mock model run to return the staging directory
+                mock_model_run.generate.return_value = staging_dir
+                
+                result = backend.run(mock_model_run, basic_config)
+                
+                assert result is True
+                mock_create_script.assert_called_once()
+                mock_submit.assert_called_once()
+                mock_wait.assert_called_once_with("12345", basic_config)
+
+    def test_run_method_job_submit_failure(self, mock_model_run, basic_config):
+        """Test the run method when job submission fails."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        with TemporaryDirectory() as staging_dir:
+            # Mock the internal methods
+            with patch.object(backend, '_create_job_script') as mock_create_script, \
+                 patch.object(backend, '_submit_job') as mock_submit:
+                
+                # Mock the methods
+                mock_create_script.return_value = "/tmp/job_script.sh"
+                mock_submit.return_value = None  # Submission failed
+                
+                # Set up the mock model run
+                mock_model_run.generate.return_value = staging_dir
+                
+                result = backend.run(mock_model_run, basic_config)
+                
+                assert result is False
+                mock_create_script.assert_called_once()
+                mock_submit.assert_called_once()
+
+    def test_run_method_generation_failure(self, mock_model_run, basic_config):
+        """Test the run method when model generation fails."""
+        from rompy.run.slurm import SlurmRunBackend
+        
+        backend = SlurmRunBackend()
+        
+        # Configure mock to raise an exception during generation
+        mock_model_run.generate.side_effect = Exception("Generation failed")
+        
+        result = backend.run(mock_model_run, basic_config)
+        
+        assert result is False
\ No newline at end of file

From 6722b640d418228a27bfbdd6639c319b09af21d6 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Wed, 1 Oct 2025 09:57:35 +1000
Subject: [PATCH 02/24] Polish and tests

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 90683f2..e3e78d6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,6 @@ rompy = "rompy.cli:main"
 
 [project.entry-points."rompy.config"]
 base = "rompy.core.config:BaseConfig"
-slurm = "rompy.backends.config:SlurmConfig"
 
 [project.entry-points."rompy.source"]
 file = "rompy.core.source:SourceFile"

From 3537db07a0b99541a89ab282b6c54a77bd1c91bf Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Thu, 11 Sep 2025 14:16:51 +1000
Subject: [PATCH 03/24] Replaced subprocess docker calls with docker python
 library

---
 pyproject.toml                           |   1 +
 src/rompy/run/docker.py                  | 165 +++++++++++------------
 tests/integration/test_docker_backend.py | 151 ++++++++++++---------
 3 files changed, 171 insertions(+), 146 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e3e78d6..3691a85 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dependencies = [
     "cloudpathlib",
     "cookiecutter>=2.6",
     "dask",
+    "docker",
     "fsspec",
     "geopandas",
     "h5py",
diff --git a/src/rompy/run/docker.py b/src/rompy/run/docker.py
index d860505..20101dc 100644
--- a/src/rompy/run/docker.py
+++ b/src/rompy/run/docker.py
@@ -12,6 +12,9 @@
 import time
 from typing import TYPE_CHECKING, Dict, List, Optional
 
+import docker
+from docker.errors import APIError, BuildError, ContainerError, ImageNotFound
+
 if TYPE_CHECKING:
     from rompy.backends import DockerConfig
 
@@ -138,39 +141,36 @@ def _prepare_image(
                 logger.info(f"Using existing Docker image: {image_name}")
                 return image_name
 
-            # Build arguments
-            build_args_list = []
-            if build_args:
-                for key, value in build_args.items():
-                    build_args_list.extend(["--build-arg", f"{key}={value}"])
-
-            # Build the Docker image
+            # Build the Docker image using docker-py
             logger.info(
                 f"Building Docker image {image_name} from {dockerfile} (context: {context_path})"
             )
-            build_cmd = [
-                "docker",
-                "build",
-                "-t",
-                image_name,
-                "-f",
-                str(dockerfile_path),  # Use full path for -f flag
-                *build_args_list,
-                str(context_path),
-            ]
-
+            
             try:
-                result = subprocess.run(
-                    build_cmd,
-                    check=True,
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                    text=True,
+                client = docker.from_env()
+                image_obj, build_logs = client.images.build(
+                    path=str(context_path),
+                    dockerfile=str(dockerfile_path.relative_to(context_path)),
+                    tag=image_name,
+                    buildargs=build_args or {},
+                    rm=True,
                 )
-                logger.debug(f"Docker build output: {result.stdout}")
+                
+                # Log build output
+                for line in build_logs:
+                    if 'stream' in line:
+                        logger.debug(line['stream'].strip())
+                
+                logger.info(f"Successfully built Docker image: {image_name}")
                 return image_name
-            except subprocess.CalledProcessError as e:
-                logger.error(f"Docker build failed: {e.stderr}")
+            except BuildError as e:
+                logger.error(f"Docker build failed: {e.msg}")
+                for line in e.build_log:
+                    if 'error' in line:
+                        logger.error(f"Build error: {line['error']}")
+                return None
+            except APIError as e:
+                logger.error(f"Docker API error during build: {e}")
                 return None
 
         # If neither is provided, use a default image
@@ -253,62 +253,61 @@ def _run_container(
         Returns:
             True if execution was successful, False otherwise
         """
-        # Set up the Docker command
-        docker_cmd = [
-            "docker",
-            "run",
-            "--rm",  # Remove container after run
-            "--user",
-            "root",  # Run as root to avoid permission issues
-        ]
-
-        # Add environment variables
-        for key, value in env_vars.items():
-            docker_cmd.extend(["-e", f"{key}={value}"])
-
-        # Add volume mounts
-        for volume in volume_mounts:
-            docker_cmd.extend(["-v", volume])
-
-        # Add the image name and command
-        docker_cmd.append(image_name)
-
-        # Add bash and -c as separate arguments
-        docker_cmd.append("bash")
-        docker_cmd.append("-c")
-
-        # Add the run command as a separate argument
-        docker_cmd.append(run_command)
-
         try:
-            logger.info(f"Executing: {' '.join(docker_cmd)}")
-            # Don't use check=True, so we can see the output even if it fails
-            result = subprocess.run(
-                docker_cmd,
-                check=False,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True,
-            )
-
-            # Always log the output regardless of success/failure
-            if result.stdout:
-                logger.info(f"Docker stdout: \n{result.stdout}")
-            if result.stderr:
-                logger.warning(f"Docker stderr: \n{result.stderr}")
-
-            # Check return code manually
-            if result.returncode == 0:
-                logger.info("Model run completed successfully with exit code 0")
+            client = docker.from_env()
+            
+            # Convert volume mounts to docker-py format
+            volumes = {}
+            for volume in volume_mounts:
+                parts = volume.split(':')
+                if len(parts) >= 2:
+                    host_path, container_path = parts[0], parts[1]
+                    mode = 'rw'  # default mode
+                    if len(parts) > 2:
+                        mode = parts[2] if parts[2] in ['ro', 'rw', 'Z'] else 'rw'
+                    volumes[host_path] = {'bind': container_path, 'mode': mode}
+
+            # Prepare container configuration
+            container_config = {
+                'image': image_name,
+                'command': ['bash', '-c', run_command],
+                'environment': env_vars,
+                'volumes': volumes,
+                'user': 'root',
+                'remove': True,  # Remove container after run
+                'stdout': True,
+                'stderr': True,
+            }
+
+            logger.info(f"Running Docker container with image: {image_name}")
+            logger.debug(f"Command: {run_command}")
+            logger.debug(f"Volumes: {volumes}")
+            logger.debug(f"Environment: {env_vars}")
+
+            # Run the container
+            container = client.containers.run(**container_config)
+            
+            # Log output
+            if container:
+                logger.info("Model run completed successfully")
                 return True
             else:
-                logger.error(f"Model run failed with exit code {result.returncode}")
-                logger.error(f"Command: {' '.join(docker_cmd)}")
+                logger.error("Model run failed - no output from container")
                 return False
 
+        except ContainerError as e:
+            logger.error(f"Container error: {e}")
+            if e.stderr:
+                logger.error(f"Container stderr: {e.stderr}")
+            return False
+        except ImageNotFound:
+            logger.error(f"Docker image not found: {image_name}")
+            return False
+        except APIError as e:
+            logger.error(f"Docker API error: {e}")
+            return False
         except Exception as e:
             logger.error(f"Docker run error: {str(e)}")
-            logger.error(f"Command: {' '.join(docker_cmd)}")
             return False
 
     def _generate_image_name(
@@ -364,15 +363,13 @@ def _image_exists(self, image_name: str) -> bool:
             True if image exists, False otherwise
         """
         try:
-            subprocess.run(
-                ["docker", "image", "inspect", image_name],
-                check=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True,
-            )
+            client = docker.from_env()
+            client.images.get(image_name)
             logger.debug(f"Image {image_name} already exists")
             return True
-        except subprocess.CalledProcessError:
+        except ImageNotFound:
             logger.debug(f"Image {image_name} does not exist")
             return False
+        except APIError as e:
+            logger.error(f"Error checking for image {image_name}: {e}")
+            return False
diff --git a/tests/integration/test_docker_backend.py b/tests/integration/test_docker_backend.py
index 39be1e6..9d2bcdb 100644
--- a/tests/integration/test_docker_backend.py
+++ b/tests/integration/test_docker_backend.py
@@ -11,6 +11,8 @@
 from unittest.mock import patch
 
 import pytest
+import docker
+from docker.errors import APIError
 
 from rompy.backends.config import DockerConfig
 from rompy.core.config import BaseConfig
@@ -22,11 +24,10 @@
 def docker_available():
     """Check if Docker is available and running."""
     try:
-        result = subprocess.run(
-            ["docker", "info"], capture_output=True, text=True, timeout=10
-        )
-        return result.returncode == 0
-    except (subprocess.TimeoutExpired, FileNotFoundError):
+        client = docker.from_env()
+        client.ping()
+        return True
+    except (APIError, docker.errors.DockerException):
         return False
 
 
@@ -190,11 +191,10 @@ def test_prepare_image_with_dockerfile(self, docker_backend, tmp_path):
 """
         )
 
-        # Mock subprocess.run to avoid actually building
-        with patch("subprocess.run") as mock_run:
-            mock_run.return_value.returncode = 0
-            mock_run.return_value.stdout = "Successfully built image"
-            mock_run.return_value.stderr = ""
+        # Mock docker.from_env to avoid actually building
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.images.build.return_value = ("image_object", [{"stream": "Successfully built image"}])
 
             # Mock _image_exists to return False (image doesn't exist)
             with patch.object(docker_backend, "_image_exists", return_value=False):
@@ -204,7 +204,7 @@ def test_prepare_image_with_dockerfile(self, docker_backend, tmp_path):
 
                 # Should return a generated image name
                 assert result.startswith("rompy-")
-                mock_run.assert_called_once()
+                mock_client.images.build.assert_called_once()
 
     def test_prepare_image_with_dockerfile_build_failure(
         self, docker_backend, tmp_path
@@ -215,10 +215,11 @@ def test_prepare_image_with_dockerfile_build_failure(
         dockerfile = context_dir / "Dockerfile"
         dockerfile.write_text("INVALID DOCKERFILE CONTENT")
 
-        # Mock subprocess.run to simulate build failure
-        with patch("subprocess.run") as mock_run:
-            mock_run.side_effect = subprocess.CalledProcessError(
-                1, "docker build", stderr="Build failed"
+        # Mock docker.from_env to simulate build failure
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.images.build.side_effect = docker.errors.BuildError(
+                "Build failed", []
             )
 
             # Mock _image_exists to return False (image doesn't exist)
@@ -247,11 +248,10 @@ def test_prepare_image_with_build_context(self, docker_backend, tmp_path):
         test_file = context_dir / "test.txt"
         test_file.write_text("test content")
 
-        # Mock subprocess.run to avoid actually building
-        with patch("subprocess.run") as mock_run:
-            mock_run.return_value.returncode = 0
-            mock_run.return_value.stdout = "Successfully built image"
-            mock_run.return_value.stderr = ""
+        # Mock docker.from_env to avoid actually building
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.images.build.return_value = ("image_object", [{"stream": "Successfully built image"}])
 
             # Mock _image_exists to return False (image doesn't exist)
             with patch.object(docker_backend, "_image_exists", return_value=False):
@@ -262,10 +262,10 @@ def test_prepare_image_with_build_context(self, docker_backend, tmp_path):
                 # Should return a generated image name
                 assert result.startswith("rompy-")
 
-                # Check that docker build was called with correct context
-                mock_run.assert_called_once()
-                call_args = mock_run.call_args[0][0]
-                assert str(context_dir) in call_args  # Build context should be included
+                # Check that docker.images.build was called with correct context
+                mock_client.images.build.assert_called_once()
+                call_kwargs = mock_client.images.build.call_args[1]
+                assert call_kwargs["path"] == str(context_dir)
 
     def test_prepare_image_with_existing_image(self, docker_backend, tmp_path):
         """Test _prepare_image with an image that already exists."""
@@ -281,8 +281,8 @@ def test_prepare_image_with_existing_image(self, docker_backend, tmp_path):
 
         # Mock _image_exists to return True (image already exists)
         with patch.object(docker_backend, "_image_exists", return_value=True):
-            # Mock subprocess.run to ensure it's NOT called
-            with patch("subprocess.run") as mock_run:
+            # Mock docker.from_env to ensure it's NOT called
+            with patch("docker.from_env") as mock_docker:
                 result = docker_backend._prepare_image(
                     None, "Dockerfile", str(context_dir)
                 )
@@ -290,7 +290,7 @@ def test_prepare_image_with_existing_image(self, docker_backend, tmp_path):
                 # Should return the existing image name
                 assert result.startswith("rompy-")
                 # Build should not be called since image exists
-                mock_run.assert_not_called()
+                mock_docker.assert_not_called()
 
     def test_generate_image_name_deterministic(self, docker_backend, tmp_path):
         """Test that _generate_image_name produces deterministic results."""
@@ -371,32 +371,28 @@ def test_generate_image_name_unreadable_dockerfile(self, docker_backend, tmp_pat
 
     def test_image_exists_true(self, docker_backend):
         """Test _image_exists when image exists."""
-        with patch("subprocess.run") as mock_run:
-            mock_run.return_value.returncode = 0
-            mock_run.return_value.stdout = "image exists"
-            mock_run.return_value.stderr = ""
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.images.get.return_value = "image_object"
 
             result = docker_backend._image_exists("test:image")
             assert result is True
 
-            # Check that docker image inspect was called
-            mock_run.assert_called_once()
-            call_args = mock_run.call_args[0][0]
-            assert "docker" in call_args
-            assert "image" in call_args
-            assert "inspect" in call_args
-            assert "test:image" in call_args
+            # Check that docker.images.get was called with correct image name
+            mock_client.images.get.assert_called_once_with("test:image")
 
     def test_image_exists_false(self, docker_backend):
         """Test _image_exists when image doesn't exist."""
-        with patch("subprocess.run") as mock_run:
-            mock_run.side_effect = subprocess.CalledProcessError(
-                1, "docker image inspect", stderr="No such image"
-            )
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.images.get.side_effect = docker.errors.ImageNotFound("No such image")
 
             result = docker_backend._image_exists("nonexistent:image")
             assert result is False
 
+            # Check that docker.images.get was called with correct image name
+            mock_client.images.get.assert_called_once_with("nonexistent:image")
+
     def test_get_run_command_simple(self, docker_backend):
         """Test _get_run_command with simple parameters."""
         result = docker_backend._get_run_command(
@@ -447,10 +443,9 @@ class TestDockerBackendMocked:
 
     def test_run_container_success(self, docker_backend):
         """Test _run_container with successful execution."""
-        with patch("subprocess.run") as mock_run:
-            mock_run.return_value.returncode = 0
-            mock_run.return_value.stdout = "Container executed successfully"
-            mock_run.return_value.stderr = ""
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.containers.run.return_value = "container_output"
 
             result = docker_backend._run_container(
                 image_name="test:image",
@@ -460,21 +455,52 @@ def test_run_container_success(self, docker_backend):
             )
 
             assert result is True
-            mock_run.assert_called_once()
-
-            # Check that the docker command was constructed correctly
-            call_args = mock_run.call_args[0][0]
-            assert "docker" in call_args
-            assert "run" in call_args
-            assert "--rm" in call_args
-            assert "test:image" in call_args
+            mock_client.containers.run.assert_called_once()
+            
+            # Check that the container was run with correct parameters
+            call_kwargs = mock_client.containers.run.call_args[1]
+            assert call_kwargs["image"] == "test:image"
+            assert call_kwargs["command"] == ["bash", "-c", "echo test"]
+            assert call_kwargs["environment"] == {"TEST": "value"}
+            assert call_kwargs["remove"] is True
 
     def test_run_container_failure(self, docker_backend):
-        """Test _run_container with failed execution."""
-        with patch("subprocess.run") as mock_run:
-            mock_run.return_value.returncode = 1
-            mock_run.return_value.stdout = ""
-            mock_run.return_value.stderr = "Container failed"
+        """Test _run_container with container error."""
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.containers.run.side_effect = docker.errors.ContainerError(
+                "container_id", 1, "echo test", "test:image", "Container failed"
+            )
+
+            result = docker_backend._run_container(
+                image_name="test:image",
+                run_command="echo test",
+                volume_mounts=[],
+                env_vars={},
+            )
+
+            assert result is False
+
+    def test_run_container_image_not_found(self, docker_backend):
+        """Test _run_container with image not found."""
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.containers.run.side_effect = docker.errors.ImageNotFound("No such image")
+
+            result = docker_backend._run_container(
+                image_name="nonexistent:image",
+                run_command="echo test",
+                volume_mounts=[],
+                env_vars={},
+            )
+
+            assert result is False
+
+    def test_run_container_api_error(self, docker_backend):
+        """Test _run_container with Docker API error."""
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.containers.run.side_effect = docker.errors.APIError("API error")
 
             result = docker_backend._run_container(
                 image_name="test:image",
@@ -486,9 +512,10 @@ def test_run_container_failure(self, docker_backend):
             assert result is False
 
     def test_run_container_exception(self, docker_backend):
-        """Test _run_container with subprocess exception."""
-        with patch("subprocess.run") as mock_run:
-            mock_run.side_effect = Exception("Docker not available")
+        """Test _run_container with generic exception."""
+        with patch("docker.from_env") as mock_docker:
+            mock_client = mock_docker.return_value
+            mock_client.containers.run.side_effect = Exception("Docker not available")
 
             result = docker_backend._run_container(
                 image_name="test:image",

From dbdc2be7d36cce538653e97a4623789e1396b163 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Thu, 11 Sep 2025 14:22:50 +1000
Subject: [PATCH 04/24] Update tests/integration/test_docker_backend.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 tests/integration/test_docker_backend.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_docker_backend.py b/tests/integration/test_docker_backend.py
index 9d2bcdb..6083bdd 100644
--- a/tests/integration/test_docker_backend.py
+++ b/tests/integration/test_docker_backend.py
@@ -468,8 +468,10 @@ def test_run_container_failure(self, docker_backend):
         """Test _run_container with container error."""
         with patch("docker.from_env") as mock_docker:
             mock_client = mock_docker.return_value
+            from unittest.mock import Mock
+            mock_container = Mock()
             mock_client.containers.run.side_effect = docker.errors.ContainerError(
-                "container_id", 1, "echo test", "test:image", "Container failed"
+                mock_container, 1, "echo test", "test:image", "Container failed"
             )
 
             result = docker_backend._run_container(

From 9d357b6a164237930f38d12acad88406d418e2ac Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Thu, 11 Sep 2025 14:27:40 +1000
Subject: [PATCH 05/24] Fixed failing test

---
 tests/backends/test_pydantic_backends.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/backends/test_pydantic_backends.py b/tests/backends/test_pydantic_backends.py
index aafba08..42ffa13 100644
--- a/tests/backends/test_pydantic_backends.py
+++ b/tests/backends/test_pydantic_backends.py
@@ -406,6 +406,7 @@ def test_local_config_integration(self, mock_model_run):
     def test_docker_config_integration(self, mock_model_run):
         """Test DockerConfig integration with DockerRunBackend."""
         import tempfile
+        import docker
 
         config = DockerConfig(
             image="test:latest",
@@ -423,17 +424,16 @@ def test_docker_config_integration(self, mock_model_run):
             # Update mock to return existing directory
             mock_model_run.generate.return_value = temp_dir
 
-            # Mock docker subprocess call
-            with patch("subprocess.run") as mock_run:
-                mock_run.return_value.returncode = 0
-                mock_run.return_value.stdout = "docker output"
-                mock_run.return_value.stderr = ""
+            # Mock docker-py calls
+            with patch("docker.from_env") as mock_docker:
+                mock_client = mock_docker.return_value
+                mock_client.containers.run.return_value = "container_output"
 
                 # Run with config
                 result = backend.run(mock_model_run, config=config)
 
                 assert result is True
-                mock_run.assert_called_once()
+                mock_client.containers.run.assert_called_once()
 
     def test_pydantic_config_integration(self, mock_model_run):
         """Test that backends work with Pydantic config objects only."""

From e953b15cf49885fc76d792b1ac84a628b62f99e6 Mon Sep 17 00:00:00 2001
From: rafa-guedes <ocerafa@gmail.com>
Date: Wed, 1 Oct 2025 17:47:48 +1300
Subject: [PATCH 06/24] log_box imported twice

---
 src/rompy/model.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/rompy/model.py b/src/rompy/model.py
index fd65fd4..ae5e874 100644
--- a/src/rompy/model.py
+++ b/src/rompy/model.py
@@ -226,10 +226,6 @@ def generate(self) -> str:
             logger.debug(f"Configuration string formatting error: {str(e)}")
 
         logger.info("")
-
-        # Use the log_box utility function
-        from rompy.formatting import log_box
-
         log_box(
             title="STARTING MODEL GENERATION",
             logger=logger,

From 7e691c12280c4f59ca1b558bdb164b292493a31f Mon Sep 17 00:00:00 2001
From: rafa-guedes <ocerafa@gmail.com>
Date: Wed, 1 Oct 2025 18:21:06 +1300
Subject: [PATCH 07/24] Replace deprecated utcnow

---
 src/rompy/model.py          | 4 ++--
 tests/test_intake_driver.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/rompy/model.py b/src/rompy/model.py
index ae5e874..a42257c 100644
--- a/src/rompy/model.py
+++ b/src/rompy/model.py
@@ -9,7 +9,7 @@
 import platform
 import shutil
 import zipfile as zf
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, Literal, Optional, Union
 
@@ -131,7 +131,7 @@ def _create_staging_dir(self):
     @property
     def _generation_medatadata(self):
         return dict(
-            _generated_at=str(datetime.utcnow()),
+            _generated_at=str(datetime.now(timezone.utc)),
             _generated_by=os.environ.get("USER"),
             _generated_on=platform.node(),
         )
diff --git a/tests/test_intake_driver.py b/tests/test_intake_driver.py
index f353388..84118b7 100644
--- a/tests/test_intake_driver.py
+++ b/tests/test_intake_driver.py
@@ -1,8 +1,8 @@
 import os
-
-# Import test utilities
+from datetime import timezone
 from test_utils.logging import get_test_logger
 
+
 # Initialize logger
 logger = get_test_logger(__name__)
 
@@ -14,7 +14,7 @@
 from rompy.core.data import DataGrid
 
 # round now to the nearest 6 hours
-cycle = datetime.utcnow().replace(
+cycle = datetime.now(timezone.utc).replace(
     hour=0, minute=0, second=0, microsecond=0
 ) - timedelta(days=2)
 

From ac7290495fc4099123ead81938bc4ac3687abed3 Mon Sep 17 00:00:00 2001
From: rafa-guedes <ocerafa@gmail.com>
Date: Fri, 3 Oct 2025 09:56:38 +1300
Subject: [PATCH 08/24] Suppressing numpy incompatibility warnings

---
 pytest.ini        | 2 ++
 tests/conftest.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/pytest.ini b/pytest.ini
index c7cb2d7..f728df6 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -5,3 +5,5 @@ addopts = --tb=short -v
 markers =
     slow: marks tests as slow (deselect with '-m "not slow"')
     integration: marks tests as integration tests
+filterwarnings =
+    ignore:numpy.ndarray size changed:RuntimeWarning
diff --git a/tests/conftest.py b/tests/conftest.py
index a570992..9274596 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,8 +3,12 @@
 import subprocess
 import sys
 import tempfile
+import warnings
 import zipfile
 
+# Suppress numpy binary incompatibility warning
+warnings.filterwarnings("ignore", message="numpy.ndarray size changed", category=RuntimeWarning)
+
 import pytest
 import requests
 

From 7432dcc913b19218a4adcaa5e2a4c9bca06cf981 Mon Sep 17 00:00:00 2001
From: rafa-guedes <ocerafa@gmail.com>
Date: Fri, 3 Oct 2025 10:04:53 +1300
Subject: [PATCH 09/24] Definitive fix for the numpy warning in the tests

---
 tests/conftest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 9274596..6b03b46 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -60,6 +60,9 @@ def pytest_configure(config):
     """Configure pytest with plugins and settings, and ensure test data is present."""
     import logging
 
+    # Suppress numpy binary incompatibility warning
+    warnings.filterwarnings("ignore", message=".*numpy.ndarray size changed.*", category=RuntimeWarning)
+
     # Get log level from command line or use default
     log_level_str = config.getoption("--rompy-log-level")
     getattr(logging, log_level_str)

From 2499cf0d1786f9638bca815e8b8e56f1692078ef Mon Sep 17 00:00:00 2001
From: rafa-guedes <ocerafa@gmail.com>
Date: Fri, 3 Oct 2025 13:49:13 +1300
Subject: [PATCH 10/24] Run ruff across the repo

---
 examples/configs/validate_configs.py          |  2 +-
 pyproject.toml                                |  9 +++++
 src/rompy/core/grid.py                        |  1 -
 src/rompy/model.py                            |  4 +-
 src/rompy/run/docker.py                       | 40 +++++++++----------
 tests/backends/test_enhanced_backends.py      |  3 +-
 .../test_modelrun_pydantic_integration.py     |  3 +-
 tests/conftest.py                             |  8 +++-
 tests/integration/test_docker_backend.py      | 21 +++++++---
 tests/test_data.py                            |  9 ++++-
 10 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/examples/configs/validate_configs.py b/examples/configs/validate_configs.py
index f43ca3f..1b96864 100644
--- a/examples/configs/validate_configs.py
+++ b/examples/configs/validate_configs.py
@@ -184,7 +184,7 @@ def validate_yaml_file(file_path: Path) -> bool:
             if doc is None:
                 continue
 
-            doc_name = f"document {i+1}" if len(documents) > 1 else "document"
+            doc_name = f"document {i + 1}" if len(documents) > 1 else "document"
 
             # Determine configuration type and validate
             if "pipeline_backend" in doc:
diff --git a/pyproject.toml b/pyproject.toml
index 3691a85..73d2a7e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,5 +156,14 @@ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 [tool.black]
 line-length = 88
 
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.format]
+# Use Black-compatible formatting
+quote-style = "double"
+indent-style = "space"
+line-ending = "auto"
+
 [tool.setuptools_scm]
 write_to = "src/rompy/_version.py"
diff --git a/src/rompy/core/grid.py b/src/rompy/core/grid.py
index 39dbc81..9ca25f7 100644
--- a/src/rompy/core/grid.py
+++ b/src/rompy/core/grid.py
@@ -294,7 +294,6 @@ def __str__(self):
 
 
 if __name__ == "__main__":
-
     grid0 = RegularGrid(x0=-1, y0=1, rot=35, nx=10, ny=10, dx=1, dy=2)
     grid1 = RegularGrid(x=grid0.x, y=grid0.y)
 
diff --git a/src/rompy/model.py b/src/rompy/model.py
index a42257c..61dab17 100644
--- a/src/rompy/model.py
+++ b/src/rompy/model.py
@@ -205,7 +205,9 @@ def generate(self) -> str:
 
         # Log the bottom of the box
         log_box(
-            title=None, logger=logger, add_empty_line=True  # Just the bottom border
+            title=None,
+            logger=logger,
+            add_empty_line=True,  # Just the bottom border
         )
 
         # Display detailed configuration info using the new formatting framework
diff --git a/src/rompy/run/docker.py b/src/rompy/run/docker.py
index 20101dc..5308936 100644
--- a/src/rompy/run/docker.py
+++ b/src/rompy/run/docker.py
@@ -145,7 +145,7 @@ def _prepare_image(
             logger.info(
                 f"Building Docker image {image_name} from {dockerfile} (context: {context_path})"
             )
-            
+
             try:
                 client = docker.from_env()
                 image_obj, build_logs = client.images.build(
@@ -155,18 +155,18 @@ def _prepare_image(
                     buildargs=build_args or {},
                     rm=True,
                 )
-                
+
                 # Log build output
                 for line in build_logs:
-                    if 'stream' in line:
-                        logger.debug(line['stream'].strip())
-                
+                    if "stream" in line:
+                        logger.debug(line["stream"].strip())
+
                 logger.info(f"Successfully built Docker image: {image_name}")
                 return image_name
             except BuildError as e:
                 logger.error(f"Docker build failed: {e.msg}")
                 for line in e.build_log:
-                    if 'error' in line:
+                    if "error" in line:
                         logger.error(f"Build error: {line['error']}")
                 return None
             except APIError as e:
@@ -255,28 +255,28 @@ def _run_container(
         """
         try:
             client = docker.from_env()
-            
+
             # Convert volume mounts to docker-py format
             volumes = {}
             for volume in volume_mounts:
-                parts = volume.split(':')
+                parts = volume.split(":")
                 if len(parts) >= 2:
                     host_path, container_path = parts[0], parts[1]
-                    mode = 'rw'  # default mode
+                    mode = "rw"  # default mode
                     if len(parts) > 2:
-                        mode = parts[2] if parts[2] in ['ro', 'rw', 'Z'] else 'rw'
-                    volumes[host_path] = {'bind': container_path, 'mode': mode}
+                        mode = parts[2] if parts[2] in ["ro", "rw", "Z"] else "rw"
+                    volumes[host_path] = {"bind": container_path, "mode": mode}
 
             # Prepare container configuration
             container_config = {
-                'image': image_name,
-                'command': ['bash', '-c', run_command],
-                'environment': env_vars,
-                'volumes': volumes,
-                'user': 'root',
-                'remove': True,  # Remove container after run
-                'stdout': True,
-                'stderr': True,
+                "image": image_name,
+                "command": ["bash", "-c", run_command],
+                "environment": env_vars,
+                "volumes": volumes,
+                "user": "root",
+                "remove": True,  # Remove container after run
+                "stdout": True,
+                "stderr": True,
             }
 
             logger.info(f"Running Docker container with image: {image_name}")
@@ -286,7 +286,7 @@ def _run_container(
 
             # Run the container
             container = client.containers.run(**container_config)
-            
+
             # Log output
             if container:
                 logger.info("Model run completed successfully")
diff --git a/tests/backends/test_enhanced_backends.py b/tests/backends/test_enhanced_backends.py
index b5a970a..46a99e8 100644
--- a/tests/backends/test_enhanced_backends.py
+++ b/tests/backends/test_enhanced_backends.py
@@ -101,7 +101,8 @@ def test_run_with_command_failure(self, model_run, tmp_path):
         output_dir.mkdir(parents=True, exist_ok=True)
 
         config = LocalConfig(
-            command="exit 1", working_dir=output_dir  # Command that will fail
+            command="exit 1",
+            working_dir=output_dir,  # Command that will fail
         )
 
         with patch("rompy.model.ModelRun.generate", return_value=str(output_dir)):
diff --git a/tests/backends/test_modelrun_pydantic_integration.py b/tests/backends/test_modelrun_pydantic_integration.py
index 29c3285..d4e56a8 100644
--- a/tests/backends/test_modelrun_pydantic_integration.py
+++ b/tests/backends/test_modelrun_pydantic_integration.py
@@ -162,7 +162,8 @@ def test_run_backend_failure_propagation(self, model_run, tmp_path):
 
         # Create LocalConfig with failing command
         config = LocalConfig(
-            command="exit 1", working_dir=output_dir  # Command that will fail
+            command="exit 1",
+            working_dir=output_dir,  # Command that will fail
         )
 
         with patch("rompy.model.ModelRun.generate", return_value=str(output_dir)):
diff --git a/tests/conftest.py b/tests/conftest.py
index 6b03b46..f1dd24a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,7 +7,9 @@
 import zipfile
 
 # Suppress numpy binary incompatibility warning
-warnings.filterwarnings("ignore", message="numpy.ndarray size changed", category=RuntimeWarning)
+warnings.filterwarnings(
+    "ignore", message="numpy.ndarray size changed", category=RuntimeWarning
+)
 
 import pytest
 import requests
@@ -61,7 +63,9 @@ def pytest_configure(config):
     import logging
 
     # Suppress numpy binary incompatibility warning
-    warnings.filterwarnings("ignore", message=".*numpy.ndarray size changed.*", category=RuntimeWarning)
+    warnings.filterwarnings(
+        "ignore", message=".*numpy.ndarray size changed.*", category=RuntimeWarning
+    )
 
     # Get log level from command line or use default
     log_level_str = config.getoption("--rompy-log-level")
diff --git a/tests/integration/test_docker_backend.py b/tests/integration/test_docker_backend.py
index 6083bdd..ad0d5a3 100644
--- a/tests/integration/test_docker_backend.py
+++ b/tests/integration/test_docker_backend.py
@@ -194,7 +194,10 @@ def test_prepare_image_with_dockerfile(self, docker_backend, tmp_path):
         # Mock docker.from_env to avoid actually building
         with patch("docker.from_env") as mock_docker:
             mock_client = mock_docker.return_value
-            mock_client.images.build.return_value = ("image_object", [{"stream": "Successfully built image"}])
+            mock_client.images.build.return_value = (
+                "image_object",
+                [{"stream": "Successfully built image"}],
+            )
 
             # Mock _image_exists to return False (image doesn't exist)
             with patch.object(docker_backend, "_image_exists", return_value=False):
@@ -251,7 +254,10 @@ def test_prepare_image_with_build_context(self, docker_backend, tmp_path):
         # Mock docker.from_env to avoid actually building
         with patch("docker.from_env") as mock_docker:
             mock_client = mock_docker.return_value
-            mock_client.images.build.return_value = ("image_object", [{"stream": "Successfully built image"}])
+            mock_client.images.build.return_value = (
+                "image_object",
+                [{"stream": "Successfully built image"}],
+            )
 
             # Mock _image_exists to return False (image doesn't exist)
             with patch.object(docker_backend, "_image_exists", return_value=False):
@@ -385,7 +391,9 @@ def test_image_exists_false(self, docker_backend):
         """Test _image_exists when image doesn't exist."""
         with patch("docker.from_env") as mock_docker:
             mock_client = mock_docker.return_value
-            mock_client.images.get.side_effect = docker.errors.ImageNotFound("No such image")
+            mock_client.images.get.side_effect = docker.errors.ImageNotFound(
+                "No such image"
+            )
 
             result = docker_backend._image_exists("nonexistent:image")
             assert result is False
@@ -456,7 +464,7 @@ def test_run_container_success(self, docker_backend):
 
             assert result is True
             mock_client.containers.run.assert_called_once()
-            
+
             # Check that the container was run with correct parameters
             call_kwargs = mock_client.containers.run.call_args[1]
             assert call_kwargs["image"] == "test:image"
@@ -469,6 +477,7 @@ def test_run_container_failure(self, docker_backend):
         with patch("docker.from_env") as mock_docker:
             mock_client = mock_docker.return_value
             from unittest.mock import Mock
+
             mock_container = Mock()
             mock_client.containers.run.side_effect = docker.errors.ContainerError(
                 mock_container, 1, "echo test", "test:image", "Container failed"
@@ -487,7 +496,9 @@ def test_run_container_image_not_found(self, docker_backend):
         """Test _run_container with image not found."""
         with patch("docker.from_env") as mock_docker:
             mock_client = mock_docker.return_value
-            mock_client.containers.run.side_effect = docker.errors.ImageNotFound("No such image")
+            mock_client.containers.run.side_effect = docker.errors.ImageNotFound(
+                "No such image"
+            )
 
             result = docker_backend._run_container(
                 image_name="nonexistent:image",
diff --git a/tests/test_data.py b/tests/test_data.py
index 9aa850d..7d10713 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -191,9 +191,14 @@ def test_source_datamesh():
         datasource="era5_wind10m", token=DATAMESH_TOKEN
     )
     filters = Filter()
-    filters.crop.update(dict(time=Slice(start="2000-01-01T00:00:00", stop="2000-01-01T03:00:00")))
     filters.crop.update(
-        dict(longitude=Slice(start=115.5, stop=116.0), latitude=Slice(start=-33.0, stop=-32.5))
+        dict(time=Slice(start="2000-01-01T00:00:00", stop="2000-01-01T03:00:00"))
+    )
+    filters.crop.update(
+        dict(
+            longitude=Slice(start=115.5, stop=116.0),
+            latitude=Slice(start=-33.0, stop=-32.5),
+        )
     )
     dset = dataset.open(
         variables=["u10"],

From ccde632d4650d8cdcfbf1fab3a1bb8385b10df54 Mon Sep 17 00:00:00 2001
From: rafa-guedes <ocerafa@gmail.com>
Date: Thu, 9 Oct 2025 14:00:29 +1300
Subject: [PATCH 11/24] Add to extra dependencies the remote dependencies for
 cloudpathlib

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 73d2a7e..c9f573a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,6 +115,7 @@ test = [
 extra = [
     "gcsfs",
     "zarr",
+    "cloudpathlib[s3,gs,azure]",
 ]
 dev = [
     "pytest",

From 5367959b967698e8618d381779b809e054001edb Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Mon, 20 Oct 2025 16:57:34 +1100
Subject: [PATCH 12/24] Added slurm examples

---
 examples/backends/05_slurm_backend_run.py   | 352 ++++++++++++++++++++
 examples/backends/README.md                 | 164 +++------
 examples/configs/README.md                  |  45 +++
 examples/configs/slurm_backend.yml          |  18 +
 examples/configs/slurm_backend_examples.yml |  81 +++++
 5 files changed, 551 insertions(+), 109 deletions(-)
 create mode 100644 examples/backends/05_slurm_backend_run.py
 create mode 100644 examples/configs/slurm_backend.yml
 create mode 100644 examples/configs/slurm_backend_examples.yml

diff --git a/examples/backends/05_slurm_backend_run.py b/examples/backends/05_slurm_backend_run.py
new file mode 100644
index 0000000..2aae157
--- /dev/null
+++ b/examples/backends/05_slurm_backend_run.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+"""
+ROMPY SLURM Backend Example
+
+This example demonstrates how to use the SLURM backend to run models on HPC clusters.
+The SLURM backend enables resource management and job scheduling for high-performance 
+computing environments.
+
+Run this example:
+    python 05_slurm_backend_run.py
+
+Note: This example requires access to a SLURM-managed HPC cluster.
+"""
+
+import logging
+import tempfile
+from datetime import datetime
+from pathlib import Path
+
+from rompy.backends import SlurmConfig
+from rompy.core.time import TimeRange
+from rompy.model import ModelRun
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def example_slurm_basic():
+    """
+    Example 1: Basic SLURM execution
+    
+    This example demonstrates the simplest configuration for running a model
+    on a SLURM cluster with minimal parameters.
+    """
+    logger.info("=" * 60)
+    logger.info("Example 1: Basic SLURM Execution")
+    logger.info("=" * 60)
+    logger.info("This example demonstrates the simplest SLURM backend configuration.")
+    logger.info("")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Create a basic model run
+        model = ModelRun(
+            run_id="slurm_basic_example",
+            period=TimeRange(
+                start=datetime(2023, 1, 1),
+                end=datetime(2023, 1, 2),
+                interval="1H",
+            ),
+            output_dir=Path(temp_dir),
+            delete_existing=True,
+        )
+
+        # Basic SLURM configuration
+        config = SlurmConfig(
+            queue="general",  # SLURM partition name
+            timeout=1800,     # Max execution time in seconds (30 minutes)
+            nodes=1,          # Number of nodes to allocate
+            ntasks=1,         # Number of tasks (processes) to run
+            cpus_per_task=2,  # Number of CPU cores per task
+            time_limit="00:30:00",  # Time limit in HH:MM:SS format
+        )
+
+        logger.info(f"SlurmConfig created: {config}")
+        logger.info("Running model with basic SLURM configuration...")
+
+        try:
+            # This would submit the job to SLURM (in a real environment)
+            # success = model.run(backend=config)
+            # Since we're not in a real SLURM environment, we'll just show the config
+            logger.info("✅ SlurmConfig validated successfully")
+            logger.info("Key concepts: SlurmConfig, queue, nodes, ntasks, cpus_per_task")
+            logger.info("Note: In a real environment, this would submit to SLURM")
+        except Exception as e:
+            logger.error(f"❌ SLURM model run failed: {e}")
+
+
+def example_slurm_advanced():
+    """
+    Example 2: Advanced SLURM execution with multiple parameters
+    
+    This example shows how to configure complex SLURM jobs with multiple
+    resource allocations, environment variables, and custom options.
+    """
+    logger.info("=" * 60)
+    logger.info("Example 2: Advanced SLURM Configuration")
+    logger.info("=" * 60)
+    logger.info("This example demonstrates advanced SLURM backend configuration.")
+    logger.info("")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        model = ModelRun(
+            run_id="slurm_advanced_example",
+            period=TimeRange(
+                start=datetime(2023, 1, 1),
+                end=datetime(2023, 1, 3),
+                interval="1H",
+            ),
+            output_dir=Path(temp_dir),
+            delete_existing=True,
+        )
+
+        # Advanced SLURM configuration with many parameters
+        config = SlurmConfig(
+            queue="gpu",                    # GPU partition
+            timeout=7200,                   # 2 hours timeout
+            nodes=2,                        # 2 compute nodes
+            ntasks=8,                       # 8 tasks total
+            cpus_per_task=4,               # 4 CPUs per task
+            time_limit="02:00:00",         # 2 hours time limit
+            account="research_project",     # Account for billing
+            qos="high",                     # Quality of Service
+            reservation="special_reservation",  # Reservation name
+            output_file="slurm-%j.out",     # Output file pattern (job ID)
+            error_file="slurm-%j.err",      # Error file pattern
+            job_name="advanced_simulation", # Name of the SLURM job
+            mail_type="BEGIN,END,FAIL",     # Types of notifications
+            mail_user="researcher@domain.com",  # Email for notifications
+            additional_options=["--gres=gpu:v100:2", "--exclusive"],  # GPU resources
+            env_vars={                      # Environment variables
+                "OMP_NUM_THREADS": "4",
+                "MODEL_DEBUG": "true",
+                "DATA_PATH": "/shared/data",
+                "RESULTS_PATH": "/shared/results",
+            },
+        )
+
+        logger.info(f"Advanced SlurmConfig created: {config}")
+        logger.info("Running model with advanced SLURM configuration...")
+
+        try:
+            # Show validation success
+            logger.info("✅ Advanced SlurmConfig validated successfully")
+            logger.info("Key concepts: account, qos, reservations, GRES, environment variables")
+            logger.info("Note: In a real environment, this would submit a complex job to SLURM")
+        except Exception as e:
+            logger.error(f"❌ Advanced SLURM configuration failed: {e}")
+
+
+def example_slurm_with_custom_command():
+    """
+    Example 3: SLURM execution with custom command
+    
+    This example shows how to run a custom command on the SLURM cluster,
+    useful for executing different types of jobs or calling external binaries.
+    """
+    logger.info("=" * 60)
+    logger.info("Example 3: SLURM with Custom Command")
+    logger.info("=" * 60)
+    logger.info("This example demonstrates running custom commands on SLURM.")
+    logger.info("")
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        model = ModelRun(
+            run_id="slurm_custom_command_example",
+            period=TimeRange(
+                start=datetime(2023, 1, 1),
+                end=datetime(2023, 1, 2),
+                interval="1H",
+            ),
+            output_dir=Path(temp_dir),
+            delete_existing=True,
+        )
+
+        # SLURM configuration with a custom command
+        config = SlurmConfig(
+            queue="general",
+            timeout=3600,  # 1 hour timeout
+            nodes=1,
+            ntasks=1,
+            cpus_per_task=2,
+            time_limit="01:00:00",
+            command="echo 'Running custom SLURM job' && date && pwd && ls -la",  # Custom command
+            env_vars={"CUSTOM_VAR": "value"},
+        )
+
+        logger.info(f"SlurmConfig with custom command: {config}")
+        logger.info("Running custom command on SLURM...")
+
+        try:
+            logger.info("✅ SlurmConfig with custom command validated successfully")
+            logger.info("Key concepts: command parameter, custom execution")
+            logger.info("Note: In a real environment, this would execute the custom command on SLURM")
+        except Exception as e:
+            logger.error(f"❌ SLURM custom command configuration failed: {e}")
+
+
+def example_slurm_from_dict():
+    """
+    Example 4: Creating SLURM configuration from dictionary
+    
+    This example shows how to create SLURM configurations from dictionaries,
+    which is useful when loading from configuration files (YAML/JSON).
+    """
+    logger.info("=" * 60)
+    logger.info("Example 4: SLURM Configuration from Dictionary")
+    logger.info("=" * 60)
+    logger.info("This example demonstrates creating SLURM configs from dictionaries.")
+    logger.info("")
+
+    # Simulate loading from YAML/JSON file
+    slurm_config_data = {
+        "queue": "compute",
+        "timeout": 7200,
+        "nodes": 1,
+        "ntasks": 4,
+        "cpus_per_task": 2,
+        "time_limit": "02:00:00",
+        "account": "myproject",
+        "env_vars": {
+            "OMP_NUM_THREADS": "2",
+            "MODEL_PRECISION": "double",
+            "DATA_DIR": "/shared/data"
+        },
+        "job_name": "yaml_configured_job",
+        "additional_options": ["--mem-per-cpu=2048"]
+    }
+
+    try:
+        # Create configuration from dictionary
+        config = SlurmConfig(**slurm_config_data)
+
+        logger.info("✅ SLURM configuration created from dictionary:")
+        logger.info(f"  Queue: {config.queue}")
+        logger.info(f"  Nodes: {config.nodes}")
+        logger.info(f"  Total CPU cores: {config.ntasks * config.cpus_per_task}")
+        logger.info(f"  Time limit: {config.time_limit}")
+        logger.info(f"  Environment variables: {len(config.env_vars)}")
+        logger.info("Key concepts: dictionary unpacking, YAML/JSON compatibility")
+        logger.info("Note: This is how configuration files are loaded in production")
+    except Exception as e:
+        logger.error(f"❌ SLURM dictionary configuration failed: {e}")
+
+
+def example_slurm_validation():
+    """
+    Example 5: SLURM configuration validation
+    
+    This example demonstrates ROMPY's built-in validation for SLURM configurations.
+    The Pydantic model catches configuration errors before runtime.
+    """
+    logger.info("=" * 60)
+    logger.info("Example 5: SLURM Configuration Validation")
+    logger.info("=" * 60)
+    logger.info("This example shows how ROMPY validates SLURM configurations automatically.")
+    logger.info("")
+
+    from pydantic import ValidationError
+
+    # Valid SLURM configuration
+    try:
+        valid_config = SlurmConfig(
+            queue="general",
+            timeout=3600,
+            nodes=1,
+            ntasks=1,
+            cpus_per_task=2,
+            time_limit="01:00:00",
+            env_vars={"TEST_VAR": "value"}
+        )
+        logger.info("✅ Valid SlurmConfig created successfully")
+    except Exception as e:
+        logger.error(f"❌ Valid SLURM config validation failed unexpectedly: {e}")
+
+    # Invalid time limit format
+    logger.info("Testing invalid time limit format...")
+    try:
+        invalid_config = SlurmConfig(
+            queue="general",
+            time_limit="25:00",  # Invalid format - missing seconds
+        )
+        logger.info("❌ This should not succeed")
+    except ValidationError as e:
+        logger.info(f"✅ Validation correctly caught time limit error: {e.errors()[0]['msg']}")
+
+    # Invalid number of nodes (too high)
+    logger.info("Testing invalid number of nodes...")
+    try:
+        invalid_config = SlurmConfig(
+            queue="general",
+            nodes=101,  # Max is 100
+            time_limit="01:00:00"
+        )
+        logger.info("❌ This should not succeed")
+    except ValidationError as e:
+        logger.info(f"✅ Validation correctly caught nodes error: {e.errors()[0]['msg']}")
+
+    # Invalid cpus_per_task (too high)
+    logger.info("Testing invalid CPUs per task...")
+    try:
+        invalid_config = SlurmConfig(
+            queue="general",
+            cpus_per_task=129,  # Max is 128
+            time_limit="01:00:00"
+        )
+        logger.info("❌ This should not succeed")
+    except ValidationError as e:
+        logger.info(f"✅ Validation correctly caught cpus_per_task error: {e.errors()[0]['msg']}")
+
+    logger.info("Key concepts: Pydantic validation, error handling, configuration safety")
+
+
+def main():
+    """Run all SLURM backend examples."""
+    logger.info("🚀 ROMPY SLURM Backend Examples")
+    logger.info("================================")
+    logger.info("These examples demonstrate how to use ROMPY with SLURM clusters for HPC jobs.")
+    logger.info("Each example builds on the previous one to show increasingly sophisticated usage.")
+    logger.info("")
+
+    # Run examples
+    examples = [
+        example_slurm_basic,
+        example_slurm_advanced,
+        example_slurm_with_custom_command,
+        example_slurm_from_dict,
+        example_slurm_validation,
+    ]
+
+    completed_examples = 0
+    for i, example in enumerate(examples, 1):
+        try:
+            logger.info(f"Running example {i}/{len(examples)}...")
+            example()
+            completed_examples += 1
+            logger.info("")
+        except Exception as e:
+            logger.error(f"❌ Example {example.__name__} failed: {e}")
+            logger.info("")
+
+    logger.info("=" * 60)
+    logger.info(
+        f"🎉 SLURM examples completed! ({completed_examples}/{len(examples)} examples ran successfully)"
+    )
+    logger.info("=" * 60)
+    logger.info("What you learned:")
+    logger.info("• Basic SLURM execution with SlurmConfig")
+    logger.info("• Advanced SLURM parameters: queues, nodes, tasks, resources")
+    logger.info("• Custom commands and environment variables")
+    logger.info("• Configuration from dictionaries")
+    logger.info("• Built-in validation for SLURM configurations")
+    logger.info("")
+    logger.info("Next steps:")
+    logger.info("1. Review the SlurmConfig documentation for all available parameters")
+    logger.info("2. Try these configurations in your actual SLURM environment")
+    logger.info("3. Create your own SLURM configuration files for your models")
+    logger.info("4. Combine with other ROMPY features like postprocessing and pipelines")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/backends/README.md b/examples/backends/README.md
index d5f2362..074cce5 100644
--- a/examples/backends/README.md
+++ b/examples/backends/README.md
@@ -1,129 +1,75 @@
-# Backend Examples
+# ROMPY SLURM Backend Examples
 
-This directory contains examples demonstrating how to use ROMPY's backend configuration system to execute models in different environments.
+This directory contains examples of how to use ROMPY with SLURM for HPC cluster execution.
 
-## Overview
+## Examples
 
-ROMPY uses Pydantic-based backend configurations to provide type-safe, validated execution parameters for different environments. This system enables precise control over model execution while maintaining flexibility and extensibility.
+### 05_slurm_backend_run.py
+A comprehensive tutorial showing different ways to configure and use the SLURM backend:
 
-## Available Examples
-
-### 1. Basic Local Run (`01_basic_local_run.py`)
-Demonstrates the simplest use case:
-- Local execution with `LocalConfig`
-- Basic timeout and command configuration
-- No-op postprocessing
-
-### 2. Docker Run (`02_docker_run.py`)
-Shows Docker container execution:
-- Using pre-built Docker images
-- Volume mounting for data access
-- Environment variable configuration
-- Resource limits (CPU, memory)
-
-### 3. Custom Postprocessor (`03_custom_postprocessor.py`)
-Illustrates custom postprocessing:
-- Creating custom postprocessor classes
-- Processing model outputs after execution
-- Error handling and result reporting
-
-### 4. Complete Workflow (`04_complete_workflow.py`)
-Demonstrates a full workflow:
-- Model execution with local backend
-- Custom postprocessing with file analysis
-- Comprehensive logging and error handling
-
-## Backend Configuration Types
-
-### LocalConfig
-For execution on the local system:
-```python
-from rompy.backends import LocalConfig
-
-config = LocalConfig(
-    timeout=3600,  # 1 hour
-    command="python run_model.py",
-    env_vars={"OMP_NUM_THREADS": "4"},
-    shell=True,
-    capture_output=True
-)
-```
-
-### DockerConfig
-For execution in Docker containers:
-```python
-from rompy.backends import DockerConfig
-
-config = DockerConfig(
-    image="python:3.9-slim",
-    cpu=2,
-    memory="2g",
-    timeout=7200,
-    volumes=["/data:/app/data:rw"],
-    env_vars={"MODEL_CONFIG": "production"}
-)
-```
-
-## Running the Examples
-
-Each example can be run directly:
+- Basic SLURM execution
+- Advanced SLURM configuration with multiple parameters
+- Custom commands on SLURM
+- Creating configurations from dictionaries
+- Configuration validation
 
+Run the example:
 ```bash
-# Basic local execution
-python 01_basic_local_run.py
+python 05_slurm_backend_run.py
+```
 
-# Docker execution (requires Docker)
-python 02_docker_run.py
+## Configuration Files
 
-# Custom postprocessing
-python 03_custom_postprocessor.py
+### slurm_backend.yml
+A basic configuration file for running jobs on SLURM with minimal parameters.
 
-# Complete workflow
-python 04_complete_workflow.py
-```
+### slurm_backend_examples.yml
+A collection of different SLURM configuration examples:
+- Basic SLURM configuration
+- Advanced GPU job configuration 
+- High-memory job configuration
+- Custom working directory configuration
 
 ## Key Features
 
-- **Type Safety**: All configurations are validated using Pydantic
-- **IDE Support**: Full autocompletion and inline documentation
-- **Flexibility**: Easy to extend with custom backends and postprocessors
-- **Error Handling**: Clear validation errors and execution feedback
-- **Serialization**: Configurations can be saved/loaded as YAML/JSON
+The ROMPY SLURM backend supports:
 
-## Configuration Validation
+- **Resource allocation**: Specify nodes, tasks, and CPU cores
+- **Queue/partition selection**: Run on different SLURM partitions
+- **Time limits**: Set job time limits in HH:MM:SS format
+- **Environment variables**: Set environment variables for your job
+- **Job notifications**: Email notifications on job start/end/failure
+- **Custom commands**: Run custom commands instead of the default model run
+- **Additional SLURM options**: Pass any additional SLURM options via `additional_options`
+- **GPU resources**: Support for GPU allocation via `--gres` options
 
-Backend configurations provide comprehensive validation:
-- Timeout values must be between 60 and 86400 seconds
-- Working directories must exist if specified
-- Docker image names must follow valid conventions
-- Volume mounts must reference existing host paths
+## Usage
 
-## Best Practices
+To use the SLURM backend in your application:
 
-1. **Set appropriate timeouts** based on your model complexity
-2. **Use environment variables** for sensitive configuration
-3. **Validate configurations** before execution
-4. **Handle errors gracefully** in your postprocessors
-5. **Use resource limits** appropriately in Docker configurations
-
-## Output Structure
+```python
+from rompy.backends import SlurmConfig
+from rompy.model import ModelRun
+
+# Create SLURM configuration
+config = SlurmConfig(
+    queue="gpu",                    # SLURM partition
+    nodes=2,                        # Number of nodes
+    ntasks=8,                       # Number of tasks
+    cpus_per_task=4,               # CPU cores per task
+    time_limit="02:00:00",         # Time limit
+    account="research_project",     # Account for billing
+    additional_options=["--gres=gpu:v100:2"],  # GPU allocation
+)
 
-All examples create output in the `./output` directory with the following structure:
+# Create and run your model
+model = ModelRun(...)
+model.run(backend=config)
 ```
-output/
-├── <run_id>/
-│   ├── INPUT              # Generated model input file
-│   ├── datasets/          # Placeholder for input datasets
-│   ├── outputs/           # Placeholder for model outputs
-│   └── <additional files> # Any files created during execution
-```
-
-## Extending the Examples
 
-You can extend these examples by:
-- Creating custom backend configurations
-- Implementing custom postprocessors
-- Adding new execution environments
-- Integrating with workflow orchestration systems
+## Validation
 
-For more detailed information, see the [Backend Configurations documentation](../../docs/source/backend_configurations.rst).
\ No newline at end of file
+The SLURM backend includes comprehensive validation:
+- Time limit format validation (HH:MM:SS)
+- Bounds checking for nodes, CPUs, etc.
+- Required field validation
\ No newline at end of file
diff --git a/examples/configs/README.md b/examples/configs/README.md
index 2ec538c..0d9d352 100644
--- a/examples/configs/README.md
+++ b/examples/configs/README.md
@@ -8,8 +8,10 @@ This directory contains example configuration files for ROMPY backend systems. T
 
 - **`local_backend.yml`** - Single-document local backend configuration
 - **`docker_backend.yml`** - Single-document Docker backend configuration
+- **`slurm_backend.yml`** - Single-document SLURM backend configuration
 - **`local_backend_examples.yml`** - Multi-document local backend examples
 - **`docker_backend_examples.yml`** - Multi-document Docker backend examples
+- **`slurm_backend_examples.yml`** - Multi-document SLURM backend examples
 - **`pipeline_config.yml`** - Complete pipeline configuration examples
 - **`validate_configs.py`** - Validation script for configuration files
 
@@ -92,6 +94,29 @@ rompy pipeline --config pipeline_config.yml
 | `user` | string | "root" | Container user |
 | `remove_container` | bool | true | Remove after execution |
 
+### SLURM Backend Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `queue` | string | - | SLURM partition name (required) |
+| `nodes` | int | 1 | Number of compute nodes to allocate (1-100) |
+| `ntasks` | int | 1 | Number of tasks (processes) to run |
+| `cpus_per_task` | int | 1 | Number of CPU cores per task (1-128) |
+| `time_limit` | string | "1:00:00" | Time limit in HH:MM:SS format |
+| `account` | string | null | Account for billing/resource tracking |
+| `qos` | string | null | Quality of Service for the job |
+| `reservation` | string | null | Reservation name to run job under |
+| `output_file` | string | null | Output file path for job output |
+| `error_file` | string | null | Error file path for job errors |
+| `job_name` | string | null | Name for the SLURM job |
+| `mail_type` | string | null | Type of mail to send (BEGIN, END, FAIL, etc.) |
+| `mail_user` | string | null | Email address for notifications |
+| `additional_options` | list | [] | Additional SLURM options (e.g., ['--gres=gpu:1']) |
+| `timeout` | int | 3600 | Maximum execution time in seconds (1 minute to 24 hours) |
+| `env_vars` | dict | {} | Environment variables for execution |
+| `working_dir` | string | null | Working directory for execution |
+| `command` | string | null | Optional shell command to run instead of config.run() |
+
 ## Example Configurations
 
 ### Local Backend
@@ -119,6 +144,26 @@ env_vars:
   MODEL_THREADS: "4"
 ```
 
+### SLURM Backend
+
+```yaml
+backend_type: slurm
+config:
+  queue: "general"
+  timeout: 7200
+  nodes: 2
+  ntasks: 8
+  cpus_per_task: 4
+  time_limit: "02:00:00"
+  account: "myproject"
+  additional_options:
+    - "--gres=gpu:v100:2"
+  job_name: "simulation_job"
+  env_vars:
+    OMP_NUM_THREADS: "4"
+    MODEL_CONFIG: "production"
+```
+
 ### Pipeline Configuration
 
 ```yaml
diff --git a/examples/configs/slurm_backend.yml b/examples/configs/slurm_backend.yml
new file mode 100644
index 0000000..ef8945b
--- /dev/null
+++ b/examples/configs/slurm_backend.yml
@@ -0,0 +1,18 @@
+# Basic SLURM Backend Configuration
+# This is a minimal configuration for running a model on a SLURM cluster
+
+backend_type: "slurm"
+config:
+  queue: "general"  # SLURM partition name
+  timeout: 3600     # Max execution time in seconds (1 hour)
+  nodes: 1          # Number of compute nodes to allocate
+  ntasks: 1         # Number of tasks (processes) to run
+  cpus_per_task: 2  # Number of CPU cores per task
+  time_limit: "01:00:00"  # Time limit in HH:MM:SS format
+  job_name: "rompy_basic_job"  # Name for the SLURM job
+  output_file: "slurm-%j.out"  # Output file pattern using job ID
+  error_file: "slurm-%j.err"   # Error file pattern using job ID
+  env_vars:  # Environment variables for the job
+    OMP_NUM_THREADS: "2"
+    MODEL_DEBUG: "false"
+  command: "python -c \"print('SLURM job executed successfully')\""  # Command to run
\ No newline at end of file
diff --git a/examples/configs/slurm_backend_examples.yml b/examples/configs/slurm_backend_examples.yml
new file mode 100644
index 0000000..35eecaa
--- /dev/null
+++ b/examples/configs/slurm_backend_examples.yml
@@ -0,0 +1,81 @@
+# SLURM Backend Configuration Examples
+# These examples show various ways to configure SLURM jobs for different scenarios
+
+# Basic SLURM configuration
+basic_slurm:
+  backend_type: "slurm"
+  config:
+    queue: "general"
+    timeout: 3600
+    nodes: 1
+    ntasks: 1
+    cpus_per_task: 2
+    time_limit: "01:00:00"
+    command: "echo 'Running basic SLURM job' && sleep 10"
+
+# Advanced SLURM configuration for GPU jobs
+advanced_gpu_slurm:
+  backend_type: "slurm"
+  config:
+    queue: "gpu"
+    timeout: 7200
+    nodes: 2
+    ntasks: 8
+    cpus_per_task: 4
+    time_limit: "02:00:00"
+    account: "research_project"
+    qos: "high"
+    reservation: "special_reservation"
+    output_file: "slurm-%j.out"
+    error_file: "slurm-%j.err"
+    job_name: "gpu_simulation"
+    mail_type: "BEGIN,END,FAIL"
+    mail_user: "researcher@domain.com"
+    additional_options:
+      - "--gres=gpu:v100:2"
+      - "--exclusive"
+    env_vars:
+      OMP_NUM_THREADS: "4"
+      MODEL_DEBUG: "true"
+      DATA_PATH: "/shared/data"
+      RESULTS_PATH: "/shared/results"
+    command: "python /app/run_simulation.py --config config.json"
+
+# SLURM configuration for high-memory jobs
+high_memory_slurm:
+  backend_type: "slurm"
+  config:
+    queue: "memory"
+    timeout: 14400
+    nodes: 1
+    ntasks: 2
+    cpus_per_task: 8
+    time_limit: "04:00:00"
+    account: "bigmem_project"
+    additional_options:
+      - "--mem=64G"
+    job_name: "high_memory_analysis"
+    output_file: "output_%j.log"
+    error_file: "error_%j.log"
+    env_vars:
+      MEMORY_LIMIT: "64G"
+      ANALYSIS_TYPE: "deep"
+    command: "Rscript analysis.R"
+
+# SLURM configuration with custom working directory
+custom_workdir_slurm:
+  backend_type: "slurm"
+  config:
+    queue: "compute"
+    timeout: 7200
+    nodes: 1
+    ntasks: 4
+    cpus_per_task: 2
+    time_limit: "02:00:00"
+    account: "analysis_project"
+    working_dir: "/shared/workspaces/my_project"
+    job_name: "workspace_analysis"
+    env_vars:
+      WORKSPACE: "/shared/workspaces/my_project"
+      TOOLS_PATH: "/shared/tools"
+    command: "./run_analysis.sh"
\ No newline at end of file

From 516d1e81f2ad3fc12aed83bcf9aeefcd71671365 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Mon, 20 Oct 2025 17:50:22 +1100
Subject: [PATCH 13/24] Clened up example backends

---
 examples/backends/README.md                 | 11 ++++++
 examples/configs/README.md                  | 40 +++++++++++++++++++++
 examples/configs/docker_backend.yml         | 14 ++++----
 examples/configs/local_backend.yml          | 14 ++++----
 examples/configs/slurm_backend.yml          | 29 ++++++++-------
 examples/configs/slurm_backend_examples.yml | 12 ++++---
 6 files changed, 86 insertions(+), 34 deletions(-)

diff --git a/examples/backends/README.md b/examples/backends/README.md
index 074cce5..3d5d3e3 100644
--- a/examples/backends/README.md
+++ b/examples/backends/README.md
@@ -18,6 +18,17 @@ Run the example:
 python 05_slurm_backend_run.py
 ```
 
+### basic_model_run.py
+Creates a basic ModelRun configuration that can be used to test different backend configurations. This provides a consistent model configuration that works across all backends.
+
+### test_backends_with_modelrun.py
+Demonstrates using the basic ModelRun with different backend configurations (Local, Docker, SLURM). This example shows how the same model run can be configured to work across different execution environments.
+
+Run the example:
+```bash
+python test_backends_with_modelrun.py
+```
+
 ## Configuration Files
 
 ### slurm_backend.yml
diff --git a/examples/configs/README.md b/examples/configs/README.md
index 0d9d352..0265b3f 100644
--- a/examples/configs/README.md
+++ b/examples/configs/README.md
@@ -9,6 +9,8 @@ This directory contains example configuration files for ROMPY backend systems. T
 - **`local_backend.yml`** - Single-document local backend configuration
 - **`docker_backend.yml`** - Single-document Docker backend configuration
 - **`slurm_backend.yml`** - Single-document SLURM backend configuration
+- **`basic_modelrun.yml`** - Basic model run configuration for CLI testing
+- **`basic_pipeline.yml`** - Basic pipeline configuration for CLI testing
 - **`local_backend_examples.yml`** - Multi-document local backend examples
 - **`docker_backend_examples.yml`** - Multi-document Docker backend examples
 - **`slurm_backend_examples.yml`** - Multi-document SLURM backend examples
@@ -164,6 +166,44 @@ config:
     MODEL_CONFIG: "production"
 ```
 
+### Basic ModelRun Configuration
+
+```yaml
+run_id: "cli_test_backend_run"
+period:
+  start: "2023-01-01T00:00:00"
+  end: "2023-01-02T00:00:00"
+  interval: "1H"
+output_dir: "./output/cli_test"
+delete_existing: true
+```
+
+### Basic Pipeline Configuration
+
+```yaml
+pipeline_backend: local
+
+model_run:
+  run_id: "cli_test_backend_run"
+  output_dir: "./output/cli_test"
+  delete_existing: true
+  period:
+    start: "2023-01-01T00:00:00"
+    end: "2023-01-02T00:00:00"
+    interval: "1H"
+
+run_backend:
+  backend_type: local
+  timeout: 3600
+  command: "echo 'Running basic model test'"
+  env_vars:
+    MODEL_TYPE: "test"
+    ENVIRONMENT: "cli"
+
+postprocessing:
+  processor: "noop"
+```
+
 ### Pipeline Configuration
 
 ```yaml
diff --git a/examples/configs/docker_backend.yml b/examples/configs/docker_backend.yml
index 4faa138..3eff3cc 100644
--- a/examples/configs/docker_backend.yml
+++ b/examples/configs/docker_backend.yml
@@ -1,20 +1,18 @@
 # Docker Backend Configuration
 # Configuration for executing models in Docker containers
 
-backend_type: docker
+type: docker
 image: "python:3.9-slim"
 timeout: 7200 # 2 hours
 cpu: 4
 memory: "2g"
-executable: "python"
+executable: 'bash -c "echo ''Hello from Docker!''"'
 mpiexec: ""
 volumes:
-  - "/tmp:/tmp:rw"
-  - ".:/app/workspace:ro"
+    - "/tmp:/tmp:rw"
 env_vars:
-  PYTHONUNBUFFERED: "1"
-  MODEL_THREADS: "4"
-  DATA_DIR: "/app/data"
-  RESULTS_DIR: "/app/results"
+    PYTHONUNBUFFERED: "1"
+    MODEL_THREADS: "4"
+    DATA_DIR: "/app/data"
 remove_container: true
 user: "root"
diff --git a/examples/configs/local_backend.yml b/examples/configs/local_backend.yml
index 8fc9e3d..3dc19e0 100644
--- a/examples/configs/local_backend.yml
+++ b/examples/configs/local_backend.yml
@@ -2,13 +2,13 @@
 # Configuration for executing models on the local system
 
 # Backend type specification
-backend_type: local
+type: local
 
 # Configuration parameters
 timeout: 7200 # 2 hours - Maximum execution time in seconds (60-86400)
 
 # Optional shell command to run instead of config.run()
-command: "python run_model.py"
+command: "ls -l"
 
 # Whether to execute commands through the shell (default: true)
 shell: true
@@ -22,8 +22,8 @@ capture_output: true
 
 # Additional environment variables to set during execution
 env_vars:
-  OMP_NUM_THREADS: "4"
-  MODEL_CONFIG: "production"
-  DATA_DIR: "/data"
-  PYTHONPATH: "/app/lib"
-  LOG_LEVEL: "INFO"
+    OMP_NUM_THREADS: "4"
+    MODEL_CONFIG: "production"
+    DATA_DIR: "/data"
+    PYTHONPATH: "/app/lib"
+    LOG_LEVEL: "INFO"
diff --git a/examples/configs/slurm_backend.yml b/examples/configs/slurm_backend.yml
index ef8945b..1a906e3 100644
--- a/examples/configs/slurm_backend.yml
+++ b/examples/configs/slurm_backend.yml
@@ -1,18 +1,17 @@
 # Basic SLURM Backend Configuration
 # This is a minimal configuration for running a model on a SLURM cluster
 
-backend_type: "slurm"
-config:
-  queue: "general"  # SLURM partition name
-  timeout: 3600     # Max execution time in seconds (1 hour)
-  nodes: 1          # Number of compute nodes to allocate
-  ntasks: 1         # Number of tasks (processes) to run
-  cpus_per_task: 2  # Number of CPU cores per task
-  time_limit: "01:00:00"  # Time limit in HH:MM:SS format
-  job_name: "rompy_basic_job"  # Name for the SLURM job
-  output_file: "slurm-%j.out"  # Output file pattern using job ID
-  error_file: "slurm-%j.err"   # Error file pattern using job ID
-  env_vars:  # Environment variables for the job
-    OMP_NUM_THREADS: "2"
-    MODEL_DEBUG: "false"
-  command: "python -c \"print('SLURM job executed successfully')\""  # Command to run
\ No newline at end of file
+type: "slurm"
+queue: "general"  # SLURM partition name
+timeout: 3600     # Max execution time in seconds (1 hour)
+nodes: 1          # Number of compute nodes to allocate
+ntasks: 1         # Number of tasks (processes) to run
+cpus_per_task: 2  # Number of CPU cores per task
+time_limit: "01:00:00"  # Time limit in HH:MM:SS format
+job_name: "rompy_basic_job"  # Name for the SLURM job
+output_file: "slurm-%j.out"  # Output file pattern using job ID
+error_file: "slurm-%j.err"   # Error file pattern using job ID
+env_vars:  # Environment variables for the job
+  OMP_NUM_THREADS: "2"
+  MODEL_DEBUG: "false"
+command: "python -c \"print('SLURM job executed successfully')\""  # Command to run
\ No newline at end of file
diff --git a/examples/configs/slurm_backend_examples.yml b/examples/configs/slurm_backend_examples.yml
index 35eecaa..6812c6a 100644
--- a/examples/configs/slurm_backend_examples.yml
+++ b/examples/configs/slurm_backend_examples.yml
@@ -1,9 +1,13 @@
 # SLURM Backend Configuration Examples
 # These examples show various ways to configure SLURM jobs for different scenarios
+# 
+# NOTE: This format (with named sections) is different from the single-document
+# backend config format used with the CLI command. For CLI usage, use the format
+# in slurm_backend.yml with 'type' field at the root level.
 
 # Basic SLURM configuration
 basic_slurm:
-  backend_type: "slurm"
+  type: "slurm"
   config:
     queue: "general"
     timeout: 3600
@@ -15,7 +19,7 @@ basic_slurm:
 
 # Advanced SLURM configuration for GPU jobs
 advanced_gpu_slurm:
-  backend_type: "slurm"
+  type: "slurm"
   config:
     queue: "gpu"
     timeout: 7200
@@ -43,7 +47,7 @@ advanced_gpu_slurm:
 
 # SLURM configuration for high-memory jobs
 high_memory_slurm:
-  backend_type: "slurm"
+  type: "slurm"
   config:
     queue: "memory"
     timeout: 14400
@@ -64,7 +68,7 @@ high_memory_slurm:
 
 # SLURM configuration with custom working directory
 custom_workdir_slurm:
-  backend_type: "slurm"
+  type: "slurm"
   config:
     queue: "compute"
     timeout: 7200

From c0e18987c1778e82ba18ab80e9154d98893540de Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Mon, 20 Oct 2025 17:51:28 +1100
Subject: [PATCH 14/24] Fixed loging in dockers

---
 pyproject.toml               | 55 +++++++-----------------------------
 src/rompy/backends/config.py |  6 +++-
 src/rompy/run/docker.py      | 19 ++++++-------
 3 files changed, 24 insertions(+), 56 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c9f573a..19ae65c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,27 +1,14 @@
 [build-system]
-requires = [
-    "setuptools",
-    "versioneer[toml]",
-]
+requires = ["setuptools", "versioneer[toml]"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "rompy"
 description = "Core rompy library for ocean wave modeling with plugin system"
 readme = "README.md"
-keywords = [
-    "relocatable",
-    "ocean",
-    "modelling",
-    "python",
-    "csiro",
-]
-authors = [
-    { name = "CSIRO", email = "paul.branson@csiro.au" },
-]
-maintainers = [
-  {name = "Rompy Contributors", email = "developers@rompy.com"}
-]
+keywords = ["relocatable", "ocean", "modelling", "python", "csiro"]
+authors = [{ name = "CSIRO", email = "paul.branson@csiro.au" }]
+maintainers = [{ name = "Rompy Contributors", email = "developers@rompy.com" }]
 classifiers = [
     "Development Status :: 3 - Alpha",
     "Intended Audience :: Science/Research",
@@ -66,9 +53,7 @@ dependencies = [
     "isodate",
     "appdirs",
 ]
-dynamic = [
-    "version",
-]
+dynamic = ["version"]
 
 [project.license]
 file = "LICENSE"
@@ -107,23 +92,9 @@ noop = "rompy.postprocess:NoopPostprocessor"
 local = "rompy.pipeline:LocalPipelineBackend"
 
 [project.optional-dependencies]
-test = [
-    "pytest",
-    "envyaml",
-    "coverage",
-]
-extra = [
-    "gcsfs",
-    "zarr",
-    "cloudpathlib[s3,gs,azure]",
-]
-dev = [
-    "pytest",
-    "envyaml",
-    "coverage",
-    "ruff",
-    "black",
-]
+test = ["pytest", "envyaml", "coverage"]
+extra = ["gcsfs", "zarr", "cloudpathlib[s3,gs,azure]"]
+dev = ["pytest", "envyaml", "coverage", "ruff", "black"]
 docs = [
     "autodoc_pydantic",
     "ipython",
@@ -134,16 +105,10 @@ docs = [
 ]
 
 [tool.setuptools.packages.find]
-where = [
-    "src",
-]
+where = ["src"]
 
 [tool.setuptools.package-data]
-"*" = [
-    "*.y*ml",
-    "*.csv",
-    "*.html",
-]
+"*" = ["*.y*ml", "*.csv", "*.html"]
 
 [tool.setuptools.dynamic.version]
 attr = "rompy.__version__"
diff --git a/src/rompy/backends/config.py b/src/rompy/backends/config.py
index 43c8b95..eae2d43 100644
--- a/src/rompy/backends/config.py
+++ b/src/rompy/backends/config.py
@@ -8,7 +8,7 @@
 
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union
 
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 
@@ -287,6 +287,10 @@ def model_post_init(self, __context) -> None:
 class SlurmConfig(BaseBackendConfig):
     """Configuration for SLURM cluster execution."""
 
+    model_type: Literal["slurm"] = Field(
+        "slurm", 
+        description="The backend type."
+    )
     queue: str = Field(
         ..., 
         description="SLURM partition name (equivalent to queue)"
diff --git a/src/rompy/run/docker.py b/src/rompy/run/docker.py
index 5308936..41026e8 100644
--- a/src/rompy/run/docker.py
+++ b/src/rompy/run/docker.py
@@ -284,16 +284,15 @@ def _run_container(
             logger.debug(f"Volumes: {volumes}")
             logger.debug(f"Environment: {env_vars}")
 
-            # Run the container
-            container = client.containers.run(**container_config)
-
-            # Log output
-            if container:
-                logger.info("Model run completed successfully")
-                return True
-            else:
-                logger.error("Model run failed - no output from container")
-                return False
+            # Run the container and capture output
+            container_output = client.containers.run(**container_config)
+
+            # Log the container output
+            if container_output:
+                logger.info(f"Container output:\n{container_output.decode('utf-8')}")
+            
+            logger.info("Model run completed successfully")
+            return True
 
         except ContainerError as e:
             logger.error(f"Container error: {e}")

From d2260e52ec205295699e45efd09ea06adb876552 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Mon, 20 Oct 2025 17:52:18 +1100
Subject: [PATCH 15/24] Added basic backed run examples

---
 examples/backends/basic_model_run.py          |  57 +++++++
 .../backends/test_backends_with_modelrun.py   | 151 ++++++++++++++++++
 examples/configs/basic_modelrun.yml           |  10 ++
 examples/configs/basic_pipeline.yml           |  37 +++++
 4 files changed, 255 insertions(+)
 create mode 100644 examples/backends/basic_model_run.py
 create mode 100644 examples/backends/test_backends_with_modelrun.py
 create mode 100644 examples/configs/basic_modelrun.yml
 create mode 100644 examples/configs/basic_pipeline.yml

diff --git a/examples/backends/basic_model_run.py b/examples/backends/basic_model_run.py
new file mode 100644
index 0000000..780fb6a
--- /dev/null
+++ b/examples/backends/basic_model_run.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""
+Basic ModelRun Configuration for Backend Testing
+
+This script creates a simple ModelRun configuration that can be used to test
+different backend configurations (local, docker, slurm).
+"""
+
+import tempfile
+from datetime import datetime
+from pathlib import Path
+
+from rompy.core.time import TimeRange
+from rompy.model import ModelRun
+
+
+def create_basic_model_run():
+    """
+    Create a basic model run configuration for testing backends.
+    This creates a minimal model run that can execute a simple command
+    using different backends.
+    """
+    # Create a temporary directory for output
+    temp_dir = Path(tempfile.mkdtemp(prefix="rompy_test_"))
+    
+    # Create a basic model run
+    model_run = ModelRun(
+        run_id="test_backend_run",
+        period=TimeRange(
+            start=datetime(2023, 1, 1),
+            end=datetime(2023, 1, 2),
+            interval="1H",
+        ),
+        output_dir=temp_dir,
+        delete_existing=True,
+    )
+
+    return model_run
+
+
+if __name__ == "__main__":
+    # Create the basic model run
+    model = create_basic_model_run()
+    
+    print("Basic ModelRun Configuration Created")
+    print("="*40)
+    print(f"Run ID: {model.run_id}")
+    print(f"Output Directory: {model.output_dir}")
+    print(f"Time Period: {model.period.start} to {model.period.end}")
+    print(f"Time Interval: {model.period.interval}")
+    print(f"Delete Existing: {model.delete_existing}")
+    print()
+    print("This basic configuration can be used to test different backends.")
+    print("For example:")
+    print("  - Local backend: Executes commands on the local machine")
+    print("  - Docker backend: Runs commands in Docker containers")
+    print("  - SLURM backend: Submits jobs to HPC clusters")
\ No newline at end of file
diff --git a/examples/backends/test_backends_with_modelrun.py b/examples/backends/test_backends_with_modelrun.py
new file mode 100644
index 0000000..f3002d7
--- /dev/null
+++ b/examples/backends/test_backends_with_modelrun.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+Test Backend Configurations with Basic ModelRun
+
+This script demonstrates how to use the basic ModelRun configuration
+with different backend configurations.
+"""
+
+import logging
+import tempfile
+from datetime import datetime
+from pathlib import Path
+
+from rompy.backends import DockerConfig, LocalConfig, SlurmConfig
+from rompy.core.time import TimeRange
+from rompy.model import ModelRun
+
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+
+def create_basic_model_run():
+    """
+    Create a basic model run configuration for testing backends.
+    """
+    temp_dir = Path(tempfile.mkdtemp(prefix="rompy_test_"))
+    
+    model_run = ModelRun(
+        run_id="test_backend_run",
+        period=TimeRange(
+            start=datetime(2023, 1, 1),
+            end=datetime(2023, 1, 2),
+            interval="1H",
+        ),
+        output_dir=temp_dir,
+        delete_existing=True,
+    )
+
+    return model_run
+
+
+def test_local_backend():
+    """Test the local backend with basic configuration."""
+    logger.info("Testing Local Backend Configuration")
+    logger.info("-" * 40)
+    
+    model = create_basic_model_run()
+    
+    # Create local backend configuration
+    config = LocalConfig(
+        timeout=1800,  # 30 minutes
+        command="echo 'Running model on local backend' && pwd && date",
+        env_vars={
+            "MODEL_TYPE": "test",
+            "ENVIRONMENT": "local"
+        },
+        shell=True,
+        capture_output=True
+    )
+    
+    logger.info(f"LocalConfig: {config}")
+    
+    # Note: In a real environment, you would run:
+    # success = model.run(backend=config)
+    # For this example, we'll just validate the configuration works
+    logger.info("Local backend configuration validated successfully")
+    logger.info(f"Working directory: {model.output_dir}")
+
+def test_docker_backend():
+    """Test the Docker backend with basic configuration."""
+    logger.info("Testing Docker Backend Configuration")
+    logger.info("-" * 40)
+    
+    model = create_basic_model_run()
+    
+    # Create Docker backend configuration
+    config = DockerConfig(
+        image="python:3.9-slim",
+        timeout=1800,
+        cpu=2,
+        memory="1g",
+        executable="python -c \"print('Running model in Docker'); import os; print(f'Working in: {os.getcwd()}')\"",
+        volumes=[f"{model.output_dir}:/app/work:rw"],
+        env_vars={
+            "MODEL_TYPE": "test",
+            "ENVIRONMENT": "docker",
+            "PYTHONUNBUFFERED": "1"
+        }
+    )
+    
+    logger.info(f"DockerConfig: {config}")
+    
+    # Validate the configuration
+    logger.info("Docker backend configuration validated successfully")
+    logger.info(f"Working directory: {model.output_dir}")
+
+def test_slurm_backend():
+    """Test the SLURM backend with basic configuration."""
+    logger.info("Testing SLURM Backend Configuration")
+    logger.info("-" * 40)
+    
+    model = create_basic_model_run()
+    
+    # Create SLURM backend configuration
+    config = SlurmConfig(
+        queue="general",
+        timeout=1800,
+        nodes=1,
+        ntasks=1,
+        cpus_per_task=2,
+        time_limit="00:30:00",
+        job_name="test_backend_job",
+        output_file=f"{model.output_dir}/slurm-%j.out",
+        error_file=f"{model.output_dir}/slurm-%j.err",
+        env_vars={
+            "MODEL_TYPE": "test",
+            "ENVIRONMENT": "slurm"
+        },
+        command="echo 'Running model on SLURM backend' && pwd && date && env | grep MODEL"
+    )
+    
+    logger.info(f"SlurmConfig: {config}")
+    
+    # Validate the configuration
+    logger.info("SLURM backend configuration validated successfully")
+    logger.info(f"Working directory: {model.output_dir}")
+
+def main():
+    """Run all backend tests."""
+    logger.info("Testing Backend Configurations with Basic ModelRun")
+    logger.info("=" * 50)
+    logger.info("This script demonstrates how to configure different backends")
+    logger.info("for the same basic ModelRun configuration.")
+    # Test all backends
+    test_local_backend()
+    test_docker_backend()
+    test_slurm_backend()
+
+    logger.info("=" * 50)
+    logger.info("All backend configurations validated successfully!")
+    logger.info("Next steps:")
+    logger.info("1. Try running these configurations on actual backend systems")
+    logger.info("2. Adjust resource requirements based on your needs")
+    logger.info("3. Add more complex commands or model executables")
+    logger.info("4. Use the YAML configuration files in examples/configs/")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/configs/basic_modelrun.yml b/examples/configs/basic_modelrun.yml
new file mode 100644
index 0000000..52e3d8a
--- /dev/null
+++ b/examples/configs/basic_modelrun.yml
@@ -0,0 +1,10 @@
+# Basic ModelRun Configuration for CLI Testing
+# This configuration can be used with the ROMPY CLI to test different backends
+
+run_id: "cli_test_backend_run"
+period:
+  start: "2023-01-01T00:00:00"
+  end: "2023-01-02T00:00:00"
+  interval: "1H"
+output_dir: "./output/cli_test"
+delete_existing: true
\ No newline at end of file
diff --git a/examples/configs/basic_pipeline.yml b/examples/configs/basic_pipeline.yml
new file mode 100644
index 0000000..1249f38
--- /dev/null
+++ b/examples/configs/basic_pipeline.yml
@@ -0,0 +1,37 @@
+# Complete Pipeline Configuration for CLI Testing
+# This demonstrates how to use the basic model run with different backends via CLI
+
+pipeline_backend: local  # or 'docker', 'slurm' depending on your system
+
+model_run:
+  run_id: "cli_test_backend_run"
+  output_dir: "./output/cli_test"
+  delete_existing: true
+  period:
+    start: "2023-01-01T00:00:00"
+    end: "2023-01-02T00:00:00"
+    interval: "1H"
+
+# This would be the backend for the actual model run execution
+# Uncomment the appropriate section based on your system:
+
+# Local backend configuration
+run_backend:
+  backend_type: local
+  timeout: 3600
+  command: "echo 'Running basic model test'"
+  env_vars:
+    MODEL_TYPE: "test"
+    ENVIRONMENT: "cli"
+
+# To run with local backend:
+# rompy run --config basic_modelrun.yml --backend-config local_backend.yml
+
+# To run with Docker backend:
+# rompy run --config basic_modelrun.yml --backend-config docker_backend.yml
+
+# To run with SLURM backend:
+# rompy run --config basic_modelrun.yml --backend-config slurm_backend.yml
+
+postprocessing:
+  processor: "noop"  # or other available processors
\ No newline at end of file

From d6c9d93aab395d3dcd52d0b539c22b8d1c3ef949 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Mon, 20 Oct 2025 22:36:13 +1100
Subject: [PATCH 16/24] fixed testing

---
 src/rompy/cli.py                     | 25 +++------------------
 src/rompy/run/docker.py              | 11 +++++-----
 src/rompy/run/slurm.py               | 24 ++++++++++++++++++++
 tests/backends/test_slurm_backend.py | 33 +++++++++++++++++++++++++---
 4 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/src/rompy/cli.py b/src/rompy/cli.py
index 8dad334..ad25ddd 100644
--- a/src/rompy/cli.py
+++ b/src/rompy/cli.py
@@ -18,7 +18,7 @@
 import yaml
 
 import rompy
-from rompy.backends import DockerConfig, LocalConfig
+from rompy.backends import DockerConfig, LocalConfig, SlurmConfig
 from rompy.logging import LogFormat, LoggingConfig, LogLevel, get_logger
 from rompy.model import PIPELINE_BACKENDS, POSTPROCESSORS, RUN_BACKENDS, ModelRun
 
@@ -291,31 +291,12 @@ def _get_backend_config_registry():
     Build a registry of backend config classes from entry points and built-ins.
     Returns: dict mapping backend type name to config class
     """
+    # TODO Remove hardcoding
     registry = {
         "local": LocalConfig,
         "docker": DockerConfig,
+        "slurm": SlurmConfig,  # Add SLURM backend config
     }
-    # Try to load from entry points (rompy.config and rompy.backend_config)
-    try:
-        eps = importlib.metadata.entry_points()
-        # Support both 'rompy.config' and 'rompy.backend_config' for flexibility
-        for group in ["rompy.config", "rompy.backend_config"]:
-            if hasattr(eps, "select"):  # Python 3.10+
-                entries = eps.select(group=group)
-            elif hasattr(eps, "get"):  # Python 3.8-3.9
-                entries = eps.get(group, [])
-            else:
-                entries = []
-            for ep in entries:
-                try:
-                    cls = ep.load()
-                    registry[ep.name] = cls
-                except Exception as e:
-                    logger.warning(
-                        f"Failed to load backend config entry point {ep.name}: {e}"
-                    )
-    except Exception as e:
-        logger.warning(f"Could not load backend config entry points: {e}")
     return registry
 
 
diff --git a/src/rompy/run/docker.py b/src/rompy/run/docker.py
index 41026e8..c9df7b0 100644
--- a/src/rompy/run/docker.py
+++ b/src/rompy/run/docker.py
@@ -268,6 +268,7 @@ def _run_container(
                     volumes[host_path] = {"bind": container_path, "mode": mode}
 
             # Prepare container configuration
+            # Note: We can't capture output when remove=True, so we'll handle that case
             container_config = {
                 "image": image_name,
                 "command": ["bash", "-c", run_command],
@@ -284,12 +285,10 @@ def _run_container(
             logger.debug(f"Volumes: {volumes}")
             logger.debug(f"Environment: {env_vars}")
 
-            # Run the container and capture output
-            container_output = client.containers.run(**container_config)
-
-            # Log the container output
-            if container_output:
-                logger.info(f"Container output:\n{container_output.decode('utf-8')}")
+            # Run the container
+            # Note: When remove=True, client.containers.run() returns None
+            # If you need to capture output, you'd need to set remove=False and manually remove
+            client.containers.run(**container_config)
             
             logger.info("Model run completed successfully")
             return True
diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
index 97ccd90..3824eb0 100644
--- a/src/rompy/run/slurm.py
+++ b/src/rompy/run/slurm.py
@@ -164,6 +164,27 @@ def _submit_job(self, job_script: str) -> Optional[str]:
             Job ID if submission successful, None otherwise
         """
         try:
+            # Check if sbatch command is available
+            result = subprocess.run(
+                ["which", "sbatch"],
+                capture_output=True,
+                text=True
+            )
+            if result.returncode != 0 or not result.stdout.strip():
+                logger.error("sbatch command not found. SLURM may not be installed or in PATH.")
+                return None
+
+            # Check if SLURM controller is responsive
+            result = subprocess.run(
+                ["squeue", "--help"],
+                capture_output=True,
+                text=True,
+                timeout=10  # Don't wait too long
+            )
+            if result.returncode != 0:
+                logger.error("SLURM controller is not responsive. squeue command failed.")
+                return None
+
             # Submit the job using sbatch
             result = subprocess.run(
                 ["sbatch", job_script],
@@ -182,6 +203,9 @@ def _submit_job(self, job_script: str) -> Optional[str]:
                 logger.error(f"Unexpected sbatch output format: {output}")
                 return None
 
+        except subprocess.TimeoutExpired:
+            logger.error("SLURM controller check timed out. SLURM may not be properly configured.")
+            return None
         except subprocess.CalledProcessError as e:
             logger.error(f"Failed to submit SLURM job: {e.stderr}")
             return None
diff --git a/tests/backends/test_slurm_backend.py b/tests/backends/test_slurm_backend.py
index a91414a..5ef2da8 100644
--- a/tests/backends/test_slurm_backend.py
+++ b/tests/backends/test_slurm_backend.py
@@ -5,17 +5,40 @@
 provides proper validation, and integrates with the SLURM execution backend.
 """
 
+import shutil
+import subprocess
+import sys
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from unittest.mock import MagicMock, patch, mock_open
-import tempfile
-import os
+from unittest.mock import MagicMock, mock_open, patch
+
 import pytest
 from pydantic import ValidationError
 
 from rompy.backends import SlurmConfig
 
 
+def is_slurm_available():
+    """Check if SLURM is available on the system."""
+    try:
+        result = subprocess.run(
+            ["which", "sbatch"],
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        return result.returncode == 0 and bool(result.stdout.strip())
+    except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.SubprocessError):
+        return False
+
+
+# Skip tests that require SLURM if it's not available
+requires_slurm = pytest.mark.skipif(
+    not is_slurm_available(),
+    reason="SLURM is not available on this system"
+)
+
+
 class TestSlurmConfig:
     """Test the SlurmConfig class."""
 
@@ -191,6 +214,7 @@ def test_field_boundaries(self):
             SlurmConfig(queue="test", cpus_per_task=129)  # Max cpus_per_task is 128
 
 
+@requires_slurm
 class TestSlurmRunBackend:
     """Test the SlurmRunBackend class."""
 
@@ -442,6 +466,7 @@ def time_side_effect():
                 # Verify that scancel was called during timeout handling
                 mock_run.assert_any_call(['scancel', '12345'], check=True, capture_output=True)
 
+    @requires_slurm
     def test_run_method_success(self, mock_model_run, basic_config):
         """Test the full run method with success."""
         from rompy.run.slurm import SlurmRunBackend
@@ -469,6 +494,7 @@ def test_run_method_success(self, mock_model_run, basic_config):
                 mock_submit.assert_called_once()
                 mock_wait.assert_called_once_with("12345", basic_config)
 
+    @requires_slurm
     def test_run_method_job_submit_failure(self, mock_model_run, basic_config):
         """Test the run method when job submission fails."""
         from rompy.run.slurm import SlurmRunBackend
@@ -493,6 +519,7 @@ def test_run_method_job_submit_failure(self, mock_model_run, basic_config):
                 mock_create_script.assert_called_once()
                 mock_submit.assert_called_once()
 
+    @requires_slurm
     def test_run_method_generation_failure(self, mock_model_run, basic_config):
         """Test the run method when model generation fails."""
         from rompy.run.slurm import SlurmRunBackend

From 5bedde01a50efd4b78ba63a1767b496f5931e6b3 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Wed, 29 Oct 2025 11:13:04 +1100
Subject: [PATCH 17/24] Address comments in PR

---
 src/rompy/backends/config_slurm_fixed.py | 103 -----------------------
 src/rompy/run/slurm.py                   |  56 +++++++-----
 2 files changed, 36 insertions(+), 123 deletions(-)
 delete mode 100644 src/rompy/backends/config_slurm_fixed.py

diff --git a/src/rompy/backends/config_slurm_fixed.py b/src/rompy/backends/config_slurm_fixed.py
deleted file mode 100644
index 39a36d2..0000000
--- a/src/rompy/backends/config_slurm_fixed.py
+++ /dev/null
@@ -1,103 +0,0 @@
-class SlurmConfig(BaseBackendConfig):
-    """Configuration for SLURM cluster execution."""
-
-    queue: Optional[str] = Field(
-        None, 
-        description="SLURM partition name (equivalent to queue)"
-    )
-    nodes: int = Field(
-        1, 
-        ge=1, 
-        le=100, 
-        description="Number of nodes to allocate"
-    )
-    ntasks: int = Field(
-        1, 
-        ge=1, 
-        description="Number of tasks (processes) to run"
-    )
-    cpus_per_task: int = Field(
-        1, 
-        ge=1, 
-        le=128, 
-        description="Number of CPU cores per task"
-    )
-    time_limit: str = Field(
-        "1:00:00", 
-        description="Time limit in format HH:MM:SS"
-    )
-    account: Optional[str] = Field(
-        None, 
-        description="Account for billing/resource tracking"
-    )
-    qos: Optional[str] = Field(
-        None, 
-        description="Quality of Service for the job"
-    )
-    reservation: Optional[str] = Field(
-        None, 
-        description="Reservation name to run job under"
-    )
-    output_file: Optional[str] = Field(
-        None, 
-        description="Output file path for job output"
-    )
-    error_file: Optional[str] = Field(
-        None, 
-        description="Error file path for job errors"
-    )
-    job_name: Optional[str] = Field(
-        None, 
-        description="Name for the SLURM job"
-    )
-    mail_type: Optional[str] = Field(
-        None,
-        description="Type of mail to send (BEGIN, END, FAIL, ALL, etc.)"
-    )
-    mail_user: Optional[str] = Field(
-        None,
-        description="Email address for notifications"
-    )
-    additional_options: List[str] = Field(
-        default_factory=list,
-        description="Additional SLURM options (e.g., '--gres=gpu:1')"
-    )
-
-    @field_validator('time_limit')
-    @classmethod
-    def validate_time_limit(cls, v):
-        """Validate time limit format (HH:MM:SS)."""
-        import re
-        if not re.match(r'^\d{1,4}:\d{2}:\d{2}$', v):
-            raise ValueError("Time limit must be in format HH:MM:SS")
-        return v
-
-    def get_backend_class(self):
-        """Return the SlurmRunBackend class."""
-        from rompy.run.slurm import SlurmRunBackend
-        return SlurmRunBackend
-
-    model_config = ConfigDict(
-        json_schema_extra={
-            "examples": [
-                {
-                    "queue": "general",
-                    "nodes": 1,
-                    "ntasks": 1,
-                    "cpus_per_task": 4,
-                    "time_limit": "02:00:00",
-                    "account": "myproject",
-                    "timeout": 7200,
-                },
-                {
-                    "queue": "gpu",
-                    "nodes": 2,
-                    "ntasks": 8,
-                    "cpus_per_task": 2,
-                    "time_limit": "24:00:00",
-                    "reservation": "special_reservation",
-                    "additional_options": ["--gres=gpu:v100:2"],
-                },
-            ]
-        }
-    )
\ No newline at end of file
diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
index 3824eb0..c08c0d5 100644
--- a/src/rompy/run/slurm.py
+++ b/src/rompy/run/slurm.py
@@ -176,13 +176,13 @@ def _submit_job(self, job_script: str) -> Optional[str]:
 
             # Check if SLURM controller is responsive
             result = subprocess.run(
-                ["squeue", "--help"],
+                ["scontrol", "--help"],
                 capture_output=True,
                 text=True,
                 timeout=10  # Don't wait too long
             )
             if result.returncode != 0:
-                logger.error("SLURM controller is not responsive. squeue command failed.")
+                logger.error("SLURM controller is not responsive. scontrol command failed.")
                 return None
 
             # Submit the job using sbatch
@@ -233,7 +233,9 @@ def _wait_for_completion(self, job_id: str, config: "SlurmConfig") -> bool:
         logger.info(f"Waiting for SLURM job {job_id} to complete...")
 
         # Terminal states that indicate job completion (successful or failed)
-        terminal_states = {'CD', 'CA', 'F', 'TO', 'NF', 'OOM', 'BF', 'DL', 'PR'}
+        # Using SLURM job states: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
+        terminal_states = {'BOOT_FAIL', 'CANCELLED', 'COMPLETED', 'DEADLINE', 'FAILED', 
+                          'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED', 'TIMEOUT'}
 
         # Start time for timeout check
         start_time = time.time()
@@ -244,43 +246,57 @@ def _wait_for_completion(self, job_id: str, config: "SlurmConfig") -> bool:
             if elapsed_time > config.timeout:
                 logger.error(f"Timeout waiting for job {job_id} after {config.timeout} seconds")
                 
-                # Try to cancel the job
-                try:
-                    subprocess.run(['scancel', job_id], check=True, capture_output=True)
-                    logger.info(f"Cancelled job {job_id} due to timeout")
-                except subprocess.CalledProcessError:
-                    logger.warning(f"Could not cancel job {job_id} due to timeout")
-                
+                # Let SLURM handle job cancellation according to its configured policies
                 return False
 
-            # Get job status
+            # Get job status using scontrol for more reliable detection
             try:
                 result = subprocess.run(
-                    ['squeue', '-j', job_id, '-h', '-o', '%T'],
+                    ['scontrol', 'show', 'job', job_id],
                     capture_output=True,
                     text=True,
                     check=True
                 )
                 
-                state = result.stdout.strip()
+                # Parse the output to get the job state
+                output = result.stdout
+                if 'JobState=' in output:
+                    state = output.split('JobState=')[1].split()[0].split('_')[0]  # Extract state like 'RUNNING', 'COMPLETED', etc.
+                else:
+                    # If JobState is not found, we might have an issue with parsing
+                    logger.warning(f"Could not determine job state from output for job {job_id}")
+                    state = None
                 
-                if not state:  # If job is not found, it may have completed and been purged
-                    logger.info(f"Job {job_id} not found in queue - likely completed")
-                    return True  # Assume successful completion if not in queue
+                if state is None:  # If job state can't be determined, check if job is not found
+                    if 'slurm_load_jobs error' in output or 'Invalid job id' in output.lower():
+                        logger.info(f"Job {job_id} not found - likely completed")
+                        return True  # Assume successful completion if job ID is invalid
                 
                 if state in terminal_states:
-                    if state == 'CD':  # Completed
+                    if state == 'COMPLETED':  # Completed successfully
                         logger.info(f"SLURM job {job_id} completed successfully")
                         return True
-                    elif state == 'CA':  # Cancelled
+                    elif state == 'CANCELLED':  # Cancelled
                         logger.warning(f"SLURM job {job_id} was cancelled")
                         return False
-                    elif state == 'F':  # Failed
+                    elif state == 'FAILED':  # Failed
                         logger.error(f"SLURM job {job_id} failed")
                         return False
-                    elif state == 'TO':  # Timeout
+                    elif state == 'TIMEOUT':  # Timeout
                         logger.error(f"SLURM job {job_id} timed out")
                         return False
+                    elif state == 'BOOT_FAIL':  # Boot failure
+                        logger.error(f"SLURM job {job_id} failed to boot")
+                        return False
+                    elif state == 'NODE_FAIL':  # Node failure
+                        logger.error(f"SLURM job {job_id} failed due to node failure")
+                        return False
+                    elif state == 'OUT_OF_MEMORY':  # Out of memory
+                        logger.error(f"SLURM job {job_id} ran out of memory")
+                        return False
+                    elif state == 'PREEMPTED':  # Preempted
+                        logger.error(f"SLURM job {job_id} was preempted")
+                        return False
                     else:
                         logger.error(f"SLURM job {job_id} ended with state: {state}")
                         return False

From 2000dd535739ba673b3acdfe99b9af25e12effdb Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Fri, 5 Dec 2025 16:54:52 +1100
Subject: [PATCH 18/24] Address incomplete implementation of command in slurm
 config

---
 examples/backends/05_slurm_backend_run.py | 148 +++++++++++++++-------
 examples/configs/docker_backend.yml       |   1 -
 src/rompy/backends/config.py              |  16 ++-
 src/rompy/run/slurm.py                    |   7 +-
 tests/backends/test_slurm_backend.py      |  86 +++++++++++--
 5 files changed, 188 insertions(+), 70 deletions(-)

diff --git a/examples/backends/05_slurm_backend_run.py b/examples/backends/05_slurm_backend_run.py
index 2aae157..d829ff5 100644
--- a/examples/backends/05_slurm_backend_run.py
+++ b/examples/backends/05_slurm_backend_run.py
@@ -3,7 +3,7 @@
 ROMPY SLURM Backend Example
 
 This example demonstrates how to use the SLURM backend to run models on HPC clusters.
-The SLURM backend enables resource management and job scheduling for high-performance 
+The SLURM backend enables resource management and job scheduling for high-performance
 computing environments.
 
 Run this example:
@@ -29,7 +29,7 @@
 def example_slurm_basic():
     """
     Example 1: Basic SLURM execution
-    
+
     This example demonstrates the simplest configuration for running a model
     on a SLURM cluster with minimal parameters.
     """
@@ -55,9 +55,10 @@ def example_slurm_basic():
         # Basic SLURM configuration
         config = SlurmConfig(
             queue="general",  # SLURM partition name
-            timeout=1800,     # Max execution time in seconds (30 minutes)
-            nodes=1,          # Number of nodes to allocate
-            ntasks=1,         # Number of tasks (processes) to run
+            command="python run_model.py",  # Command to run in the workspace
+            timeout=1800,  # Max execution time in seconds (30 minutes)
+            nodes=1,  # Number of nodes to allocate
+            ntasks=1,  # Number of tasks (processes) to run
             cpus_per_task=2,  # Number of CPU cores per task
             time_limit="00:30:00",  # Time limit in HH:MM:SS format
         )
@@ -66,20 +67,31 @@ def example_slurm_basic():
         logger.info("Running model with basic SLURM configuration...")
 
         try:
-            # This would submit the job to SLURM (in a real environment)
-            # success = model.run(backend=config)
-            # Since we're not in a real SLURM environment, we'll just show the config
-            logger.info("✅ SlurmConfig validated successfully")
-            logger.info("Key concepts: SlurmConfig, queue, nodes, ntasks, cpus_per_task")
-            logger.info("Note: In a real environment, this would submit to SLURM")
+            # Submit the job to SLURM (in a real environment)
+            success = model.run(backend=config)
+            if success:
+                logger.info("✅ SLURM job submitted successfully")
+            else:
+                logger.info(
+                    "⚠️  SLURM job submission completed but may have failed (e.g., in test environment)"
+                )
+            logger.info(
+                "Key concepts: SlurmConfig, queue, nodes, ntasks, cpus_per_task"
+            )
+            logger.info(
+                "Note: In a real SLURM environment, this would submit the job for execution"
+            )
         except Exception as e:
             logger.error(f"❌ SLURM model run failed: {e}")
+            logger.info(
+                "Note: This may fail in non-SLURM environments, which is expected"
+            )
 
 
 def example_slurm_advanced():
     """
     Example 2: Advanced SLURM execution with multiple parameters
-    
+
     This example shows how to configure complex SLURM jobs with multiple
     resource allocations, environment variables, and custom options.
     """
@@ -103,22 +115,23 @@ def example_slurm_advanced():
 
         # Advanced SLURM configuration with many parameters
         config = SlurmConfig(
-            queue="gpu",                    # GPU partition
-            timeout=7200,                   # 2 hours timeout
-            nodes=2,                        # 2 compute nodes
-            ntasks=8,                       # 8 tasks total
-            cpus_per_task=4,               # 4 CPUs per task
-            time_limit="02:00:00",         # 2 hours time limit
-            account="research_project",     # Account for billing
-            qos="high",                     # Quality of Service
+            queue="gpu",  # GPU partition
+            command="python run_model.py --gpu",  # Command to run in the workspace
+            timeout=7200,  # 2 hours timeout
+            nodes=2,  # 2 compute nodes
+            ntasks=8,  # 8 tasks total
+            cpus_per_task=4,  # 4 CPUs per task
+            time_limit="02:00:00",  # 2 hours time limit
+            account="research_project",  # Account for billing
+            qos="high",  # Quality of Service
             reservation="special_reservation",  # Reservation name
-            output_file="slurm-%j.out",     # Output file pattern (job ID)
-            error_file="slurm-%j.err",      # Error file pattern
-            job_name="advanced_simulation", # Name of the SLURM job
-            mail_type="BEGIN,END,FAIL",     # Types of notifications
+            output_file="slurm-%j.out",  # Output file pattern (job ID)
+            error_file="slurm-%j.err",  # Error file pattern
+            job_name="advanced_simulation",  # Name of the SLURM job
+            mail_type="BEGIN,END,FAIL",  # Types of notifications
             mail_user="researcher@domain.com",  # Email for notifications
             additional_options=["--gres=gpu:v100:2", "--exclusive"],  # GPU resources
-            env_vars={                      # Environment variables
+            env_vars={  # Environment variables
                 "OMP_NUM_THREADS": "4",
                 "MODEL_DEBUG": "true",
                 "DATA_PATH": "/shared/data",
@@ -130,18 +143,24 @@ def example_slurm_advanced():
         logger.info("Running model with advanced SLURM configuration...")
 
         try:
-            # Show validation success
-            logger.info("✅ Advanced SlurmConfig validated successfully")
-            logger.info("Key concepts: account, qos, reservations, GRES, environment variables")
-            logger.info("Note: In a real environment, this would submit a complex job to SLURM")
+            success = model.run(backend=config)
+            if success:
+                logger.info("✅ Advanced SLURM job submitted successfully")
+            else:
+                logger.info(
+                    "⚠️  Advanced SLURM job submission completed but may have failed"
+                )
         except Exception as e:
             logger.error(f"❌ Advanced SLURM configuration failed: {e}")
+            logger.info(
+                "Note: This may fail in non-SLURM environments, which is expected"
+            )
 
 
 def example_slurm_with_custom_command():
     """
     Example 3: SLURM execution with custom command
-    
+
     This example shows how to run a custom command on the SLURM cluster,
     useful for executing different types of jobs or calling external binaries.
     """
@@ -179,17 +198,28 @@ def example_slurm_with_custom_command():
         logger.info("Running custom command on SLURM...")
 
         try:
-            logger.info("✅ SlurmConfig with custom command validated successfully")
+            success = model.run(backend=config)
+            if success:
+                logger.info("✅ SLURM job with custom command submitted successfully")
+            else:
+                logger.info(
+                    "⚠️  SLURM job with custom command completed but may have failed"
+                )
             logger.info("Key concepts: command parameter, custom execution")
-            logger.info("Note: In a real environment, this would execute the custom command on SLURM")
+            logger.info(
+                "Note: In a real SLURM environment, this would execute the custom command"
+            )
         except Exception as e:
             logger.error(f"❌ SLURM custom command configuration failed: {e}")
+            logger.info(
+                "Note: This may fail in non-SLURM environments, which is expected"
+            )
 
 
 def example_slurm_from_dict():
     """
     Example 4: Creating SLURM configuration from dictionary
-    
+
     This example shows how to create SLURM configurations from dictionaries,
     which is useful when loading from configuration files (YAML/JSON).
     """
@@ -202,6 +232,7 @@ def example_slurm_from_dict():
     # Simulate loading from YAML/JSON file
     slurm_config_data = {
         "queue": "compute",
+        "command": "python run_model.py",
         "timeout": 7200,
         "nodes": 1,
         "ntasks": 4,
@@ -211,10 +242,10 @@ def example_slurm_from_dict():
         "env_vars": {
             "OMP_NUM_THREADS": "2",
             "MODEL_PRECISION": "double",
-            "DATA_DIR": "/shared/data"
+            "DATA_DIR": "/shared/data",
         },
         "job_name": "yaml_configured_job",
-        "additional_options": ["--mem-per-cpu=2048"]
+        "additional_options": ["--mem-per-cpu=2048"],
     }
 
     try:
@@ -236,14 +267,16 @@ def example_slurm_from_dict():
 def example_slurm_validation():
     """
     Example 5: SLURM configuration validation
-    
+
     This example demonstrates ROMPY's built-in validation for SLURM configurations.
     The Pydantic model catches configuration errors before runtime.
     """
     logger.info("=" * 60)
     logger.info("Example 5: SLURM Configuration Validation")
     logger.info("=" * 60)
-    logger.info("This example shows how ROMPY validates SLURM configurations automatically.")
+    logger.info(
+        "This example shows how ROMPY validates SLURM configurations automatically."
+    )
     logger.info("")
 
     from pydantic import ValidationError
@@ -252,12 +285,13 @@ def example_slurm_validation():
     try:
         valid_config = SlurmConfig(
             queue="general",
+            command="python run_model.py",
             timeout=3600,
             nodes=1,
             ntasks=1,
             cpus_per_task=2,
             time_limit="01:00:00",
-            env_vars={"TEST_VAR": "value"}
+            env_vars={"TEST_VAR": "value"},
         )
         logger.info("✅ Valid SlurmConfig created successfully")
     except Exception as e:
@@ -268,45 +302,60 @@ def example_slurm_validation():
     try:
         invalid_config = SlurmConfig(
             queue="general",
+            command="python run_model.py",
             time_limit="25:00",  # Invalid format - missing seconds
         )
         logger.info("❌ This should not succeed")
     except ValidationError as e:
-        logger.info(f"✅ Validation correctly caught time limit error: {e.errors()[0]['msg']}")
+        logger.info(
+            f"✅ Validation correctly caught time limit error: {e.errors()[0]['msg']}"
+        )
 
     # Invalid number of nodes (too high)
     logger.info("Testing invalid number of nodes...")
     try:
         invalid_config = SlurmConfig(
             queue="general",
+            command="python run_model.py",
             nodes=101,  # Max is 100
-            time_limit="01:00:00"
+            time_limit="01:00:00",
         )
         logger.info("❌ This should not succeed")
     except ValidationError as e:
-        logger.info(f"✅ Validation correctly caught nodes error: {e.errors()[0]['msg']}")
+        logger.info(
+            f"✅ Validation correctly caught nodes error: {e.errors()[0]['msg']}"
+        )
 
     # Invalid cpus_per_task (too high)
     logger.info("Testing invalid CPUs per task...")
     try:
         invalid_config = SlurmConfig(
             queue="general",
+            command="python run_model.py",
             cpus_per_task=129,  # Max is 128
-            time_limit="01:00:00"
+            time_limit="01:00:00",
         )
         logger.info("❌ This should not succeed")
     except ValidationError as e:
-        logger.info(f"✅ Validation correctly caught cpus_per_task error: {e.errors()[0]['msg']}")
+        logger.info(
+            f"✅ Validation correctly caught cpus_per_task error: {e.errors()[0]['msg']}"
+        )
 
-    logger.info("Key concepts: Pydantic validation, error handling, configuration safety")
+    logger.info(
+        "Key concepts: Pydantic validation, error handling, configuration safety"
+    )
 
 
 def main():
     """Run all SLURM backend examples."""
     logger.info("🚀 ROMPY SLURM Backend Examples")
     logger.info("================================")
-    logger.info("These examples demonstrate how to use ROMPY with SLURM clusters for HPC jobs.")
-    logger.info("Each example builds on the previous one to show increasingly sophisticated usage.")
+    logger.info(
+        "These examples demonstrate how to use ROMPY with SLURM clusters for HPC jobs."
+    )
+    logger.info(
+        "Each example builds on the previous one to show increasingly sophisticated usage."
+    )
     logger.info("")
 
     # Run examples
@@ -345,8 +394,11 @@ def main():
     logger.info("1. Review the SlurmConfig documentation for all available parameters")
     logger.info("2. Try these configurations in your actual SLURM environment")
     logger.info("3. Create your own SLURM configuration files for your models")
-    logger.info("4. Combine with other ROMPY features like postprocessing and pipelines")
+    logger.info(
+        "4. Combine with other ROMPY features like postprocessing and pipelines"
+    )
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
+
diff --git a/examples/configs/docker_backend.yml b/examples/configs/docker_backend.yml
index 3eff3cc..f14d2cc 100644
--- a/examples/configs/docker_backend.yml
+++ b/examples/configs/docker_backend.yml
@@ -11,7 +11,6 @@ mpiexec: ""
 volumes:
     - "/tmp:/tmp:rw"
 env_vars:
-    PYTHONUNBUFFERED: "1"
     MODEL_THREADS: "4"
     DATA_DIR: "/app/data"
 remove_container: true
diff --git a/src/rompy/backends/config.py b/src/rompy/backends/config.py
index eae2d43..a8694c6 100644
--- a/src/rompy/backends/config.py
+++ b/src/rompy/backends/config.py
@@ -288,16 +288,18 @@ class SlurmConfig(BaseBackendConfig):
     """Configuration for SLURM cluster execution."""
 
     model_type: Literal["slurm"] = Field(
-        "slurm", 
+        "slurm",
         description="The backend type."
     )
-    queue: str = Field(
-        ..., 
+    queue: Optional[str] = Field(
+        None,
         description="SLURM partition name (equivalent to queue)"
     )
-    
-    command: Optional[str] = Field(
-        None, description="Optional shell command to run instead of config.run()"
+
+    command: str = Field(
+        ...,
+        description="Shell command to run in the workspace directory",
+        min_length=1
     )
     nodes: int = Field(
         1, 
@@ -376,6 +378,7 @@ def get_backend_class(self):
             "examples": [
                 {
                     "queue": "general",
+                    "command": "python run_model.py",
                     "nodes": 1,
                     "ntasks": 1,
                     "cpus_per_task": 4,
@@ -385,6 +388,7 @@ def get_backend_class(self):
                 },
                 {
                     "queue": "gpu",
+                    "command": "python run_model.py --gpu",
                     "nodes": 2,
                     "ntasks": 8,
                     "cpus_per_task": 2,
diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
index c08c0d5..8ba354b 100644
--- a/src/rompy/run/slurm.py
+++ b/src/rompy/run/slurm.py
@@ -142,7 +142,12 @@ def _create_job_script(
         for key, value in config.env_vars.items():
             script_lines.append(f"export {key}={value}")
 
-        # Add the actual command to run the model\n        # First, check if there's a specific command in config, otherwise use the model's run method\n        if hasattr(config, 'command') and config.command:\n            script_lines.extend([\n                \"\",\n                \"# Execute custom command\",\n                config.command,\n            ])\n        else:\n            script_lines.extend([\n                \"\",\n                \"# Execute model using model_run.config.run() method\",\n                \"python -c \\\"\",\n                \"import sys\",\n                \"import os\",\n                \"sys.path.insert(0, os.getcwd())\",\n                \"from rompy.model import ModelRun\",\n                f\"model_run = ModelRun.from_dict({model_run.model_dump()})\",\n                \"model_run.config.run(model_run)\",\n                \"\\\"\",\n            ])
+        # Add the actual command to run the model
+        script_lines.extend([
+            "",
+            "# Execute command in the workspace",
+            config.command,
+        ])
 
         # Create temporary job script file
         with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
diff --git a/tests/backends/test_slurm_backend.py b/tests/backends/test_slurm_backend.py
index 5ef2da8..e502b23 100644
--- a/tests/backends/test_slurm_backend.py
+++ b/tests/backends/test_slurm_backend.py
@@ -46,12 +46,14 @@ def test_default_values(self):
         """Test default values for SlurmConfig."""
         config = SlurmConfig(
             queue="general",  # Required field
+            command="python run_model.py",  # Required field
         )
 
         assert config.timeout == 3600
         assert config.env_vars == {}
         assert config.working_dir is None
         assert config.queue == "general"
+        assert config.command == "python run_model.py"
         assert config.nodes == 1
         assert config.ntasks == 1
         assert config.cpus_per_task == 1
@@ -71,6 +73,7 @@ def test_custom_values(self):
         with TemporaryDirectory() as tmp_dir:
             config = SlurmConfig(
                 queue="compute",
+                command="python run_model.py --param value",
                 nodes=2,
                 ntasks=4,
                 cpus_per_task=8,
@@ -118,7 +121,7 @@ def test_time_limit_validation(self):
         ]
 
         for time_limit in valid_time_limits:
-            config = SlurmConfig(queue="test", time_limit=time_limit)
+            config = SlurmConfig(queue="test", command="python run_model.py", time_limit=time_limit)
             assert config.time_limit == time_limit
 
         # Invalid time limits (format-based validation)
@@ -135,24 +138,25 @@ def test_time_limit_validation(self):
 
         for time_limit in invalid_time_limits:
             with pytest.raises(ValidationError):
-                SlurmConfig(queue="test", time_limit=time_limit)
+                SlurmConfig(queue="test", command="python run_model.py", time_limit=time_limit)
 
     def test_additional_options_validation(self):
         """Test additional options validation."""
         # Valid additional options
         config = SlurmConfig(
             queue="test",
+            command="python run_model.py",
             additional_options=["--gres=gpu:1", "--exclusive", "--mem-per-cpu=2048"]
         )
         assert config.additional_options == ["--gres=gpu:1", "--exclusive", "--mem-per-cpu=2048"]
 
         # Empty list should be valid
-        config = SlurmConfig(queue="test", additional_options=[])
+        config = SlurmConfig(queue="test", command="python run_model.py", additional_options=[])
         assert config.additional_options == []
 
     def test_get_backend_class(self):
         """Test that get_backend_class returns the correct class."""
-        config = SlurmConfig(queue="test")
+        config = SlurmConfig(queue="test", command="python run_model.py")
         backend_class = config.get_backend_class()
 
         # Should return SlurmRunBackend class
@@ -168,14 +172,14 @@ def test_config_examples(self):
             config = SlurmConfig(**example)
             assert isinstance(config, SlurmConfig)
 
-    def test_required_queue_field(self):
-        """Test that queue field is required."""
-        # Should fail without queue
-        with pytest.raises(ValidationError, match="Field required"):
-            SlurmConfig()
+    def test_queue_field_is_optional(self):
+        """Test that queue field is optional."""
+        # Should work without queue (None)
+        config = SlurmConfig(command="python run_model.py")
+        assert config.queue is None
 
         # Should work with queue
-        config = SlurmConfig(queue="general")
+        config = SlurmConfig(queue="general", command="python run_model.py")
         assert config.queue == "general"
 
     def test_field_boundaries(self):
@@ -183,6 +187,7 @@ def test_field_boundaries(self):
         # Test minimum values
         config = SlurmConfig(
             queue="test",
+            command="python run_model.py",
             nodes=1,
             ntasks=1,
             cpus_per_task=1,
@@ -194,6 +199,7 @@ def test_field_boundaries(self):
         # Test maximum values
         config = SlurmConfig(
             queue="test",
+            command="python run_model.py",
             nodes=100,  # Max nodes
             cpus_per_task=128,  # Max cpus per task
         )
@@ -202,16 +208,33 @@ def test_field_boundaries(self):
 
         # Test out of bounds
         with pytest.raises(ValidationError):
-            SlurmConfig(queue="test", nodes=0)  # Min nodes is 1
+            SlurmConfig(queue="test", command="python run_model.py", nodes=0)  # Min nodes is 1
 
         with pytest.raises(ValidationError):
-            SlurmConfig(queue="test", nodes=101)  # Max nodes is 100
+            SlurmConfig(queue="test", command="python run_model.py", nodes=101)  # Max nodes is 100
 
         with pytest.raises(ValidationError):
-            SlurmConfig(queue="test", cpus_per_task=0)  # Min cpus_per_task is 1
+            SlurmConfig(queue="test", command="python run_model.py", cpus_per_task=0)  # Min cpus_per_task is 1
 
         with pytest.raises(ValidationError):
-            SlurmConfig(queue="test", cpus_per_task=129)  # Max cpus_per_task is 128
+            SlurmConfig(queue="test", command="python run_model.py", cpus_per_task=129)  # Max cpus_per_task is 128
+
+    def test_command_field(self):
+        """Test the command field validation and functionality."""
+        # Test with a custom command
+        config = SlurmConfig(
+            queue="test",
+            command="python my_script.py --param value",
+        )
+        assert config.command == "python my_script.py --param value"
+
+        # Test with no command provided - this should now raise an error since command is required
+        with pytest.raises(ValidationError):
+            SlurmConfig(queue="test")
+
+        # Test with empty command - this should now raise an error since command is required
+        with pytest.raises(ValidationError):
+            SlurmConfig(queue="test", command="")
 
 
 @requires_slurm
@@ -239,6 +262,7 @@ def basic_config(self):
         """Create a basic SlurmConfig."""
         return SlurmConfig(
             queue="general",
+            command="python run_model.py",
             timeout=3600,
             nodes=1,
             ntasks=1,
@@ -331,6 +355,40 @@ def test_create_job_script_with_all_options(self, mock_model_run):
             if os.path.exists(script_path):
                 os.remove(script_path)
 
+    def test_create_job_script_with_command(self, mock_model_run):
+        """Test the _create_job_script method with command."""
+        from rompy.run.slurm import SlurmRunBackend
+
+        # Create a config with a command
+        config = SlurmConfig(
+            queue="general",
+            command="python my_script.py --param value",
+            nodes=1,
+            ntasks=1,
+            cpus_per_task=1,
+            time_limit="01:00:00",
+        )
+
+        backend = SlurmRunBackend()
+
+        with TemporaryDirectory() as staging_dir:
+            script_path = backend._create_job_script(mock_model_run, config, staging_dir)
+
+            with open(script_path, 'r') as f:
+                content = f.read()
+
+            # Check that the command is in the script
+            assert "python my_script.py --param value" in content
+            # Check that it's properly marked as command execution
+            assert "# Execute command in the workspace" in content
+            # Make sure the old model execution is not present
+            assert "# Execute model using model_run.config.run() method" not in content
+
+            # Clean up
+            if os.path.exists(script_path):
+                os.remove(script_path)
+
+
     def test_submit_job(self, basic_config):
         """Test the _submit_job method."""
         from rompy.run.slurm import SlurmRunBackend

From 1a9f3dd6a6d1c77cbf0500f4682a130c30edb312 Mon Sep 17 00:00:00 2001
From: Ben Leighton <Ben.Leighton@csiro.au>
Date: Thu, 11 Dec 2025 12:42:09 +1100
Subject: [PATCH 19/24] added missing imports, command parameter is required

---
 tests/backends/test_slurm_backend.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/backends/test_slurm_backend.py b/tests/backends/test_slurm_backend.py
index e502b23..534e896 100644
--- a/tests/backends/test_slurm_backend.py
+++ b/tests/backends/test_slurm_backend.py
@@ -11,7 +11,8 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from unittest.mock import MagicMock, mock_open, patch
-
+import os
+import tempfile
 import pytest
 from pydantic import ValidationError
 
@@ -309,6 +310,7 @@ def test_create_job_script_with_all_options(self, mock_model_run):
             ntasks=4,
             cpus_per_task=8,
             time_limit="24:00:00",
+            command="echo 'Test'",
             account="myproject",
             qos="high",
             reservation="special",
@@ -589,4 +591,4 @@ def test_run_method_generation_failure(self, mock_model_run, basic_config):
         
         result = backend.run(mock_model_run, basic_config)
         
-        assert result is False
\ No newline at end of file
+        assert result is False

From 7c530c1800e05e691f5ce23bda266c06d72d8780 Mon Sep 17 00:00:00 2001
From: Ben Leighton <Ben.Leighton@csiro.au>
Date: Thu, 11 Dec 2025 12:51:11 +1100
Subject: [PATCH 20/24] remove duplicates in pyproj toml

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 70a5380..5df356c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,10 +89,6 @@ noop = "rompy.postprocess:NoopPostprocessor"
 local = "rompy.pipeline:LocalPipelineBackend"
 
 [project.optional-dependencies]
-test = ["pytest", "envyaml", "coverage"]
-extra = ["gcsfs", "zarr", "cloudpathlib[s3,gs,azure]"]
-dev = ["pytest", "envyaml", "coverage", "ruff", "black"]
-
 test = ["pytest", "envyaml", "coverage"]
 extra = ["gcsfs", "zarr", "cloudpathlib[s3,gs,azure]"]
 dev = ["pytest", "envyaml", "coverage", "ruff", "black"]

From 594c801b06346be412408ad622e2d9cea75f3e01 Mon Sep 17 00:00:00 2001
From: Ben Leighton <Ben.Leighton@csiro.au>
Date: Thu, 11 Dec 2025 13:12:47 +1100
Subject: [PATCH 21/24] fixed warning / error on slashes in f string

---
 src/rompy/run/slurm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
index 8ba354b..6bac941 100644
--- a/src/rompy/run/slurm.py
+++ b/src/rompy/run/slurm.py
@@ -155,7 +155,7 @@ def _create_job_script(
             script_path = f.name
 
         logger.debug(f"SLURM job script created at: {script_path}")
-        logger.debug(f"Job script content:\n{'\n'.join(script_lines)}")
+	logger.debug("Job script content:\n%s", "\n".join(script_lines))
 
         return script_path
 
@@ -316,4 +316,4 @@ def _wait_for_completion(self, job_id: str, config: "SlurmConfig") -> bool:
                 return False
             except Exception as e:
                 logger.error(f"Unexpected error while monitoring job {job_id}: {e}")
-                return False
\ No newline at end of file
+                return False

From c711630a50d526276ba47eadee7f0f757582d2ba Mon Sep 17 00:00:00 2001
From: Ben Leighton <Ben.Leighton@csiro.au>
Date: Thu, 11 Dec 2025 13:24:19 +1100
Subject: [PATCH 22/24] fixed tabs to spaces

---
 src/rompy/run/slurm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
index 6bac941..42e3d96 100644
--- a/src/rompy/run/slurm.py
+++ b/src/rompy/run/slurm.py
@@ -155,7 +155,7 @@ def _create_job_script(
             script_path = f.name
 
         logger.debug(f"SLURM job script created at: {script_path}")
-	logger.debug("Job script content:\n%s", "\n".join(script_lines))
+        logger.debug("Job script content:\n%s", "\n".join(script_lines))
 
         return script_path
 

From c2531535279e8f72f3d9e62d00d27df5640a87c9 Mon Sep 17 00:00:00 2001
From: Tom Durrant <t.durrant@oceanum.science>
Date: Fri, 12 Dec 2025 14:59:42 +1100
Subject: [PATCH 23/24] Changed integration tests to unit tests with mocks

---
 src/rompy/run/slurm.py               |  14 ++-
 tests/backends/test_slurm_backend.py | 180 +++++++++++++++------------
 2 files changed, 111 insertions(+), 83 deletions(-)

diff --git a/src/rompy/run/slurm.py b/src/rompy/run/slurm.py
index 42e3d96..dea6de4 100644
--- a/src/rompy/run/slurm.py
+++ b/src/rompy/run/slurm.py
@@ -42,11 +42,15 @@ def run(
 
         # Use provided workspace or generate if not provided (for backwards compatibility)
         if workspace_dir is None:
-            logger.warning(
-                "No workspace_dir provided, generating files (this may cause double generation in pipeline)"
-            )
-            staging_dir = model_run.generate()
-            logger.info(f"Model inputs generated in: {staging_dir}")
+            try:
+                logger.warning(
+                    "No workspace_dir provided, generating files (this may cause double generation in pipeline)"
+                )
+                staging_dir = model_run.generate()
+                logger.info(f"Model inputs generated in: {staging_dir}")
+            except Exception as e:
+                logger.exception(f"Model generation failed: {e}")
+                return False
         else:
             logger.info(f"Using provided workspace directory: {workspace_dir}")
             staging_dir = workspace_dir
diff --git a/tests/backends/test_slurm_backend.py b/tests/backends/test_slurm_backend.py
index 534e896..40ab471 100644
--- a/tests/backends/test_slurm_backend.py
+++ b/tests/backends/test_slurm_backend.py
@@ -238,7 +238,6 @@ def test_command_field(self):
             SlurmConfig(queue="test", command="")
 
 
-@requires_slurm
 class TestSlurmRunBackend:
     """Test the SlurmRunBackend class."""
 
@@ -249,11 +248,8 @@ def mock_model_run(self):
         model_run.run_id = "test_run_123"
         model_run.output_dir = Path("/tmp/test_output")
 
-        # Create a temporary directory for staging
-        import tempfile
-
-        temp_dir = tempfile.mkdtemp()
-        model_run.generate.return_value = temp_dir
+        # Will be set to a temporary directory by individual tests as needed
+        # This avoids creating directories that aren't cleaned up
         model_run.config.run.return_value = True
         model_run.model_dump.return_value = {"test": "data"}  # Mock for serialization
         return model_run
@@ -394,26 +390,34 @@ def test_create_job_script_with_command(self, mock_model_run):
     def test_submit_job(self, basic_config):
         """Test the _submit_job method."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
+
         # Create a simple job script
         with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
             f.write("#!/bin/bash\n#SBATCH --job-name=test\n")
             script_path = f.name
-        
+
         try:
             # Mock subprocess.run to return a successful job submission
+            # We need to mock multiple subprocess calls: which sbatch, scontrol, and sbatch
             with patch("subprocess.run") as mock_run:
-                mock_run.return_value.stdout = "Submitted batch job 12345"
-                mock_run.return_value.stderr = ""
-                mock_run.return_value.returncode = 0
-                
+                # Configure the side effect to simulate the sequence of calls in _submit_job
+                mock_run.side_effect = [
+                    # First call: which sbatch - return success
+                    MagicMock(returncode=0, stdout="/usr/bin/sbatch"),
+                    # Second call: scontrol --help - return success
+                    MagicMock(returncode=0, stdout="scontrol help text"),
+                    # Third call: sbatch command - return success
+                    MagicMock(returncode=0, stdout="Submitted batch job 12345", stderr="")
+                ]
+
                 job_id = backend._submit_job(script_path)
-                
+
                 assert job_id == "12345"
-                mock_run.assert_called_once()
-                
+                # Check that subprocess.run was called exactly 3 times
+                assert mock_run.call_count == 3
+
         finally:
             # Clean up
             if os.path.exists(script_path):
@@ -422,24 +426,32 @@ def test_submit_job(self, basic_config):
     def test_submit_job_failure(self, basic_config):
         """Test the _submit_job method with failure."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
+
         # Create a simple job script
         with tempfile.NamedTemporaryFile(mode='w', suffix='.sh', delete=False) as f:
             f.write("#!/bin/bash\n#SBATCH --job-name=test\n")
             script_path = f.name
-        
+
         try:
-            # Mock subprocess.run to return a failure
+            # Mock subprocess.run to return a failure during sbatch command
             with patch("subprocess.run") as mock_run:
-                mock_run.side_effect = Exception("Submission failed")
-                
+                # Mock the sequence of calls but make sbatch fail
+                mock_run.side_effect = [
+                    # First call: which sbatch - return success
+                    MagicMock(returncode=0, stdout="/usr/bin/sbatch"),
+                    # Second call: scontrol --help - return success
+                    MagicMock(returncode=0, stdout="scontrol help text"),
+                    # Third call: sbatch command - return failure
+                    subprocess.CalledProcessError(1, "sbatch", stderr="SLURM submission failed")
+                ]
+
                 job_id = backend._submit_job(script_path)
-                
+
                 assert job_id is None
-                mock_run.assert_called_once()
-                
+                assert mock_run.call_count == 3  # All three calls attempted
+
         finally:
             # Clean up
             if os.path.exists(script_path):
@@ -448,147 +460,159 @@ def test_submit_job_failure(self, basic_config):
     def test_wait_for_completion_completed(self, basic_config):
         """Test _wait_for_completion method for completed job."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
-        # Mock subprocess.run for squeue to return completed state
+
+        # Mock subprocess.run for scontrol to return completed state
         with patch("subprocess.run") as mock_run:
             # First call returns running, second returns completed
             mock_run.side_effect = [
-                # Running
+                # Running state from scontrol
                 MagicMock(
-                    stdout="R\n",
+                    stdout="JobState=RUNNING\nOtherInfo=...",
                     stderr="",
                     returncode=0
                 ),
-                # Completed 
+                # Completed state from scontrol
                 MagicMock(
-                    stdout="CD\n",
+                    stdout="JobState=COMPLETED\nOtherInfo=...",
                     stderr="",
                     returncode=0
                 )
             ]
-            
+
             result = backend._wait_for_completion("12345", basic_config)
-            
+
             assert result is True
             assert mock_run.call_count == 2
 
     def test_wait_for_completion_failed(self, basic_config):
         """Test _wait_for_completion method for failed job."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
-        # Mock subprocess.run for squeue to return failed state
+
+        # Mock subprocess.run for scontrol to return failed state
         with patch("subprocess.run") as mock_run:
-            mock_result = MagicMock(stdout="F\n", stderr="", returncode=0)
+            mock_result = MagicMock(
+                stdout="JobState=FAILED\nOtherInfo=...",
+                stderr="",
+                returncode=0
+            )
             mock_run.return_value = mock_result
-            
+
             result = backend._wait_for_completion("12345", basic_config)
-            
+
             assert result is False
 
     def test_wait_for_completion_timeout(self):
         """Test _wait_for_completion method with timeout."""
         from rompy.run.slurm import SlurmRunBackend
         import time
-        from unittest.mock import ANY
-        
+
         config = SlurmConfig(
             queue="test",
+            command="python run_model.py", # Added required command field
             timeout=60,  # Minimum valid timeout value
             nodes=1,
             ntasks=1,
             cpus_per_task=1,
             time_limit="01:00:00",
         )
-        
+
         backend = SlurmRunBackend()
-        
-        # Use a more advanced approach with time mocking
+
+        # Track the call count to simulate time progression with each call
+        call_count = 0
         initial_time = time.time()
+
         def time_side_effect():
-            # Return an increasing time value to simulate timeout
-            return initial_time + 120  # More than 60s timeout
-        
+            # Simulate time progressing 10 seconds per call to trigger timeout faster
+            nonlocal call_count
+            call_count += 1
+            return initial_time + (call_count * 10)  # Increment time by 10s per call
+
         with patch("subprocess.run") as mock_run:
             with patch("time.time", side_effect=time_side_effect):
-                # Return running state to avoid early exit due to job completion
-                mock_result = MagicMock(stdout="R\n", stderr="", returncode=0)
-                mock_run.return_value = mock_result
-                
-                result = backend._wait_for_completion("12345", config)
-                
-                # Should return False due to timeout
-                assert result is False
-                
-                # Verify that scancel was called during timeout handling
-                mock_run.assert_any_call(['scancel', '12345'], check=True, capture_output=True)
+                with patch("time.sleep"):  # Mock time.sleep to avoid actual sleeping
+                    # Mock scontrol to return RUNNING state to simulate a job that keeps running
+                    def scontrol_side_effect(*args, **kwargs):
+                        return MagicMock(
+                            stdout="JobState=RUNNING\nOtherInfo=...",
+                            stderr="",
+                            returncode=0
+                        )
+
+                    mock_run.side_effect = scontrol_side_effect
+
+                    result = backend._wait_for_completion("12345", config)
+
+                    # Should return False due to timeout
+                    assert result is False
+
+                    # In the original implementation, the timeout was handled without scancel
+                    # so we don't expect scancel to be called
 
-    @requires_slurm
     def test_run_method_success(self, mock_model_run, basic_config):
         """Test the full run method with success."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
+
         with TemporaryDirectory() as staging_dir:
             # Mock the internal methods
             with patch.object(backend, '_create_job_script') as mock_create_script, \
                  patch.object(backend, '_submit_job') as mock_submit, \
                  patch.object(backend, '_wait_for_completion') as mock_wait:
-                
+
                 # Mock the methods to return expected values
                 mock_create_script.return_value = "/tmp/job_script.sh"
                 mock_submit.return_value = "12345"
                 mock_wait.return_value = True  # Job completed successfully
-                
+
                 # Set up the mock model run to return the staging directory
                 mock_model_run.generate.return_value = staging_dir
-                
+
                 result = backend.run(mock_model_run, basic_config)
-                
+
                 assert result is True
                 mock_create_script.assert_called_once()
                 mock_submit.assert_called_once()
                 mock_wait.assert_called_once_with("12345", basic_config)
 
-    @requires_slurm
     def test_run_method_job_submit_failure(self, mock_model_run, basic_config):
         """Test the run method when job submission fails."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
+
         with TemporaryDirectory() as staging_dir:
             # Mock the internal methods
             with patch.object(backend, '_create_job_script') as mock_create_script, \
                  patch.object(backend, '_submit_job') as mock_submit:
-                
+
                 # Mock the methods
                 mock_create_script.return_value = "/tmp/job_script.sh"
                 mock_submit.return_value = None  # Submission failed
-                
+
                 # Set up the mock model run
                 mock_model_run.generate.return_value = staging_dir
-                
+
                 result = backend.run(mock_model_run, basic_config)
-                
+
                 assert result is False
                 mock_create_script.assert_called_once()
                 mock_submit.assert_called_once()
 
-    @requires_slurm
     def test_run_method_generation_failure(self, mock_model_run, basic_config):
         """Test the run method when model generation fails."""
         from rompy.run.slurm import SlurmRunBackend
-        
+
         backend = SlurmRunBackend()
-        
+
         # Configure mock to raise an exception during generation
         mock_model_run.generate.side_effect = Exception("Generation failed")
-        
+
         result = backend.run(mock_model_run, basic_config)
-        
+
         assert result is False

From 023e277183f051b8752d847c37e99d6221c04c2e Mon Sep 17 00:00:00 2001
From: Ben Leighton <Ben.Leighton@csiro.au>
Date: Mon, 15 Dec 2025 16:21:40 +1100
Subject: [PATCH 24/24] removed gpu requirements, must specify output directory
 for basic tests due to temp directory disappearing

---
 examples/backends/05_slurm_backend_run.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/backends/05_slurm_backend_run.py b/examples/backends/05_slurm_backend_run.py
index d829ff5..0ab84ca 100644
--- a/examples/backends/05_slurm_backend_run.py
+++ b/examples/backends/05_slurm_backend_run.py
@@ -54,9 +54,10 @@ def example_slurm_basic():
 
         # Basic SLURM configuration
         config = SlurmConfig(
-            queue="general",  # SLURM partition name
             command="python run_model.py",  # Command to run in the workspace
             timeout=1800,  # Max execution time in seconds (30 minutes)
+            output_file="slurm-%j.out",  # Output file pattern (job ID)
+            error_file="slurm-%j.err",  # Error file pattern
             nodes=1,  # Number of nodes to allocate
             ntasks=1,  # Number of tasks (processes) to run
             cpus_per_task=2,  # Number of CPU cores per task
@@ -115,22 +116,18 @@ def example_slurm_advanced():
 
         # Advanced SLURM configuration with many parameters
         config = SlurmConfig(
-            queue="gpu",  # GPU partition
-            command="python run_model.py --gpu",  # Command to run in the workspace
+            command="python run_model.py",  # Command to run in the workspace
             timeout=7200,  # 2 hours timeout
             nodes=2,  # 2 compute nodes
             ntasks=8,  # 8 tasks total
             cpus_per_task=4,  # 4 CPUs per task
             time_limit="02:00:00",  # 2 hours time limit
-            account="research_project",  # Account for billing
-            qos="high",  # Quality of Service
-            reservation="special_reservation",  # Reservation name
             output_file="slurm-%j.out",  # Output file pattern (job ID)
             error_file="slurm-%j.err",  # Error file pattern
             job_name="advanced_simulation",  # Name of the SLURM job
             mail_type="BEGIN,END,FAIL",  # Types of notifications
             mail_user="researcher@domain.com",  # Email for notifications
-            additional_options=["--gres=gpu:v100:2", "--exclusive"],  # GPU resources
+            additional_options=["--exclusive"],  # GPU resources
             env_vars={  # Environment variables
                 "OMP_NUM_THREADS": "4",
                 "MODEL_DEBUG": "true",
@@ -184,7 +181,6 @@ def example_slurm_with_custom_command():
 
         # SLURM configuration with a custom command
         config = SlurmConfig(
-            queue="general",
             timeout=3600,  # 1 hour timeout
             nodes=1,
             ntasks=1,