From 82e8b3b70e522a24977921f08ee2aa9b1c813be4 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 13:05:36 +0100 Subject: [PATCH 1/9] Extrac base NIXL command args class --- src/cloudai/workloads/common/nixl.py | 9 +++++++++ src/cloudai/workloads/nixl_bench/nixl_bench.py | 12 ++++++------ src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py | 13 +++++-------- .../workloads/nixl_perftest/nixl_perftest.py | 9 +++------ 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index f9b33046e..dba85e48f 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -20,6 +20,7 @@ from pathlib import Path from typing import TYPE_CHECKING +from cloudai.models.workload import CmdArgs from cloudai.systems.slurm import SlurmCommandGenStrategy from cloudai.util.lazy_imports import lazy @@ -27,6 +28,14 @@ import pandas as pd +class NIXLBaseCmdArgs(CmdArgs): + """Command line arguments for a NIXL workloads.""" + + docker_image_url: str + etcd_path: str = "etcd" + wait_etcd_for: int = 60 + + class NIXLCmdGenBase(SlurmCommandGenStrategy): """Base command generation strategy for NIXL-based workloads.""" diff --git a/src/cloudai/workloads/nixl_bench/nixl_bench.py b/src/cloudai/workloads/nixl_bench/nixl_bench.py index 88db43ebb..1f6ecb293 100644 --- a/src/cloudai/workloads/nixl_bench/nixl_bench.py +++ b/src/cloudai/workloads/nixl_bench/nixl_bench.py @@ -17,16 +17,14 @@ from __future__ import annotations from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun -from cloudai.models.workload import CmdArgs, TestDefinition -from cloudai.workloads.common.nixl import extract_nixlbench_data +from cloudai.models.workload import TestDefinition +from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, extract_nixlbench_data -class NIXLBenchCmdArgs(CmdArgs): +class NIXLBenchCmdArgs(NIXLBaseCmdArgs): """Command line arguments for a NIXL Bench test.""" - docker_image_url: str path_to_benchmark: str - etcd_path: str = "etcd" etcd_endpoints: str = "http://$NIXL_ETCD_ENDPOINTS" @@ -48,7 +46,9 @@ def installables(self) -> list[Installable]: @property def cmd_args_dict(self) -> dict[str, str | list[str]]: - return self.cmd_args.model_dump(exclude={"docker_image_url", "path_to_benchmark", "cmd_args", "etcd_path"}) + return self.cmd_args.model_dump( + exclude={"docker_image_url", "path_to_benchmark", "cmd_args", "etcd_path", "wait_etcd_for"} + ) def was_run_successful(self, tr: TestRun) -> JobStatusResult: df = extract_nixlbench_data(tr.output_path / "stdout.txt") diff --git a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py index c2020576f..b2b119f8f 100644 --- a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py +++ b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py @@ -18,18 +18,15 @@ from typing import Literal -from cloudai.core import CmdArgs, DockerImage, Installable, JobStatusResult, TestDefinition, TestRun -from cloudai.workloads.common.nixl import extract_nixlbench_data +from cloudai.core import DockerImage, Installable, JobStatusResult, TestDefinition, TestRun +from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, extract_nixlbench_data -class NIXLKVBenchCmdArgs(CmdArgs): - """Command line arguments for NIXLKVBench.""" +class NIXLKVBenchCmdArgs(NIXLBaseCmdArgs): + """Command line arguments for NIXL KVBench.""" command: Literal["profile"] = "profile" - etcd_path: str = "etcd" - wait_etcd_for: int = 60 - docker_image_url: str kvbench_script: str = "/workspace/nixl/benchmark/kvbench/main.py" python_executable: str = "python" @@ -40,7 +37,7 @@ class NIXLKVBenchCmdArgs(CmdArgs): class NIXLKVBenchTestDefinition(TestDefinition): - """Test definition for NIXLKVBench.""" + """Test definition for NIXL KVBench.""" _docker_image: DockerImage | None = None cmd_args: NIXLKVBenchCmdArgs diff --git a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py index b96ccd520..b68045683 100644 --- a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py +++ b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py @@ -19,6 +19,7 @@ from pydantic import Field, model_validator from cloudai.core import CmdArgs, DockerImage, Installable, TestDefinition, TestRun +from cloudai.workloads.common.nixl import NIXLBaseCmdArgs class MatgenCmdArgs(CmdArgs): @@ -27,17 +28,13 @@ class MatgenCmdArgs(CmdArgs): ppn: int | None = None -class NixlPerftestCmdArgs(CmdArgs): - """CmdArgs for NixlPerftestTestDefinition.""" - - docker_image_url: str +class NixlPerftestCmdArgs(NIXLBaseCmdArgs): + """CmdArgs for NIXL Perftest.""" subtest: Literal["sequential-ct-perftest"] perftest_script: str = "/workspace/nixl/benchmark/kvbench/main.py" matgen_script: str = "/workspace/nixl/benchmark/kvbench/test/inference_workload_matgen.py" python_executable: str = "python" - etcd_path: str = "etcd" - wait_etcd_for: int = 60 num_user_requests: int | list[int] batch_size: int | list[int] From 17b42ee31cf2397ff7c0590f9ba28eae48e02517 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 13:17:40 +0100 Subject: [PATCH 2/9] Extract common tdef for NIXL workloads --- src/cloudai/workloads/common/nixl.py | 20 +++++++++++++++++- .../workloads/nixl_bench/nixl_bench.py | 20 ++++-------------- .../workloads/nixl_kvbench/nixl_kvbench.py | 19 ++++------------- .../workloads/nixl_perftest/nixl_perftest.py | 21 +++++-------------- 4 files changed, 32 insertions(+), 48 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index dba85e48f..5df3877b0 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -20,7 +20,8 @@ from pathlib import Path from typing import TYPE_CHECKING -from cloudai.models.workload import CmdArgs +from cloudai.core import DockerImage, Installable +from cloudai.models.workload import CmdArgs, TestDefinition from cloudai.systems.slurm import SlurmCommandGenStrategy from cloudai.util.lazy_imports import lazy @@ -36,6 +37,23 @@ class NIXLBaseCmdArgs(CmdArgs): wait_etcd_for: int = 60 +class NIXLBaseTestDefinition(TestDefinition): + """Test definition for a NIXL workloads.""" + + cmd_args: NIXLBaseCmdArgs + _nixl_image: DockerImage | None = None + + @property + def docker_image(self) -> DockerImage: + if not self._nixl_image: + self._nixl_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._nixl_image + + @property + def installables(self) -> list[Installable]: + return [self.docker_image, *self.git_repos] + + class NIXLCmdGenBase(SlurmCommandGenStrategy): """Base command generation strategy for NIXL-based workloads.""" diff --git a/src/cloudai/workloads/nixl_bench/nixl_bench.py b/src/cloudai/workloads/nixl_bench/nixl_bench.py index 1f6ecb293..c91f426c8 100644 --- a/src/cloudai/workloads/nixl_bench/nixl_bench.py +++ b/src/cloudai/workloads/nixl_bench/nixl_bench.py @@ -16,9 +16,8 @@ from __future__ import annotations -from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun -from cloudai.models.workload import TestDefinition -from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, extract_nixlbench_data +from cloudai.core import JobStatusResult, TestRun +from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, NIXLBaseTestDefinition, extract_nixlbench_data class NIXLBenchCmdArgs(NIXLBaseCmdArgs): @@ -28,21 +27,10 @@ class NIXLBenchCmdArgs(NIXLBaseCmdArgs): etcd_endpoints: str = "http://$NIXL_ETCD_ENDPOINTS" -class NIXLBenchTestDefinition(TestDefinition): +class NIXLBenchTestDefinition(NIXLBaseTestDefinition): """Test definition for a NIXL Bench test.""" - cmd_args: NIXLBenchCmdArgs - _nixl_image: DockerImage | None = None - - @property - def docker_image(self) -> DockerImage: - if not self._nixl_image: - self._nixl_image = DockerImage(url=self.cmd_args.docker_image_url) - return self._nixl_image - - @property - def installables(self) -> list[Installable]: - return [self.docker_image, *self.git_repos] + cmd_args: NIXLBenchCmdArgs # type: ignore[override] @property def cmd_args_dict(self) -> dict[str, str | list[str]]: diff --git a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py index b2b119f8f..638a1ef5c 100644 --- a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py +++ b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py @@ -18,8 +18,8 @@ from typing import Literal -from cloudai.core import DockerImage, Installable, JobStatusResult, TestDefinition, TestRun -from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, extract_nixlbench_data +from cloudai.core import JobStatusResult, TestRun +from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, NIXLBaseTestDefinition, extract_nixlbench_data class NIXLKVBenchCmdArgs(NIXLBaseCmdArgs): @@ -36,21 +36,10 @@ class NIXLKVBenchCmdArgs(NIXLBaseCmdArgs): backend: str | list[str] | None = None -class NIXLKVBenchTestDefinition(TestDefinition): +class NIXLKVBenchTestDefinition(NIXLBaseTestDefinition): """Test definition for NIXL KVBench.""" - _docker_image: DockerImage | None = None - cmd_args: NIXLKVBenchCmdArgs - - @property - def docker_image(self) -> DockerImage: - if not self._docker_image: - self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) - return self._docker_image - - @property - def installables(self) -> list[Installable]: - return [*self.git_repos, self.docker_image] + cmd_args: NIXLKVBenchCmdArgs # type: ignore[override] @property def cmd_args_dict(self) -> dict[str, str | list[str]]: diff --git a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py index b68045683..3a8846a2a 100644 --- a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py +++ b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py @@ -14,12 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Literal, Optional +from typing import Literal from pydantic import Field, model_validator -from cloudai.core import CmdArgs, DockerImage, Installable, TestDefinition, TestRun -from cloudai.workloads.common.nixl import NIXLBaseCmdArgs +from cloudai.core import CmdArgs, TestRun +from cloudai.workloads.common.nixl import NIXLBaseCmdArgs, NIXLBaseTestDefinition class MatgenCmdArgs(CmdArgs): @@ -88,21 +88,10 @@ def model_vs_custom(self): return self -class NixlPerftestTestDefinition(TestDefinition): +class NixlPerftestTestDefinition(NIXLBaseTestDefinition): """TestDefinition for NixlPerftest.""" - _docker_image: Optional[DockerImage] = None - cmd_args: NixlPerftestCmdArgs - - @property - def docker_image(self) -> DockerImage: - if not self._docker_image: - self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) - return self._docker_image - - @property - def installables(self) -> list[Installable]: - return [*self.git_repos, self.docker_image] + cmd_args: NixlPerftestCmdArgs # type: ignore[override] def constraint_check(self, tr: TestRun) -> bool: decode_tp = int(tr.test.cmd_args.decode_tp) From 03af93bc5b380772c257a81021c1cceb177ae600 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 13:51:14 +0100 Subject: [PATCH 3/9] Support separate container for etcd --- src/cloudai/workloads/common/nixl.py | 32 +++++++++++++++++-- .../workloads/nixl_bench/nixl_bench.py | 9 +++++- .../nixl_bench/slurm_command_gen_strategy.py | 8 ----- .../test_command_gen_strategy_slurm.py | 10 ++++++ 4 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 5df3877b0..8eacbfe01 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -18,11 +18,12 @@ import logging from functools import cache from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast -from cloudai.core import DockerImage, Installable +from cloudai.core import DockerImage, Installable, TestRun from cloudai.models.workload import CmdArgs, TestDefinition from cloudai.systems.slurm import SlurmCommandGenStrategy +from cloudai.systems.slurm.slurm_system import SlurmSystem from cloudai.util.lazy_imports import lazy if TYPE_CHECKING: @@ -35,6 +36,7 @@ class NIXLBaseCmdArgs(CmdArgs): docker_image_url: str etcd_path: str = "etcd" wait_etcd_for: int = 60 + etcd_image_url: str | None = None class NIXLBaseTestDefinition(TestDefinition): @@ -42,6 +44,7 @@ class NIXLBaseTestDefinition(TestDefinition): cmd_args: NIXLBaseCmdArgs _nixl_image: DockerImage | None = None + _etcd_image: DockerImage | None = None @property def docker_image(self) -> DockerImage: @@ -49,14 +52,32 @@ def docker_image(self) -> DockerImage: self._nixl_image = DockerImage(url=self.cmd_args.docker_image_url) return self._nixl_image + @property + def etcd_image(self) -> DockerImage | None: + if not self.cmd_args.etcd_image_url: + return None + if not self._etcd_image: + self._etcd_image = DockerImage(url=self.cmd_args.etcd_image_url) + return self._etcd_image + @property def installables(self) -> list[Installable]: - return [self.docker_image, *self.git_repos] + installables = [self.docker_image, *self.git_repos] + if self.etcd_image: + installables.append(self.etcd_image) + return installables class NIXLCmdGenBase(SlurmCommandGenStrategy): """Base command generation strategy for NIXL-based workloads.""" + def __init__(self, system: SlurmSystem, test_run: TestRun) -> None: + super().__init__(system, test_run) + self._current_image_url: str | None = None + + def image_path(self) -> str | None: + return self._current_image_url + @property def final_env_vars(self) -> dict[str, str | list[str]]: env_vars = super().final_env_vars @@ -79,6 +100,10 @@ def gen_etcd_srun_command(self, etcd_path: str) -> list[str]: '--initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380"', "--initial-cluster-state=new", ] + tdef = cast(NIXLBaseTestDefinition, self.test_run.test) + curr_image = self._current_image_url + if tdef.etcd_image: + self._current_image_url = str(tdef.etcd_image.installed_path) cmd = [ *self.gen_srun_prefix(with_num_nodes=False), f"--output={self.test_run.output_path.absolute() / 'etcd.log'}", @@ -90,6 +115,7 @@ def gen_etcd_srun_command(self, etcd_path: str) -> list[str]: *etcd_cmd, " &", ] + self._current_image_url = curr_image return cmd def gen_wait_for_etcd_command(self, timeout: int = 60) -> list[str]: diff --git a/src/cloudai/workloads/nixl_bench/nixl_bench.py b/src/cloudai/workloads/nixl_bench/nixl_bench.py index c91f426c8..529925dd0 100644 --- a/src/cloudai/workloads/nixl_bench/nixl_bench.py +++ b/src/cloudai/workloads/nixl_bench/nixl_bench.py @@ -35,7 +35,14 @@ class NIXLBenchTestDefinition(NIXLBaseTestDefinition): @property def cmd_args_dict(self) -> dict[str, str | list[str]]: return self.cmd_args.model_dump( - exclude={"docker_image_url", "path_to_benchmark", "cmd_args", "etcd_path", "wait_etcd_for"} + exclude={ + "docker_image_url", + "path_to_benchmark", + "cmd_args", + "etcd_path", + "wait_etcd_for", + "etcd_image_url", + } ) def was_run_successful(self, tr: TestRun) -> JobStatusResult: diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py index 3b97293ef..f9e78fea2 100644 --- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py @@ -26,14 +26,6 @@ class NIXLBenchSlurmCommandGenStrategy(NIXLCmdGenBase): """Command generation strategy for NIXL Bench tests.""" - def __init__(self, system: SlurmSystem, test_run: TestRun) -> None: - super().__init__(system, test_run) - - self._current_image_url: str | None = None - - def image_path(self) -> str | None: - return self._current_image_url - def _container_mounts(self) -> list[str]: return [] diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py index 898aa74d7..a806aa362 100644 --- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py @@ -91,6 +91,16 @@ def test_gen_etcd_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem assert "--container-mounts" in cmd +def test_get_etcd_srun_command_with_etcd_image(nixl_bench_tr: TestRun, slurm_system: SlurmSystem): + strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr) + tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, nixl_bench_tr.test) + tdef.cmd_args.etcd_image_url = "docker.io/library/etcd:latest" + + cmd = " ".join(strategy.gen_etcd_srun_command(tdef.cmd_args.etcd_path)) + assert tdef.etcd_image is not None + assert f"--container-image={tdef.etcd_image.installed_path}" in cmd + + @pytest.mark.parametrize( "backend,nnodes,exp_ntasks", [ From d356b49214106f18bc5a80e8d1bf74d712fd76a4 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 14:44:04 +0100 Subject: [PATCH 4/9] Use generics to avoid type override conflict --- src/cloudai/workloads/common/nixl.py | 11 +++++++---- src/cloudai/workloads/nixl_bench/nixl_bench.py | 4 +--- .../nixl_bench/slurm_command_gen_strategy.py | 2 -- src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py | 4 +--- src/cloudai/workloads/nixl_perftest/nixl_perftest.py | 4 +--- 5 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 8eacbfe01..9e7c26bef 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -18,7 +18,7 @@ import logging from functools import cache from pathlib import Path -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Generic, TypeVar, cast from cloudai.core import DockerImage, Installable, TestRun from cloudai.models.workload import CmdArgs, TestDefinition @@ -39,10 +39,13 @@ class NIXLBaseCmdArgs(CmdArgs): etcd_image_url: str | None = None -class NIXLBaseTestDefinition(TestDefinition): +NIXLCmdArgsT = TypeVar("NIXLCmdArgsT", bound=NIXLBaseCmdArgs) + + +class NIXLBaseTestDefinition(TestDefinition, Generic[NIXLCmdArgsT]): """Test definition for a NIXL workloads.""" - cmd_args: NIXLBaseCmdArgs + cmd_args: NIXLCmdArgsT _nixl_image: DockerImage | None = None _etcd_image: DockerImage | None = None @@ -100,7 +103,7 @@ def gen_etcd_srun_command(self, etcd_path: str) -> list[str]: '--initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380"', "--initial-cluster-state=new", ] - tdef = cast(NIXLBaseTestDefinition, self.test_run.test) + tdef = cast(NIXLBaseTestDefinition[NIXLBaseCmdArgs], self.test_run.test) curr_image = self._current_image_url if tdef.etcd_image: self._current_image_url = str(tdef.etcd_image.installed_path) diff --git a/src/cloudai/workloads/nixl_bench/nixl_bench.py b/src/cloudai/workloads/nixl_bench/nixl_bench.py index 529925dd0..faa9b62e3 100644 --- a/src/cloudai/workloads/nixl_bench/nixl_bench.py +++ b/src/cloudai/workloads/nixl_bench/nixl_bench.py @@ -27,11 +27,9 @@ class NIXLBenchCmdArgs(NIXLBaseCmdArgs): etcd_endpoints: str = "http://$NIXL_ETCD_ENDPOINTS" -class NIXLBenchTestDefinition(NIXLBaseTestDefinition): +class NIXLBenchTestDefinition(NIXLBaseTestDefinition[NIXLBenchCmdArgs]): """Test definition for a NIXL Bench test.""" - cmd_args: NIXLBenchCmdArgs # type: ignore[override] - @property def cmd_args_dict(self) -> dict[str, str | list[str]]: return self.cmd_args.model_dump( diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py index f9e78fea2..061c03f7b 100644 --- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py @@ -16,8 +16,6 @@ from typing import cast -from cloudai.core import TestRun -from cloudai.systems.slurm import SlurmSystem from cloudai.workloads.common.nixl import NIXLCmdGenBase from .nixl_bench import NIXLBenchTestDefinition diff --git a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py index 638a1ef5c..3d68cbb45 100644 --- a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py +++ b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py @@ -36,11 +36,9 @@ class NIXLKVBenchCmdArgs(NIXLBaseCmdArgs): backend: str | list[str] | None = None -class NIXLKVBenchTestDefinition(NIXLBaseTestDefinition): +class NIXLKVBenchTestDefinition(NIXLBaseTestDefinition[NIXLKVBenchCmdArgs]): """Test definition for NIXL KVBench.""" - cmd_args: NIXLKVBenchCmdArgs # type: ignore[override] - @property def cmd_args_dict(self) -> dict[str, str | list[str]]: return self.cmd_args.model_dump( diff --git a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py index 3a8846a2a..fc77a07ba 100644 --- a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py +++ b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py @@ -88,11 +88,9 @@ def model_vs_custom(self): return self -class NixlPerftestTestDefinition(NIXLBaseTestDefinition): +class NixlPerftestTestDefinition(NIXLBaseTestDefinition[NixlPerftestCmdArgs]): """TestDefinition for NixlPerftest.""" - cmd_args: NixlPerftestCmdArgs # type: ignore[override] - def constraint_check(self, tr: TestRun) -> bool: decode_tp = int(tr.test.cmd_args.decode_tp) decode_nodes = int(tr.test.cmd_args.num_decode_nodes) From 1ffb860c1986c972260b96155b3b93a811dfcc98 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 14:53:53 +0100 Subject: [PATCH 5/9] Fix copyright years --- src/cloudai/workloads/common/nixl.py | 2 +- src/cloudai/workloads/nixl_bench/nixl_bench.py | 2 +- src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py | 2 +- src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py | 2 +- src/cloudai/workloads/nixl_perftest/nixl_perftest.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 9e7c26bef..499c90bfd 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/nixl_bench/nixl_bench.py b/src/cloudai/workloads/nixl_bench/nixl_bench.py index faa9b62e3..21352f77c 100644 --- a/src/cloudai/workloads/nixl_bench/nixl_bench.py +++ b/src/cloudai/workloads/nixl_bench/nixl_bench.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py index 061c03f7b..6d0fa51ed 100644 --- a/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_bench/slurm_command_gen_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py index 3d68cbb45..69640cb13 100644 --- a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py +++ b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py index fc77a07ba..740e0081c 100644 --- a/src/cloudai/workloads/nixl_perftest/nixl_perftest.py +++ b/src/cloudai/workloads/nixl_perftest/nixl_perftest.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 3d94694c8684b609071d037c23d3d7b6a59107e7 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 15:13:33 +0100 Subject: [PATCH 6/9] Update documentation for NIXL workloads --- doc/conf.py | 4 ++++ doc/workloads/nixl_bench.rst | 3 +-- doc/workloads/nixl_kvbench.rst | 3 +-- doc/workloads/nixl_perftest.rst | 3 +-- src/cloudai/workloads/common/nixl.py | 16 ++++++++++++---- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 54055e069..6f2714086 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -75,11 +75,15 @@ def setup(app): "members": True, "member-order": "bysource", "special-members": "__init__", + "inherited-members": "BaseModel", } autodoc_pydantic_model_show_json = False autodoc_pydantic_model_show_field_summary = False autodoc_pydantic_model_show_config_summary = False +autodoc_pydantic_field_list_validators = False +autodoc_pydantic_model_show_validator_summary = False +autodoc_pydantic_model_show_validator_members = False # Generate autosummary even if no references autosummary_generate = True diff --git a/doc/workloads/nixl_bench.rst b/doc/workloads/nixl_bench.rst index 5b7a68ce6..99ba6cb39 100644 --- a/doc/workloads/nixl_bench.rst +++ b/doc/workloads/nixl_bench.rst @@ -64,9 +64,8 @@ API Documentation Command Arguments ~~~~~~~~~~~~~~~~~ -.. autoclass:: cloudai.workloads.nixl_bench.nixl_bench.NIXLBenchCmdArgs +.. autopydantic_model:: cloudai.workloads.nixl_bench.nixl_bench.NIXLBenchCmdArgs :members: - :show-inheritance: Test Definition ~~~~~~~~~~~~~~~ diff --git a/doc/workloads/nixl_kvbench.rst b/doc/workloads/nixl_kvbench.rst index 268372b37..310211e8a 100644 --- a/doc/workloads/nixl_kvbench.rst +++ b/doc/workloads/nixl_kvbench.rst @@ -65,9 +65,8 @@ API Documentation Command Arguments ~~~~~~~~~~~~~~~~~ -.. autoclass:: cloudai.workloads.nixl_kvbench.nixl_kvbench.NIXLKVBenchCmdArgs +.. autopydantic_model:: cloudai.workloads.nixl_kvbench.nixl_kvbench.NIXLKVBenchCmdArgs :members: - :show-inheritance: Test Definition ~~~~~~~~~~~~~~~ diff --git a/doc/workloads/nixl_perftest.rst b/doc/workloads/nixl_perftest.rst index 7ba608b2d..b8e158a98 100644 --- a/doc/workloads/nixl_perftest.rst +++ b/doc/workloads/nixl_perftest.rst @@ -75,9 +75,8 @@ API Documentation Command Arguments ~~~~~~~~~~~~~~~~~ -.. autoclass:: cloudai.workloads.nixl_perftest.nixl_perftest.NixlPerftestCmdArgs +.. autopydantic_model:: cloudai.workloads.nixl_perftest.nixl_perftest.NixlPerftestCmdArgs :members: - :show-inheritance: Test Definition ~~~~~~~~~~~~~~~ diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 499c90bfd..b8342d583 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -20,6 +20,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Generic, TypeVar, cast +from pydantic import Field + from cloudai.core import DockerImage, Installable, TestRun from cloudai.models.workload import CmdArgs, TestDefinition from cloudai.systems.slurm import SlurmCommandGenStrategy @@ -33,10 +35,16 @@ class NIXLBaseCmdArgs(CmdArgs): """Command line arguments for a NIXL workloads.""" - docker_image_url: str - etcd_path: str = "etcd" - wait_etcd_for: int = 60 - etcd_image_url: str | None = None + docker_image_url: str = Field(description="URL of the Docker image to use for the benchmark.") + etcd_path: str = Field(default="etcd", description="Path to the etcd executable.") + wait_etcd_for: int = Field(default=60, description="Number of seconds to wait for etcd to become healthy.") + etcd_image_url: str | None = Field( + default=None, + description=( + "Optional URL of the Docker image to use for etcd, by default etcd will be run from the same image " + "as the benchmark." + ), + ) NIXLCmdArgsT = TypeVar("NIXLCmdArgsT", bound=NIXLBaseCmdArgs) From 0d2ff070121127cf92c1b92df8c33b8ef50ce6a4 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 16:11:23 +0100 Subject: [PATCH 7/9] Fix issue with etcd container for kvbench and perftest --- .../nixl_kvbench/slurm_command_gen_strategy.py | 2 ++ .../nixl_perftest/slurm_command_gen_strategy.py | 2 ++ tests/workloads/nixl_kvbench/test_command_gen_slurm.py | 10 ++++++++++ .../nixl_perftest/test_command_gen_strategy_slurm.py | 10 ++++++++++ 4 files changed, 24 insertions(+) diff --git a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py index 9c94956bd..0e2cf747a 100644 --- a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py @@ -38,6 +38,8 @@ def tdef(self) -> NIXLKVBenchTestDefinition: return cast(NIXLKVBenchTestDefinition, self.test_run.test) def image_path(self) -> str | None: + if self._current_image_url is not None: + return self._current_image_url return str(self.tdef.docker_image.installed_path) def _gen_srun_command(self) -> str: diff --git a/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py index 323c98c3c..0c22305b0 100644 --- a/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py @@ -39,6 +39,8 @@ def tdef(self) -> NixlPerftestTestDefinition: return cast(NixlPerftestTestDefinition, self.test_run.test) def image_path(self) -> str | None: + if self._current_image_url is not None: + return self._current_image_url return str(self.tdef.docker_image.installed_path) def _container_mounts(self) -> list[str]: diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py index 02638b978..ea178854d 100644 --- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py +++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py @@ -99,3 +99,13 @@ def test_gen_kvbench_omits_none_values(kvbench_tr: TestRun, slurm_system: SlurmS # Ensure None-valued args are omitted entirely assert not any(arg.startswith("--op_type ") for arg in cmd) assert not any(arg.startswith("--source ") for arg in cmd) + + +def test_get_etcd_srun_command_with_etcd_image(kvbench_tr: TestRun, slurm_system: SlurmSystem): + strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr) + tdef: NIXLKVBenchTestDefinition = cast(NIXLKVBenchTestDefinition, kvbench_tr.test) + tdef.cmd_args.etcd_image_url = "docker.io/library/etcd:latest" + + cmd = " ".join(strategy.gen_etcd_srun_command(tdef.cmd_args.etcd_path)) + assert tdef.etcd_image is not None + assert f"--container-image={tdef.etcd_image.installed_path}" in cmd diff --git a/tests/workloads/nixl_perftest/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_perftest/test_command_gen_strategy_slurm.py index e4cf21819..859a91663 100644 --- a/tests/workloads/nixl_perftest/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_perftest/test_command_gen_strategy_slurm.py @@ -198,3 +198,13 @@ def test_constraint_check( nixl_perftest.cmd_args.prefill_tp = prefill_tp nixl_perftest.cmd_args.num_prefill_nodes = prefill_nodes assert nixl_perftest.constraint_check(test_run) is res + + +def test_get_etcd_srun_command_with_etcd_image(test_run: TestRun, slurm_system: SlurmSystem): + strategy = NixlPerftestSlurmCommandGenStrategy(slurm_system, test_run) + tdef: NixlPerftestTestDefinition = cast(NixlPerftestTestDefinition, test_run.test) + tdef.cmd_args.etcd_image_url = "docker.io/library/etcd:latest" + + cmd = " ".join(strategy.gen_etcd_srun_command(tdef.cmd_args.etcd_path)) + assert tdef.etcd_image is not None + assert f"--container-image={tdef.etcd_image.installed_path}" in cmd From 1e26fc4fa3ebb31d75be2898b160df9d56d75dbd Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 16:21:02 +0100 Subject: [PATCH 8/9] Fix copyright years --- .../workloads/nixl_kvbench/slurm_command_gen_strategy.py | 2 +- .../workloads/nixl_perftest/slurm_command_gen_strategy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py index 0e2cf747a..74f9573e2 100644 --- a/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_kvbench/slurm_command_gen_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py b/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py index 0c22305b0..062878fda 100644 --- a/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From b7b32d73a7a03bff7bd636f2963af0ae28338759 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 19 Feb 2026 16:32:46 +0100 Subject: [PATCH 9/9] Exclude etcd images spec from kvbench cli --- .../workloads/nixl_kvbench/nixl_kvbench.py | 1 + .../nixl_kvbench/test_command_gen_slurm.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py index 69640cb13..c90ed1ce3 100644 --- a/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py +++ b/src/cloudai/workloads/nixl_kvbench/nixl_kvbench.py @@ -49,6 +49,7 @@ def cmd_args_dict(self) -> dict[str, str | list[str]]: "wait_etcd_for", "docker_image_url", "command", + "etcd_image_url", }, ) diff --git a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py index ea178854d..e9c595828 100644 --- a/tests/workloads/nixl_kvbench/test_command_gen_slurm.py +++ b/tests/workloads/nixl_kvbench/test_command_gen_slurm.py @@ -101,6 +101,21 @@ def test_gen_kvbench_omits_none_values(kvbench_tr: TestRun, slurm_system: SlurmS assert not any(arg.startswith("--source ") for arg in cmd) +def test_gen_kvbench_command_includes_etcd_endpoints(kvbench_tr: TestRun, slurm_system: SlurmSystem): + kvbench_tr.test.cmd_args = NIXLKVBenchCmdArgs.model_validate( + { + "docker_image_url": "docker://image/url", + "etcd_image_url": "docker://etcd/url", + "model": "./model.yaml", + } + ) + cmd_gen = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr) + cmd = cmd_gen.gen_kvbench_command() + + assert "etcd_image_url" not in " ".join(cmd) + assert "docker://etcd/url" not in " ".join(cmd) + + def test_get_etcd_srun_command_with_etcd_image(kvbench_tr: TestRun, slurm_system: SlurmSystem): strategy = NIXLKVBenchSlurmCommandGenStrategy(slurm_system, kvbench_tr) tdef: NIXLKVBenchTestDefinition = cast(NIXLKVBenchTestDefinition, kvbench_tr.test)