From eb318e0603e3104591edb982513cf81a53f32a9c Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 19 Feb 2026 05:34:26 -0800 Subject: [PATCH 1/5] gracefully killing etcd --- src/cloudai/workloads/common/nixl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index f9b33046e..a2bee73c4 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -80,7 +80,7 @@ def gen_wait_for_etcd_command(self, timeout: int = 60) -> list[str]: def gen_kill_and_wait_cmd(self, pid_var: str, timeout: int = 60) -> list[str]: cmd = [ - f"kill -9 ${pid_var}\n", + f"kill -TERM ${pid_var}\n", "timeout", str(timeout), "bash", From 263ebe4227075e4dd370a89ceb90b673d3d37a17 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 19 Feb 2026 14:40:26 +0100 Subject: [PATCH 2/5] adjusted tests --- tests/ref_data/nixl-kvbench.sbatch | 2 +- tests/ref_data/nixl-perftest.sbatch | 2 +- tests/ref_data/nixl_bench.sbatch | 2 +- tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ref_data/nixl-kvbench.sbatch b/tests/ref_data/nixl-kvbench.sbatch index 1d0245789..88edc23c6 100644 --- a/tests/ref_data/nixl-kvbench.sbatch +++ b/tests/ref_data/nixl-kvbench.sbatch @@ -25,7 +25,7 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" & sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" -kill -9 $etcd_pid +kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 diff --git a/tests/ref_data/nixl-perftest.sbatch b/tests/ref_data/nixl-perftest.sbatch index 659cc6e35..6b4d2ba88 100644 --- a/tests/ref_data/nixl-perftest.sbatch +++ b/tests/ref_data/nixl-perftest.sbatch @@ -24,7 +24,7 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; exit 1 } srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python /workspace/nixl/benchmark/kvbench/main.py sequential-ct-perftest __OUTPUT_DIR__/output/matrices/metadata.yaml --json-output-path=__OUTPUT_DIR__/output/results.json " -kill -9 $etcd_pid +kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 diff --git a/tests/ref_data/nixl_bench.sbatch b/tests/ref_data/nixl_bench.sbatch index 4ef1f0c12..ff7c44f5d 100644 --- a/tests/ref_data/nixl_bench.sbatch +++ b/tests/ref_data/nixl_bench.sbatch @@ -25,7 +25,7 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" & sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" -kill -9 $etcd_pid +kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py index 898aa74d7..3c6d3e766 100644 --- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py @@ -143,7 +143,7 @@ def test_gen_kill_and_wait_cmd(nixl_bench_tr: TestRun, slurm_system: SlurmSystem strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr) cmd = strategy.gen_kill_and_wait_cmd("PID", timeout=120) assert cmd == [ - "kill -9 $PID\n", + "kill -TERM $PID\n", "timeout", "120", "bash", From 5d378c0dc6dae0ff514df57a41ea9621e98afea6 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 19 Feb 2026 14:50:11 +0100 Subject: [PATCH 3/5] update copyright --- src/cloudai/workloads/common/nixl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index a2bee73c4..859c8e919 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 842449c0aa1b4ac1528af9ff11faf00a647fd793 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Thu, 19 Feb 2026 18:31:59 +0100 Subject: [PATCH 4/5] redefine kill-wait flow --- src/cloudai/workloads/common/nixl.py | 11 +++-------- tests/ref_data/nixl-kvbench.sbatch | 4 ++-- tests/ref_data/nixl-perftest.sbatch | 2 +- tests/ref_data/nixl_bench.sbatch | 4 ++-- .../test_command_gen_strategy_slurm.py | 16 ---------------- 5 files changed, 8 insertions(+), 29 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 859c8e919..2ad7c17c2 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -79,18 +79,13 @@ def gen_wait_for_etcd_command(self, timeout: int = 60) -> list[str]: return cmd def gen_kill_and_wait_cmd(self, pid_var: str, timeout: int = 60) -> list[str]: - cmd = [ + return [ f"kill -TERM ${pid_var}\n", - "timeout", - str(timeout), - "bash", - "-c", - f'"while kill -0 ${pid_var} 2>/dev/null; do sleep 1; done" || {{\n', + f"timeout {timeout} wait ${pid_var} || {{\n", f' echo "Failed to kill ETCD (pid=${pid_var}) within {timeout} seconds";\n', " exit 1\n", - "}", + "}\n", ] - return cmd def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list[list[str]]: prefix_part = self.gen_srun_prefix(with_num_nodes=False) diff --git a/tests/ref_data/nixl-kvbench.sbatch b/tests/ref_data/nixl-kvbench.sbatch index 88edc23c6..5cdaa1e90 100644 --- a/tests/ref_data/nixl-kvbench.sbatch +++ b/tests/ref_data/nixl-kvbench.sbatch @@ -26,7 +26,7 @@ srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mo sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" kill -TERM $etcd_pid - timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { + timeout 60 wait $etcd_pid || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 - } \ No newline at end of file + } diff --git a/tests/ref_data/nixl-perftest.sbatch b/tests/ref_data/nixl-perftest.sbatch index 6b4d2ba88..962733d51 100644 --- a/tests/ref_data/nixl-perftest.sbatch +++ b/tests/ref_data/nixl-perftest.sbatch @@ -25,7 +25,7 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; } srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python /workspace/nixl/benchmark/kvbench/main.py sequential-ct-perftest __OUTPUT_DIR__/output/matrices/metadata.yaml --json-output-path=__OUTPUT_DIR__/output/results.json " kill -TERM $etcd_pid - timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { + timeout 60 wait $etcd_pid || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 } \ No newline at end of file diff --git a/tests/ref_data/nixl_bench.sbatch b/tests/ref_data/nixl_bench.sbatch index ff7c44f5d..c0d088881 100644 --- a/tests/ref_data/nixl_bench.sbatch +++ b/tests/ref_data/nixl_bench.sbatch @@ -26,7 +26,7 @@ srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-moun sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" kill -TERM $etcd_pid - timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { + timeout 60 wait $etcd_pid || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 - } \ No newline at end of file + } diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py index 3c6d3e766..e9c6e93d3 100644 --- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py @@ -137,19 +137,3 @@ def test_gen_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem): " exit 1\n", "}", ] - - -def test_gen_kill_and_wait_cmd(nixl_bench_tr: TestRun, slurm_system: SlurmSystem) -> None: - strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr) - cmd = strategy.gen_kill_and_wait_cmd("PID", timeout=120) - assert cmd == [ - "kill -TERM $PID\n", - "timeout", - "120", - "bash", - "-c", - '"while kill -0 $PID 2>/dev/null; do sleep 1; done" || {\n', - ' echo "Failed to kill ETCD (pid=$PID) within 120 seconds";\n', - " exit 1\n", - "}", - ] From 256ff5047f0db9d05b54cf89c621b64fb406f4c5 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Fri, 20 Feb 2026 16:54:38 +0100 Subject: [PATCH 5/5] revert wait solution --- src/cloudai/workloads/common/nixl.py | 11 ++++++++--- tests/ref_data/nixl-kvbench.sbatch | 2 +- tests/ref_data/nixl-perftest.sbatch | 2 +- tests/ref_data/nixl_bench.sbatch | 2 +- .../test_command_gen_strategy_slurm.py | 16 ++++++++++++++++ 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index 2ad7c17c2..859c8e919 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -79,13 +79,18 @@ def gen_wait_for_etcd_command(self, timeout: int = 60) -> list[str]: return cmd def gen_kill_and_wait_cmd(self, pid_var: str, timeout: int = 60) -> list[str]: - return [ + cmd = [ f"kill -TERM ${pid_var}\n", - f"timeout {timeout} wait ${pid_var} || {{\n", + "timeout", + str(timeout), + "bash", + "-c", + f'"while kill -0 ${pid_var} 2>/dev/null; do sleep 1; done" || {{\n', f' echo "Failed to kill ETCD (pid=${pid_var}) within {timeout} seconds";\n', " exit 1\n", - "}\n", + "}", ] + return cmd def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list[list[str]]: prefix_part = self.gen_srun_prefix(with_num_nodes=False) diff --git a/tests/ref_data/nixl-kvbench.sbatch b/tests/ref_data/nixl-kvbench.sbatch index 5cdaa1e90..817b0eacb 100644 --- a/tests/ref_data/nixl-kvbench.sbatch +++ b/tests/ref_data/nixl-kvbench.sbatch @@ -26,7 +26,7 @@ srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mo sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" kill -TERM $etcd_pid - timeout 60 wait $etcd_pid || { + timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 } diff --git a/tests/ref_data/nixl-perftest.sbatch b/tests/ref_data/nixl-perftest.sbatch index 962733d51..6b4d2ba88 100644 --- a/tests/ref_data/nixl-perftest.sbatch +++ b/tests/ref_data/nixl-perftest.sbatch @@ -25,7 +25,7 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; } srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python /workspace/nixl/benchmark/kvbench/main.py sequential-ct-perftest __OUTPUT_DIR__/output/matrices/metadata.yaml --json-output-path=__OUTPUT_DIR__/output/results.json " kill -TERM $etcd_pid - timeout 60 wait $etcd_pid || { + timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 } \ No newline at end of file diff --git a/tests/ref_data/nixl_bench.sbatch b/tests/ref_data/nixl_bench.sbatch index c0d088881..9bde93c75 100644 --- a/tests/ref_data/nixl_bench.sbatch +++ b/tests/ref_data/nixl_bench.sbatch @@ -26,7 +26,7 @@ srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-moun sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" kill -TERM $etcd_pid - timeout 60 wait $etcd_pid || { + timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 } diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py index e9c6e93d3..3c6d3e766 100644 --- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py @@ -137,3 +137,19 @@ def test_gen_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem): " exit 1\n", "}", ] + + +def test_gen_kill_and_wait_cmd(nixl_bench_tr: TestRun, slurm_system: SlurmSystem) -> None: + strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr) + cmd = strategy.gen_kill_and_wait_cmd("PID", timeout=120) + assert cmd == [ + "kill -TERM $PID\n", + "timeout", + "120", + "bash", + "-c", + '"while kill -0 $PID 2>/dev/null; do sleep 1; done" || {\n', + ' echo "Failed to kill ETCD (pid=$PID) within 120 seconds";\n', + " exit 1\n", + "}", + ]