diff --git a/src/cloudai/workloads/common/nixl.py b/src/cloudai/workloads/common/nixl.py index f9b33046e..859c8e919 100644 --- a/src/cloudai/workloads/common/nixl.py +++ b/src/cloudai/workloads/common/nixl.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -80,7 +80,7 @@ def gen_wait_for_etcd_command(self, timeout: int = 60) -> list[str]: def gen_kill_and_wait_cmd(self, pid_var: str, timeout: int = 60) -> list[str]: cmd = [ - f"kill -9 ${pid_var}\n", + f"kill -TERM ${pid_var}\n", "timeout", str(timeout), "bash", diff --git a/tests/ref_data/nixl-kvbench.sbatch b/tests/ref_data/nixl-kvbench.sbatch index 1d0245789..817b0eacb 100644 --- a/tests/ref_data/nixl-kvbench.sbatch +++ b/tests/ref_data/nixl-kvbench.sbatch @@ -25,8 +25,8 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" & sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" -kill -9 $etcd_pid +kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 - } \ No newline at end of file + } diff --git a/tests/ref_data/nixl-perftest.sbatch b/tests/ref_data/nixl-perftest.sbatch index 659cc6e35..6b4d2ba88 100644 --- a/tests/ref_data/nixl-perftest.sbatch +++ b/tests/ref_data/nixl-perftest.sbatch @@ -24,7 +24,7 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; exit 1 } srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python /workspace/nixl/benchmark/kvbench/main.py sequential-ct-perftest __OUTPUT_DIR__/output/matrices/metadata.yaml --json-output-path=__OUTPUT_DIR__/output/results.json " -kill -9 $etcd_pid +kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 diff --git a/tests/ref_data/nixl_bench.sbatch b/tests/ref_data/nixl_bench.sbatch index 4ef1f0c12..9bde93c75 100644 --- a/tests/ref_data/nixl_bench.sbatch +++ b/tests/ref_data/nixl_bench.sbatch @@ -25,8 +25,8 @@ timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" & sleep 15 srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" -kill -9 $etcd_pid +kill -TERM $etcd_pid timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || { echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds"; exit 1 - } \ No newline at end of file + } diff --git a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py index 898aa74d7..3c6d3e766 100644 --- a/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py +++ b/tests/workloads/nixl_bench/test_command_gen_strategy_slurm.py @@ -143,7 +143,7 @@ def test_gen_kill_and_wait_cmd(nixl_bench_tr: TestRun, slurm_system: SlurmSystem strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, nixl_bench_tr) cmd = strategy.gen_kill_and_wait_cmd("PID", timeout=120) assert cmd == [ - "kill -9 $PID\n", + "kill -TERM $PID\n", "timeout", "120", "bash",