From 7b912e8f4aa63ec061cca389e45b8bb3e43eb104 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Thu, 26 Feb 2026 05:40:34 +0000 Subject: [PATCH 01/15] Update NVIDIA GPT-OSS vLLM image from v0.15.1 to v0.16.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump vllm/vllm-openai image tag for all 3 NVIDIA GPT-OSS configs (B200, H100, H200). All existing BKC flags preserved — no config changes beyond the image tag. v0.16.0 notable changes for GPT-OSS/MXFP4: - Async scheduling + pipeline parallelism (30.8% throughput improvement) - New MXFP4 backends: SM90 FlashInfer BF16, SM100 CUTLASS - MoE cold start optimization - Triton backend now default non-FlashInfer fallback on SM90/SM100 Closes #798 Co-authored-by: Cameron Quilici --- .github/configs/nvidia-master.yaml | 6 +++--- perf-changelog.yaml | 12 +++++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6885f36cb..539455009 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3076,7 +3076,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 4} gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.16.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -3107,7 +3107,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.16.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -3386,7 +3386,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.15.1 + image: vllm/vllm-openai:v0.16.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dbb3abc88..afd370e4c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -723,7 +723,17 @@ - "Gains: CUTLASS MoE optimizations (~8% throughput), FP4 kernel improvements (~4% E2E on B200), torch.compile cold-start fix" - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789 - + +- config-keys: + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm + description: + - "Update vLLM image from v0.15.1 to v0.16.0 for NVIDIA GPT-OSS configs" + - "All existing BKC flags preserved (VLLM_MXFP4_USE_MARLIN=1 on H100/H200, VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 on B200)" + - "v0.16.0 key changes: async scheduling + PP (30.8% throughput), new MXFP4 backends (SM90 FlashInfer BF16, SM100 CUTLASS), MoE cold start optimization" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - config-keys: - dsr1-fp4-mi355x-atom - dsr1-fp4-mi355x-atom-mtp From d3e5c265be3f588a7adf715bca7d021fcc4399cc Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Wed, 25 Feb 2026 23:44:46 -0600 Subject: [PATCH 02/15] Update perf-changelog.yaml with new vLLM details Removed outdated configuration entries and added new vLLM image update details for NVIDIA GPT-OSS. Updated pull request links for changes. --- perf-changelog.yaml | 62 ++++++--------------------------------------- 1 file changed, 8 insertions(+), 54 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index afd370e4c..41ac32f5f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -724,16 +724,6 @@ - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789 -- config-keys: - - gptoss-fp4-b200-vllm - - gptoss-fp4-h100-vllm - - gptoss-fp4-h200-vllm - description: - - "Update vLLM image from v0.15.1 to v0.16.0 for NVIDIA GPT-OSS configs" - - "All existing BKC flags preserved (VLLM_MXFP4_USE_MARLIN=1 on H100/H200, VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 on B200)" - - "v0.16.0 key changes: async scheduling + PP (30.8% throughput), new MXFP4 backends (SM90 FlashInfer BF16, SM100 CUTLASS), MoE cold start optimization" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX - - config-keys: - dsr1-fp4-mi355x-atom - dsr1-fp4-mi355x-atom-mtp @@ -846,48 +836,12 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839 - config-keys: - - dsr1-fp8-mi355x-sglang-disagg - - dsr1-fp8-mi355x-sglang-disagg-mtp - - dsr1-fp4-mi355x-sglang-disagg - - dsr1-fp4-mi355x-sglang-disagg-mtp - description: - - "Add more sweep configs for MI355X FP8/FP4 Disagg" - - "Add TP/DP/EP size < 8 support " - - "Support DSR1-0528 MTP Disagg" - - "Bump SGL mori image to Feb 27" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 - -- config-keys: - - minimaxm2.5-fp8-h100-vllm - description: - - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100" - - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" - - "Image: vllm/vllm-openai:v0.16.0" - - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k" - - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832 - -- config-keys: - - qwen3.5-fp8-b200-sglang - description: - - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200" - - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" - - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner" - - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804 - -- config-keys: - - qwen3.5-fp8-h200-sglang - description: - - "Add Qwen 3.5 FP8 H200 SGLang configuration" - - "Model: Qwen/Qwen3.5-397B-A17B-FP8, runner: h200, image: lmsysorg/sglang:v0.5.8-cu130-amd64" - - "Benchmark script: benchmarks/single_node/qwen3.5_fp8_h200.sh" - - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 - -- config-keys: - - dsr1-fp8-mi355x-sglang + - gptoss-fp4-b200-vllm + - gptoss-fp4-h100-vllm + - gptoss-fp4-h200-vllm description: - - "Expanding TP search space" - - "Adding kv-cache-fp8" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865 + - "Update vLLM image from v0.15.1 to v0.16.0 for NVIDIA GPT-OSS configs" + - "All existing BKC flags preserved (VLLM_MXFP4_USE_MARLIN=1 on H100/H200, VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 on B200)" + - "v0.16.0 key changes: async scheduling + PP (30.8% throughput), new MXFP4 backends (SM90 FlashInfer BF16, SM100 CUTLASS), MoE cold start optimization" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/800 + From e4863aded04880208a0bce1335e9b393216b524e Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Fri, 6 Mar 2026 12:42:52 -0800 Subject: [PATCH 03/15] update container image --- .github/configs/nvidia-master.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 539455009..443c2e312 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3076,7 +3076,7 @@ gptoss-fp4-b200-trt: - { tp: 8, conc-start: 4, conc-end: 4} gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.16.0 + image: vllm/vllm-openai:v0.16.0-cu130 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -3107,7 +3107,7 @@ gptoss-fp4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.16.0 + image: vllm/vllm-openai:v0.16.0-cu130 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -3386,7 +3386,7 @@ gptoss-fp4-h200-trt: - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.16.0 + image: vllm/vllm-openai:v0.16.0-cu130 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 From 83612e73b284e9c12c069ee030b01dc3662089a6 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Fri, 6 Mar 2026 12:47:31 -0800 Subject: [PATCH 04/15] add stream-interval --- benchmarks/single_node/gptoss_fp4_b200.sh | 1 + benchmarks/single_node/gptoss_fp4_h100.sh | 1 + benchmarks/single_node/gptoss_fp4_h200.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index 1d9e727ce..d21114105 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -51,6 +51,7 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ +--stream-interval 20 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 7d9cec06f..eeeda0ecd 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -35,6 +35,7 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ +--stream-interval 20 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index 3d945df42..cfbba7618 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -48,6 +48,7 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs $CONC \ + --stream-interval 20 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From 8d682607a63e70843e0519b0bff50c14ce3db1ce Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Fri, 6 Mar 2026 13:00:33 -0800 Subject: [PATCH 05/15] update perf-changelog --- perf-changelog.yaml | 51 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 41ac32f5f..4687d289a 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -723,7 +723,7 @@ - "Gains: CUTLASS MoE optimizations (~8% throughput), FP4 kernel improvements (~4% E2E on B200), torch.compile cold-start fix" - "v0.15.1 includes fix for prefix cache hit rate of 0% on GPT-OSS hybrid attention models" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/789 - + - config-keys: - dsr1-fp4-mi355x-atom - dsr1-fp4-mi355x-atom-mtp @@ -835,12 +835,59 @@ - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839 +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add more sweep configs for MI355X FP8/FP4 Disagg" + - "Add TP/DP/EP size < 8 support " + - "Support DSR1-0528 MTP Disagg" + - "Bump SGL mori image to Feb 27" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 + +- config-keys: + - minimaxm2.5-fp8-h100-vllm + description: + - "Add MiniMax-M2.5 FP8 vLLM benchmark for H100" + - "Model: MiniMaxAI/MiniMax-M2.5 with --trust-remote-code" + - "Image: vllm/vllm-openai:v0.16.0" + - "Switch from TP=8/EP=8 to TP=4/EP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k" + - "Script uses conditional --enable-expert-parallel based on EP_SIZE env var" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/832 + +- config-keys: + - qwen3.5-fp8-b200-sglang + description: + - "Add Qwen3.5-397B-A17B-FP8 SGLang benchmark configuration for B200" + - "Image: lmsysorg/sglang:v0.5.9-cu129-amd64" + - "Uses trtllm_mha attention backend and flashinfer_trtllm MOE runner" + - "Enable SGLANG_ENABLE_FLASHINFER_GEMM=true, NCCL_NVLS_ENABLE=1" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/804 + +- config-keys: + - qwen3.5-fp8-h200-sglang + description: + - "Add Qwen 3.5 FP8 H200 SGLang configuration" + - "Model: Qwen/Qwen3.5-397B-A17B-FP8, runner: h200, image: lmsysorg/sglang:v0.5.8-cu130-amd64" + - "Benchmark script: benchmarks/single_node/qwen3.5_fp8_h200.sh" + - "Server: reasoning-parser qwen3, tool-call-parser qwen3_coder, enable-flashinfer-allreduce-fusion, mem-fraction-static 0.8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/855 + +- config-keys: + - dsr1-fp8-mi355x-sglang + description: + - "Expanding TP search space" + - "Adding kv-cache-fp8" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865 + - config-keys: - gptoss-fp4-b200-vllm - gptoss-fp4-h100-vllm - gptoss-fp4-h200-vllm description: - - "Update vLLM image from v0.15.1 to v0.16.0 for NVIDIA GPT-OSS configs" + - "Update vLLM image from v0.15.1 to v0.16.0-cu130 for NVIDIA GPT-OSS configs" - "All existing BKC flags preserved (VLLM_MXFP4_USE_MARLIN=1 on H100/H200, VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 on B200)" - "v0.16.0 key changes: async scheduling + PP (30.8% throughput), new MXFP4 backends (SM90 FlashInfer BF16, SM100 CUTLASS), MoE cold start optimization" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/800 From 8c2010701e3e5695a97359955f1b635d9a5e98f1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 15:19:57 -0600 Subject: [PATCH 06/15] add flock for h100 --- runners/launch_h100-dgxc-slurm.sh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index bb0335955..85552a0d4 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -230,16 +230,22 @@ else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then - echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." - srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - fi + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ From d7c0a471e626897791d537964d36c57ec00642a7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 15:21:04 -0600 Subject: [PATCH 07/15] add flock for cw runners as well --- runners/launch_h100-cw.sh | 15 ++++++--------- runners/launch_h200-cw.sh | 9 +++++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 49a42e981..31f808e87 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -2,20 +2,17 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" -SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" -set -x - -JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell +JOB_ID=$(squeue -u $USER -h -o %A | head -n1) -if [ -z "$JOB_ID" ]; then - echo "ERROR: salloc failed to allocate a job" - exit 1 -fi +SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) +set -x # Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " +srun --jobid=$JOB_ID bash -c " exec 9>\"$LOCK_FILE\" flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 657f84792..e458200cc 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -8,9 +8,14 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" -SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" +SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) + +salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell +JOB_ID=$(squeue -u $USER -h -o %A | head -n1) + set -x JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') @@ -25,7 +30,7 @@ if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then CONTAINER_IMAGE=$IMAGE else # Use flock to serialize concurrent imports to the same squash file - srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + srun --jobid=$JOB_ID bash -c " exec 9>\"$LOCK_FILE\" flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then From 320bb8ff19a112aead0336728c827ad9d1453f2e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 15:31:20 -0600 Subject: [PATCH 08/15] add flock for cw runners as well pt 2 --- runners/launch_h100-cw.sh | 3 ++- runners/launch_h100-dgxc-slurm.sh | 18 ++++++------------ runners/launch_h200-cw.sh | 3 ++- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 31f808e87..d5625e9c9 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -3,7 +3,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" +LOCK_FILE="/mnt/vast/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh.lock" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) @@ -13,6 +13,7 @@ SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) set -x # Use flock to serialize concurrent imports to the same squash file srun --jobid=$JOB_ID bash -c " + (umask 0000 && touch \"$LOCK_FILE\" && chmod 666 \"$LOCK_FILE\") 2>/dev/null || true exec 9>\"$LOCK_FILE\" flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index 85552a0d4..bb0335955 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -230,22 +230,16 @@ else HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - LOCK_FILE="${SQUASH_FILE}.lock" salloc --exclude="$SLURM_EXCLUDED_NODELIST" --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - # Use flock to serialize concurrent imports to the same squash file - srun --jobid=$JOB_ID bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE - fi - " + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then + echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." + srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + fi srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index e458200cc..27f8035ed 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -9,7 +9,7 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="${SQUASH_FILE}.lock" +LOCK_FILE="/mnt/vast/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh.lock" SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) @@ -31,6 +31,7 @@ if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then else # Use flock to serialize concurrent imports to the same squash file srun --jobid=$JOB_ID bash -c " + (umask 0000 && touch \"$LOCK_FILE\" && chmod 666 \"$LOCK_FILE\") 2>/dev/null || true exec 9>\"$LOCK_FILE\" flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then From b13abb5eb48d6900a0b39c7a17086d3392a0e9e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 15:57:14 -0600 Subject: [PATCH 09/15] cw h100 and h200 require separate squash for users --- runners/launch_h100-cw.sh | 21 +++++++-------------- runners/launch_h200-cw.sh | 21 +++++++-------------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index d5625e9c9..3bbafcaab 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -2,8 +2,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="/mnt/vast/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh.lock" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1}.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) @@ -11,18 +10,12 @@ JOB_ID=$(squeue -u $USER -h -o %A | head -n1) SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) set -x -# Use flock to serialize concurrent imports to the same squash file -srun --jobid=$JOB_ID bash -c " - (umask 0000 && touch \"$LOCK_FILE\" && chmod 666 \"$LOCK_FILE\") 2>/dev/null || true - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE - fi -" +srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" +if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then + echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." + srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" +fi srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 27f8035ed..876c4bde6 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -8,8 +8,7 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -LOCK_FILE="/mnt/vast/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh.lock" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1}.sqsh" SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) @@ -29,18 +28,12 @@ fi if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then CONTAINER_IMAGE=$IMAGE else - # Use flock to serialize concurrent imports to the same squash file - srun --jobid=$JOB_ID bash -c " - (umask 0000 && touch \"$LOCK_FILE\" && chmod 666 \"$LOCK_FILE\") 2>/dev/null || true - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE - fi - " + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then + echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." + srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + fi CONTAINER_IMAGE=$(realpath $SQUASH_FILE) fi From 37381b73cb2773bd0a6a2f2f5a3c09b4b828e372 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 16:12:06 -0600 Subject: [PATCH 10/15] cw update --- runners/launch_h100-cw.sh | 2 +- runners/launch_h200-cw.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 3bbafcaab..4c2a318f0 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -2,7 +2,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1}.sqsh" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER}.sqsh" salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell JOB_ID=$(squeue -u $USER -h -o %A | head -n1) diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 876c4bde6..8653c4f2c 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -8,7 +8,7 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER: -1}.sqsh" +SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER}.sqsh" SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) From 4ca13fcc9e4d42fa492a457456dc370651649646 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 16:23:03 -0600 Subject: [PATCH 11/15] cw update pt 2 --- benchmarks/single_node/gptoss_fp4_b200.sh | 1 + benchmarks/single_node/gptoss_fp4_b200_trt.sh | 1 + benchmarks/single_node/gptoss_fp4_h100.sh | 1 + 3 files changed, 3 insertions(+) diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index d21114105..5f7a20785 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -16,6 +16,7 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi +curl -LsSf https://hf.co/cli/install.sh | bash hf download "$MODEL" nvidia-smi diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/gptoss_fp4_b200_trt.sh index 30547a090..7162c3d00 100644 --- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_b200_trt.sh @@ -24,6 +24,7 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" +curl -LsSf https://hf.co/cli/install.sh | bash hf download $MODEL SERVER_LOG=/workspace/server.log diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index eeeda0ecd..bec8cebd2 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -15,6 +15,7 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi +curl -LsSf https://hf.co/cli/install.sh | bash hf download "$MODEL" cat > config.yaml << EOF From 7e64f05977a50dc10fa42d07e6c3f2a84b28beef Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 17:01:02 -0600 Subject: [PATCH 12/15] reverting hf download --- benchmarks/single_node/gptoss_fp4_b200.sh | 2 -- benchmarks/single_node/gptoss_fp4_b200_trt.sh | 1 - benchmarks/single_node/gptoss_fp4_h100.sh | 2 -- benchmarks/single_node/gptoss_fp4_h200.sh | 1 - 4 files changed, 6 deletions(-) diff --git a/benchmarks/single_node/gptoss_fp4_b200.sh b/benchmarks/single_node/gptoss_fp4_b200.sh index 5f7a20785..1d9e727ce 100644 --- a/benchmarks/single_node/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/gptoss_fp4_b200.sh @@ -16,7 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -curl -LsSf https://hf.co/cli/install.sh | bash hf download "$MODEL" nvidia-smi @@ -52,7 +51,6 @@ vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs 512 \ ---stream-interval 20 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_b200_trt.sh b/benchmarks/single_node/gptoss_fp4_b200_trt.sh index 7162c3d00..30547a090 100644 --- a/benchmarks/single_node/gptoss_fp4_b200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_b200_trt.sh @@ -24,7 +24,6 @@ fi echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" -curl -LsSf https://hf.co/cli/install.sh | bash hf download $MODEL SERVER_LOG=/workspace/server.log diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index bec8cebd2..7d9cec06f 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -curl -LsSf https://hf.co/cli/install.sh | bash hf download "$MODEL" cat > config.yaml << EOF @@ -36,7 +35,6 @@ vllm serve $MODEL --host=0.0.0.0 --port=$PORT \ --gpu-memory-utilization=0.9 \ --tensor-parallel-size=$TP \ --max-num-seqs=$CONC \ ---stream-interval 20 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_h200.sh b/benchmarks/single_node/gptoss_fp4_h200.sh index cfbba7618..3d945df42 100644 --- a/benchmarks/single_node/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/gptoss_fp4_h200.sh @@ -48,7 +48,6 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.9 \ --tensor-parallel-size $TP \ --max-num-seqs $CONC \ - --stream-interval 20 \ --disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From a0afe2f1471806395a1ddc8cec3c0b0969788a76 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 17:01:52 -0600 Subject: [PATCH 13/15] reverting cw changes --- runners/launch_h100-cw.sh | 29 +++++++++++++++++++---------- runners/launch_h200-cw.sh | 25 +++++++++++++------------ 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh index 4c2a318f0..49a42e981 100644 --- a/runners/launch_h100-cw.sh +++ b/runners/launch_h100-cw.sh @@ -2,20 +2,29 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunner/hf-hub-cache" PARTITION="h100" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER}.sqsh" +SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" -salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A | head -n1) +set -x -SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') -set -x -srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" -if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then - echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." - srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" +if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 fi + +# Use flock to serialize concurrent imports to the same squash file +srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi +" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh index 8653c4f2c..657f84792 100644 --- a/runners/launch_h200-cw.sh +++ b/runners/launch_h200-cw.sh @@ -8,12 +8,8 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="h200" -SQUASH_FILE="/mnt/vast/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER}.sqsh" - -SAGEMAKER_SHM_PATH=$(mktemp -d /mnt/vast/shm-XXXXXX) - -salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell -JOB_ID=$(squeue -u $USER -h -o %A | head -n1) +SQUASH_FILE="/mnt/vast/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" set -x @@ -28,12 +24,17 @@ fi if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then CONTAINER_IMAGE=$IMAGE else - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then - echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." - srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - fi + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " CONTAINER_IMAGE=$(realpath $SQUASH_FILE) fi From ebe7cb35ce6b6463c9e07eab063c02df234d8a89 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 19:02:38 -0600 Subject: [PATCH 14/15] trigger test check From 390816f72ba65186e503fe2f76722478e80d5228 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 19:10:28 -0600 Subject: [PATCH 15/15] b200 rm locks revert --- runners/launch_b200-nb.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 8862b0120..c321ee0f9 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -7,9 +7,6 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') UCX_NET_DEVICES=eth0 -# Cleanup any stale enroot locks from previous runs -find /var/cache/enroot-container-images/$UID -type f -name "*.lock" | xargs rm - set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \