Skip to content

Error while serving Minimax2.5 with vllm #30

@F-Michelon

Description

@F-Michelon

Hi i am trying to serve Minimax on 4 H100 nodes with 4 Gpus per node here is my job:

#!/bin/bash

#SBATCH --job-name=VLLM-Ray
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=1
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=96
#SBATCH --hint=nomultithread
#SBATCH --time=00:15:00
#SBATCH --output=ray_vllm_%j.out
#SBATCH --error=ray_vllm_%j.err
#SBATCH --exclusive

############################################
# Global configuration
############################################

export CONTAINER="vllm_ray.sif"
export MODEL_NAME="/models/MiniMax-M2.5"

export TENSOR_PARALLEL_SIZE=4
export PIPELINE_PARALLEL_SIZE=4
export RAY_PORT=6379
export VLLM_PORT=45678
export OMP_NUM_THREADS=24
export VLLM_USE_V1=0

DELAY=15
MAX_ATTEMPTS=120

############################################
# Node partitioning
############################################

NODELIST=($(scontrol show hostnames "$SLURM_NODELIST"))

VLLM_NODES=("${NODELIST[@]:0:4}")
VLLM_HEAD="${VLLM_NODES[0]}"
export RAY_ADDRESS="${VLLM_HEAD}:${RAY_PORT}"

# Resolve IPs of nodes
VLLM_IPS=()
for node in "${VLLM_NODES[@]}"; do
    ip=$(getent hosts "$node" | awk '{print $1}')
    VLLM_IPS+=("$ip")
done
export OPENAI_BASE_URL="http://${VLLM_IPS[0]}:${VLLM_PORT}"

# Resolve IP of frozen node
echo "vLLM nodes: ${VLLM_NODES[*]}"
echo "Ray address: ${RAY_ADDRESS}"
echo "OPENAI_BASE_URL: ${OPENAI_BASE_URL}"

############################################
# vLLM + Ray launcher
############################################

start_vllm_node() {
    local node_rank=$1

    # Get IP current node
    local NODE_IP=$(hostname -I | awk '{print $1}')
    export VLLM_HOST_IP=${NODE_IP}

    echo "Node ${node_rank}: IP=${NODE_IP}, VLLM_HOST_IP=${VLLM_HOST_IP}"

    if [ "${node_rank}" -eq 0 ]; then
        # Master node : start Ray head
        echo "Starting Ray head on ${NODE_IP}:${RAY_PORT}"

        singularity exec --nv \
          --bind ${MODEL_NAME}:${MODEL_NAME} \
          --env XDG_CACHE_HOME=cache \
          --env VLLM_HOST_IP="${NODE_IP}" \
          --env VLLM_USE_V1="${VLLM_USE_V1}" \
          "${CONTAINER}" \
          ray start --head \
            --node-ip-address="${NODE_IP}" \
            --port="${RAY_PORT}" \
            --num-gpus=4 \
            --block &

        echo "Waiting for all 16 GPUs to join Ray..."
        while true; do
            gpu_count=$(singularity exec "${CONTAINER}" python3 -c "import ray; ray.init(address='auto'); print(int(ray.cluster_resources().get('GPU', 0)))")
            if [ "$gpu_count" -ge 16 ]; then
                echo "All GPUs detected!"
                break
            fi
            echo "Current GPUs in Ray: $gpu_count/16..."
            sleep 5
        done

        echo "Ray head initialized"

        singularity exec --nv \
          --bind ${MODEL_NAME}:${MODEL_NAME} \
          --env XDG_CACHE_HOME=cache \
          --bind .cache:/.cache \
          --env VLLM_HOST_IP="${NODE_IP}" \
          --env VLLM_USE_V1="${VLLM_USE_V1}" \
          "${CONTAINER}" \
          vllm serve "${MODEL_NAME}" \
            --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" \
            --pipeline-parallel-size "${PIPELINE_PARALLEL_SIZE}" \
            --distributed-executor-backend ray \
            --enable-auto-tool-choice \
            --tool-call-parser minimax_m2 \
            --reasoning-parser minimax_m2_append_think \
            --gpu-memory-utilization 0.9 \
            --max-num-seqs 128 \
            --max-num-batched-tokens 65536 \
            --trust-remote-code \
            --max-model-len 8192 \
            --compilation-config '{"cudagraph_mode": "PIECEWISE"}' \
            --host 0.0.0.0 \
            --port "${VLLM_PORT}"

    else
        # Worker nodes : connect to Ray head
        echo "Starting Ray worker on ${NODE_IP}, connecting to ${RAY_ADDRESS}"

        singularity exec --nv \
          --bind ${MODEL_NAME}:${MODEL_NAME} \
          --env XDG_CACHE_HOME=cache \
          --env VLLM_HOST_IP="${NODE_IP}" \
          --env VLLM_USE_V1="${VLLM_USE_V1}" \
          "${CONTAINER}" \
          ray start \
            --address="${RAY_ADDRESS}" \
            --node-ip-address="${NODE_IP}" \
            --num-gpus=4 \
            --block

        echo "Ray worker on ${NODE_IP} started and connected to head"
    fi
}

export -f start_vllm_node

############################################
# Start vLLM on 4 nodes
############################################

srun -N4 \
     -w "$(IFS=,; echo "${VLLM_NODES[*]}")" \
     --ntasks=4 \
     --ntasks-per-node=1 \
     bash -c 'start_vllm_node ${SLURM_PROCID}' &

############################################
# Wait for vLLM readiness
############################################

ATTEMPT=0
until curl -s "${OPENAI_BASE_URL}/v1/models" | grep -q "id"; do
    ATTEMPT=$((ATTEMPT + 1))
    echo "Waiting for vLLM... (${ATTEMPT}/${MAX_ATTEMPTS})"
    if [ "$ATTEMPT" -ge "$MAX_ATTEMPTS" ]; then
        echo "ERROR: vLLM did not start"
        exit 1
    fi
    sleep "${DELAY}"
done

echo "vLLM is ready at ${OPENAI_BASE_URL}"

wait

But i get these errors:

vLLM nodes: jzxh200 jzxh201 jzxh202 jzxh203
Ray address: jzxh200:6379
OPENAI_BASE_URL: http://172.20.4.200:45678
Waiting for vLLM... (1/120)
Node 0: IP=172.20.4.200, VLLM_HOST_IP=172.20.4.200
Starting Ray head on 172.20.4.200:6379
Waiting for all 16 GPUs to join Ray...
Node 2: IP=172.20.4.202, VLLM_HOST_IP=172.20.4.202
Starting Ray worker on 172.20.4.202, connecting to jzxh200:6379
Node 3: IP=172.20.4.203, VLLM_HOST_IP=172.20.4.203
Starting Ray worker on 172.20.4.203, connecting to jzxh200:6379
Node 1: IP=172.20.4.201, VLLM_HOST_IP=172.20.4.201
Starting Ray worker on 172.20.4.201, connecting to jzxh200:6379
Current GPUs in Ray: /16...
2026-04-01 11:21:29,971	INFO scripts.py:1124 -- �[37mLocal node IP�[39m: �[1m172.20.4.203�[22m
2026-04-01 11:21:35,690	SUCC scripts.py:1140 -- �[32m--------------------�[39m
2026-04-01 11:21:35,690	SUCC scripts.py:1141 -- �[32mRay runtime started.�[39m
2026-04-01 11:21:35,691	SUCC scripts.py:1142 -- �[32m--------------------�[39m
2026-04-01 11:21:35,691	INFO scripts.py:1144 -- To terminate the Ray runtime, run
2026-04-01 11:21:35,691	INFO scripts.py:1145 -- �[1m  ray stop�[22m
2026-04-01 11:21:35,691	INFO scripts.py:1155 -- �[36m�[1m--block�[22m�[39m
2026-04-01 11:21:35,691	INFO scripts.py:1156 -- This command will now block forever until terminated by a signal.
2026-04-01 11:21:35,691	INFO scripts.py:1159 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported.
2026-04-01 11:21:35,691	INFO scripts.py:1164 -- Process exit logs will be saved to: �[1m/tmp/ray/session_2026-04-01_11-21-29_754115_76083/logs/ray_process_exit.log�[22m�[26m
2026-04-01 11:21:29,959	INFO scripts.py:1124 -- �[37mLocal node IP�[39m: �[1m172.20.4.201�[22m
2026-04-01 11:21:35,695	SUCC scripts.py:1140 -- �[32m--------------------�[39m
2026-04-01 11:21:35,696	SUCC scripts.py:1141 -- �[32mRay runtime started.�[39m
2026-04-01 11:21:35,696	SUCC scripts.py:1142 -- �[32m--------------------�[39m
2026-04-01 11:21:35,696	INFO scripts.py:1144 -- To terminate the Ray runtime, run
2026-04-01 11:21:35,696	INFO scripts.py:1145 -- �[1m  ray stop�[22m
2026-04-01 11:21:35,696	INFO scripts.py:1155 -- �[36m�[1m--block�[22m�[39m
2026-04-01 11:21:35,696	INFO scripts.py:1156 -- This command will now block forever until terminated by a signal.
2026-04-01 11:21:35,696	INFO scripts.py:1159 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported.
2026-04-01 11:21:35,696	INFO scripts.py:1164 -- Process exit logs will be saved to: �[1m/tmp/ray/session_2026-04-01_11-21-29_754115_76083/logs/ray_process_exit.log�[22m�[26m
2026-04-01 11:21:29,730	INFO scripts.py:1124 -- �[37mLocal node IP�[39m: �[1m172.20.4.202�[22m
2026-04-01 11:21:35,701	SUCC scripts.py:1140 -- �[32m--------------------�[39m
2026-04-01 11:21:35,701	SUCC scripts.py:1141 -- �[32mRay runtime started.�[39m
2026-04-01 11:21:35,701	SUCC scripts.py:1142 -- �[32m--------------------�[39m
2026-04-01 11:21:35,701	INFO scripts.py:1144 -- To terminate the Ray runtime, run
2026-04-01 11:21:35,701	INFO scripts.py:1145 -- �[1m  ray stop�[22m
2026-04-01 11:21:35,702	INFO scripts.py:1155 -- �[36m�[1m--block�[22m�[39m
2026-04-01 11:21:35,702	INFO scripts.py:1156 -- This command will now block forever until terminated by a signal.
2026-04-01 11:21:35,702	INFO scripts.py:1159 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported.
2026-04-01 11:21:35,702	INFO scripts.py:1164 -- Process exit logs will be saved to: �[1m/tmp/ray/session_2026-04-01_11-21-29_754115_76083/logs/ray_process_exit.log�[22m�[26m
2026-04-01 11:21:29,737	INFO usage_lib.py:473 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
2026-04-01 11:21:29,753	INFO scripts.py:936 -- �[37mLocal node IP�[39m: �[1m172.20.4.200�[22m
2026-04-01 11:21:35,778	SUCC scripts.py:975 -- �[32m--------------------�[39m
2026-04-01 11:21:35,779	SUCC scripts.py:976 -- �[32mRay runtime started.�[39m
2026-04-01 11:21:35,779	SUCC scripts.py:977 -- �[32m--------------------�[39m
2026-04-01 11:21:35,779	INFO scripts.py:979 -- �[36mNext steps�[39m
2026-04-01 11:21:35,779	INFO scripts.py:982 -- To add another node to this Ray cluster, run
2026-04-01 11:21:35,779	INFO scripts.py:985 -- �[1m  ray start --address='172.20.4.200:6379'�[22m
2026-04-01 11:21:35,779	INFO scripts.py:996 -- To connect to this Ray cluster:
2026-04-01 11:21:35,779	INFO scripts.py:998 -- �[35mimport�[39m�[26m ray
2026-04-01 11:21:35,779	INFO scripts.py:999 -- ray�[35m.�[39m�[26minit(_node_ip_address�[35m=�[39m�[26m�[33m'172.20.4.200'�[39m�[26m)
2026-04-01 11:21:35,779	INFO scripts.py:1013 -- To submit a Ray job using the Ray Jobs CLI:
2026-04-01 11:21:35,779	INFO scripts.py:1014 -- �[1m  RAY_API_SERVER_ADDRESS='http://127.0.0.1:8265' ray job submit --working-dir . -- python my_script.py�[22m
2026-04-01 11:21:35,779	INFO scripts.py:1023 -- See https://docs.ray.io/en/latest/cluster/running-applications/job-submission/index.html 
2026-04-01 11:21:35,779	INFO scripts.py:1027 -- for more information on submitting Ray jobs to the Ray cluster.
2026-04-01 11:21:35,779	INFO scripts.py:1032 -- To terminate the Ray runtime, run
2026-04-01 11:21:35,779	INFO scripts.py:1033 -- �[1m  ray stop�[22m
2026-04-01 11:21:35,779	INFO scripts.py:1036 -- To view the status of the cluster, use
2026-04-01 11:21:35,779	INFO scripts.py:1037 --   �[1mray status�[22m�[26m
2026-04-01 11:21:35,779	INFO scripts.py:1041 -- To monitor and debug Ray, view the dashboard at 
2026-04-01 11:21:35,779	INFO scripts.py:1042 --   �[1m127.0.0.1:8265�[22m�[26m
2026-04-01 11:21:35,779	INFO scripts.py:1049 -- �[4mIf connection to the dashboard fails, check your firewall settings and network configuration.�[24m
2026-04-01 11:21:35,779	INFO scripts.py:1155 -- �[36m�[1m--block�[22m�[39m
2026-04-01 11:21:35,779	INFO scripts.py:1156 -- This command will now block forever until terminated by a signal.
2026-04-01 11:21:35,779	INFO scripts.py:1159 -- Running subprocesses are monitored and a message will be printed if any of them terminate unexpectedly. Subprocesses exit with SIGTERM will be treated as graceful, thus NOT reported.
2026-04-01 11:21:35,779	INFO scripts.py:1164 -- Process exit logs will be saved to: �[1m/tmp/ray/session_2026-04-01_11-21-29_754115_76083/logs/ray_process_exit.log�[22m�[26m
Waiting for vLLM... (2/120)
All GPUs detected!
Ray head initialized
Waiting for vLLM... (3/120)
ERROR 04-01 11:21:57 [config.py:29] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.matmul_ogs'
ERROR 04-01 11:21:57 [gpt_oss_triton_kernels_moe.py:61] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.swiglu'
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:297] 
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:297]        █     █     █▄   ▄█
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:297]  ▄▄ ▄█ █     █     █ ▀▄▀ █  version 0.18.0
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:297]   █▄█▀ █     █     █     █  model   /lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:297]    ▀▀  ▀▀▀▀▀ ▀▀▀▀▀ ▀     ▀
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:297] 
(APIServer pid=88185) INFO 04-01 11:21:59 [utils.py:233] non-default args: {'model_tag': '/lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5', 'enable_auto_tool_choice': True, 'tool_call_parser': 'minimax_m2', 'host': '0.0.0.0', 'port': 45678, 'model': '/lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5', 'trust_remote_code': True, 'max_model_len': 8192, 'reasoning_parser': 'minimax_m2_append_think', 'distributed_executor_backend': 'ray', 'pipeline_parallel_size': 4, 'tensor_parallel_size': 4, 'max_num_batched_tokens': 65536, 'max_num_seqs': 128, 'compilation_config': {'mode': None, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': [], 'splitting_ops': None, 'compile_mm_encoder': False, 'compile_sizes': None, 'compile_ranges_endpoints': None, 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.PIECEWISE: 1>, 'cudagraph_num_of_warmups': 0, 'cudagraph_capture_sizes': None, 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': None, 'pass_config': {}, 'max_cudagraph_capture_size': None, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': None, 'static_all_moe_layers': []}}
(APIServer pid=88185) WARNING 04-01 11:21:59 [envs.py:1717] Unknown vLLM environment variable detected: VLLM_USE_V1
(APIServer pid=88185) INFO 04-01 11:22:00 [model.py:533] Resolved architecture: MiniMaxM2ForCausalLM
(APIServer pid=88185) INFO 04-01 11:22:01 [model.py:1917] Downcasting torch.float32 to torch.bfloat16.
(APIServer pid=88185) INFO 04-01 11:22:01 [model.py:1582] Using max model len 8192
(APIServer pid=88185) INFO 04-01 11:22:01 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=65536.
(APIServer pid=88185) WARNING 04-01 11:22:01 [vllm.py:743] Async scheduling will be disabled because it is not supported with the `ray` distributed executor backend (only `mp`, `uni`, and `external_launcher` are supported).
(APIServer pid=88185) INFO 04-01 11:22:01 [vllm.py:754] Asynchronous scheduling is disabled.
(APIServer pid=88185) INFO 04-01 11:22:01 [compilation.py:289] Enabled custom fusions: norm_quant, act_quant
Waiting for vLLM... (4/120)
ERROR 04-01 11:22:12 [config.py:29] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.matmul_ogs'
ERROR 04-01 11:22:12 [gpt_oss_triton_kernels_moe.py:61] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.swiglu'
(EngineCore pid=89247) INFO 04-01 11:22:14 [core.py:103] Initializing a V1 LLM engine (v0.18.0) with config: model='/lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5', speculative_config=None, tokenizer='/lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=4, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='minimax_m2_append_think', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['+quant_fp8', 'none', '+quant_fp8'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_endpoints': [65536], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.PIECEWISE: 1>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': True, 'fuse_act_quant': True, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 256, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore pid=89247) WARNING 04-01 11:22:14 [ray_utils.py:376] Tensor parallel size (16) exceeds available GPUs (4). This may result in Ray placement group allocation failures. Consider reducing tensor_parallel_size to 4 or less, or ensure your Ray cluster has 16 GPUs available.
(EngineCore pid=89247) INFO 04-01 11:22:14 [ray_utils.py:441] No current placement group found. Creating a new placement group.
Waiting for vLLM... (5/120)
(EngineCore pid=89247) INFO 04-01 11:22:30 [ray_env.py:100] Env var prefixes to copy: ['HF_', 'HUGGING_FACE_', 'LMCACHE_', 'NCCL_', 'UCX_', 'VLLM_']
(EngineCore pid=89247) INFO 04-01 11:22:30 [ray_env.py:101] Copying the following environment variables to workers: ['LD_LIBRARY_PATH', 'VLLM_ENABLE_CUDA_COMPATIBILITY', 'VLLM_PORT', 'VLLM_USAGE_SOURCE', 'VLLM_USE_V1', 'VLLM_WORKER_MULTIPROC_METHOD']
(EngineCore pid=89247) INFO 04-01 11:22:30 [ray_env.py:111] To exclude env vars from copying, add them to /linkhome/rech/gennlj01/uls42ep/.config/vllm/ray_non_carry_over_env_vars.json
(EngineCore pid=89247) INFO 04-01 11:22:30 [network_utils.py:205] Port 45678 is already in use, trying port 45679
Waiting for vLLM... (6/120)
Waiting for vLLM... (7/120)
Waiting for vLLM... (8/120)
Waiting for vLLM... (9/120)
Waiting for vLLM... (10/120)
Waiting for vLLM... (11/120)
Waiting for vLLM... (12/120)
Waiting for vLLM... (13/120)
Waiting for vLLM... (14/120)
Waiting for vLLM... (15/120)
Waiting for vLLM... (16/120)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m WARNING 04-01 11:22:30 [system_utils.py:38] Overwriting environment variable LD_LIBRARY_PATH from '/usr/local/lib/python3.12/dist-packages/cv2/../../lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64:/.singularity.d/libs' to '/usr/local/lib/python3.12/dist-packages/cv2/../../lib64:/usr/local/lib/python3.12/dist-packages/cv2/../../lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64:/.singularity.d/libs'
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m ERROR 04-01 11:22:30 [config.py:29] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.matmul_ogs'
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m ERROR 04-01 11:22:32 [gpt_oss_triton_kernels_moe.py:61] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.swiglu'
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m WARNING 04-01 11:22:33 [worker_base.py:287] Missing `shared_worker_lock` argument from executor. This argument is needed for mm_processor_cache_type='shm'.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:22:38 [parallel_state.py:1395] world_size=16 rank=12 local_rank=0 distributed_init_method=tcp://172.20.4.200:45679 backend=nccl
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165188, ip=172.20.4.201)�[0m WARNING 04-01 11:22:30 [system_utils.py:38] Overwriting environment variable LD_LIBRARY_PATH from '/usr/local/lib/python3.12/dist-packages/cv2/../../lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64:/.singularity.d/libs' to '/usr/local/lib/python3.12/dist-packages/cv2/../../lib64:/usr/local/lib/python3.12/dist-packages/cv2/../../lib64:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64:/.singularity.d/libs'�[32m [repeated 15x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89564)�[0m ERROR 04-01 11:22:30 [config.py:29] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.matmul_ogs'�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89564)�[0m ERROR 04-01 11:22:33 [gpt_oss_triton_kernels_moe.py:61] Failed to import Triton kernels. Please make sure your triton version is compatible. Error: No module named 'triton_kernels.swiglu'�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89564)�[0m WARNING 04-01 11:22:34 [worker_base.py:287] Missing `shared_worker_lock` argument from executor. This argument is needed for mm_processor_cache_type='shm'.�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165186, ip=172.20.4.201)�[0m INFO 04-01 11:22:40 [pynccl.py:111] vLLM is using nccl==2.27.5
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m WARNING 04-01 11:22:43 [symm_mem.py:107] SymmMemCommunicator: symmetric memory multicast operations are not supported.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:22:44 [parallel_state.py:1717] rank 12 in world size 16 is assigned as DP rank 0, PP rank 3, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165188, ip=172.20.4.201)�[0m INFO 04-01 11:22:39 [parallel_state.py:1395] world_size=16 rank=6 local_rank=2 distributed_init_method=tcp://172.20.4.200:45679 backend=nccl�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:22:45 [gpu_model_runner.py:4481] Starting to load model /lustre/fsn1/projects/rech/ari/uls42ep/models/MiniMax-M2.5...
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:22:40 [pynccl.py:111] vLLM is using nccl==2.27.5�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75388, ip=172.20.4.202)�[0m INFO 04-01 11:22:45 [utils.py:129] Hidden layers were unevenly partitioned: [15,16,16,15]. This can be manually overridden using the VLLM_PP_LAYER_PARTITION environment variable
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:22:45 [deep_gemm.py:100] DeepGEMM E8M0 enabled on current platform.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:22:45 [cuda.py:317] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:22:45 [flash_attn.py:598] Using FlashAttention version 3
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:22:45 [fp8.py:396] Using TRITON Fp8 MoE backend out of potential backends: ['TRITON', 'AITER', 'FLASHINFER_TRTLLM', 'FLASHINFER_CUTLASS', 'DEEPGEMM', 'MARLIN', 'BATCHED_DEEPGEMM', 'BATCHED_TRITON', 'XPU'].
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:24:04 [default_loader.py:384] Loading weights took 78.19 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m WARNING 04-01 11:22:43 [symm_mem.py:107] SymmMemCommunicator: symmetric memory multicast operations are not supported.�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89564)�[0m INFO 04-01 11:22:44 [parallel_state.py:1717] rank 3 in world size 16 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 3, EP rank 3, EPLB rank N/A�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:22:45 [utils.py:129] Hidden layers were unevenly partitioned: [15,16,16,15]. This can be manually overridden using the VLLM_PP_LAYER_PARTITION environment variable�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:22:45 [deep_gemm.py:100] DeepGEMM E8M0 enabled on current platform.�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:22:46 [cuda.py:317] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:22:46 [flash_attn.py:598] Using FlashAttention version 3�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:22:46 [fp8.py:396] Using TRITON Fp8 MoE backend out of potential backends: ['TRITON', 'AITER', 'FLASHINFER_TRTLLM', 'FLASHINFER_CUTLASS', 'DEEPGEMM', 'MARLIN', 'BATCHED_DEEPGEMM', 'BATCHED_TRITON', 'XPU'].�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:24:08 [fp8.py:545] Using MoEPrepareAndFinalizeNoDPEPModular
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:24:08 [gpu_model_runner.py:4566] Model loading took 13.2 GiB memory and 82.628035 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:09 [default_loader.py:384] Loading weights took 142.85 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:14 [fp8.py:545] Using MoEPrepareAndFinalizeNoDPEPModular
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:14 [gpu_model_runner.py:4566] Model loading took 13.2 GiB memory and 148.493862 seconds
Waiting for vLLM... (17/120)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165186, ip=172.20.4.201)�[0m INFO 04-01 11:25:18 [default_loader.py:384] Loading weights took 151.79 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165186, ip=172.20.4.201)�[0m INFO 04-01 11:25:23 [fp8.py:545] Using MoEPrepareAndFinalizeNoDPEPModular
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:20 [default_loader.py:384] Loading weights took 154.14 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165186, ip=172.20.4.201)�[0m INFO 04-01 11:25:24 [gpu_model_runner.py:4566] Model loading took 13.77 GiB memory and 157.565025 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:28 [backends.py:988] Using cache directory: cache/vllm/torch_compile_cache/eae65b7bee/rank_12_0/backbone for vLLM's torch.compile
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:28 [backends.py:1048] Dynamo bytecode transform time: 0.78 s
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:26 [fp8.py:545] Using MoEPrepareAndFinalizeNoDPEPModular
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:30 [backends.py:284] Directly load the compiled graph(s) for compile range (1, 65536) from the cache, took 1.559 s
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:30 [monitor.py:48] torch.compile took 3.10 s in total
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:30 [decorators.py:296] Directly load AOT compilation from path cache/vllm/torch_compile_cache/torch_aot_compile/4f02743c4cc9556a834904da0e70ba080fbfab610b72876060fbe4b478e821e7/rank_12_0/model
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:27 [gpu_model_runner.py:4566] Model loading took 13.77 GiB memory and 160.627155 seconds
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:25:31 [fused_moe.py:1080] Using configuration from /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=256,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for MoE layer.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:32 [monitor.py:76] Initial profiling/warmup run took 1.88 s
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m INFO 04-01 11:25:33 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=256
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m INFO 04-01 11:25:33 [gpu_model_runner.py:5607] Profiling CUDA graph memory: PIECEWISE=35 (largest=256)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:28 [backends.py:988] Using cache directory: cache/vllm/torch_compile_cache/9df9e558f0/rank_8_0/backbone for vLLM's torch.compile�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165186, ip=172.20.4.201)�[0m INFO 04-01 11:25:28 [backends.py:1048] Dynamo bytecode transform time: 0.88 s�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m INFO 04-01 11:25:34 [custom_all_reduce.py:216] Registering 96 cuda graph addresses
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165188, ip=172.20.4.201)�[0m INFO 04-01 11:25:34 [gpu_model_runner.py:5686] Estimated CUDA graph memory: 0.19 GiB total
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165188, ip=172.20.4.201)�[0m INFO 04-01 11:25:35 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9000 to 0.9024 to maintain the same effective KV cache size.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165186, ip=172.20.4.201)�[0m INFO 04-01 11:25:35 [gpu_worker.py:456] Available KV cache memory: 46.97 GiB
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:31 [backends.py:284] Directly load the compiled graph(s) for compile range (1, 65536) from the cache, took 1.842 s�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:31 [monitor.py:48] torch.compile took 3.64 s in total�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89562)�[0m INFO 04-01 11:25:31 [decorators.py:296] Directly load AOT compilation from path cache/vllm/torch_compile_cache/torch_aot_compile/b4d45bca7d85fd5aa17ccc983abfee33717ec75f8ae5315d835b07a6669a3c81/rank_2_0/model�[32m [repeated 15x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=89561)�[0m INFO 04-01 11:25:33 [monitor.py:76] Initial profiling/warmup run took 2.09 s�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:38 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=256�[32m [repeated 12x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:38 [gpu_model_runner.py:5607] Profiling CUDA graph memory: PIECEWISE=35 (largest=256)�[32m [repeated 12x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:39 [custom_all_reduce.py:216] Registering 90 cuda graph addresses�[32m [repeated 12x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:40 [gpu_model_runner.py:5686] Estimated CUDA graph memory: 0.12 GiB total�[32m [repeated 12x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=75390, ip=172.20.4.202)�[0m INFO 04-01 11:25:36 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9000 to 0.9024 to maintain the same effective KV cache size.�[32m [repeated 11x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164889, ip=172.20.4.203)�[0m INFO 04-01 11:25:40 [gpu_worker.py:456] Available KV cache memory: 47.07 GiB�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) INFO 04-01 11:25:40 [kv_cache_utils.py:1316] GPU KV cache size: 3,078,384 tokens
(EngineCore pid=89247) INFO 04-01 11:25:40 [kv_cache_utils.py:1321] Maximum concurrency for 8,192 tokens per request: 375.78x
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099] EngineCore failed to start.
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099] Traceback (most recent call last):
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1073, in run_engine_core
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return func(*args, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 839, in __init__
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     super().__init__(
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 122, in __init__
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     kv_cache_config = self._initialize_kv_caches(vllm_config)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return func(*args, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 278, in _initialize_kv_caches
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     self.model_executor.initialize_from_config(kv_cache_configs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 117, in initialize_from_config
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     self.collective_rpc("initialize_from_config", args=(kv_cache_configs,))
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_executor.py", line 515, in collective_rpc
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return ray.get(ray_worker_outputs, timeout=timeout)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return fn(*args, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return func(*args, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2981, in get
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     values, debugger_breakpoint = worker.get_objects(
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]                                   ^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1012, in get_objects
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     raise value.as_instanceof_cause()
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099] ray.exceptions.RayTaskError(KeyError): �[36mray::RayWorkerWrapper.execute_method()�[39m (pid=165186, ip=172.20.4.201, actor_id=db1ac0b96fcffa0902ee4bcc02000000, repr=<vllm.v1.executor.ray_utils.RayWorkerWrapper object at 0x14b43c5bae40>)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_utils.py", line 75, in execute_method
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     raise e
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_utils.py", line 65, in execute_method
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return run_method(self, method, args, kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 459, in run_method
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return func(*args, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     return func(*args, **kwargs)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 556, in initialize_from_config
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     self.model_runner.initialize_kv_cache(kv_cache_config)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 6481, in initialize_kv_cache
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     self.initialize_attn_backend(kv_cache_config)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5904, in initialize_attn_backend
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5863, in get_attn_backends_for_group
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]     attn_backend = layers[layer_name].get_attn_backend()
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099]                    ~~~~~~^^^^^^^^^^^^
(EngineCore pid=89247) ERROR 04-01 11:25:40 [core.py:1099] KeyError: 'model.layers.47.self_attn.attn'
(EngineCore pid=89247) INFO 04-01 11:25:40 [ray_executor.py:119] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74] Error executing method 'initialize_from_config'. This might cause deadlock in distributed execution.
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74] Traceback (most recent call last):
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_utils.py", line 65, in execute_method
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return run_method(self, method, args, kwargs)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 459, in run_method
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return func(*args, **kwargs)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/ray/util/tracing/tracing_helper.py", line 461, in _resume_span
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return method(self, *_args, **_kwargs)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 306, in initialize_from_config
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return func(*args, **kwargs)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 556, in initialize_from_config
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     self.model_runner.initialize_kv_cache(kv_cache_config)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 6481, in initialize_kv_cache
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     self.initialize_attn_backend(kv_cache_config)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5904, in initialize_attn_backend
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     attn_backends = get_attn_backends_for_group(kv_cache_group_spec)
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5863, in get_attn_backends_for_group
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     attn_backend = layers[layer_name].get_attn_backend()
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]                    ~~~~~~^^^^^^^^^^^^
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=165187, ip=172.20.4.201)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74] KeyError: 'model.layers.47.self_attn.attn'
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m INFO 04-01 11:25:38 [kv_cache_utils.py:826] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=256�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m INFO 04-01 11:25:38 [gpu_model_runner.py:5607] Profiling CUDA graph memory: PIECEWISE=35 (largest=256)�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m INFO 04-01 11:25:39 [custom_all_reduce.py:216] Registering 90 cuda graph addresses�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m INFO 04-01 11:25:40 [gpu_model_runner.py:5686] Estimated CUDA graph memory: 0.12 GiB total�[32m [repeated 3x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m INFO 04-01 11:25:40 [gpu_worker.py:490] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9000 to 0.9016 to maintain the same effective KV cache size.�[32m [repeated 4x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74] Error executing method 'initialize_from_config'. This might cause deadlock in distributed execution.�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74] Traceback (most recent call last):�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_utils.py", line 65, in execute_method�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return run_method(self, method, args, kwargs)�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 459, in run_method�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return func(*args, **kwargs)�[32m [repeated 14x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^�[32m [repeated 14x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/ray/util/tracing/tracing_helper.py", line 461, in _resume_span�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     return method(self, *_args, **_kwargs)�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 556, in initialize_from_config�[32m [repeated 14x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     self.worker.initialize_from_config(kv_cache_config)  # type: ignore�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     self.model_runner.initialize_kv_cache(kv_cache_config)�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 6481, in initialize_kv_cache�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     self.initialize_attn_backend(kv_cache_config)�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5904, in initialize_attn_backend�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     attn_backends = get_attn_backends_for_group(kv_cache_group_spec)�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 5863, in get_attn_backends_for_group�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]     attn_backend = layers[layer_name].get_attn_backend()�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74]                    ~~~~~~^^^^^^^^^^^^�[32m [repeated 7x across cluster]�[0m
(EngineCore pid=89247) �[36m(RayWorkerWrapper pid=164891, ip=172.20.4.203)�[0m ERROR 04-01 11:25:40 [ray_utils.py:74] KeyError: 'model.layers.15.self_attn.attn'�[32m [repeated 7x across cluster]�[0m

My singularity image is this:

Bootstrap: docker
From: vllm/vllm-openai:latest

%post
    # 1) Install Ray for multi-node serving
    pip install --no-cache-dir "ray[default]"

    # 2) (Optional but recommended) install extra dependencies vLLM may need
    pip install --no-cache-dir \
        aiohttp \
        uvloop \
        triton-kernels>=2.0.0

    # 3) CUDA-aware tools
    # (optional) NVIDIA tools useful for serving & NCCL / Ray
    pip install --no-cache-dir \
        psutil \
        setproctitle

    # 4) Ensure vLLM latest
    pip install --upgrade --no-cache-dir vllm

%labels
    Author YourName
    Version vLLM-MultiNode

%runscript
    # Default entrypoint
    exec /bin/bash "$@"

And vllm version is 0.18.

Could anyone help me with that ?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions