From ed66ccfe3c836e4afc2a6653a7254e50963c5b32 Mon Sep 17 00:00:00 2001
From: hyukjlee <hyukjlee@amd.com>
Date: Wed, 28 Jan 2026 03:31:40 +0000
Subject: [PATCH 1/4] Llama4-Scout update for AMD GPU

Signed-off-by: hyukjlee <hyukjlee@amd.com>
---
 Llama/Llama4-Scout_AMD.md | 85 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 Llama/Llama4-Scout_AMD.md
diff --git a/Llama/Llama4-Scout_AMD.md b/Llama/Llama4-Scout_AMD.md
new file mode 100644
index 00000000..7812b3a3
--- /dev/null
+++ b/Llama/Llama4-Scout_AMD.md
@@ -0,0 +1,85 @@
+# Llama 4 Maverick & Scout on vLLM - AMD Hardware
+
+## Introduction
+
+This quick start recipe explains how to run Llama 4 Scout 16 experts and Maverick 128 experts models on MI300X and MI355X GPUs. 
+
+## Key benefits of AMD GPUs on large models and developers
+
+The AMD Instinct GPUs accelerators are purpose-built to handle the demands of next-gen models like Llama 4:
+- Massive HBM memory capacity enables support for extended context lengths, delivering smooth and efficient performance.
+- Using Optimized Triton and AITER kernels provide best-in-class performance and TCO for production deployment.
+
+## Access & Licensing
+
+### License and Model parameters
+
+To use Llama 4 Scout and Maverick models, you must first need to gain access to the model repos under Huggingface.
+- [Llama4 Scout 16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
+- [Llama4 Maverick 128E](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct)
+
+## Prerequisites
+
+- OS: Linux
+- Drivers: ROCm 7.0 or above
+- GPU: AMD MI300X, MI325X, and MI355X
+
+## Deployment Steps
+
+### 1. Using vLLM docker image (For AMD users)
+
+```bash
+alias drun='sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 32G -v /data:/data -v $HOME:/myhome -w /myhome --entrypoint /bin/bash'
+drun vllm/vllm-openai-rocm:v0.14.1
+
+### 2. Start vLLM online server (run in background)
+
+```bash
+export TP=8 
+#export MODEL="meta-llama/Llama-4-Maverick-17B-128E-Instruct" 
+export MODEL="meta-llama/Llama-4-Scout-17B-16E-Instruct"
+export VLLM_ROCM_USE_AITER=1
+vllm serve $MODEL \
+  --disable-log-requests \
+  -tp $TP \  
+  --max-num-seqs 64 \
+  --no-enable-prefix-caching \
+  --max-num-batched-tokens=16384 \
+  --max-model-len 32000 &
+``` 
+
+### 3. Running Inference using benchmark script
+
+Let the Ll4 Scout model to describe the following two images.
+![first image](./images/rabbit.jpg)
+![second image](./images/cat.png)
+
+```bash
+curl http://localhost:8000/v1/completions     -H "Content-Type: application/json"     -d '{
+        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "prompt": "<image>https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg</image><image>https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png</image> Can you describe how these two images are similar, and how they differ?",
+        "max_tokens": 256,
+        "temperature": 0
+    }'
+``` 
+
+### 4. Performance benchmark 
+
+```bash
+#export MODEL="meta-llama/Llama-4-Maverick-17B-128E-Instruct" 
+export MODEL="meta-llama/Llama-4-Scout-17B-16E-Instruct"
+export ISL=1024
+export OSL=1024
+export REQ=10
+export CONC=10
+vllm bench serve \
+  --backend vllm \
+  --model $MODEL \
+  --dataset-name random \
+  --random-input-len $ISL \
+  --random-output-len $OSL \
+  --num-prompts $REQ \
+  --ignore-eos \
+  --max-concurrency $CONC \
+  --percentile-metrics ttft,tpot,itl,e2el
+``` 

From f5e338433541bbee3d5a7aa350b9e624c63fb65a Mon Sep 17 00:00:00 2001
From: Hyukjoon Lee <hyukjlee@amd.com>
Date: Wed, 28 Jan 2026 12:43:35 +0900
Subject: [PATCH 2/4] Update Llama4-Scout_AMD.md

Signed-off-by: Hyukjoon Lee <hyukjlee@amd.com>
---
 Llama/Llama4-Scout_AMD.md | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/Llama/Llama4-Scout_AMD.md b/Llama/Llama4-Scout_AMD.md
index 7812b3a3..82981ce7 100644
--- a/Llama/Llama4-Scout_AMD.md
+++ b/Llama/Llama4-Scout_AMD.md
@@ -14,9 +14,9 @@ The AMD Instinct GPUs accelerators are purpose-built to handle the demands of ne
 
 ### License and Model parameters
 
-To use Llama 4 Scout and Maverick models, you must first need to gain access to the model repos under Huggingface.
+To use Llama 4 Scout model, you must first need to gain access to the model repos under Huggingface.
 - [Llama4 Scout 16E](https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct)
-- [Llama4 Maverick 128E](https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct)
+
 
 ## Prerequisites
 
@@ -48,22 +48,7 @@ vllm serve $MODEL \
   --max-model-len 32000 &
 ``` 
 
-### 3. Running Inference using benchmark script
-
-Let the Ll4 Scout model to describe the following two images.
-![first image](./images/rabbit.jpg)
-![second image](./images/cat.png)
-
-```bash
-curl http://localhost:8000/v1/completions     -H "Content-Type: application/json"     -d '{
-        "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        "prompt": "<image>https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg</image><image>https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png</image> Can you describe how these two images are similar, and how they differ?",
-        "max_tokens": 256,
-        "temperature": 0
-    }'
-``` 
-
-### 4. Performance benchmark 
+### 3. Performance benchmark 
 
 ```bash
 #export MODEL="meta-llama/Llama-4-Maverick-17B-128E-Instruct" 

From 19e4a8060bb0858102b57fe3ad81a9caf6ba5faa Mon Sep 17 00:00:00 2001
From: Hyukjoon Lee <hyukjlee@amd.com>
Date: Mon, 9 Feb 2026 16:32:18 +0900
Subject: [PATCH 3/4] Update Llama4-Scout_AMD.md

Signed-off-by: Hyukjoon Lee <hyukjlee@amd.com>
---
 Llama/Llama4-Scout_AMD.md | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/Llama/Llama4-Scout_AMD.md b/Llama/Llama4-Scout_AMD.md
index 82981ce7..faf9bd46 100644
--- a/Llama/Llama4-Scout_AMD.md
+++ b/Llama/Llama4-Scout_AMD.md
@@ -29,14 +29,26 @@ To use Llama 4 Scout model, you must first need to gain access to the model repo
 ### 1. Using vLLM docker image (For AMD users)
 
 ```bash
-alias drun='sudo docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size 32G -v /data:/data -v $HOME:/myhome -w /myhome --entrypoint /bin/bash'
-drun vllm/vllm-openai-rocm:v0.14.1
+docker run -it \
+  --network=host \
+  --device=/dev/kfd \
+  --device=/dev/dri \
+  --group-add=video \
+  --ipc=host \
+  --cap-add=SYS_PTRACE \
+  --security-opt seccomp=unconfined \
+  --shm-size 32G \
+  -v /data:/data \
+  -v $HOME:/myhome \
+  -w /myhome \
+  --entrypoint /bin/bash \
+  vllm/vllm-openai-rocm:latest
+```
 
 ### 2. Start vLLM online server (run in background)
 
 ```bash
 export TP=8 
-#export MODEL="meta-llama/Llama-4-Maverick-17B-128E-Instruct" 
 export MODEL="meta-llama/Llama-4-Scout-17B-16E-Instruct"
 export VLLM_ROCM_USE_AITER=1
 vllm serve $MODEL \
@@ -51,7 +63,6 @@ vllm serve $MODEL \
 ### 3. Performance benchmark 
 
 ```bash
-#export MODEL="meta-llama/Llama-4-Maverick-17B-128E-Instruct" 
 export MODEL="meta-llama/Llama-4-Scout-17B-16E-Instruct"
 export ISL=1024
 export OSL=1024

From cbeb19fe4777a5be1d69a13b8609ae571de87ce9 Mon Sep 17 00:00:00 2001
From: Hyukjoon Lee <hyukjlee@amd.com>
Date: Mon, 9 Feb 2026 17:08:45 +0900
Subject: [PATCH 4/4] Update Llama4-Scout_AMD.md

Signed-off-by: Hyukjoon Lee <hyukjlee@amd.com>
---
 Llama/Llama4-Scout_AMD.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Llama/Llama4-Scout_AMD.md b/Llama/Llama4-Scout_AMD.md
index faf9bd46..e54a1a5f 100644
--- a/Llama/Llama4-Scout_AMD.md
+++ b/Llama/Llama4-Scout_AMD.md
@@ -44,7 +44,13 @@ docker run -it \
   --entrypoint /bin/bash \
   vllm/vllm-openai-rocm:latest
 ```
-
+or you can use uv environment.
+ > Note: The vLLM wheel for ROCm requires Python 3.12, ROCm 7.0, and glibc >= 2.35. If your environment does not meet these requirements, please use the Docker-based setup as described in the [documentation](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/#pre-built-images).  
+ ```bash 
+ uv venv 
+ source .venv/bin/activate 
+ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/
+ ```
 ### 2. Start vLLM online server (run in background)
 
 ```bash