From 114e222811ea5090d56bc88a77112bede19d5d55 Mon Sep 17 00:00:00 2001
From: Matthew Leon <matthew.leon.tech@gmail.com>
Date: Thu, 4 Sep 2025 15:22:15 -0700
Subject: [PATCH] Add build, test, and profile scripts

Signed-off-by: Matthew Leon <matthew.leon.tech@gmail.com>
---
 build.sh         | 43 +++++++++++++++++++++++++++++++++++++++++++
 profile.sh       | 31 +++++++++++++++++++++++++++++++
 run_inference.py | 25 +++++++++++++++++++++++++
 run_test.sh      |  8 ++++++++
 4 files changed, 107 insertions(+)
 create mode 100755 build.sh
 create mode 100755 profile.sh
 create mode 100644 run_inference.py
 create mode 100755 run_test.sh

diff --git a/build.sh b/build.sh
new file mode 100755
index 000000000000..b1e5ae18f722
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -euo pipefail
+
+# Detect if already inside a virtual environment
+if [ -n "${VIRTUAL_ENV:-}" ]; then
+    echo "Already inside a virtual environment: $VIRTUAL_ENV"
+    venv_created=false
+else
+    venv_created=false
+    if [ ! -d ".venv" ]; then
+        echo "Creating virtual environment..."
+        uv venv --python 3.12 --seed
+        venv_created=true
+    else
+        echo "Virtual environment .venv already exists, skipping creation."
+    fi
+
+    echo "Activating .venv..."
+    source .venv/bin/activate
+fi
+
+# Install build requirements if we just created the venv
+if [ "$venv_created" = true ]; then
+    echo "Installing build requirements..."
+    uv pip install -r requirements/build.txt -v
+else
+    echo "Skipping build requirements installation (venv already in use)."
+fi
+
+# Use all available CPUs for parallel jobs
+if command -v nproc &>/dev/null; then
+    MAX_JOBS=$(nproc)
+elif [[ "$(uname)" == "Darwin" ]]; then
+    MAX_JOBS=$(sysctl -n hw.ncpu)
+else
+    MAX_JOBS=4  # fallback
+fi
+export MAX_JOBS
+
+echo "Using $MAX_JOBS parallel jobs."
+
+CCACHE_NOHASHDIR="true" pip install -v --no-build-isolation -e .
+
diff --git a/profile.sh b/profile.sh
new file mode 100755
index 000000000000..350e6e38d60b
--- /dev/null
+++ b/profile.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -euo pipefail
+
+OUT=/tmp/vllm_profile
+
+# Collect report
+ncu --section LaunchStats \
+    --target-processes all \
+    --profile-from-start on \
+    -o "$OUT" -f \
+    python run_inference.py
+
+echo "Nsight Compute report is at: ${OUT}.ncu-rep"
+
+echo
+echo "Top kernels (mangled):"
+ncu --import "${OUT}.ncu-rep" \
+    --csv \
+    --print-kernel-base mangled \
+| tail -n +2 | cut -d',' -f5 | tr -d '"' | sort -u
+
+echo
+echo "Top kernels (demangled):"
+ncu --import "${OUT}.ncu-rep" \
+    --csv \
+    --print-kernel-base demangled \
+| tail -n +2 | cut -d',' -f5 | tr -d '"' | sort -u
+
+echo
+echo "Summary:"
+ncu --import "${OUT}.ncu-rep" --page summary | head -80 || true
diff --git a/run_inference.py b/run_inference.py
new file mode 100644
index 000000000000..181aa8e60e70
--- /dev/null
+++ b/run_inference.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+if os.getcwd() in sys.path:
+    sys.path.remove(os.getcwd())
+
+os.environ.setdefault("VLLM_USE_V1", "0")
+os.environ.setdefault("VLLM_USE_CUDAGRAPH", "0")  # no cuda graphs
+
+from vllm import LLM, SamplingParams
+
+print("Loading model...")
+llm = LLM(
+    "llama-3.2-3b-instruct-hf",
+    gpu_memory_utilization=0.5,
+    max_model_len=512,
+    enforce_eager=True,
+)
+
+print("Running inference...")
+outs = llm.generate(["What is software?"], SamplingParams(max_tokens=10, temperature=0.7))
+print("Generated:", outs[0].outputs[0].text)
+print("Done.")
diff --git a/run_test.sh b/run_test.sh
new file mode 100755
index 000000000000..0fbc35f47b67
--- /dev/null
+++ b/run_test.sh
@@ -0,0 +1,8 @@
+export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+export VLLM_USE_FLASHINFER=0
+export VLLM_USE_TRITON_FLASH_ATTN=0
+# If you only instrumented FA2 or FA3, force it:
+# export VLLM_FLASH_ATTN_VERSION=2   # or 3
+
+python run_inference.py
+ 
\ No newline at end of file