From 114e222811ea5090d56bc88a77112bede19d5d55 Mon Sep 17 00:00:00 2001 From: Matthew Leon Date: Thu, 4 Sep 2025 15:22:15 -0700 Subject: [PATCH] Add build, test, and profile scripts Signed-off-by: Matthew Leon --- build.sh | 43 +++++++++++++++++++++++++++++++++++++++++++ profile.sh | 31 +++++++++++++++++++++++++++++++ run_inference.py | 25 +++++++++++++++++++++++++ run_test.sh | 8 ++++++++ 4 files changed, 107 insertions(+) create mode 100755 build.sh create mode 100755 profile.sh create mode 100644 run_inference.py create mode 100755 run_test.sh diff --git a/build.sh b/build.sh new file mode 100755 index 000000000000..b1e5ae18f722 --- /dev/null +++ b/build.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -euo pipefail + +# Detect if already inside a virtual environment +if [ -n "${VIRTUAL_ENV:-}" ]; then + echo "Already inside a virtual environment: $VIRTUAL_ENV" + venv_created=false +else + venv_created=false + if [ ! -d ".venv" ]; then + echo "Creating virtual environment..." + uv venv --python 3.12 --seed + venv_created=true + else + echo "Virtual environment .venv already exists, skipping creation." + fi + + echo "Activating .venv..." + source .venv/bin/activate +fi + +# Install build requirements if we just created the venv +if [ "$venv_created" = true ]; then + echo "Installing build requirements..." + uv pip install -r requirements/build.txt -v +else + echo "Skipping build requirements installation (venv already in use)." +fi + +# Use all available CPUs for parallel jobs +if command -v nproc &>/dev/null; then + MAX_JOBS=$(nproc) +elif [[ "$(uname)" == "Darwin" ]]; then + MAX_JOBS=$(sysctl -n hw.ncpu) +else + MAX_JOBS=4 # fallback +fi +export MAX_JOBS + +echo "Using $MAX_JOBS parallel jobs." + +CCACHE_NOHASHDIR="true" pip install -v --no-build-isolation -e . + diff --git a/profile.sh b/profile.sh new file mode 100755 index 000000000000..350e6e38d60b --- /dev/null +++ b/profile.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -euo pipefail + +OUT=/tmp/vllm_profile + +# Collect report +ncu --section LaunchStats \ + --target-processes all \ + --profile-from-start on \ + -o "$OUT" -f \ + python run_inference.py + +echo "Nsight Compute report is at: ${OUT}.ncu-rep" + +echo +echo "Top kernels (mangled):" +ncu --import "${OUT}.ncu-rep" \ + --csv \ + --print-kernel-base mangled \ +| tail -n +2 | cut -d',' -f5 | tr -d '"' | sort -u + +echo +echo "Top kernels (demangled):" +ncu --import "${OUT}.ncu-rep" \ + --csv \ + --print-kernel-base demangled \ +| tail -n +2 | cut -d',' -f5 | tr -d '"' | sort -u + +echo +echo "Summary:" +ncu --import "${OUT}.ncu-rep" --page summary | head -80 || true diff --git a/run_inference.py b/run_inference.py new file mode 100644 index 000000000000..181aa8e60e70 --- /dev/null +++ b/run_inference.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +import os +import sys + +if os.getcwd() in sys.path: + sys.path.remove(os.getcwd()) + +os.environ.setdefault("VLLM_USE_V1", "0") +os.environ.setdefault("VLLM_USE_CUDAGRAPH", "0") # no cuda graphs + +from vllm import LLM, SamplingParams + +print("Loading model...") +llm = LLM( + "llama-3.2-3b-instruct-hf", + gpu_memory_utilization=0.5, + max_model_len=512, + enforce_eager=True, +) + +print("Running inference...") +outs = llm.generate(["What is software?"], SamplingParams(max_tokens=10, temperature=0.7)) +print("Generated:", outs[0].outputs[0].text) +print("Done.") diff --git a/run_test.sh b/run_test.sh new file mode 100755 index 000000000000..0fbc35f47b67 --- /dev/null +++ b/run_test.sh @@ -0,0 +1,8 @@ +export VLLM_ATTENTION_BACKEND=FLASH_ATTN +export VLLM_USE_FLASHINFER=0 +export VLLM_USE_TRITON_FLASH_ATTN=0 +# If you only instrumented FA2 or FA3, force it: +# export VLLM_FLASH_ATTN_VERSION=2 # or 3 + +python run_inference.py + \ No newline at end of file