Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
set -euo pipefail

# Detect if already inside a virtual environment
if [ -n "${VIRTUAL_ENV:-}" ]; then
echo "Already inside a virtual environment: $VIRTUAL_ENV"
venv_created=false
else
venv_created=false
if [ ! -d ".venv" ]; then
echo "Creating virtual environment..."
uv venv --python 3.12 --seed
venv_created=true
else
echo "Virtual environment .venv already exists, skipping creation."
fi

echo "Activating .venv..."
source .venv/bin/activate
fi

# Install build requirements if we just created the venv
if [ "$venv_created" = true ]; then
echo "Installing build requirements..."
uv pip install -r requirements/build.txt -v
else
echo "Skipping build requirements installation (venv already in use)."
fi

# Use all available CPUs for parallel jobs
if command -v nproc &>/dev/null; then
MAX_JOBS=$(nproc)
elif [[ "$(uname)" == "Darwin" ]]; then
MAX_JOBS=$(sysctl -n hw.ncpu)
else
MAX_JOBS=4 # fallback
fi
export MAX_JOBS

echo "Using $MAX_JOBS parallel jobs."

CCACHE_NOHASHDIR="true" pip install -v --no-build-isolation -e .

31 changes: 31 additions & 0 deletions profile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash
set -euo pipefail

OUT=/tmp/vllm_profile

# Collect report
ncu --section LaunchStats \
--target-processes all \
--profile-from-start on \
-o "$OUT" -f \
python run_inference.py

echo "Nsight Compute report is at: ${OUT}.ncu-rep"

echo
echo "Top kernels (mangled):"
ncu --import "${OUT}.ncu-rep" \
--csv \
--print-kernel-base mangled \
| tail -n +2 | cut -d',' -f5 | tr -d '"' | sort -u

echo
echo "Top kernels (demangled):"
ncu --import "${OUT}.ncu-rep" \
--csv \
--print-kernel-base demangled \
| tail -n +2 | cut -d',' -f5 | tr -d '"' | sort -u

echo
echo "Summary:"
ncu --import "${OUT}.ncu-rep" --page summary | head -80 || true
25 changes: 25 additions & 0 deletions run_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env python3

import os
import sys

if os.getcwd() in sys.path:
sys.path.remove(os.getcwd())

os.environ.setdefault("VLLM_USE_V1", "0")
os.environ.setdefault("VLLM_USE_CUDAGRAPH", "0") # no cuda graphs

from vllm import LLM, SamplingParams

print("Loading model...")
llm = LLM(
"llama-3.2-3b-instruct-hf",
gpu_memory_utilization=0.5,
max_model_len=512,
enforce_eager=True,
)

print("Running inference...")
outs = llm.generate(["What is software?"], SamplingParams(max_tokens=10, temperature=0.7))

Check failure on line 23 in run_inference.py

View workflow job for this annotation

GitHub Actions / pre-commit

Ruff (E501)

run_inference.py:23:89: E501 Line too long (90 > 88)
print("Generated:", outs[0].outputs[0].text)
print("Done.")
8 changes: 8 additions & 0 deletions run_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export VLLM_USE_FLASHINFER=0
export VLLM_USE_TRITON_FLASH_ATTN=0
# If you only instrumented FA2 or FA3, force it:
# export VLLM_FLASH_ATTN_VERSION=2 # or 3

python run_inference.py

Loading