diff --git a/.github/actions/launch-gpu-runner/action.yml b/.github/actions/launch-gpu-runner/action.yml
new file mode 100644
index 0000000000..3009cba332
--- /dev/null
+++ b/.github/actions/launch-gpu-runner/action.yml
@@ -0,0 +1,74 @@
+name: 'Launch GPU EC2 Runner'
+description: 'Launch GPU-enabled EC2 instance as GitHub Actions self-hosted runner (wrapper for machulav/ec2-github-runner)'
+
+inputs:
+  mode:
+    description: 'Mode: start or stop'
+    required: true
+  github-token:
+    description: 'GitHub Personal Access Token with repo scope for runner registration'
+    required: true
+  instance-type:
+    description: 'EC2 instance type (e.g., g6.2xlarge, g5.2xlarge, g6.8xlarge)'
+    required: false
+    default: 'g6.2xlarge'
+  aws-region:
+    description: 'AWS region'
+    required: true
+  availability-zones-config:
+    description: 'JSON array of AZ configs with imageId, subnetId, securityGroupId for fallback'
+    required: false
+    default: ''
+  # For stop mode
+  label:
+    description: 'Runner label (for stop mode)'
+    required: false
+  ec2-instance-id:
+    description: 'EC2 instance ID (for stop mode)'
+    required: false
+  # Optional
+  ec2-instance-tags:
+    description: 'JSON array of tags to apply to EC2 instance'
+    required: false
+    default: '[]'
+  runner-home-dir:
+    description: 'Home directory for the runner'
+    required: false
+    default: '/home/ec2-user/actions-runner'
+  iam-role-name:
+    description: 'IAM role name to attach to the instance (optional, for enhanced security)'
+    required: false
+    default: ''
+
+outputs:
+  label:
+    description: 'Unique label for the launched runner'
+    value: ${{ steps.ec2-runner.outputs.label }}
+  ec2-instance-id:
+    description: 'EC2 instance ID'
+    value: ${{ steps.ec2-runner.outputs.ec2-instance-id }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Start EC2 runner
+      if: inputs.mode == 'start'
+      id: ec2-runner
+      uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+      with:
+        mode: start
+        github-token: ${{ inputs.github-token }}
+        ec2-instance-type: ${{ inputs.instance-type }}
+        aws-resource-tags: ${{ inputs.ec2-instance-tags }}
+        runner-home-dir: ${{ inputs.runner-home-dir }}
+        iam-role-name: ${{ inputs.iam-role-name }}
+        availability-zones-config: ${{ inputs.availability-zones-config }}
+
+    - name: Stop EC2 runner
+      if: inputs.mode == 'stop'
+      uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+      with:
+        mode: stop
+        github-token: ${{ inputs.github-token }}
+        label: ${{ inputs.label }}
+        ec2-instance-id: ${{ inputs.ec2-instance-id }}
diff --git a/.github/actions/setup-vllm-gpu/action.yml b/.github/actions/setup-vllm-gpu/action.yml
new file mode 100644
index 0000000000..9b4ea40d6e
--- /dev/null
+++ b/.github/actions/setup-vllm-gpu/action.yml
@@ -0,0 +1,217 @@
+name: 'Setup vLLM GPU'
+description: 'Install vLLM with GPU support and start vLLM server with gpt-oss:20b'
+
+inputs:
+  model:
+    description: 'Model to serve (e.g., gpt-oss:20b, Qwen/Qwen3-0.6B)'
+    required: false
+    default: 'gpt-oss:20b'
+  port:
+    description: 'Port for vLLM server'
+    required: false
+    default: '8000'
+  gpu-memory-utilization:
+    description: 'GPU memory utilization (0.0-1.0)'
+    required: false
+    default: '0.85'
+  max-model-len:
+    description: 'Maximum model context length'
+    required: false
+    default: '8192'
+  quantization:
+    description: 'Quantization method (awq, gptq, or none)'
+    required: false
+    default: 'awq'
+
+outputs:
+  vllm-url:
+    description: 'URL of the vLLM server'
+    value: 'http://0.0.0.0:${{ inputs.port }}/v1'
+  model-name:
+    description: 'Name of the model being served'
+    value: ${{ inputs.model }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Verify GPU availability
+      shell: bash
+      run: |
+        echo "=== GPU Information ==="
+        nvidia-smi
+        echo ""
+        echo "=== CUDA Version ==="
+        nvcc --version || echo "nvcc not found in PATH"
+        echo ""
+        echo "=== Environment ==="
+        echo "CUDA_HOME: ${CUDA_HOME:-not set}"
+        echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}"
+
+    - name: Configure CUDA environment
+      shell: bash
+      run: |
+        # Set CUDA environment variables if not already set
+        if [ -z "$CUDA_HOME" ]; then
+          export CUDA_HOME=/usr/local/cuda-12.4
+          echo "CUDA_HOME=/usr/local/cuda-12.4" >> $GITHUB_ENV
+        fi
+
+        # Configure LD_LIBRARY_PATH with "sandwich" pattern
+        # (system libs first, then NVIDIA libs, then CUDA)
+        export LD_LIBRARY_PATH="/usr/lib64:${LD_LIBRARY_PATH:-}:/usr/local/cuda-12.4/lib64"
+        echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV
+
+        export PATH="${CUDA_HOME}/bin:${PATH}"
+        echo "PATH=${PATH}" >> $GITHUB_ENV
+
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        # Install uv if not present
+        if ! command -v uv &> /dev/null; then
+          echo "Installing uv..."
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          source $HOME/.cargo/env
+        fi
+
+        # Create virtual environment
+        python3.12 -m venv /tmp/vllm-env
+        source /tmp/vllm-env/bin/activate
+
+        echo "VIRTUAL_ENV=/tmp/vllm-env" >> $GITHUB_ENV
+        echo "/tmp/vllm-env/bin" >> $GITHUB_PATH
+
+    - name: Install vLLM with GPU support
+      shell: bash
+      run: |
+        source /tmp/vllm-env/bin/activate
+
+        echo "=== Installing PyTorch with CUDA support ==="
+        pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124
+
+        echo "=== Installing vLLM ==="
+        # Install vLLM with CUDA support
+        pip install vllm
+
+        echo "=== Verifying installation ==="
+        python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+        python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
+        python -c "import torch; print(f'CUDA device count: {torch.cuda.device_count()}')"
+        vllm --version
+
+    - name: Pull model (if Ollama-style)
+      shell: bash
+      env:
+        MODEL: ${{ inputs.model }}
+      run: |
+        source /tmp/vllm-env/bin/activate
+
+        # Check if model is in Ollama format (contains ":")
+        if [[ "$MODEL" == *":"* ]]; then
+          echo "Detected Ollama-style model: $MODEL"
+          echo "Note: vLLM will attempt to load from Ollama cache"
+          echo "Ensure Ollama has pulled this model or it exists in ~/.ollama/models"
+
+          # Optionally install ollama-python for model management
+          pip install ollama-python || true
+        else
+          echo "Detected HuggingFace-style model: $MODEL"
+          echo "vLLM will download from HuggingFace Hub if not cached"
+        fi
+
+    - name: Start vLLM server
+      shell: bash
+      env:
+        MODEL: ${{ inputs.model }}
+        PORT: ${{ inputs.port }}
+        GPU_MEM: ${{ inputs.gpu-memory-utilization }}
+        MAX_LEN: ${{ inputs.max-model-len }}
+        QUANT: ${{ inputs.quantization }}
+      run: |
+        source /tmp/vllm-env/bin/activate
+
+        echo "=== Starting vLLM server ==="
+        echo "Model: $MODEL"
+        echo "Port: $PORT"
+        echo "GPU Memory Utilization: $GPU_MEM"
+        echo "Max Model Length: $MAX_LEN"
+        echo "Quantization: $QUANT"
+        echo ""
+
+        # Build vLLM command
+        VLLM_CMD="vllm serve $MODEL \
+          --host 0.0.0.0 \
+          --port $PORT \
+          --tensor-parallel-size 1 \
+          --gpu-memory-utilization $GPU_MEM \
+          --max-model-len $MAX_LEN \
+          --enable-auto-tool-choice \
+          --tool-call-parser hermes \
+          --dtype auto"
+
+        # Add quantization if specified
+        if [ "$QUANT" != "none" ]; then
+          VLLM_CMD="$VLLM_CMD --quantization $QUANT"
+        fi
+
+        echo "Command: $VLLM_CMD"
+        echo ""
+
+        # Start vLLM in background
+        $VLLM_CMD > /tmp/vllm-server.log 2>&1 &
+        VLLM_PID=$!
+        echo "vLLM server started with PID: $VLLM_PID"
+        echo "VLLM_PID=$VLLM_PID" >> $GITHUB_ENV
+
+        # Save PID for cleanup
+        echo $VLLM_PID > /tmp/vllm.pid
+
+    - name: Wait for vLLM server to be ready
+      shell: bash
+      env:
+        PORT: ${{ inputs.port }}
+      run: |
+        echo "=== Waiting for vLLM server to be ready ==="
+        echo "Health check URL: http://localhost:$PORT/health"
+        echo ""
+
+        # Wait up to 10 minutes for server to be ready
+        timeout 600 bash -c "
+          until curl -f http://localhost:$PORT/health > /dev/null 2>&1; do
+            echo \"Waiting for vLLM server... (checking http://localhost:$PORT/health)\"
+
+            # Check if process is still running
+            if ! kill -0 \$VLLM_PID 2>/dev/null; then
+              echo \"ERROR: vLLM process died!\"
+              echo \"Last 50 lines of vLLM log:\"
+              tail -n 50 /tmp/vllm-server.log
+              exit 1
+            fi
+
+            sleep 5
+          done
+        " || {
+          echo "ERROR: vLLM server failed to start within 10 minutes"
+          echo "=== vLLM Server Log ==="
+          cat /tmp/vllm-server.log
+          exit 1
+        }
+
+        echo "✓ vLLM server is ready!"
+        echo ""
+        echo "=== Testing API endpoint ==="
+        curl -s http://localhost:$PORT/v1/models | python3 -m json.tool || echo "Warning: Could not query models endpoint"
+
+    - name: Display server information
+      shell: bash
+      env:
+        PORT: ${{ inputs.port }}
+        MODEL: ${{ inputs.model }}
+      run: |
+        echo "=== vLLM Server Information ==="
+        echo "URL: http://0.0.0.0:$PORT/v1"
+        echo "Model: $MODEL"
+        echo "Health: http://0.0.0.0:$PORT/health"
+        echo "Models: http://0.0.0.0:$PORT/v1/models"
+        echo ""
+        echo "Server is ready for testing!"
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index a0d8b23a40..b268000578 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -25,6 +25,7 @@ Llama Stack uses GitHub Actions for Continuous Integration (CI). Below is a tabl
 | Test llama stack list-deps | [providers-list-deps.yml](providers-list-deps.yml) | Test llama stack list-deps |
 | Build, test, and publish packages | [pypi.yml](pypi.yml) | Build, test, and publish packages |
 | Integration Tests (Record) | [record-integration-tests.yml](record-integration-tests.yml) | Auto-record missing test recordings for PR |
+| vLLM GPU Recording | [record-vllm-gpu-tests.yml](record-vllm-gpu-tests.yml) | GPU recording for gpt-oss:20b (${{ inputs.suite }} suite) |
 | Release Branch Scheduled CI | [release-branch-scheduled-ci.yml](release-branch-scheduled-ci.yml) | Scheduled CI checks for active release branches |
 | Check semantic PR titles | [semantic-pr.yml](semantic-pr.yml) | Ensure that PR titles follow the conventional commit spec |
 | Stainless SDK Builds | [stainless-builds.yml](stainless-builds.yml) | Build Stainless SDK from OpenAPI spec changes |
diff --git a/.github/workflows/record-vllm-gpu-tests.yml b/.github/workflows/record-vllm-gpu-tests.yml
new file mode 100644
index 0000000000..d24e5a9d7a
--- /dev/null
+++ b/.github/workflows/record-vllm-gpu-tests.yml
@@ -0,0 +1,224 @@
+name: vLLM GPU Recording
+
+run-name: GPU recording for gpt-oss:20b (${{ inputs.suite }} suite)
+
+on:
+  workflow_dispatch:
+    inputs:
+      suite:
+        description: 'Test suite to run'
+        required: false
+        type: choice
+        default: 'base'
+        options:
+          - base
+          - responses
+          - vllm-reasoning
+      pr_number:
+        description: 'PR number to commit recordings to (optional)'
+        required: false
+        type: number
+
+concurrency:
+  group: gpu-vllm-record-${{ github.run_id }}
+  cancel-in-progress: false  # Don't cancel - EC2 cleanup is critical
+
+# OIDC authentication for AWS - no long-lived credentials!
+permissions:
+  contents: read
+  id-token: write  # Required for OIDC authentication to AWS
+
+jobs:
+  # Job 1: Launch GPU EC2 instance with multi-AZ fallback
+  start-gpu-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Configure AWS credentials via OIDC
+        uses: aws-actions/configure-aws-credentials@e3dd6b9db4c5f1d5e55a52ae244b09d44a2e2d5a # v4.0.2
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+          aws-region: us-east-2
+          role-session-name: GitHubActions-vLLM-GPU-${{ github.run_id }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: ./.github/actions/launch-gpu-runner
+        with:
+          mode: start
+          github-token: ${{ secrets.RELEASE_PAT }}
+          instance-type: g6.2xlarge
+          aws-region: us-east-2
+          availability-zones-config: |
+            [
+              {"imageId": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", "subnetId": "${{ vars.SUBNET_US_EAST_2A }}", "securityGroupId": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"},
+              {"imageId": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", "subnetId": "${{ vars.SUBNET_US_EAST_2B }}", "securityGroupId": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"},
+              {"imageId": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", "subnetId": "${{ vars.SUBNET_US_EAST_2C }}", "securityGroupId": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"}
+            ]
+          ec2-instance-tags: |
+            [
+              {"Key": "Name", "Value": "llamastack-vllm-gpu-runner"},
+              {"Key": "Project", "Value": "llama-stack"},
+              {"Key": "Purpose", "Value": "vllm-gpu-recording"},
+              {"Key": "Model", "Value": "gpt-oss:20b"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubRunId", "Value": "${{ github.run_id }}"},
+              {"Key": "ManagedBy", "Value": "GitHub-Actions"}
+            ]
+
+      - name: Runner launch summary
+        run: |
+          echo "GPU runner launched successfully"
+          echo "  Instance ID: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}"
+          echo "  Runner Label: ${{ steps.start-ec2-runner.outputs.label }}"
+          echo "  Model: gpt-oss:20b"
+          echo "  Instance Type: g6.2xlarge"
+
+  # Job 2: Run vLLM tests on GPU runner
+  record-vllm-tests:
+    needs: start-gpu-runner
+    runs-on: ${{ needs.start-gpu-runner.outputs.label }}
+    permissions: {}  # CRITICAL: No permissions - prevents secret theft from untrusted code
+    env:
+      TMPDIR: /home/tmp
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Setup environment
+        run: |
+          mkdir -p /home/tmp
+          echo "=== System Information ==="
+          cat /etc/os-release
+          echo ""
+          echo "=== Disk Space ==="
+          df -h
+          echo ""
+          echo "=== Memory ==="
+          free -h
+          echo ""
+          echo "=== GPU Information ==="
+          nvidia-smi
+
+      - name: Setup vLLM GPU
+        uses: ./.github/actions/setup-vllm-gpu
+        with:
+          model: 'gpt-oss:20b'
+          port: '8000'
+          gpu-memory-utilization: '0.85'
+          max-model-len: '8192'
+          quantization: 'awq'
+
+      - name: Setup test environment
+        uses: ./.github/actions/setup-test-environment
+        with:
+          python-version: '3.12'
+          client-version: 'latest'
+          setup: 'vllm-gpu-gpt-oss'
+          suite: ${{ inputs.suite }}
+          inference-mode: 'record'
+
+      - name: Run integration tests (record mode)
+        uses: ./.github/actions/run-and-record-tests
+        with:
+          stack-config: 'server:ci-tests'
+          setup: 'vllm-gpu-gpt-oss'
+          inference-mode: 'record'
+          suite: ${{ inputs.suite }}
+          skip-commit: 'true'  # Don't commit here - upload as artifacts
+
+      - name: Upload recordings as artifacts
+        if: always()
+        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
+        with:
+          name: vllm-gpu-recordings-${{ github.run_id }}
+          path: |
+            tests/integration/recordings/
+            tests/integration/*/recordings/
+          retention-days: 7
+          if-no-files-found: warn
+
+      - name: Upload vLLM logs
+        if: always()
+        run: |
+          if [ -f /tmp/vllm-server.log ]; then
+            cat /tmp/vllm-server.log
+          fi
+
+      - name: Disk space after tests
+        if: always()
+        run: |
+          echo "=== Disk Space After Tests ==="
+          df -h
+
+  # Job 3: Stop GPU EC2 instance (ALWAYS runs for cleanup)
+  stop-gpu-runner:
+    needs: [start-gpu-runner, record-vllm-tests]
+    runs-on: ubuntu-latest
+    if: ${{ always() }}  # CRITICAL: Always cleanup, even on failure or cancellation
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Configure AWS credentials via OIDC
+        uses: aws-actions/configure-aws-credentials@e3dd6b9db4c5f1d5e55a52ae244b09d44a2e2d5a # v4.0.2
+        with:
+          role-to-assume: ${{ secrets.AWS_ROLE_ARN }}
+          aws-region: us-east-2
+          role-session-name: GitHubActions-vLLM-GPU-Cleanup-${{ github.run_id }}
+
+      - name: Stop EC2 runner
+        uses: ./.github/actions/launch-gpu-runner
+        with:
+          mode: stop
+          github-token: ${{ secrets.RELEASE_PAT }}
+          aws-region: us-east-2
+          label: ${{ needs.start-gpu-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-gpu-runner.outputs.instance-id }}
+
+      - name: Cleanup summary
+        run: |
+          echo "GPU runner terminated successfully"
+          echo "  Instance ID: ${{ needs.start-gpu-runner.outputs.instance-id }}"
+
+  # Job 4: Summary and next steps
+  summary:
+    needs: [start-gpu-runner, record-vllm-tests, stop-gpu-runner]
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Workflow summary
+        run: |
+          {
+            echo "## vLLM GPU Recording Summary"
+            echo ""
+            echo "**Model**: gpt-oss:20b"
+            echo "**Instance Type**: g6.2xlarge"
+            echo "**Test Suite**: ${{ inputs.suite }}"
+            echo ""
+
+            if [ "${{ needs.record-vllm-tests.result }}" == "success" ]; then
+              echo "**Test Status**: Successful"
+              echo ""
+              echo "Recordings have been uploaded as artifacts. Download them from the workflow run and commit manually."
+            else
+              echo "**Test Status**: Failed"
+              echo ""
+              echo "Check the test logs for errors."
+            fi
+
+            echo ""
+            echo "**Cleanup Status**: ${{ needs.stop-gpu-runner.result == 'success' && 'Instance terminated' || 'Check manually' }}"
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Check for cleanup issues
+        if: needs.stop-gpu-runner.result != 'success'
+        run: |
+          echo "::warning::EC2 instance cleanup may have failed! Check AWS console for orphaned instances."
+          echo "Instance ID: ${{ needs.start-gpu-runner.outputs.instance-id }}"
diff --git a/AWS_SETUP_GUIDE.md b/AWS_SETUP_GUIDE.md
new file mode 100644
index 0000000000..2974ef8ffb
--- /dev/null
+++ b/AWS_SETUP_GUIDE.md
@@ -0,0 +1,602 @@
+# AWS Setup Guide for GPU Runners
+
+This guide walks through setting up the AWS infrastructure required for GPU-enabled self-hosted runners.
+
+## Prerequisites
+
+- AWS account with appropriate permissions
+- AWS CLI installed and configured
+- Access to create IAM roles and OIDC providers
+
+## Step 1: Set up OIDC Provider for GitHub Actions
+
+GitHub Actions can authenticate to AWS using OpenID Connect (OIDC) instead of long-lived access keys.
+
+### 1.1 Create OIDC Provider in IAM
+
+```bash
+# Using AWS CLI
+aws iam create-open-id-connect-provider \
+  --url https://token.actions.githubusercontent.com \
+  --client-id-list sts.amazonaws.com \
+  --thumbprint-list 6938fd4d98bab03faadb97b34396831e3780aea1
+```
+
+**Via AWS Console**:
+
+1. Go to IAM > Identity providers
+2. Click **Add provider**
+3. Provider type: **OpenID Connect**
+4. Provider URL: `https://token.actions.githubusercontent.com`
+5. Audience: `sts.amazonaws.com`
+6. Click **Add provider**
+
+### 1.2 Create IAM Role for GitHub Actions
+
+Create a file `trust-policy.json`:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": {
+        "Federated": "arn:aws:iam::YOUR_ACCOUNT_ID:oidc-provider/token.actions.githubusercontent.com"
+      },
+      "Action": "sts:AssumeRoleWithWebIdentity",
+      "Condition": {
+        "StringEquals": {
+          "token.actions.githubusercontent.com:aud": "sts.amazonaws.com"
+        },
+        "StringLike": {
+          "token.actions.githubusercontent.com:sub": "repo:YOUR_ORG/llama-stack:*"
+        }
+      }
+    }
+  ]
+}
+```
+
+Replace:
+
+- `YOUR_ACCOUNT_ID`: Your AWS account ID (e.g., `123456789012`)
+- `YOUR_ORG/llama-stack`: Your GitHub repository (e.g., `meta-llama/llama-stack`)
+
+Create the role:
+
+```bash
+aws iam create-role \
+  --role-name GitHubActionsLlamaStackGPU \
+  --assume-role-policy-document file://trust-policy.json
+```
+
+### 1.3 Attach Permissions Policy
+
+Create a file `permissions-policy.json`:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Sid": "EC2Management",
+      "Effect": "Allow",
+      "Action": [
+        "ec2:RunInstances",
+        "ec2:TerminateInstances",
+        "ec2:DescribeInstances",
+        "ec2:DescribeInstanceStatus",
+        "ec2:DescribeInstanceTypes",
+        "ec2:CreateTags",
+        "ec2:DescribeImages",
+        "ec2:DescribeSubnets",
+        "ec2:DescribeSecurityGroups",
+        "ec2:DescribeKeyPairs",
+        "ec2:DescribeVolumes"
+      ],
+      "Resource": "*",
+      "Condition": {
+        "StringEquals": {
+          "aws:RequestedRegion": ["us-east-1", "us-east-2"]
+        }
+      }
+    },
+    {
+      "Sid": "IAMPassRole",
+      "Effect": "Allow",
+      "Action": "iam:PassRole",
+      "Resource": "arn:aws:iam::YOUR_ACCOUNT_ID:role/GitHubActionsLlamaStackGPU",
+      "Condition": {
+        "StringEquals": {
+          "iam:PassedToService": "ec2.amazonaws.com"
+        }
+      }
+    }
+  ]
+}
+```
+
+Replace `YOUR_ACCOUNT_ID` with your AWS account ID.
+
+Attach the policy:
+
+```bash
+aws iam put-role-policy \
+  --role-name GitHubActionsLlamaStackGPU \
+  --policy-name EC2GPURunnerPermissions \
+  --policy-document file://permissions-policy.json
+```
+
+### 1.4 Save Role ARN
+
+Get the role ARN:
+
+```bash
+aws iam get-role --role-name GitHubActionsLlamaStackGPU --query 'Role.Arn' --output text
+```
+
+Save this ARN - you'll need it for GitHub secrets:
+
+```text
+arn:aws:iam::123456789012:role/GitHubActionsLlamaStackGPU
+```
+
+## Step 2: Set up VPC and Networking
+
+### 2.1 Option A: Use Existing VPC
+
+If you already have a VPC with internet access:
+
+```bash
+# List VPCs
+aws ec2 describe-vpcs --region us-east-2
+
+# List subnets in VPC
+aws ec2 describe-subnets --region us-east-2 --filters "Name=vpc-id,Values=vpc-xxxxx"
+```
+
+### 2.2 Option B: Create New VPC
+
+```bash
+# Create VPC in us-east-2
+aws ec2 create-vpc \
+  --region us-east-2 \
+  --cidr-block 10.0.0.0/16 \
+  --tag-specifications 'ResourceType=vpc,Tags=[{Key=Name,Value=llama-stack-gpu}]'
+
+# Enable DNS hostnames
+aws ec2 modify-vpc-attribute \
+  --region us-east-2 \
+  --vpc-id vpc-xxxxx \
+  --enable-dns-hostnames
+
+# Create Internet Gateway
+aws ec2 create-internet-gateway \
+  --region us-east-2 \
+  --tag-specifications 'ResourceType=internet-gateway,Tags=[{Key=Name,Value=llama-stack-gpu-igw}]'
+
+# Attach to VPC
+aws ec2 attach-internet-gateway \
+  --region us-east-2 \
+  --vpc-id vpc-xxxxx \
+  --internet-gateway-id igw-xxxxx
+```
+
+### 2.3 Create Subnets
+
+Create 3 subnets in us-east-2 (one per AZ):
+
+```bash
+# us-east-2a
+aws ec2 create-subnet \
+  --region us-east-2 \
+  --vpc-id vpc-xxxxx \
+  --cidr-block 10.0.1.0/24 \
+  --availability-zone us-east-2a \
+  --tag-specifications 'ResourceType=subnet,Tags=[{Key=Name,Value=llama-stack-gpu-2a}]'
+
+# us-east-2b
+aws ec2 create-subnet \
+  --region us-east-2 \
+  --vpc-id vpc-xxxxx \
+  --cidr-block 10.0.2.0/24 \
+  --availability-zone us-east-2b \
+  --tag-specifications 'ResourceType=subnet,Tags=[{Key=Name,Value=llama-stack-gpu-2b}]'
+
+# us-east-2c
+aws ec2 create-subnet \
+  --region us-east-2 \
+  --vpc-id vpc-xxxxx \
+  --cidr-block 10.0.3.0/24 \
+  --availability-zone us-east-2c \
+  --tag-specifications 'ResourceType=subnet,Tags=[{Key=Name,Value=llama-stack-gpu-2c}]'
+```
+
+Repeat for us-east-1 with appropriate CIDR blocks.
+
+### 2.4 Configure Route Table
+
+```bash
+# Create route table
+aws ec2 create-route-table \
+  --region us-east-2 \
+  --vpc-id vpc-xxxxx \
+  --tag-specifications 'ResourceType=route-table,Tags=[{Key=Name,Value=llama-stack-gpu-rt}]'
+
+# Add route to internet gateway
+aws ec2 create-route \
+  --region us-east-2 \
+  --route-table-id rtb-xxxxx \
+  --destination-cidr-block 0.0.0.0/0 \
+  --gateway-id igw-xxxxx
+
+# Associate subnets with route table
+aws ec2 associate-route-table \
+  --region us-east-2 \
+  --route-table-id rtb-xxxxx \
+  --subnet-id subnet-xxxxx
+```
+
+### 2.5 Create Security Group
+
+```bash
+aws ec2 create-security-group \
+  --region us-east-2 \
+  --group-name llama-stack-gpu-runners \
+  --description "Security group for llama-stack GPU runners" \
+  --vpc-id vpc-xxxxx \
+  --tag-specifications 'ResourceType=security-group,Tags=[{Key=Name,Value=llama-stack-gpu-sg}]'
+
+# Add outbound rules (allow all - default)
+# No inbound rules needed (runners connect outbound only)
+```
+
+## Step 3: Create GPU-Enabled AMI
+
+### 3.1 Launch Base Instance
+
+```bash
+# Launch Ubuntu 22.04 instance with GPU
+aws ec2 run-instances \
+  --region us-east-2 \
+  --image-id ami-0c55b159cbfafe1f0 \
+  --instance-type g6.2xlarge \
+  --key-name your-key-pair \
+  --subnet-id subnet-xxxxx \
+  --security-group-ids sg-xxxxx \
+  --block-device-mappings 'DeviceName=/dev/sda1,Ebs={VolumeSize=100,VolumeType=gp3}' \
+  --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value=llama-stack-gpu-ami-builder}]'
+```
+
+### 3.2 SSH and Configure Instance
+
+```bash
+ssh -i your-key.pem ubuntu@<instance-public-ip>
+```
+
+Run the setup script:
+
+```bash
+#!/bin/bash
+set -e
+
+# Update system
+sudo apt-get update
+sudo apt-get upgrade -y
+
+# Install system packages
+sudo apt-get install -y \
+  build-essential \
+  gcc \
+  g++ \
+  make \
+  git \
+  curl \
+  wget \
+  ca-certificates \
+  gnupg \
+  lsb-release
+
+# Install NVIDIA drivers
+sudo apt-get install -y ubuntu-drivers-common
+sudo ubuntu-drivers autoinstall
+
+# Install CUDA 12.4
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install cuda-12-4
+
+# Configure CUDA environment
+echo 'export CUDA_HOME=/usr/local/cuda-12.4' | sudo tee -a /etc/profile.d/cuda.sh
+echo 'export PATH=$PATH:$CUDA_HOME/bin' | sudo tee -a /etc/profile.d/cuda.sh
+echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64' | sudo tee -a /etc/profile.d/cuda.sh
+
+# Install Docker
+curl -fsSL https://get.docker.com -o get-docker.sh
+sudo sh get-docker.sh
+
+# Install NVIDIA Container Toolkit
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
+  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+  sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+
+sudo apt-get update
+sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+
+# Install Python 3.12
+sudo add-apt-repository ppa:deadsnakes/ppa -y
+sudo apt-get update
+sudo apt-get install -y python3.12 python3.12-venv python3.12-dev
+
+# Verify installation
+nvidia-smi
+nvcc --version
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+
+echo "Setup complete! Ready to create AMI."
+```
+
+### 3.3 Create AMI
+
+After setup completes:
+
+```bash
+# From your local machine
+aws ec2 create-image \
+  --region us-east-2 \
+  --instance-id i-xxxxx \
+  --name "llama-stack-gpu-ubuntu-2204-cuda-12.4-$(date +%Y%m%d)" \
+  --description "Ubuntu 22.04 with NVIDIA drivers, CUDA 12.4, Docker, Python 3.12" \
+  --tag-specifications 'ResourceType=image,Tags=[{Key=Name,Value=llama-stack-gpu-ami}]'
+
+# Copy AMI to us-east-1
+aws ec2 copy-image \
+  --region us-east-1 \
+  --source-region us-east-2 \
+  --source-image-id ami-xxxxx \
+  --name "llama-stack-gpu-ubuntu-2204-cuda-12.4-$(date +%Y%m%d)"
+```
+
+### 3.4 Test AMI
+
+Launch a test instance:
+
+```bash
+aws ec2 run-instances \
+  --region us-east-2 \
+  --image-id ami-xxxxx \
+  --instance-type g6.2xlarge \
+  --subnet-id subnet-xxxxx \
+  --security-group-ids sg-xxxxx
+
+# SSH and verify
+ssh ubuntu@<ip>
+nvidia-smi
+nvcc --version
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+```
+
+## Step 4: Create GitHub Personal Access Token
+
+1. Go to GitHub Settings > Developer settings > Personal access tokens > Tokens (classic)
+2. Click **Generate new token (classic)**
+3. Name: `llama-stack-gpu-runners`
+4. Scopes: Select `repo` (full control of private repositories)
+5. Click **Generate token**
+6. **Save the token** - you won't see it again!
+
+## Step 5: Configure GitHub Secrets and Variables
+
+### 5.1 Add Secrets
+
+Go to your GitHub repository > Settings > Secrets and variables > Actions > Secrets
+
+Click **New repository secret** for each:
+
+- **Name**: `AWS_ROLE_ARN`
+  - **Value**: `arn:aws:iam::123456789012:role/GitHubActionsLlamaStackGPU`
+
+- **Name**: `RELEASE_PAT`
+  - **Value**: `ghp_xxxxxxxxxxxxx` (from Step 4)
+
+### 5.2 Add Variables
+
+Go to your GitHub repository > Settings > Secrets and variables > Actions > Variables
+
+Click **New repository variable** for each:
+
+**us-east-2**:
+
+- `SUBNET_US_EAST_2A`: `subnet-xxxxx`
+- `SUBNET_US_EAST_2B`: `subnet-xxxxx`
+- `SUBNET_US_EAST_2C`: `subnet-xxxxx`
+- `AWS_EC2_AMI_US_EAST_2`: `ami-xxxxx`
+- `SECURITY_GROUP_ID_US_EAST_2`: `sg-xxxxx`
+
+**us-east-1**:
+
+- `SUBNET_US_EAST_1A`: `subnet-xxxxx`
+- `SUBNET_US_EAST_1B`: `subnet-xxxxx`
+- `SUBNET_US_EAST_1C`: `subnet-xxxxx`
+- `AWS_EC2_AMI_US_EAST_1`: `ami-xxxxx`
+- `SECURITY_GROUP_ID_US_EAST_1`: `sg-xxxxx`
+
+## Step 6: Test the Setup
+
+### 6.1 Trigger Test Workflow
+
+1. Go to Actions tab in GitHub
+2. Select **vLLM GPU Recording** workflow
+3. Click **Run workflow**
+4. Use default values
+5. Click **Run workflow**
+
+### 6.2 Verify Success
+
+Check that:
+
+- [ ] EC2 instance launches in us-east-2
+- [ ] Runner registers and picks up job
+- [ ] GPU is detected (`nvidia-smi` output)
+- [ ] vLLM server starts successfully
+- [ ] Tests run and complete
+- [ ] Recordings uploaded as artifacts
+- [ ] EC2 instance terminates
+
+### 6.3 Check AWS Console
+
+1. Go to EC2 > Instances
+2. Verify no instances with tag `Purpose: vllm-gpu-recording` are still running
+3. Check terminated instances - should see your test instance
+
+## Troubleshooting
+
+### OIDC Authentication Fails
+
+**Error**: "Not authorized to perform sts:AssumeRoleWithWebIdentity"
+
+**Solutions**:
+
+1. Verify OIDC provider is created correctly
+2. Check trust policy allows your repository
+3. Verify `token.actions.githubusercontent.com:sub` matches your repo
+
+### EC2 Launch Fails
+
+**Error**: "InsufficientInstanceCapacity"
+
+**Solutions**:
+
+1. Try different AZ (workflow does this automatically)
+2. Try different instance type
+3. Check service quotas in AWS console
+
+### AMI Not Found
+
+**Error**: "Invalid AMI ID"
+
+**Solutions**:
+
+1. Verify AMI exists in the region you're trying to use
+2. Check AMI ID is correct in GitHub variables
+3. Ensure AMI is not deregistered
+
+### Security Group Issues
+
+**Error**: "UnauthorizedOperation"
+
+**Solutions**:
+
+1. Verify security group exists in same VPC as subnet
+2. Check security group allows outbound HTTPS (443)
+3. Ensure IAM role has `ec2:DescribeSecurityGroups` permission
+
+## Cost Monitoring
+
+### Enable Cost Allocation Tags
+
+1. Go to AWS Billing > Cost Allocation Tags
+2. Activate these tags:
+   - `Project`
+   - `Purpose`
+   - `GitHubRepository`
+   - `GitHubRunId`
+3. Wait 24 hours for tags to appear in Cost Explorer
+
+### Create Budget Alert
+
+```bash
+aws budgets create-budget \
+  --account-id 123456789012 \
+  --budget file://budget.json \
+  --notifications-with-subscribers file://notifications.json
+```
+
+`budget.json`:
+
+```json
+{
+  "BudgetName": "llama-stack-gpu-runners",
+  "BudgetLimit": {
+    "Amount": "50",
+    "Unit": "USD"
+  },
+  "TimeUnit": "MONTHLY",
+  "BudgetType": "COST",
+  "CostFilters": {
+    "TagKeyValue": ["user:Purpose$vllm-gpu-recording"]
+  }
+}
+```
+
+`notifications.json`:
+
+```json
+[
+  {
+    "Notification": {
+      "NotificationType": "ACTUAL",
+      "ComparisonOperator": "GREATER_THAN",
+      "Threshold": 80
+    },
+    "Subscribers": [
+      {
+        "SubscriptionType": "EMAIL",
+        "Address": "your-email@example.com"
+      }
+    ]
+  }
+]
+```
+
+## Security Best Practices
+
+### 1. Principle of Least Privilege
+
+The IAM role only has permissions to:
+
+- Launch/terminate EC2 instances
+- Only in us-east-1 and us-east-2 regions
+- Only for llama-stack repository
+
+### 2. No Long-Lived Credentials
+
+Using OIDC means:
+
+- No AWS access keys stored in GitHub
+- Tokens expire after use
+- Better audit trail in CloudTrail
+
+### 3. Regular Audits
+
+Monthly:
+
+- [ ] Review EC2 instances for orphaned runners
+- [ ] Check AWS costs vs budget
+- [ ] Review CloudTrail logs for unusual activity
+- [ ] Rotate GitHub PAT if needed
+
+## Next Steps
+
+After completing this setup:
+
+1. ✅ Test workflow runs successfully
+2. ✅ No orphaned EC2 instances
+3. ✅ Costs are as expected (~$0.43 per run)
+4. Read `docs/gpu-runners.md` for usage guide
+5. Consider implementing Phase 2 optimizations (spot instances, model caching)
+
+## Support
+
+For issues during setup:
+
+- Check AWS CloudTrail for API errors
+- Review GitHub Actions logs for OIDC errors
+- Verify all ARNs and IDs are correct
+- Contact: Charles Doern (@cdoern)
diff --git a/IMPLEMENTATION_PLAN.md b/IMPLEMENTATION_PLAN.md
new file mode 100644
index 0000000000..fe4907f71c
--- /dev/null
+++ b/IMPLEMENTATION_PLAN.md
@@ -0,0 +1,557 @@
+# GPU Runners Implementation Plan
+
+This document outlines the step-by-step implementation plan for adding GPU-enabled self-hosted runners for vLLM re-recording with gpt-oss:20b.
+
+## Overview
+
+**Goal**: Enable re-recording of vLLM integration tests with gpt-oss:20b on GPU-enabled EC2 instances via GitHub Actions.
+
+**Key Benefits**:
+
+- Test larger models (20B parameters) that don't fit on CPU runners
+- Faster inference times with GPU acceleration
+- More realistic production-like test environment
+- On-demand re-recording via workflow_dispatch
+
+**Estimated Cost**: ~$0.43 per run (30 min on g6.2xlarge), ~$1.72/month for weekly runs
+
+---
+
+## Phase 1: Core Infrastructure (Week 1-2)
+
+**Goal**: Set up basic GPU runner capability and test end-to-end.
+
+### Tasks
+
+#### 1. Set up AWS infrastructure (Task #11) 🔧
+
+**Priority**: Critical - blocking all other work
+**Owner**: DevOps/Charles
+
+**Actions**:
+
+- [ ] Create or identify VPC in us-east-2 and us-east-1
+- [ ] Create subnets (3 AZs per region = 6 total)
+- [ ] Configure security groups (SSH, HTTPS, HTTP)
+- [ ] Set up IAM role for OIDC authentication
+- [ ] Document all IDs and add to GitHub repo variables
+
+**Repository Variables** (add via Settings > Secrets and variables > Actions):
+
+```text
+SUBNET_US_EAST_2A=subnet-xxxxx
+SUBNET_US_EAST_2B=subnet-xxxxx
+SUBNET_US_EAST_2C=subnet-xxxxx
+SUBNET_US_EAST_1A=subnet-xxxxx
+SUBNET_US_EAST_1B=subnet-xxxxx
+SUBNET_US_EAST_1C=subnet-xxxxx
+AWS_EC2_AMI_US_EAST_2=ami-xxxxx
+AWS_EC2_AMI_US_EAST_1=ami-xxxxx
+SECURITY_GROUP_ID_US_EAST_2=sg-xxxxx
+SECURITY_GROUP_ID_US_EAST_1=sg-xxxxx
+```
+
+**Repository Secrets**:
+
+```text
+AWS_ROLE_ARN=arn:aws:iam::123456789012:role/GitHubActionsRole
+RELEASE_PAT=ghp_xxxxx (GitHub PAT with 'repo' scope)
+```
+
+**Dependencies**: None
+**Estimated Time**: 2-4 hours
+
+---
+
+#### 2. Create GPU-enabled AMI (Task #10) 🖼️
+
+**Priority**: Critical - needed for runner launch
+**Depends On**: Task #11 (AWS infrastructure)
+
+**Actions**:
+
+- [ ] Launch base EC2 instance (g6.2xlarge with Amazon Linux 2023 or Ubuntu 22.04)
+- [ ] Install NVIDIA drivers and CUDA 12.4
+- [ ] Install Docker with NVIDIA Container Toolkit
+- [ ] Install system packages (gcc, g++, make, git, python3.12, python3.12-devel)
+- [ ] Configure CUDA environment variables
+- [ ] Verify with `nvidia-smi`
+- [ ] Create AMI in both us-east-2 and us-east-1
+- [ ] Document AMI IDs
+
+**Alternative**: Use AWS Deep Learning AMI and customize
+
+**Validation**:
+
+```bash
+nvidia-smi  # Should show GPU
+nvcc --version  # Should show CUDA 12.4
+docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi
+```
+
+**Dependencies**: Task #11
+**Estimated Time**: 3-5 hours (includes testing)
+
+---
+
+#### 3. Create launch-gpu-runner action (Task #12) ⚙️
+
+**Priority**: Critical - core functionality
+**Depends On**: Tasks #10, #11
+
+**File**: `.github/actions/launch-gpu-runner/action.yml`
+
+**Key Features**:
+
+- Multi-region fallback (us-east-2 → us-east-1)
+- Multi-AZ fallback (3 AZs per region)
+- Dynamic runner label generation
+- Resource tagging for cost tracking
+- Error handling and retries
+
+**Implementation Options**:
+
+1. Use `machulav/ec2-github-runner@v2.3.6` with wrapper logic
+2. Fork and customize `instructlab/ci-actions` (if available)
+3. Build custom JavaScript action
+
+**Recommended**: Option 1 (machulav with wrapper)
+
+**Dependencies**: Tasks #10, #11
+**Estimated Time**: 4-6 hours
+
+---
+
+#### 4. Create setup-vllm-gpu action (Task #6) 🚀
+
+**Priority**: Critical - needed for test execution
+**Depends On**: Task #10 (AMI with CUDA)
+
+**File**: `.github/actions/setup-vllm-gpu/action.yml`
+
+**Key Features**:
+
+- Install vLLM with GPU support
+- Pull gpt-oss:20b model (or specified model)
+- Start vLLM server with optimal settings:
+  - AWQ quantization for 24GB GPUs
+  - GPU memory utilization: 0.85
+  - Tool calling support (hermes parser)
+- Health check with timeout
+- Support both Ollama and HuggingFace models
+
+**vLLM Server Command**:
+
+```bash
+vllm serve gpt-oss:20b \
+  --host 0.0.0.0 \
+  --port 8000 \
+  --tensor-parallel-size 1 \
+  --gpu-memory-utilization 0.85 \
+  --max-model-len 8192 \
+  --enable-auto-tool-choice \
+  --tool-call-parser hermes \
+  --dtype auto \
+  --quantization awq
+```
+
+**Dependencies**: Task #10
+**Estimated Time**: 3-4 hours
+
+---
+
+#### 5. Create record-vllm-gpu-tests workflow (Task #1) 📋
+
+**Priority**: Critical - ties everything together
+**Depends On**: Tasks #6, #12
+
+**File**: `.github/workflows/record-vllm-gpu-tests.yml`
+
+**Structure**:
+
+```yaml
+name: vLLM GPU Recording
+
+on:
+  workflow_dispatch:
+    inputs:
+      model: [gpt-oss:20b, gpt-oss:latest, Qwen/Qwen3-0.6B]
+      instance_type: [g6.2xlarge, g5.2xlarge, g6.8xlarge, g6e.12xlarge]
+      suite: [base, responses, vllm-reasoning]
+      pr_number: (optional)
+
+jobs:
+  start-gpu-runner:
+    # Launch EC2 with launch-gpu-runner action
+
+  record-vllm-tests:
+    runs-on: ${{ needs.start-gpu-runner.outputs.label }}
+    permissions: {}  # CRITICAL: No permissions
+    # Setup vLLM GPU, run tests, upload artifacts
+
+  stop-gpu-runner:
+    if: always()  # CRITICAL: Always cleanup
+    # Terminate EC2 instance
+```
+
+**Security Highlights**:
+
+- Test job has `permissions: {}` (prevents secret theft)
+- OIDC authentication (no long-lived AWS credentials)
+- `if: always()` on cleanup job (prevents orphaned instances)
+
+**Dependencies**: Tasks #6, #12
+**Estimated Time**: 4-6 hours
+
+---
+
+#### 6. Update test configuration files (Tasks #7, #9) 📝
+
+**Priority**: High - needed for test execution
+
+#### 6a. Update tests/integration/suites.py (Task #7)
+
+Add new setup to SETUP_DEFINITIONS dict:
+
+```python
+SETUP_DEFINITIONS = {
+    # ... existing setups ...
+    "vllm-gpu-gpt-oss": Setup(
+        name="vllm-gpu",
+        description="vLLM GPU provider with gpt-oss:20b model",
+        env={
+            "VLLM_URL": "http://0.0.0.0:8000/v1",
+        },
+        defaults={
+            "text_model": "vllm/gpt-oss:20b",
+        },
+    ),
+}
+```
+
+#### 6b. Update tests/integration/ci_matrix.json (Task #9)
+
+Add GPU matrix:
+
+```json
+"gpu-vllm": [
+  {"suite": "base", "setup": "vllm-gpu-gpt-oss", "model": "gpt-oss:20b"},
+  {"suite": "responses", "setup": "vllm-gpu-gpt-oss", "model": "gpt-oss:20b"},
+  {"suite": "vllm-reasoning", "setup": "vllm-gpu-gpt-oss", "model": "gpt-oss:20b"}
+]
+```
+
+**Dependencies**: None (can be done in parallel)
+**Estimated Time**: 1-2 hours total
+
+---
+
+#### 7. End-to-end testing (Task #8) ✅
+
+**Priority**: Critical - validates entire system
+**Depends On**: Tasks #1, #6, #7, #9, #10, #11, #12
+
+**Test Plan**:
+
+1. **Happy Path Test**:
+   - [ ] Trigger workflow via workflow_dispatch
+   - [ ] Verify EC2 launches in us-east-2a
+   - [ ] Verify runner registration
+   - [ ] Verify vLLM starts with gpt-oss:20b
+   - [ ] Verify tests execute successfully
+   - [ ] Verify recordings uploaded
+   - [ ] Verify EC2 cleanup
+   - [ ] Check execution time (< 30 min)
+   - [ ] Check AWS cost (~$0.43)
+
+2. **Failure Scenarios**:
+   - [ ] Capacity issue → verify fallback to us-east-2b
+   - [ ] Region capacity issue → verify fallback to us-east-1
+   - [ ] Test failure → verify cleanup still happens
+   - [ ] Manual cancellation → verify cleanup
+
+3. **Performance Validation**:
+   - [ ] Runner startup: < 5 min
+   - [ ] vLLM startup: < 5 min
+   - [ ] Test execution: ~20 min
+   - [ ] Total: < 30 min
+
+**Success Criteria**:
+
+- 3 consecutive successful runs
+- 95%+ success rate over 10 runs
+- Average cost < $0.50 per run
+- Zero orphaned instances
+
+**Dependencies**: All Phase 1 tasks
+**Estimated Time**: 4-8 hours (includes iteration)
+
+---
+
+#### 8. Documentation (Task #2) 📚
+
+**Priority**: Medium - needed for team adoption
+**Depends On**: Task #8 (successful testing)
+
+**Actions**:
+
+- [ ] Create `docs/gpu-runners.md` with usage guide
+- [ ] Update README.md with GPU runner section
+- [ ] Document troubleshooting steps
+- [ ] Add inline comments to workflow files
+- [ ] Document cost estimates and monitoring
+
+**Content**:
+
+- How to trigger manual re-recording
+- Expected costs and runtime
+- AWS prerequisites
+- Troubleshooting guide
+- Architecture diagram (reference GPU_RUNNERS_DESIGN.md)
+
+**Dependencies**: Task #8
+**Estimated Time**: 2-3 hours
+
+---
+
+## Phase 1 Summary
+
+**Total Estimated Time**: 2-3 weeks (part-time)
+**Key Deliverables**:
+
+- ✅ Working GPU runner infrastructure
+- ✅ Manual workflow_dispatch for re-recording
+- ✅ End-to-end tested with gpt-oss:20b
+- ✅ Documentation for team
+
+**Phase 1 Completion Criteria**:
+
+- [ ] All Phase 1 tasks completed
+- [ ] 10+ successful GPU recording runs
+- [ ] Zero orphaned EC2 instances
+- [ ] Documentation reviewed and approved
+
+---
+
+## Phase 2: Optimization (Week 3-4)
+
+**Goal**: Reduce costs and improve performance.
+
+### Phase 2 Tasks
+
+#### 9. Set up cost monitoring (Task #4) 💰
+
+**Priority**: High - prevents cost overruns
+
+**Actions**:
+
+- [ ] Create AWS Budget ($50/month alert)
+- [ ] CloudWatch alarm for long-running instances (> 2 hours)
+- [ ] Lambda for auto-cleanup of orphaned instances
+- [ ] Enable cost allocation tags
+- [ ] Create CloudWatch dashboard
+
+**Metrics to Track**:
+
+- Total monthly GPU costs
+- Average run duration
+- Success rate
+- Spot vs on-demand usage
+
+**Dependencies**: Task #8 (completed Phase 1)
+**Estimated Time**: 3-4 hours
+
+---
+
+#### 10. Implement spot instances (Task #3) 💵
+
+**Priority**: High - 70-80% cost savings
+
+**Actions**:
+
+- [ ] Update launch-gpu-runner action for spot support
+- [ ] Add spot/on-demand fallback logic
+- [ ] Update workflow to use spot by default
+- [ ] Test spot reliability (10+ runs)
+- [ ] Update cost documentation
+
+**Expected Savings**: $0.43 → $0.09-$0.17 per run
+
+**Dependencies**: Task #8 (completed Phase 1)
+**Estimated Time**: 4-6 hours
+
+---
+
+#### 11. Add model caching (Task #5) ⚡
+
+**Priority**: Medium - performance optimization
+
+**Actions**:
+
+- [ ] Pre-cache gpt-oss:20b in AMI
+- [ ] Test with cached model
+- [ ] Measure time savings
+- [ ] Update documentation
+
+**Expected Improvement**: 30 min → 20 min total runtime
+
+**Dependencies**: Task #10 (AMI creation)
+**Estimated Time**: 2-3 hours
+
+---
+
+## Phase 2 Summary
+
+**Total Estimated Time**: 1-2 weeks (part-time)
+**Key Deliverables**:
+
+- ✅ Cost monitoring and alerts
+- ✅ Spot instance support (70-80% cost reduction)
+- ✅ Model caching (33% faster runs)
+
+**Expected Outcomes**:
+
+- Monthly cost: $1.72 → $0.34-$0.69 (with spot instances)
+- Run time: 30 min → 20 min (with caching)
+
+---
+
+## Phase 3: Automation (Future)
+
+**Goal**: Integrate GPU runners into existing CI/CD.
+
+### Potential Tasks
+
+1. **Add scheduled GPU runs**:
+   - Weekly full test suite on gpt-oss:20b
+   - Update ci_matrix.json schedules
+
+2. **Integrate with record-integration-tests.yml**:
+   - Add vllm-gpu to provider matrix
+   - Support manual trigger for GPU re-recording
+
+3. **PR comment integration**:
+   - Notify when GPU recordings complete
+   - Link to artifacts
+
+4. **Auto-scaling**:
+   - Queue-based runner provisioning
+   - Scale based on pending workflow runs
+
+---
+
+## Phase 4: Advanced Features (Future)
+
+**Goal**: Support multiple models and advanced use cases.
+
+### Phase 4 Tasks
+
+1. **Multi-model support**:
+   - Add more models to GPU matrix
+   - Model-specific optimizations
+
+2. **Distributed inference**:
+   - Multi-GPU support (g6e.12xlarge with 4x L40S)
+   - Tensor parallelism for 70B+ models
+
+3. **Custom metrics dashboard**:
+   - Real-time cost tracking
+   - Performance trends
+   - Success rate by model
+
+---
+
+## Risk Mitigation
+
+### Risk 1: EC2 Capacity Issues
+
+**Mitigation**: Multi-region, multi-AZ fallback (9 AZs total)
+**Probability**: Low (<5% with fallback)
+
+### Risk 2: Cost Overruns
+
+**Mitigation**: AWS Budgets, CloudWatch alarms, auto-cleanup Lambda
+**Probability**: Very Low (<1% with monitoring)
+
+### Risk 3: Orphaned Instances
+
+**Mitigation**: `if: always()` cleanup, CloudWatch alarms, auto-cleanup
+**Probability**: Very Low (<1%)
+
+### Risk 4: Security Issues
+
+**Mitigation**: OIDC auth, `permissions: {}`, read-only secrets
+**Probability**: Very Low (<1%)
+
+---
+
+## Success Metrics
+
+### Functional
+
+- [ ] Successfully record gpt-oss:20b tests within 30 minutes
+- [ ] 95%+ success rate for runner provisioning
+- [ ] Zero leaked AWS credentials
+
+### Performance
+
+- [ ] Runner startup time < 5 minutes
+- [ ] vLLM startup time < 5 minutes
+- [ ] Total execution time < 30 minutes (20 min with caching)
+
+### Cost
+
+- [ ] Monthly AWS costs < $20 for on-demand
+- [ ] Monthly AWS costs < $5 with spot instances
+- [ ] Spot instance utilization > 70%
+
+### Reliability
+
+- [ ] Multi-region fallback prevents <2% of failures
+- [ ] 100% runner cleanup rate
+- [ ] Zero orphaned instances over 30 days
+
+---
+
+## Next Steps
+
+### Immediate (This Week)
+
+1. **Task #11**: Set up AWS infrastructure (Charles + DevOps)
+2. **Task #10**: Create GPU AMI with CUDA (Charles)
+3. **Task #12**: Create launch-gpu-runner action (Charles)
+
+### Week 2
+
+1. **Task #6**: Create setup-vllm-gpu action (Charles)
+2. **Task #1**: Create record-vllm-gpu-tests workflow (Charles)
+3. **Tasks #7, #9**: Update test configuration (Charles)
+
+### Week 3
+
+1. **Task #8**: End-to-end testing (Charles + team)
+2. **Task #2**: Documentation (Charles)
+
+### Week 4
+
+1. **Task #4**: Set up cost monitoring (DevOps)
+2. **Task #3**: Implement spot instances (Charles)
+3. **Task #5**: Add model caching (Charles)
+
+---
+
+## Resources
+
+- **Design Document**: `GPU_RUNNERS_DESIGN.md`
+- **AWS Documentation**: [EC2 Instance Types](https://aws.amazon.com/ec2/instance-types/g6/)
+- **vLLM Documentation**: [docs.vllm.ai](https://docs.vllm.ai/)
+- **GitHub Actions Security**: [Security hardening](https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions)
+- **Reference Implementations**:
+  - [instructlab/instructlab](https://github.com/instructlab/instructlab/tree/main/.github/workflows)
+  - [opendatahub-io/data-processing](https://github.com/opendatahub-io/data-processing/blob/main/.github/workflows/execute-all-notebooks.yml)
+
+---
+
+## Questions or Issues?
+
+Contact: Charles Doern (@cdoern)
diff --git a/IMPLEMENTATION_STATUS.md b/IMPLEMENTATION_STATUS.md
new file mode 100644
index 0000000000..73689c6f35
--- /dev/null
+++ b/IMPLEMENTATION_STATUS.md
@@ -0,0 +1,415 @@
+# GPU Runners Implementation Status
+
+**Last Updated**: 2026-03-25
+**Status**: Phase 1 - Code Complete, AWS Setup Required
+
+## ✅ Completed Tasks
+
+### Code Implementation (Ready to Use)
+
+#### 1. GitHub Actions Workflow ✅
+
+**File**: `.github/workflows/record-vllm-gpu-tests.yml`
+
+- ✅ Manual workflow_dispatch trigger
+- ✅ OIDC authentication for AWS (no long-lived credentials!)
+- ✅ Multi-region/AZ fallback strategy
+- ✅ Three-job pattern (launch → test → cleanup)
+- ✅ Security hardening (`permissions: {}` on test job)
+- ✅ Always-cleanup guarantee (`if: always()`)
+- ✅ Comprehensive error handling and logging
+
+**Features**:
+
+- Select model: gpt-oss:20b, gpt-oss:latest, Qwen/Qwen3-0.6B
+- Select instance type: g6.2xlarge, g5.2xlarge, g6.8xlarge, g6e.12xlarge
+- Select test suite: base, responses, vllm-reasoning
+- Optional PR number for tracking
+
+#### 2. Setup vLLM GPU Action ✅
+
+**File**: `.github/actions/setup-vllm-gpu/action.yml`
+
+- ✅ GPU verification (nvidia-smi)
+- ✅ CUDA environment configuration
+- ✅ Python virtual environment setup
+- ✅ PyTorch with CUDA installation
+- ✅ vLLM installation with GPU support
+- ✅ Model pulling (Ollama and HuggingFace formats)
+- ✅ vLLM server startup with optimal settings
+- ✅ Health check with 10-minute timeout
+- ✅ AWQ quantization for 24GB GPUs
+
+#### 3. Launch GPU Runner Action ✅
+
+**File**: `.github/actions/launch-gpu-runner/action.yml`
+
+- ✅ Wrapper for machulav/ec2-github-runner
+- ✅ Support for both start and stop modes
+- ✅ Configurable instance type, region, subnet, AMI, security group
+- ✅ Resource tagging support
+- ✅ IAM role support for enhanced security
+
+**Note**: Multi-region fallback logic is handled in the workflow, not the action.
+
+#### 4. Test Configuration ✅
+
+**Files**: `tests/integration/suites.py`, `tests/integration/ci_matrix.json`
+
+- ✅ Added `vllm-gpu-gpt-oss` setup in suites.py
+- ✅ Added `gpu-vllm` matrix in ci_matrix.json
+- ✅ Configured for base, responses, and vllm-reasoning test suites
+
+#### 5. Documentation ✅
+
+**Files**: `docs/gpu-runners.md`, `AWS_SETUP_GUIDE.md`, `IMPLEMENTATION_PLAN.md`, `GPU_RUNNERS_DESIGN.md`
+
+- ✅ User guide for triggering GPU workflows
+- ✅ Step-by-step AWS setup guide with OIDC
+- ✅ Detailed implementation plan
+- ✅ Architecture design document
+- ✅ Troubleshooting guides
+- ✅ Cost estimates and monitoring guidance
+
+## 🔧 AWS Setup Required (Manual Steps)
+
+**Priority**: CRITICAL - Required before testing
+
+### Infrastructure Tasks
+
+#### 1. Set up OIDC Provider ⏳
+
+**Owner**: DevOps/Charles
+**Time**: 30 minutes
+
+- [ ] Create OIDC provider in AWS IAM
+- [ ] Create IAM role `GitHubActionsLlamaStackGPU`
+- [ ] Configure trust policy for GitHub
+- [ ] Attach EC2 permissions policy
+- [ ] Save role ARN for GitHub secrets
+
+**Guide**: `AWS_SETUP_GUIDE.md` Step 1
+
+#### 2. Set up VPC and Networking ⏳
+
+**Owner**: DevOps/Charles
+**Time**: 1-2 hours
+
+**us-east-2 (Primary)**:
+
+- [ ] Create or identify VPC
+- [ ] Create 3 subnets (us-east-2a, 2b, 2c)
+- [ ] Configure internet gateway and routing
+- [ ] Create security group
+
+**us-east-1 (Fallback)**:
+
+- [ ] Create or identify VPC
+- [ ] Create 3 subnets (us-east-1a, 1b, 1c)
+- [ ] Configure internet gateway and routing
+- [ ] Create security group
+
+**Guide**: `AWS_SETUP_GUIDE.md` Step 2
+
+#### 3. Create GPU-Enabled AMI ⏳
+
+**Owner**: DevOps/Charles
+**Time**: 3-4 hours (includes building time)
+
+**us-east-2**:
+
+- [ ] Launch g6.2xlarge instance
+- [ ] Install NVIDIA drivers
+- [ ] Install CUDA 12.4
+- [ ] Install Docker with NVIDIA Container Toolkit
+- [ ] Install Python 3.12
+- [ ] Create AMI
+
+**us-east-1**:
+
+- [ ] Copy AMI from us-east-2
+- [ ] Verify AMI works
+
+**Guide**: `AWS_SETUP_GUIDE.md` Step 3
+
+#### 4. Create GitHub Personal Access Token ⏳
+
+**Owner**: Charles
+**Time**: 5 minutes
+
+- [ ] Create PAT with `repo` scope
+- [ ] Save token securely
+
+**Guide**: `AWS_SETUP_GUIDE.md` Step 4
+
+#### 5. Configure GitHub Secrets and Variables ⏳
+
+**Owner**: Charles
+**Time**: 10 minutes
+
+**Secrets** (Settings > Secrets and variables > Actions > Secrets):
+
+- [ ] `AWS_ROLE_ARN`: IAM role ARN from step 1
+- [ ] `RELEASE_PAT`: GitHub PAT from step 4
+
+**Variables** (Settings > Secrets and variables > Actions > Variables):
+
+- [ ] `SUBNET_US_EAST_2A`, `SUBNET_US_EAST_2B`, `SUBNET_US_EAST_2C`
+- [ ] `SUBNET_US_EAST_1A`, `SUBNET_US_EAST_1B`, `SUBNET_US_EAST_1C`
+- [ ] `AWS_EC2_AMI_US_EAST_2`
+- [ ] `AWS_EC2_AMI_US_EAST_1`
+- [ ] `SECURITY_GROUP_ID_US_EAST_2`
+- [ ] `SECURITY_GROUP_ID_US_EAST_1`
+
+**Guide**: `AWS_SETUP_GUIDE.md` Step 5
+
+## 🧪 Testing Required
+
+**Priority**: HIGH - Required for validation
+
+### Test Plan
+
+#### 1. Initial Test Run ⏳
+
+**Owner**: Charles
+**Time**: 30 minutes
+
+- [ ] Trigger workflow via workflow_dispatch
+- [ ] Verify EC2 launches successfully
+- [ ] Verify runner registers
+- [ ] Verify GPU is detected
+- [ ] Verify vLLM starts
+- [ ] Verify tests run
+- [ ] Verify recordings uploaded
+- [ ] Verify EC2 cleanup
+
+**Guide**: `AWS_SETUP_GUIDE.md` Step 6
+
+#### 2. Failure Scenario Testing ⏳
+
+**Owner**: Charles
+**Time**: 1-2 hours
+
+- [ ] Test manual cancellation (verify cleanup)
+- [ ] Test job failure (verify cleanup)
+- [ ] Verify multi-region fallback (if capacity issue)
+
+#### 3. Performance Validation ⏳
+
+**Owner**: Charles
+**Time**: Ongoing (10+ runs)
+
+- [ ] Measure average execution time (target: < 30 min)
+- [ ] Measure success rate (target: > 95%)
+- [ ] Verify no orphaned instances
+- [ ] Track AWS costs (target: ~$0.43 per run)
+
+## 📋 Phase 2: Optimization (Future)
+
+**Priority**: MEDIUM - Cost and performance improvements
+
+### Tasks
+
+#### 1. Set up AWS Cost Monitoring ⏳
+
+**Time**: 3-4 hours
+
+- [ ] Create AWS Budget with $50/month alert
+- [ ] Create CloudWatch alarm for long-running instances
+- [ ] Create Lambda for auto-cleanup
+- [ ] Enable cost allocation tags
+- [ ] Create CloudWatch dashboard
+
+**ROI**: Prevents cost overruns, better visibility
+
+#### 2. Implement Spot Instances ⏳
+
+**Time**: 4-6 hours
+
+- [ ] Update launch-gpu-runner action for spot support
+- [ ] Add spot/on-demand fallback logic
+- [ ] Test spot reliability (10+ runs)
+- [ ] Update documentation
+
+**ROI**: 70-80% cost reduction ($0.43 → $0.09-$0.17 per run)
+
+#### 3. Add Model Caching ⏳
+
+**Time**: 2-3 hours
+
+- [ ] Pre-cache gpt-oss:20b in AMI
+- [ ] Test with cached model
+- [ ] Measure time savings
+- [ ] Update documentation
+
+**ROI**: 33% faster runs (30 min → 20 min)
+
+## 📊 Success Metrics
+
+### Functional
+
+- [ ] Successfully record gpt-oss:20b tests within 30 minutes
+- [ ] 95%+ success rate for runner provisioning
+- [ ] Zero leaked AWS credentials
+- [ ] Zero orphaned EC2 instances
+
+### Performance
+
+- [ ] Runner startup time < 5 minutes
+- [ ] vLLM startup time < 5 minutes
+- [ ] Total execution time < 30 minutes
+
+### Cost
+
+- [ ] Monthly AWS costs < $20 for on-demand usage
+- [ ] Average cost per run: $0.40-$0.50
+
+## 🚀 Quick Start Guide
+
+### For First-Time Setup
+
+1. **AWS Setup** (DevOps + Charles, ~8 hours total):
+
+   ```bash
+   # Follow AWS_SETUP_GUIDE.md steps 1-5
+   # Estimated time breakdown:
+   # - OIDC setup: 30 min
+   # - VPC/networking: 1-2 hours
+   # - AMI creation: 3-4 hours
+   # - GitHub config: 15 min
+   # - Testing: 30 min
+   ```
+
+2. **Test Workflow**:
+   - Go to Actions > vLLM GPU Recording
+   - Click Run workflow
+   - Use defaults
+   - Verify success
+
+3. **Monitor**:
+   - Check AWS EC2 console
+   - Verify instance cleanup
+   - Check costs in AWS Billing
+
+### For Regular Use
+
+1. Go to Actions > vLLM GPU Recording
+2. Click Run workflow
+3. Select model and test suite
+4. Download recordings artifact when done
+5. Commit recordings to PR
+
+## 📁 File Structure
+
+```text
+llama-stack/
+├── .github/
+│   ├── actions/
+│   │   ├── launch-gpu-runner/
+│   │   │   └── action.yml ✅
+│   │   └── setup-vllm-gpu/
+│   │       └── action.yml ✅
+│   └── workflows/
+│       └── record-vllm-gpu-tests.yml ✅
+├── docs/
+│   └── gpu-runners.md ✅
+├── tests/integration/
+│   ├── ci_matrix.json ✅ (updated)
+│   └── suites.py ✅ (updated)
+├── AWS_SETUP_GUIDE.md ✅
+├── GPU_RUNNERS_DESIGN.md ✅
+├── IMPLEMENTATION_PLAN.md ✅
+└── IMPLEMENTATION_STATUS.md ✅ (this file)
+```
+
+## 🔒 Security Highlights
+
+### OIDC Authentication
+
+- ✅ No long-lived AWS credentials in GitHub
+- ✅ Temporary tokens from AWS STS
+- ✅ Automatic rotation
+- ✅ Better audit trail
+
+### Test Job Isolation
+
+- ✅ `permissions: {}` on test job
+- ✅ Cannot access secrets
+- ✅ Cannot write to repository
+- ✅ Prevents credential theft
+
+### Cleanup Guarantees
+
+- ✅ `if: always()` on cleanup job
+- ✅ Runs even on failure/cancellation
+- ✅ Prevents orphaned instances
+- ✅ Cost protection
+
+### Resource Tagging
+
+- ✅ All instances tagged with:
+  - Project, Purpose, Model
+  - GitHub repository, run ID
+  - ManagedBy: GitHub-Actions
+
+## 💰 Cost Estimates
+
+### Current State (On-Demand)
+
+| Scenario | Frequency | Cost/Month |
+|----------|-----------|------------|
+| Weekly re-recording | 4x/month | **$1.72** |
+| Daily testing | 30x/month | **$12.90** |
+| On-demand (PRs) | 10x/month | **$4.30** |
+
+### Phase 2 (With Spot Instances)
+
+| Scenario | Frequency | Cost/Month |
+|----------|-----------|------------|
+| Weekly re-recording | 4x/month | **$0.36-$0.68** |
+| Daily testing | 30x/month | **$2.70-$5.10** |
+| On-demand (PRs) | 10x/month | **$0.90-$1.70** |
+
+**Savings**: 60-90% with spot instances
+
+## 📞 Support
+
+### For AWS Setup Issues
+
+- Consult `AWS_SETUP_GUIDE.md`
+- Check AWS CloudTrail for API errors
+- Verify IAM permissions
+
+### For Workflow Issues
+
+- Consult `docs/gpu-runners.md`
+- Check GitHub Actions logs
+- Verify all secrets/variables set correctly
+
+### For General Questions
+
+- Review `GPU_RUNNERS_DESIGN.md` for architecture
+- Review `IMPLEMENTATION_PLAN.md` for roadmap
+- Contact: Charles Doern (@cdoern)
+
+## 🎯 Next Actions
+
+**Immediate** (This Week):
+
+1. [ ] Complete AWS infrastructure setup (Tasks 11, 10 from plan)
+2. [ ] Configure GitHub secrets and variables
+3. [ ] Run first test workflow (Task 8)
+
+**Short-term** (Next 2 Weeks):
+4. [ ] Iterate on any issues from testing
+5. [ ] Run 10+ workflows to validate reliability
+6. [ ] Measure and document actual costs
+
+**Medium-term** (Next Month):
+7. [ ] Implement cost monitoring (Task 4)
+8. [ ] Implement spot instances (Task 3)
+9. [ ] Add model caching (Task 5)
+
+---
+
+**Status**: Ready for AWS setup and testing! All code is complete and documented. 🚀
diff --git a/docs/gpu-runners.md b/docs/gpu-runners.md
new file mode 100644
index 0000000000..370749d362
--- /dev/null
+++ b/docs/gpu-runners.md
@@ -0,0 +1,418 @@
+# GPU Runners for vLLM Recording
+
+This guide explains how to use GPU-enabled self-hosted runners to re-record vLLM integration tests with larger models like `gpt-oss:20b`.
+
+## Overview
+
+GPU runners allow us to:
+
+- Test larger models (20B parameters) that don't fit on CPU runners
+- Faster inference with GPU acceleration
+- More realistic production-like test environment
+- On-demand re-recording via workflow_dispatch
+
+**Cost**: ~$0.43 per run (30 min on g6.2xlarge), ~$1.72/month for weekly runs
+
+## Quick Start
+
+### Trigger a GPU Recording Run
+
+1. Go to **Actions** tab in GitHub
+2. Select **vLLM GPU Recording** workflow
+3. Click **Run workflow**
+4. Configure:
+   - **Model**: `gpt-oss:20b` (default)
+   - **Instance Type**: `g6.2xlarge` (default)
+   - **Suite**: `base` (default)
+5. Click **Run workflow**
+
+The workflow will:
+
+1. Launch a GPU EC2 instance (5 min)
+2. Setup vLLM with the model (5 min)
+3. Run tests in record mode (~20 min)
+4. Upload recordings as artifacts
+5. Terminate the EC2 instance
+
+**Total time**: ~30 minutes
+
+### Download Recordings
+
+1. Wait for the workflow to complete
+2. Go to the workflow run summary
+3. Download the `vllm-gpu-recordings-*` artifact
+4. Extract and commit the recordings to your PR
+
+## Architecture
+
+```text
+┌─────────────────────────────────────────────────┐
+│  Workflow Trigger (manual)                      │
+│  - Select model and instance type               │
+└────────────────┬────────────────────────────────┘
+                 │
+                 ▼
+┌─────────────────────────────────────────────────┐
+│  Job 1: Start GPU EC2 Runner                    │
+│  - AWS OIDC authentication (no long-lived keys!)│
+│  - Multi-region/AZ fallback                     │
+│  - Launch g6.2xlarge with GPU AMI               │
+│  - Register as GitHub Actions runner            │
+└────────────────┬────────────────────────────────┘
+                 │
+                 ▼
+┌─────────────────────────────────────────────────┐
+│  Job 2: Run vLLM Recording Tests                │
+│  - Runs on GPU runner (permissions: {})         │
+│  - Install vLLM with CUDA support               │
+│  - Start vLLM server with AWQ quantization      │
+│  - Run integration tests in record mode         │
+│  - Upload recordings as artifacts               │
+└────────────────┬────────────────────────────────┘
+                 │
+                 ▼
+┌─────────────────────────────────────────────────┐
+│  Job 3: Stop GPU EC2 Runner                     │
+│  - Terminate instance (always runs!)            │
+└─────────────────────────────────────────────────┘
+```
+
+## AWS Prerequisites
+
+### Required AWS Resources
+
+You must set up the following in AWS before using GPU runners:
+
+#### 1. IAM Role for OIDC Authentication
+
+Create an IAM role that GitHub Actions can assume via OIDC:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": {
+        "Federated": "arn:aws:iam::YOUR_ACCOUNT_ID:oidc-provider/token.actions.githubusercontent.com"
+      },
+      "Action": "sts:AssumeRoleWithWebIdentity",
+      "Condition": {
+        "StringEquals": {
+          "token.actions.githubusercontent.com:aud": "sts.amazonaws.com"
+        },
+        "StringLike": {
+          "token.actions.githubusercontent.com:sub": "repo:YOUR_ORG/llama-stack:*"
+        }
+      }
+    }
+  ]
+}
+```
+
+Attach this policy to the role:
+
+```json
+{
+  "Version": "2012-10-17",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": [
+        "ec2:RunInstances",
+        "ec2:TerminateInstances",
+        "ec2:DescribeInstances",
+        "ec2:DescribeInstanceStatus",
+        "ec2:CreateTags",
+        "ec2:DescribeImages",
+        "ec2:DescribeSubnets",
+        "ec2:DescribeSecurityGroups"
+      ],
+      "Resource": "*",
+      "Condition": {
+        "StringEquals": {
+          "aws:RequestedRegion": ["us-east-1", "us-east-2"]
+        }
+      }
+    }
+  ]
+}
+```
+
+#### 2. VPC and Subnets
+
+You need subnets in two regions for fallback:
+
+**us-east-2 (Primary)**:
+
+- us-east-2a: subnet-xxxxx
+- us-east-2b: subnet-xxxxx
+- us-east-2c: subnet-xxxxx
+
+**us-east-1 (Fallback)**:
+
+- us-east-1a: subnet-xxxxx
+- us-east-1b: subnet-xxxxx
+- us-east-1c: subnet-xxxxx
+
+#### 3. Security Groups
+
+Create security groups in both regions with:
+
+**Inbound Rules**:
+
+- None (runners connect outbound only)
+
+**Outbound Rules**:
+
+- Port 443 (HTTPS): `0.0.0.0/0` - GitHub API, HuggingFace, PyPI
+- Port 80 (HTTP): `0.0.0.0/0` - Package downloads
+
+#### 4. GPU-Enabled AMI
+
+Create AMIs in both regions with:
+
+- Base OS: Amazon Linux 2023 or Ubuntu 22.04
+- NVIDIA drivers
+- CUDA 12.4 runtime
+- Docker with NVIDIA Container Toolkit
+- Python 3.12
+
+See `GPU_RUNNERS_DESIGN.md` Appendix C for AMI build script.
+
+### GitHub Configuration
+
+#### Secrets
+
+Add these to **Settings > Secrets and variables > Actions > Secrets**:
+
+- `AWS_ROLE_ARN`: ARN of the IAM role for OIDC (e.g., `arn:aws:iam::123456789012:role/GitHubActionsRole`)
+- `RELEASE_PAT`: GitHub Personal Access Token with `repo` scope
+
+#### Variables
+
+Add these to **Settings > Secrets and variables > Actions > Variables**:
+
+**us-east-2**:
+
+- `SUBNET_US_EAST_2A`: subnet-xxxxx
+- `SUBNET_US_EAST_2B`: subnet-xxxxx
+- `SUBNET_US_EAST_2C`: subnet-xxxxx
+- `AWS_EC2_AMI_US_EAST_2`: ami-xxxxx
+- `SECURITY_GROUP_ID_US_EAST_2`: sg-xxxxx
+
+**us-east-1**:
+
+- `SUBNET_US_EAST_1A`: subnet-xxxxx
+- `SUBNET_US_EAST_1B`: subnet-xxxxx
+- `SUBNET_US_EAST_1C`: subnet-xxxxx
+- `AWS_EC2_AMI_US_EAST_1`: ami-xxxxx
+- `SECURITY_GROUP_ID_US_EAST_1`: sg-xxxxx
+
+## Security
+
+### OIDC Authentication
+
+We use **OpenID Connect (OIDC)** to authenticate with AWS instead of long-lived access keys:
+
+- ✅ No static AWS credentials stored in GitHub
+- ✅ Automatic token rotation
+- ✅ Fine-grained permissions per workflow
+- ✅ Better audit trail in AWS CloudTrail
+
+The workflow requests temporary credentials from AWS STS using OIDC tokens from GitHub.
+
+### Test Job Isolation
+
+The test job runs with **no permissions** (`permissions: {}`):
+
+- ✅ Cannot access GitHub secrets
+- ✅ Cannot write to repository
+- ✅ Prevents credential theft from untrusted code
+
+This is critical because the test job runs potentially untrusted code on PRs.
+
+### Cleanup Guarantees
+
+The cleanup job always runs (`if: always()`):
+
+- ✅ EC2 instance terminated even on failure
+- ✅ EC2 instance terminated even on manual cancellation
+- ✅ Prevents orphaned instances and cost overruns
+
+## Instance Types
+
+| Instance | GPU | Memory | vCPUs | Cost/hr | Best For |
+|----------|-----|--------|-------|---------|----------|
+| **g6.2xlarge** | 1x L4 (24GB) | 24 GB | 8 | $0.86 | **gpt-oss:20b (recommended)** |
+| g5.2xlarge | 1x A10G (24GB) | 24 GB | 8 | $1.21 | Alternative for gpt-oss:20b |
+| g6.8xlarge | 1x L4 (24GB) | 24 GB | 32 | $1.38 | More vCPUs if needed |
+| g6e.12xlarge | 4x L40S (192GB) | 192 GB | 48 | $5.44 | 70B+ models (future) |
+
+**Note**: gpt-oss:20b requires ~40GB in FP16, but we use AWQ quantization to fit in 24GB GPU memory.
+
+## Cost Estimates
+
+| Scenario | Frequency | Instance | Cost/Run | Monthly Cost |
+|----------|-----------|----------|----------|--------------|
+| Weekly re-recording | 1x/week | g6.2xlarge | $0.43 | **$1.72** |
+| Daily testing | 1x/day | g6.2xlarge | $0.43 | **$12.90** |
+| On-demand (PRs) | 10x/month | g6.2xlarge | $0.43 | **$4.30** |
+| With spot instances | 1x/week | g6.2xlarge (spot) | $0.09-$0.17 | **$0.36-$0.68** |
+
+**Recommendation**: Use on-demand workflow_dispatch only. Add scheduled runs later if needed.
+
+## Troubleshooting
+
+### Workflow fails to launch EC2 instance
+
+**Problem**: "InsufficientInstanceCapacity" error
+
+**Solution**: The workflow automatically tries fallback regions/AZs. If all fail:
+
+1. Check AWS Service Health Dashboard for capacity issues
+2. Try a different instance type (g5.2xlarge instead of g6.2xlarge)
+3. Try again during off-peak hours
+
+### vLLM server fails to start
+
+**Problem**: Server doesn't respond to health checks
+
+**Solutions**:
+
+1. Check vLLM logs in workflow output
+2. Verify GPU is detected: look for `nvidia-smi` output
+3. Check CUDA installation: `nvcc --version`
+4. Try different quantization: change `quantization: 'awq'` to `quantization: 'none'`
+
+### Tests fail but recordings not uploaded
+
+**Problem**: No artifacts in workflow run
+
+**Solutions**:
+
+1. Check if tests actually created recordings
+2. Verify `tests/integration/*/recordings/` directories exist
+3. Check workflow logs for artifact upload errors
+
+### EC2 instance not terminated
+
+**Problem**: Instance still running after workflow completes
+
+**Solutions**:
+
+1. Check stop-gpu-runner job logs for errors
+2. Manually terminate instance via AWS console
+3. Set up CloudWatch alarm for long-running instances (see Phase 2)
+
+### Cost overruns
+
+**Problem**: Unexpected AWS charges
+
+**Solutions**:
+
+1. Check for orphaned instances in AWS EC2 console (filter by tag: `Purpose: vllm-gpu-recording`)
+2. Set up AWS Budget alerts (see `IMPLEMENTATION_PLAN.md` Phase 2)
+3. Review CloudWatch metrics for runner usage
+
+## Performance Tuning
+
+### Reduce Model Load Time
+
+**Current**: ~5 minutes to download gpt-oss:20b
+
+**Options**:
+
+1. **Pre-cache in AMI**: Include model in GPU AMI (~0 min load time)
+2. **EBS snapshot**: Attach pre-loaded model volume (~1 min)
+3. **S3 cache**: Download from S3 instead of HuggingFace (~2 min)
+
+See `IMPLEMENTATION_PLAN.md` Task #5 for implementation.
+
+### Reduce Costs with Spot Instances
+
+**Current**: $0.43 per run (on-demand)
+**With spot**: $0.09-$0.17 per run (60-90% savings)
+
+Spot instances can be interrupted, but for test workloads this is acceptable.
+
+See `IMPLEMENTATION_PLAN.md` Task #3 for implementation.
+
+## Adding New Models
+
+To add a new model for GPU testing:
+
+1. **Update workflow input** (`.github/workflows/record-vllm-gpu-tests.yml`):
+
+   ```yaml
+   model:
+     options:
+       - gpt-oss:20b
+       - gpt-oss:latest
+       - your-new-model
+   ```
+
+2. **Add to test matrix** (`tests/integration/ci_matrix.json`):
+
+   ```json
+   "gpu-vllm": [
+     {"suite": "base", "setup": "vllm-gpu-gpt-oss"},
+     {"suite": "base", "setup": "vllm-gpu-your-model"}
+   ]
+   ```
+
+3. **Create setup** (`tests/integration/suites.py`):
+
+   ```python
+   "vllm-gpu-your-model": Setup(
+       name="vllm-gpu",
+       defaults={"text_model": "vllm/your-model"},
+   )
+   ```
+
+4. **Choose instance type**:
+   - < 20B params: `g6.2xlarge` (24GB)
+   - 20-70B params: `g6.8xlarge` or `g6e.12xlarge` (192GB)
+   - 70B+ params: `g6e.12xlarge` (192GB) or `g6e.48xlarge` (384GB)
+
+## Monitoring
+
+### CloudWatch Dashboards
+
+Create a dashboard to track:
+
+- Total GPU runner costs (daily/weekly/monthly)
+- Instance launch success rate
+- Average test duration
+- Failures by reason
+
+See `IMPLEMENTATION_PLAN.md` Task #4 for setup.
+
+### Cost Allocation Tags
+
+All EC2 instances are tagged with:
+
+- `Project`: llama-stack
+- `Purpose`: vllm-gpu-recording
+- `Model`: gpt-oss:20b
+- `GitHubRepository`: your-org/llama-stack
+- `GitHubRunId`: 12345
+
+Enable cost allocation in **AWS Billing > Cost Allocation Tags** to track costs by tag.
+
+## References
+
+- **Design Document**: `GPU_RUNNERS_DESIGN.md`
+- **Implementation Plan**: `IMPLEMENTATION_PLAN.md`
+- **AWS EC2 Instance Types**: <https://aws.amazon.com/ec2/instance-types/g6/>
+- **vLLM Documentation**: <https://docs.vllm.ai/>
+- **GitHub OIDC**: <https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/configuring-openid-connect-in-amazon-web-services>
+
+## Support
+
+For issues or questions:
+
+- Create an issue in the repository
+- Check existing issues for similar problems
+- Review troubleshooting section above
+- Contact: Charles Doern (@cdoern)
diff --git a/tests/integration/ci_matrix.json b/tests/integration/ci_matrix.json
index f0a6ab53d6..d460a79f4c 100644
--- a/tests/integration/ci_matrix.json
+++ b/tests/integration/ci_matrix.json
@@ -13,6 +13,11 @@
     {"suite": "vllm-reasoning", "setup": "vllm"},
     {"suite": "ollama-reasoning", "setup": "ollama-reasoning"}
   ],
+  "gpu-vllm": [
+    {"suite": "base", "setup": "vllm-gpu-gpt-oss"},
+    {"suite": "responses", "setup": "vllm-gpu-gpt-oss"},
+    {"suite": "vllm-reasoning", "setup": "vllm-gpu-gpt-oss"}
+  ],
   "stainless": [
     {"suite": "base", "setup": "ollama", "inference_mode": "record-if-missing"}
   ],
diff --git a/tests/integration/suites.py b/tests/integration/suites.py
index b80ec65034..6362b5ad94 100644
--- a/tests/integration/suites.py
+++ b/tests/integration/suites.py
@@ -103,6 +103,16 @@ class Setup(BaseModel):
             "rerank_model": "vllm/Qwen/Qwen3-Reranker-0.6B",
         },
     ),
+    "vllm-gpu-gpt-oss": Setup(
+        name="vllm-gpu",
+        description="vLLM GPU provider with gpt-oss:20b reasoning model (runs on GPU with quantization)",
+        env={
+            "VLLM_URL": "http://0.0.0.0:8000/v1",
+        },
+        defaults={
+            "text_model": "vllm/gpt-oss:20b",
+        },
+    ),
     "ollama-reasoning": Setup(
         name="ollama",
         description="Local Ollama provider with a reasoning-capable model (gpt-oss)",