lloyal-ai · lloyal-research · Feb 12, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 11, 2026
diff --git a/.github/actions/provision-cuda/action.yaml b/.github/actions/provision-cuda/action.yaml
@@ -28,9 +28,36 @@ runs:
         method: 'network'
         sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "visual_studio_integration"]'
 
-    # Linux x64: Install from NVIDIA repos
-    - name: Install CUDA (Linux x64)
-      if: runner.os == 'Linux' && inputs.arch == 'x64'
+    # Linux: Compute version info for cache key
+    - name: Compute CUDA version info
+      id: cuda-info
+      if: runner.os == 'Linux'
+      shell: bash
+      env:
+        VERSION: ${{ inputs.version }}
+      run: |
+        echo "major-minor=$(echo $VERSION | cut -d. -f1,2)" >> $GITHUB_OUTPUT
+
+    # Linux: Cache CUDA toolkit directory (~2-3 GB, saves 3-5 min install)
+    - name: Cache CUDA toolkit
+      if: runner.os == 'Linux'
+      id: cuda-cache
+      uses: actions/cache@v4
+      with:
+        path: /usr/local/cuda-${{ steps.cuda-info.outputs.major-minor }}
+        key: cuda-toolkit-${{ inputs.version }}-${{ inputs.arch }}
+
+    # Linux: Install build dependencies (always needed, fast from apt cache)
+    - name: Install build tools
+      if: runner.os == 'Linux'
+      shell: bash
+      run: |
+        sudo apt-get update -qq
+        sudo apt-get install -y -qq build-essential cmake
+
+    # Linux x64: Install CUDA toolkit (cache miss only)
+    - name: Install CUDA toolkit (Linux x64)
+      if: runner.os == 'Linux' && inputs.arch == 'x64' && steps.cuda-cache.outputs.cache-hit != 'true'
       shell: bash
       env:
         VERSION: ${{ inputs.version }}
@@ -42,18 +69,11 @@ runs:
         wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update -qq
-        sudo apt-get install -y -qq cuda-toolkit-${version_slug} build-essential cmake
+        sudo apt-get install -y -qq cuda-toolkit-${version_slug}
 
-        cuda_path="/usr/local/cuda-${version_major_minor}"
-        echo "CUDA_PATH=${cuda_path}" >> $GITHUB_ENV
-        echo "${cuda_path}/bin" >> $GITHUB_PATH
-        echo "LD_LIBRARY_PATH=${cuda_path}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-
-        echo "CUDA installed at: ${cuda_path}"
-
-    # Linux ARM64: Install from NVIDIA repos
-    - name: Install CUDA (Linux ARM64)
-      if: runner.os == 'Linux' && inputs.arch == 'arm64'
+    # Linux ARM64: Install CUDA toolkit (cache miss only)
+    - name: Install CUDA toolkit (Linux ARM64)
+      if: runner.os == 'Linux' && inputs.arch == 'arm64' && steps.cuda-cache.outputs.cache-hit != 'true'
       shell: bash
       env:
         VERSION: ${{ inputs.version }}
@@ -65,14 +85,19 @@ runs:
         wget -q https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update -qq
-        sudo apt-get install -y -qq cuda-toolkit-${version_slug} build-essential cmake
+        sudo apt-get install -y -qq cuda-toolkit-${version_slug}
 
-        cuda_path="/usr/local/cuda-${version_major_minor}"
+    # Linux: Set CUDA environment variables (always - cached or fresh install)
+    - name: Set CUDA environment
+      if: runner.os == 'Linux'
+      shell: bash
+      run: |
+        cuda_path="/usr/local/cuda-${{ steps.cuda-info.outputs.major-minor }}"
         echo "CUDA_PATH=${cuda_path}" >> $GITHUB_ENV
         echo "${cuda_path}/bin" >> $GITHUB_PATH
         echo "LD_LIBRARY_PATH=${cuda_path}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 
-        echo "CUDA installed at: ${cuda_path}"
+        echo "CUDA ready at: ${cuda_path}"
 
     # Set output
     - name: Set CUDA path output

diff --git a/.github/workflows/gpu-test.yml b/.github/workflows/gpu-test.yml
@@ -1,6 +1,16 @@
 name: GPU Tests (CUDA)
 
 on:
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'liblloyal'
+      - 'llama.cpp'
+      - 'lib/**'
+      - 'src/**'
+      - 'test/**'
+      - 'ci/**'
+      - 'CMakeLists.txt'
   workflow_dispatch:
     inputs:
       skip_build:
@@ -38,25 +48,29 @@ jobs:
         run: node scripts/sync-llama-cpp.js --check
         shell: bash
 
-      - name: Install build tools
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential cmake
-
       # CUDA 12.2.2 required for Cloud Run L4 GPU (driver 535.x)
+      # provision-cuda also installs build-essential + cmake
       - name: Provision CUDA toolkit
         uses: ./.github/actions/provision-cuda
         with:
           version: '12.2.2'
           arch: x64
 
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: cuda-build-${{ runner.os }}
+
       - name: Install npm dependencies
-        run: npm install --ignore-scripts
+        run: npm ci --ignore-scripts
 
       - name: Build native module
         run: npm run build
         env:
           LLOYAL_GPU: cuda
+          CMAKE_C_COMPILER_LAUNCHER: ccache
+          CMAKE_CXX_COMPILER_LAUNCHER: ccache
+          CMAKE_CUDA_COMPILER_LAUNCHER: ccache
 
       - name: Create platform package
         run: node scripts/create-platform-package.js linux-x64-cuda ubuntu-22.04 x64
@@ -67,6 +81,7 @@ jobs:
           name: package-linux-x64-cuda
           path: packages/linux-x64-cuda/
           retention-days: 1
+          compression-level: 0
 
   # GPU Integration Tests via Cloud Run
   # Runs real GPU tests on NVIDIA L4
@@ -129,15 +144,15 @@ jobs:
               --image="${IMAGE}" \
               --service-account="${{ secrets.GCP_SA_EMAIL }}" \
               --set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \
-              --task-timeout=10m \
+              --task-timeout=20m \
               --no-gpu-zonal-redundancy
           else
             gcloud run jobs create $JOB_NAME \
               --region=us-east4 \
               --image="${IMAGE}" \
               --service-account="${{ secrets.GCP_SA_EMAIL }}" \
               --set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \
-              --task-timeout=10m \
+              --task-timeout=20m \
               --gpu=1 \
               --gpu-type=nvidia-l4 \
               --memory=16Gi \
@@ -149,20 +164,74 @@ jobs:
       - name: Run GPU tests
         run: |
           JOB_NAME="lloyal-gpu-test-cuda"
+          REGION="us-east4"
 
-          # Execute job
-          EXECUTION=$(gcloud run jobs execute $JOB_NAME \
-            --region=us-east4 \
-            --wait \
+          # Launch job asynchronously so we can stream logs
+          EXEC=$(gcloud run jobs execute $JOB_NAME \
+            --region=$REGION \
+            --async \
             --format='value(metadata.name)')
 
-          echo "Execution: $EXECUTION"
-
-          # Wait for logs to flush to Cloud Logging
-          sleep 5
-
-          # Get logs
-          gcloud logging read \
-            "resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$JOB_NAME\" AND resource.labels.location=\"us-east4\"" \
-            --limit=200 \
-            --format='value(textPayload)'
+          echo "Execution: $EXEC"
+          echo "Streaming logs (container startup may take ~30s)..."
+          echo ""
+
+          # Filter for this specific execution's logs
+          LOG_FILTER="resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$JOB_NAME\" AND labels.\"run.googleapis.com/execution_name\"=\"$EXEC\""
+
+          # Poll loop: stream new log lines + check for completion
+          SEEN=0
+          while true; do
+            # Check if execution has completed
+            COMPLETION=$(gcloud run jobs executions describe "$EXEC" \
+              --region="$REGION" \
+              --format='value(status.completionTime)' 2>/dev/null || true)
+
+            # Fetch all logs for this execution in chronological order
+            LOGS=$(gcloud logging read "$LOG_FILTER" \
+              --limit=10000 \
+              --order=asc \
+              --format='value(textPayload)' 2>/dev/null || true)
+
+            # Print only lines we haven't seen yet
+            if [ -n "$LOGS" ]; then
+              TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ')
+              if [ "$TOTAL" -gt "$SEEN" ]; then
+                echo "$LOGS" | tail -n +$((SEEN + 1))
+                SEEN=$TOTAL
+              fi
+            fi
+
+            # If done, do one final fetch for stragglers then break
+            if [ -n "$COMPLETION" ]; then
+              sleep 5
+              LOGS=$(gcloud logging read "$LOG_FILTER" \
+                --limit=10000 \
+                --order=asc \
+                --format='value(textPayload)' 2>/dev/null || true)
+              if [ -n "$LOGS" ]; then
+                TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ')
+                if [ "$TOTAL" -gt "$SEEN" ]; then
+                  echo "$LOGS" | tail -n +$((SEEN + 1))
+                fi
+              fi
+              break
+            fi
+
+            sleep 10
+          done
+
+          # Determine pass/fail from execution status
+          SUCCEEDED=$(gcloud run jobs executions describe "$EXEC" \
+            --region="$REGION" \
+            --format=json 2>/dev/null | \
+            jq -r '.status.conditions[] | select(.type == "Completed") | .status')
+
+          if [ "$SUCCEEDED" = "True" ]; then
+            echo ""
+            echo "✅ GPU Tests Passed"
+          else
+            echo ""
+            echo "❌ GPU Tests Failed"
+            exit 1
+          fi