Skip to content

feat(chat): new chat api #23

feat(chat): new chat api

feat(chat): new chat api #23

Workflow file for this run

name: GPU Tests (CUDA)
on:
pull_request:
branches: [ main ]
paths:
- 'liblloyal'
- 'llama.cpp'
- 'lib/**'
- 'src/**'
- 'test/**'
- 'ci/**'
- 'CMakeLists.txt'
workflow_dispatch:
inputs:
skip_build:
description: 'Skip build step (use existing artifacts)'
type: boolean
default: false
workflow_call:
inputs:
skip_build:
description: 'Skip build step (packages already built by caller)'
type: boolean
default: true
jobs:
# Build CUDA package for testing
# Skipped when called from release.yml (packages already built)
build-cuda-package:
name: Build linux-x64-cuda
if: ${{ inputs.skip_build != true }}
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: 24
registry-url: 'https://registry.npmjs.org'
- name: Validate llama.cpp version
run: node scripts/sync-llama-cpp.js --check
shell: bash
# CUDA 12.2.2 required for Cloud Run L4 GPU (driver 535.x)
# provision-cuda also installs build-essential + cmake
- name: Provision CUDA toolkit
uses: ./.github/actions/provision-cuda
with:
version: '12.2.2'
arch: x64
- name: Setup ccache
uses: hendrikmuhs/ccache-action@v1.2
with:
key: cuda-build-${{ runner.os }}
- name: Install npm dependencies
run: npm ci --ignore-scripts
- name: Build native module
run: npm run build
env:
LLOYAL_GPU: cuda
CMAKE_C_COMPILER_LAUNCHER: ccache
CMAKE_CXX_COMPILER_LAUNCHER: ccache
CMAKE_CUDA_COMPILER_LAUNCHER: ccache
- name: Create platform package
run: node scripts/create-platform-package.js linux-x64-cuda ubuntu-22.04 x64
- name: Upload platform package artifact
uses: actions/upload-artifact@v4
with:
name: package-linux-x64-cuda
path: packages/linux-x64-cuda/
retention-days: 1
compression-level: 0
# GPU Integration Tests via Cloud Run
# Runs real GPU tests on NVIDIA L4
#
# L4 GPU Requirements (as of 2024):
# - Driver: 535.216.03 (supports CUDA 12.2.2 max)
# - Minimum: 4 CPU, 16 GiB memory
# - Regions: us-central1, us-east4, europe-west1, europe-west4, asia-southeast1
# - Quota: 3 L4 GPUs per region (default)
gpu-integration:
name: GPU Tests (L4)
needs: build-cuda-package
runs-on: ubuntu-latest
# Run if build succeeded OR was skipped (packages from caller)
if: ${{ !cancelled() && (needs.build-cuda-package.result == 'success' || needs.build-cuda-package.result == 'skipped') }}
permissions:
contents: read
id-token: write # Required for Workload Identity Federation
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Authenticate to GCP
uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }}
service_account: ${{ secrets.GCP_SA_EMAIL }}
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@v2
- name: Configure Docker for Artifact Registry
run: gcloud auth configure-docker us-east4-docker.pkg.dev --quiet
- name: Download package artifact
uses: actions/download-artifact@v4
with:
name: package-linux-x64-cuda
path: packages/package-linux-x64-cuda
- name: Build GPU test image
run: |
IMAGE="us-east4-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/lloyal-ci/gpu-tests:${{ github.sha }}-cuda"
docker build \
-f ci/Dockerfile.gpu-tests \
-t "$IMAGE" .
docker push "$IMAGE"
echo "IMAGE=$IMAGE" >> $GITHUB_ENV
- name: Deploy Cloud Run Job
run: |
JOB_NAME="lloyal-gpu-test-cuda"
# Check if job exists
if gcloud run jobs describe $JOB_NAME --region=us-east4 2>/dev/null; then
gcloud run jobs update $JOB_NAME \
--region=us-east4 \
--image="${IMAGE}" \
--service-account="${{ secrets.GCP_SA_EMAIL }}" \
--set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \
--task-timeout=20m \
--no-gpu-zonal-redundancy
else
gcloud run jobs create $JOB_NAME \
--region=us-east4 \
--image="${IMAGE}" \
--service-account="${{ secrets.GCP_SA_EMAIL }}" \
--set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \
--task-timeout=20m \
--gpu=1 \
--gpu-type=nvidia-l4 \
--memory=16Gi \
--cpu=4 \
--max-retries=0 \
--no-gpu-zonal-redundancy
fi
- name: Run GPU tests
run: |
JOB_NAME="lloyal-gpu-test-cuda"
REGION="us-east4"
# Launch job asynchronously so we can stream logs
EXEC=$(gcloud run jobs execute $JOB_NAME \
--region=$REGION \
--async \
--format='value(metadata.name)')
echo "Execution: $EXEC"
echo "Streaming logs (container startup may take ~30s)..."
echo ""
# Filter for this specific execution's logs
LOG_FILTER="resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$JOB_NAME\" AND labels.\"run.googleapis.com/execution_name\"=\"$EXEC\""
# Poll loop: stream new log lines + check for completion
SEEN=0
while true; do
# Check if execution has completed
COMPLETION=$(gcloud run jobs executions describe "$EXEC" \
--region="$REGION" \
--format='value(status.completionTime)' 2>/dev/null || true)
# Fetch all logs for this execution in chronological order
LOGS=$(gcloud logging read "$LOG_FILTER" \
--limit=10000 \
--order=asc \
--format='value(textPayload)' 2>/dev/null || true)
# Print only lines we haven't seen yet
if [ -n "$LOGS" ]; then
TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ')
if [ "$TOTAL" -gt "$SEEN" ]; then
echo "$LOGS" | tail -n +$((SEEN + 1))
SEEN=$TOTAL
fi
fi
# If done, do one final fetch for stragglers then break
if [ -n "$COMPLETION" ]; then
sleep 5
LOGS=$(gcloud logging read "$LOG_FILTER" \
--limit=10000 \
--order=asc \
--format='value(textPayload)' 2>/dev/null || true)
if [ -n "$LOGS" ]; then
TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ')
if [ "$TOTAL" -gt "$SEEN" ]; then
echo "$LOGS" | tail -n +$((SEEN + 1))
fi
fi
break
fi
sleep 10
done
# Determine pass/fail from execution status
SUCCEEDED=$(gcloud run jobs executions describe "$EXEC" \
--region="$REGION" \
--format=json 2>/dev/null | \
jq -r '.status.conditions[] | select(.type == "Completed") | .status')
if [ "$SUCCEEDED" = "True" ]; then
echo ""
echo "✅ GPU Tests Passed"
else
echo ""
echo "❌ GPU Tests Failed"
exit 1
fi