feat(chat): new chat api #23
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU Tests (CUDA) | |
| on: | |
| pull_request: | |
| branches: [ main ] | |
| paths: | |
| - 'liblloyal' | |
| - 'llama.cpp' | |
| - 'lib/**' | |
| - 'src/**' | |
| - 'test/**' | |
| - 'ci/**' | |
| - 'CMakeLists.txt' | |
| workflow_dispatch: | |
| inputs: | |
| skip_build: | |
| description: 'Skip build step (use existing artifacts)' | |
| type: boolean | |
| default: false | |
| workflow_call: | |
| inputs: | |
| skip_build: | |
| description: 'Skip build step (packages already built by caller)' | |
| type: boolean | |
| default: true | |
| jobs: | |
| # Build CUDA package for testing | |
| # Skipped when called from release.yml (packages already built) | |
| build-cuda-package: | |
| name: Build linux-x64-cuda | |
| if: ${{ inputs.skip_build != true }} | |
| runs-on: ubuntu-22.04 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: 24 | |
| registry-url: 'https://registry.npmjs.org' | |
| - name: Validate llama.cpp version | |
| run: node scripts/sync-llama-cpp.js --check | |
| shell: bash | |
| # CUDA 12.2.2 required for Cloud Run L4 GPU (driver 535.x) | |
| # provision-cuda also installs build-essential + cmake | |
| - name: Provision CUDA toolkit | |
| uses: ./.github/actions/provision-cuda | |
| with: | |
| version: '12.2.2' | |
| arch: x64 | |
| - name: Setup ccache | |
| uses: hendrikmuhs/ccache-action@v1.2 | |
| with: | |
| key: cuda-build-${{ runner.os }} | |
| - name: Install npm dependencies | |
| run: npm ci --ignore-scripts | |
| - name: Build native module | |
| run: npm run build | |
| env: | |
| LLOYAL_GPU: cuda | |
| CMAKE_C_COMPILER_LAUNCHER: ccache | |
| CMAKE_CXX_COMPILER_LAUNCHER: ccache | |
| CMAKE_CUDA_COMPILER_LAUNCHER: ccache | |
| - name: Create platform package | |
| run: node scripts/create-platform-package.js linux-x64-cuda ubuntu-22.04 x64 | |
| - name: Upload platform package artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: package-linux-x64-cuda | |
| path: packages/linux-x64-cuda/ | |
| retention-days: 1 | |
| compression-level: 0 | |
| # GPU Integration Tests via Cloud Run | |
| # Runs real GPU tests on NVIDIA L4 | |
| # | |
| # L4 GPU Requirements (as of 2024): | |
| # - Driver: 535.216.03 (supports CUDA 12.2.2 max) | |
| # - Minimum: 4 CPU, 16 GiB memory | |
| # - Regions: us-central1, us-east4, europe-west1, europe-west4, asia-southeast1 | |
| # - Quota: 3 L4 GPUs per region (default) | |
| gpu-integration: | |
| name: GPU Tests (L4) | |
| needs: build-cuda-package | |
| runs-on: ubuntu-latest | |
| # Run if build succeeded OR was skipped (packages from caller) | |
| if: ${{ !cancelled() && (needs.build-cuda-package.result == 'success' || needs.build-cuda-package.result == 'skipped') }} | |
| permissions: | |
| contents: read | |
| id-token: write # Required for Workload Identity Federation | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Authenticate to GCP | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} | |
| service_account: ${{ secrets.GCP_SA_EMAIL }} | |
| - name: Set up Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| - name: Configure Docker for Artifact Registry | |
| run: gcloud auth configure-docker us-east4-docker.pkg.dev --quiet | |
| - name: Download package artifact | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: package-linux-x64-cuda | |
| path: packages/package-linux-x64-cuda | |
| - name: Build GPU test image | |
| run: | | |
| IMAGE="us-east4-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/lloyal-ci/gpu-tests:${{ github.sha }}-cuda" | |
| docker build \ | |
| -f ci/Dockerfile.gpu-tests \ | |
| -t "$IMAGE" . | |
| docker push "$IMAGE" | |
| echo "IMAGE=$IMAGE" >> $GITHUB_ENV | |
| - name: Deploy Cloud Run Job | |
| run: | | |
| JOB_NAME="lloyal-gpu-test-cuda" | |
| # Check if job exists | |
| if gcloud run jobs describe $JOB_NAME --region=us-east4 2>/dev/null; then | |
| gcloud run jobs update $JOB_NAME \ | |
| --region=us-east4 \ | |
| --image="${IMAGE}" \ | |
| --service-account="${{ secrets.GCP_SA_EMAIL }}" \ | |
| --set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \ | |
| --task-timeout=20m \ | |
| --no-gpu-zonal-redundancy | |
| else | |
| gcloud run jobs create $JOB_NAME \ | |
| --region=us-east4 \ | |
| --image="${IMAGE}" \ | |
| --service-account="${{ secrets.GCP_SA_EMAIL }}" \ | |
| --set-env-vars=LLOYAL_GPU=cuda,LLOYAL_NO_FALLBACK=1 \ | |
| --task-timeout=20m \ | |
| --gpu=1 \ | |
| --gpu-type=nvidia-l4 \ | |
| --memory=16Gi \ | |
| --cpu=4 \ | |
| --max-retries=0 \ | |
| --no-gpu-zonal-redundancy | |
| fi | |
| - name: Run GPU tests | |
| run: | | |
| JOB_NAME="lloyal-gpu-test-cuda" | |
| REGION="us-east4" | |
| # Launch job asynchronously so we can stream logs | |
| EXEC=$(gcloud run jobs execute $JOB_NAME \ | |
| --region=$REGION \ | |
| --async \ | |
| --format='value(metadata.name)') | |
| echo "Execution: $EXEC" | |
| echo "Streaming logs (container startup may take ~30s)..." | |
| echo "" | |
| # Filter for this specific execution's logs | |
| LOG_FILTER="resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"$JOB_NAME\" AND labels.\"run.googleapis.com/execution_name\"=\"$EXEC\"" | |
| # Poll loop: stream new log lines + check for completion | |
| SEEN=0 | |
| while true; do | |
| # Check if execution has completed | |
| COMPLETION=$(gcloud run jobs executions describe "$EXEC" \ | |
| --region="$REGION" \ | |
| --format='value(status.completionTime)' 2>/dev/null || true) | |
| # Fetch all logs for this execution in chronological order | |
| LOGS=$(gcloud logging read "$LOG_FILTER" \ | |
| --limit=10000 \ | |
| --order=asc \ | |
| --format='value(textPayload)' 2>/dev/null || true) | |
| # Print only lines we haven't seen yet | |
| if [ -n "$LOGS" ]; then | |
| TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ') | |
| if [ "$TOTAL" -gt "$SEEN" ]; then | |
| echo "$LOGS" | tail -n +$((SEEN + 1)) | |
| SEEN=$TOTAL | |
| fi | |
| fi | |
| # If done, do one final fetch for stragglers then break | |
| if [ -n "$COMPLETION" ]; then | |
| sleep 5 | |
| LOGS=$(gcloud logging read "$LOG_FILTER" \ | |
| --limit=10000 \ | |
| --order=asc \ | |
| --format='value(textPayload)' 2>/dev/null || true) | |
| if [ -n "$LOGS" ]; then | |
| TOTAL=$(echo "$LOGS" | wc -l | tr -d ' ') | |
| if [ "$TOTAL" -gt "$SEEN" ]; then | |
| echo "$LOGS" | tail -n +$((SEEN + 1)) | |
| fi | |
| fi | |
| break | |
| fi | |
| sleep 10 | |
| done | |
| # Determine pass/fail from execution status | |
| SUCCEEDED=$(gcloud run jobs executions describe "$EXEC" \ | |
| --region="$REGION" \ | |
| --format=json 2>/dev/null | \ | |
| jq -r '.status.conditions[] | select(.type == "Completed") | .status') | |
| if [ "$SUCCEEDED" = "True" ]; then | |
| echo "" | |
| echo "✅ GPU Tests Passed" | |
| else | |
| echo "" | |
| echo "❌ GPU Tests Failed" | |
| exit 1 | |
| fi |