Run OpenResponses conformance tests against llama-stack Responses API #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: OpenResponses Conformance Tests | |
| run-name: Run OpenResponses conformance tests against llama-stack Responses API | |
| # This job is OPTIONAL and informational — it tracks progress toward full | |
| # conformance with the OpenResponses spec (https://openresponses.org). | |
| # Failures are expected while gaps remain in the Responses API implementation. | |
| # See: https://github.com/llamastack/llama-stack/issues/4818 | |
| # | |
| # Inference calls are replayed from checked-in recordings under | |
| # tests/integration/openresponses/recordings/. To add or update recordings, | |
| # run the server locally with LLAMA_STACK_TEST_INFERENCE_MODE=record-if-missing | |
| # pointed at that directory, run the compliance tests, and commit the results. | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - 'release-[0-9]+.[0-9]+.x' | |
| paths: | |
| - 'src/llama_stack/providers/inline/agents/**' | |
| - 'src/llama_stack/apis/agents/**' | |
| - 'tests/integration/openresponses/**' | |
| - '.github/workflows/openresponses-conformance.yml' | |
| pull_request: | |
| branches: | |
| - main | |
| - 'release-[0-9]+.[0-9]+.x' | |
| paths: | |
| - 'src/llama_stack/providers/inline/agents/**' | |
| - 'src/llama_stack/apis/agents/**' | |
| - 'tests/integration/openresponses/**' | |
| - '.github/workflows/openresponses-conformance.yml' | |
| merge_group: | |
| branches: | |
| - main | |
| - 'release-[0-9]+.[0-9]+.x' | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| jobs: | |
| openresponses-conformance: | |
| name: OpenResponses Conformance | |
| runs-on: ubuntu-latest | |
| env: | |
| # Dummy key satisfies config parsing; actual inference is replayed from recordings | |
| OPENAI_API_KEY: "dummy-key-for-replay-mode" | |
| INFERENCE_MODEL: "openai/gpt-4o-mini" | |
| LLAMA_STACK_PORT: 8321 | |
| LLAMA_STACK_TEST_INFERENCE_MODE: "replay" | |
| LLAMA_STACK_TEST_RECORDING_DIR: ${{ github.workspace }}/tests/integration/openresponses | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| - name: Check for recordings | |
| id: check-recordings | |
| run: | | |
| RECORDINGS_DIR="${{ github.workspace }}/tests/integration/openresponses/recordings" | |
| COUNT=$(find "$RECORDINGS_DIR" -name "*.json" 2>/dev/null | wc -l) | |
| echo "recordings_count=$COUNT" >> "$GITHUB_OUTPUT" | |
| if [ "$COUNT" -eq 0 ]; then | |
| echo "::warning::No recordings found in tests/integration/openresponses/recordings/." | |
| echo "::warning::Run the server locally with LLAMA_STACK_TEST_INFERENCE_MODE=record-if-missing" | |
| echo "::warning::and commit the resulting recordings to enable conformance testing in CI." | |
| else | |
| echo "Found $COUNT recording(s)" | |
| fi | |
| - name: Install dependencies | |
| uses: ./.github/actions/setup-runner | |
| with: | |
| python-version: '3.12' | |
| - name: Install ci-tests distro provider dependencies | |
| run: uv run llama stack list-deps ci-tests --format uv | sh | |
| - name: Start Llama Stack server | |
| run: | | |
| mkdir -p /tmp/llama-stack-conformance | |
| export LLAMA_STACK_LOG_WIDTH=200 | |
| nohup uv run llama stack run ci-tests --port "$LLAMA_STACK_PORT" \ | |
| > /tmp/llama-stack-conformance/server.log 2>&1 & | |
| echo "Server PID: $!" | |
| - name: Wait for Llama Stack server to be ready | |
| run: | | |
| echo "Waiting for Llama Stack server..." | |
| for _ in {1..60}; do | |
| if curl -s "http://localhost:$LLAMA_STACK_PORT/v1/health" | grep -q "OK"; then | |
| echo "Llama Stack server is ready!" | |
| exit 0 | |
| fi | |
| sleep 2 | |
| done | |
| echo "Llama Stack server failed to start in 120 seconds" | |
| cat /tmp/llama-stack-conformance/server.log | |
| exit 1 | |
| - name: Setup Bun | |
| uses: oven-sh/setup-bun@ecf28ddc73e819eb6fa29df6b34ef8921c743461 #v2.0.0 | |
| - name: Clone OpenResponses repository | |
| run: | | |
| git clone --depth=1 https://github.com/openresponses/openresponses.git /tmp/openresponses | |
| - name: Install OpenResponses dependencies | |
| working-directory: /tmp/openresponses | |
| run: bun install | |
| - name: Run OpenResponses conformance tests | |
| id: run-tests | |
| run: | | |
| cd /tmp/openresponses | |
| bun run bin/compliance-test.ts \ | |
| --base-url "http://localhost:$LLAMA_STACK_PORT/v1" \ | |
| --api-key "llama-stack" \ | |
| --model "$INFERENCE_MODEL" \ | |
| --json > /tmp/openresponses-results.json 2>/dev/null || true | |
| echo "=== OpenResponses Conformance Test Output ===" | |
| bun run bin/compliance-test.ts \ | |
| --base-url "http://localhost:$LLAMA_STACK_PORT/v1" \ | |
| --api-key "llama-stack" \ | |
| --model "$INFERENCE_MODEL" \ | |
| --verbose 2>&1 | tee /tmp/openresponses-output.txt || true | |
| - name: Generate conformance report | |
| if: always() | |
| run: | | |
| { | |
| echo "## OpenResponses Conformance Test Results" | |
| echo "" | |
| echo "> **Note:** These tests are **informational only** and track progress toward full" | |
| echo "> [OpenResponses](https://www.openresponses.org) API conformance." | |
| echo "> Failures are expected while gaps remain in the Responses API implementation." | |
| echo "> See [#4818](https://github.com/llamastack/llama-stack/issues/4818)." | |
| echo "" | |
| echo "**Model:** \`$INFERENCE_MODEL\`" | |
| echo "**Recordings:** ${{ steps.check-recordings.outputs.recordings_count }} file(s) in \`tests/integration/openresponses/recordings/\`" | |
| echo "" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| if [ "${{ steps.check-recordings.outputs.recordings_count }}" = "0" ]; then | |
| { | |
| echo "### No Recordings Found" | |
| echo "" | |
| echo "To enable conformance testing, generate recordings locally:" | |
| echo '```bash' | |
| echo "OPENAI_API_KEY=\$YOUR_KEY bash scripts/record-openresponses-conformance.sh" | |
| echo "" | |
| echo "# Then commit the recordings" | |
| echo "git add tests/integration/openresponses/recordings/" | |
| echo "git commit -m 'chore: add OpenResponses conformance recordings'" | |
| echo '```' | |
| echo "Commit the resulting \`tests/integration/openresponses/recordings/*.json\` files." | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| elif [ -f /tmp/openresponses-results.json ] && jq -e '.summary' /tmp/openresponses-results.json > /dev/null 2>&1; then | |
| PASSED=$(jq -r '.summary.passed' /tmp/openresponses-results.json) | |
| FAILED=$(jq -r '.summary.failed' /tmp/openresponses-results.json) | |
| TOTAL=$(jq -r '.summary.total' /tmp/openresponses-results.json) | |
| { | |
| echo "### Summary" | |
| echo "" | |
| echo "| Metric | Count |" | |
| echo "|--------|-------|" | |
| echo "| ✅ Passed | $PASSED |" | |
| echo "| ❌ Failed | $FAILED |" | |
| echo "| **Total** | **$TOTAL** |" | |
| echo "" | |
| echo "### Test Details" | |
| echo "" | |
| echo "| Test | Status | Duration |" | |
| echo "|------|--------|----------|" | |
| jq -r '.results[] | "| \(.name) | \(if .status == "passed" then "✅ Pass" else "❌ Fail" end) | \(.duration // "—")ms |"' \ | |
| /tmp/openresponses-results.json | |
| echo "" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| if [ "$FAILED" -gt 0 ]; then | |
| { | |
| echo "### Failure Details" | |
| echo "" | |
| jq -r '.results[] | select(.status == "failed") | "**\(.name)**\n" + ((.errors // []) | map("- \(.)") | join("\n")) + "\n"' \ | |
| /tmp/openresponses-results.json | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| fi | |
| else | |
| { | |
| echo "### Results" | |
| echo "" | |
| echo "Could not parse structured results." | |
| if [ -f /tmp/openresponses-output.txt ]; then | |
| echo "" | |
| echo "<details><summary>Raw output</summary>" | |
| echo "" | |
| echo '```' | |
| cat /tmp/openresponses-output.txt | |
| echo '```' | |
| echo "" | |
| echo "</details>" | |
| fi | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| fi | |
| - name: Upload conformance test results | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 | |
| with: | |
| name: openresponses-conformance-results-${{ github.run_id }} | |
| path: | | |
| /tmp/openresponses-results.json | |
| /tmp/openresponses-output.txt | |
| retention-days: 30 | |
| if-no-files-found: warn | |
| - name: Print server log on failure | |
| if: failure() | |
| run: | | |
| echo "=== Llama Stack server log (last 200 lines) ===" | |
| tail -200 /tmp/llama-stack-conformance/server.log || true |