Skip to content

Run OpenResponses conformance tests against llama-stack Responses API #1

Run OpenResponses conformance tests against llama-stack Responses API

Run OpenResponses conformance tests against llama-stack Responses API #1

name: OpenResponses Conformance Tests
run-name: Run OpenResponses conformance tests against llama-stack Responses API
# This job is OPTIONAL and informational — it tracks progress toward full
# conformance with the OpenResponses spec (https://openresponses.org).
# Failures are expected while gaps remain in the Responses API implementation.
# See: https://github.com/llamastack/llama-stack/issues/4818
#
# Inference calls are replayed from checked-in recordings under
# tests/integration/openresponses/recordings/. To add or update recordings,
# run the server locally with LLAMA_STACK_TEST_INFERENCE_MODE=record-if-missing
# pointed at that directory, run the compliance tests, and commit the results.
on:
push:
branches:
- main
- 'release-[0-9]+.[0-9]+.x'
paths:
- 'src/llama_stack/providers/inline/agents/**'
- 'src/llama_stack/apis/agents/**'
- 'tests/integration/openresponses/**'
- '.github/workflows/openresponses-conformance.yml'
pull_request:
branches:
- main
- 'release-[0-9]+.[0-9]+.x'
paths:
- 'src/llama_stack/providers/inline/agents/**'
- 'src/llama_stack/apis/agents/**'
- 'tests/integration/openresponses/**'
- '.github/workflows/openresponses-conformance.yml'
merge_group:
branches:
- main
- 'release-[0-9]+.[0-9]+.x'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
openresponses-conformance:
name: OpenResponses Conformance
runs-on: ubuntu-latest
env:
# Dummy key satisfies config parsing; actual inference is replayed from recordings
OPENAI_API_KEY: "dummy-key-for-replay-mode"
INFERENCE_MODEL: "openai/gpt-4o-mini"
LLAMA_STACK_PORT: 8321
LLAMA_STACK_TEST_INFERENCE_MODE: "replay"
LLAMA_STACK_TEST_RECORDING_DIR: ${{ github.workspace }}/tests/integration/openresponses
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Check for recordings
id: check-recordings
run: |
RECORDINGS_DIR="${{ github.workspace }}/tests/integration/openresponses/recordings"
COUNT=$(find "$RECORDINGS_DIR" -name "*.json" 2>/dev/null | wc -l)
echo "recordings_count=$COUNT" >> "$GITHUB_OUTPUT"
if [ "$COUNT" -eq 0 ]; then
echo "::warning::No recordings found in tests/integration/openresponses/recordings/."
echo "::warning::Run the server locally with LLAMA_STACK_TEST_INFERENCE_MODE=record-if-missing"
echo "::warning::and commit the resulting recordings to enable conformance testing in CI."
else
echo "Found $COUNT recording(s)"
fi
- name: Install dependencies
uses: ./.github/actions/setup-runner
with:
python-version: '3.12'
- name: Install ci-tests distro provider dependencies
run: uv run llama stack list-deps ci-tests --format uv | sh
- name: Start Llama Stack server
run: |
mkdir -p /tmp/llama-stack-conformance
export LLAMA_STACK_LOG_WIDTH=200
nohup uv run llama stack run ci-tests --port "$LLAMA_STACK_PORT" \
> /tmp/llama-stack-conformance/server.log 2>&1 &
echo "Server PID: $!"
- name: Wait for Llama Stack server to be ready
run: |
echo "Waiting for Llama Stack server..."
for _ in {1..60}; do
if curl -s "http://localhost:$LLAMA_STACK_PORT/v1/health" | grep -q "OK"; then
echo "Llama Stack server is ready!"
exit 0
fi
sleep 2
done
echo "Llama Stack server failed to start in 120 seconds"
cat /tmp/llama-stack-conformance/server.log
exit 1
- name: Setup Bun
uses: oven-sh/setup-bun@ecf28ddc73e819eb6fa29df6b34ef8921c743461 #v2.0.0
- name: Clone OpenResponses repository
run: |
git clone --depth=1 https://github.com/openresponses/openresponses.git /tmp/openresponses
- name: Install OpenResponses dependencies
working-directory: /tmp/openresponses
run: bun install
- name: Run OpenResponses conformance tests
id: run-tests
run: |
cd /tmp/openresponses
bun run bin/compliance-test.ts \
--base-url "http://localhost:$LLAMA_STACK_PORT/v1" \
--api-key "llama-stack" \
--model "$INFERENCE_MODEL" \
--json > /tmp/openresponses-results.json 2>/dev/null || true
echo "=== OpenResponses Conformance Test Output ==="
bun run bin/compliance-test.ts \
--base-url "http://localhost:$LLAMA_STACK_PORT/v1" \
--api-key "llama-stack" \
--model "$INFERENCE_MODEL" \
--verbose 2>&1 | tee /tmp/openresponses-output.txt || true
- name: Generate conformance report
if: always()
run: |
{
echo "## OpenResponses Conformance Test Results"
echo ""
echo "> **Note:** These tests are **informational only** and track progress toward full"
echo "> [OpenResponses](https://www.openresponses.org) API conformance."
echo "> Failures are expected while gaps remain in the Responses API implementation."
echo "> See [#4818](https://github.com/llamastack/llama-stack/issues/4818)."
echo ""
echo "**Model:** \`$INFERENCE_MODEL\`"
echo "**Recordings:** ${{ steps.check-recordings.outputs.recordings_count }} file(s) in \`tests/integration/openresponses/recordings/\`"
echo ""
} >> "$GITHUB_STEP_SUMMARY"
if [ "${{ steps.check-recordings.outputs.recordings_count }}" = "0" ]; then
{
echo "### No Recordings Found"
echo ""
echo "To enable conformance testing, generate recordings locally:"
echo '```bash'
echo "OPENAI_API_KEY=\$YOUR_KEY bash scripts/record-openresponses-conformance.sh"
echo ""
echo "# Then commit the recordings"
echo "git add tests/integration/openresponses/recordings/"
echo "git commit -m 'chore: add OpenResponses conformance recordings'"
echo '```'
echo "Commit the resulting \`tests/integration/openresponses/recordings/*.json\` files."
} >> "$GITHUB_STEP_SUMMARY"
elif [ -f /tmp/openresponses-results.json ] && jq -e '.summary' /tmp/openresponses-results.json > /dev/null 2>&1; then
PASSED=$(jq -r '.summary.passed' /tmp/openresponses-results.json)
FAILED=$(jq -r '.summary.failed' /tmp/openresponses-results.json)
TOTAL=$(jq -r '.summary.total' /tmp/openresponses-results.json)
{
echo "### Summary"
echo ""
echo "| Metric | Count |"
echo "|--------|-------|"
echo "| ✅ Passed | $PASSED |"
echo "| ❌ Failed | $FAILED |"
echo "| **Total** | **$TOTAL** |"
echo ""
echo "### Test Details"
echo ""
echo "| Test | Status | Duration |"
echo "|------|--------|----------|"
jq -r '.results[] | "| \(.name) | \(if .status == "passed" then "✅ Pass" else "❌ Fail" end) | \(.duration // "—")ms |"' \
/tmp/openresponses-results.json
echo ""
} >> "$GITHUB_STEP_SUMMARY"
if [ "$FAILED" -gt 0 ]; then
{
echo "### Failure Details"
echo ""
jq -r '.results[] | select(.status == "failed") | "**\(.name)**\n" + ((.errors // []) | map("- \(.)") | join("\n")) + "\n"' \
/tmp/openresponses-results.json
} >> "$GITHUB_STEP_SUMMARY"
fi
else
{
echo "### Results"
echo ""
echo "Could not parse structured results."
if [ -f /tmp/openresponses-output.txt ]; then
echo ""
echo "<details><summary>Raw output</summary>"
echo ""
echo '```'
cat /tmp/openresponses-output.txt
echo '```'
echo ""
echo "</details>"
fi
} >> "$GITHUB_STEP_SUMMARY"
fi
- name: Upload conformance test results
if: always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
with:
name: openresponses-conformance-results-${{ github.run_id }}
path: |
/tmp/openresponses-results.json
/tmp/openresponses-output.txt
retention-days: 30
if-no-files-found: warn
- name: Print server log on failure
if: failure()
run: |
echo "=== Llama Stack server log (last 200 lines) ==="
tail -200 /tmp/llama-stack-conformance/server.log || true