Run OpenResponses conformance tests against llama-stack Responses API #1

Workflow file for this run

.github/workflows/openresponses-conformance.yml at 903c3fb

	name: OpenResponses Conformance Tests

	run-name: Run OpenResponses conformance tests against llama-stack Responses API

	# This job is OPTIONAL and informational — it tracks progress toward full
	# conformance with the OpenResponses spec (https://openresponses.org).
	# Failures are expected while gaps remain in the Responses API implementation.
	# See: https://github.com/llamastack/llama-stack/issues/4818
	#
	# Inference calls are replayed from checked-in recordings under
	# tests/integration/openresponses/recordings/. To add or update recordings,
	# run the server locally with LLAMA_STACK_TEST_INFERENCE_MODE=record-if-missing
	# pointed at that directory, run the compliance tests, and commit the results.

	on:
	push:
	branches:
	- main
	- 'release-[0-9]+.[0-9]+.x'
	paths:
	- 'src/llama_stack/providers/inline/agents/**'
	- 'src/llama_stack/apis/agents/**'
	- 'tests/integration/openresponses/**'
	- '.github/workflows/openresponses-conformance.yml'
	pull_request:
	branches:
	- main
	- 'release-[0-9]+.[0-9]+.x'
	paths:
	- 'src/llama_stack/providers/inline/agents/**'
	- 'src/llama_stack/apis/agents/**'
	- 'tests/integration/openresponses/**'
	- '.github/workflows/openresponses-conformance.yml'
	merge_group:
	branches:
	- main
	- 'release-[0-9]+.[0-9]+.x'
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id \|\| github.ref }}
	cancel-in-progress: true

	permissions:
	contents: read

	jobs:
	openresponses-conformance:
	name: OpenResponses Conformance
	runs-on: ubuntu-latest

	env:
	# Dummy key satisfies config parsing; actual inference is replayed from recordings
	OPENAI_API_KEY: "dummy-key-for-replay-mode"
	INFERENCE_MODEL: "openai/gpt-4o-mini"
	LLAMA_STACK_PORT: 8321
	LLAMA_STACK_TEST_INFERENCE_MODE: "replay"
	LLAMA_STACK_TEST_RECORDING_DIR: ${{ github.workspace }}/tests/integration/openresponses

	steps:
	- name: Checkout repository
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

	- name: Check for recordings
	id: check-recordings
	run: \|
	RECORDINGS_DIR="${{ github.workspace }}/tests/integration/openresponses/recordings"
	COUNT=$(find "$RECORDINGS_DIR" -name "*.json" 2>/dev/null \| wc -l)
	echo "recordings_count=$COUNT" >> "$GITHUB_OUTPUT"
	if [ "$COUNT" -eq 0 ]; then
	echo "::warning::No recordings found in tests/integration/openresponses/recordings/."
	echo "::warning::Run the server locally with LLAMA_STACK_TEST_INFERENCE_MODE=record-if-missing"
	echo "::warning::and commit the resulting recordings to enable conformance testing in CI."
	else
	echo "Found $COUNT recording(s)"
	fi

	- name: Install dependencies
	uses: ./.github/actions/setup-runner
	with:
	python-version: '3.12'

	- name: Install ci-tests distro provider dependencies
	run: uv run llama stack list-deps ci-tests --format uv \| sh

	- name: Start Llama Stack server
	run: \|
	mkdir -p /tmp/llama-stack-conformance
	export LLAMA_STACK_LOG_WIDTH=200
	nohup uv run llama stack run ci-tests --port "$LLAMA_STACK_PORT" \
	> /tmp/llama-stack-conformance/server.log 2>&1 &
	echo "Server PID: $!"

	- name: Wait for Llama Stack server to be ready
	run: \|
	echo "Waiting for Llama Stack server..."
	for _ in {1..60}; do
	if curl -s "http://localhost:$LLAMA_STACK_PORT/v1/health" \| grep -q "OK"; then
	echo "Llama Stack server is ready!"
	exit 0
	fi
	sleep 2
	done
	echo "Llama Stack server failed to start in 120 seconds"
	cat /tmp/llama-stack-conformance/server.log
	exit 1

	- name: Setup Bun
	uses: oven-sh/setup-bun@ecf28ddc73e819eb6fa29df6b34ef8921c743461 #v2.0.0

	- name: Clone OpenResponses repository
	run: \|
	git clone --depth=1 https://github.com/openresponses/openresponses.git /tmp/openresponses

	- name: Install OpenResponses dependencies
	working-directory: /tmp/openresponses
	run: bun install

	- name: Run OpenResponses conformance tests
	id: run-tests
	run: \|
	cd /tmp/openresponses
	bun run bin/compliance-test.ts \
	--base-url "http://localhost:$LLAMA_STACK_PORT/v1" \
	--api-key "llama-stack" \
	--model "$INFERENCE_MODEL" \
	--json > /tmp/openresponses-results.json 2>/dev/null \|\| true

	echo "=== OpenResponses Conformance Test Output ==="
	bun run bin/compliance-test.ts \
	--base-url "http://localhost:$LLAMA_STACK_PORT/v1" \
	--api-key "llama-stack" \
	--model "$INFERENCE_MODEL" \
	--verbose 2>&1 \| tee /tmp/openresponses-output.txt \|\| true

	- name: Generate conformance report
	if: always()
	run: \|
	{
	echo "## OpenResponses Conformance Test Results"
	echo ""
	echo "> Note: These tests are informational only and track progress toward full"
	echo "> [OpenResponses](https://www.openresponses.org) API conformance."
	echo "> Failures are expected while gaps remain in the Responses API implementation."
	echo "> See [#4818](https://github.com/llamastack/llama-stack/issues/4818)."
	echo ""
	echo "Model: \`$INFERENCE_MODEL\`"
	echo "Recordings: ${{ steps.check-recordings.outputs.recordings_count }} file(s) in \`tests/integration/openresponses/recordings/\`"
	echo ""
	} >> "$GITHUB_STEP_SUMMARY"

	if [ "${{ steps.check-recordings.outputs.recordings_count }}" = "0" ]; then
	{
	echo "### No Recordings Found"
	echo ""
	echo "To enable conformance testing, generate recordings locally:"
	echo '```bash'
	echo "OPENAI_API_KEY=\$YOUR_KEY bash scripts/record-openresponses-conformance.sh"
	echo ""
	echo "# Then commit the recordings"
	echo "git add tests/integration/openresponses/recordings/"
	echo "git commit -m 'chore: add OpenResponses conformance recordings'"
	echo '```'
	echo "Commit the resulting \`tests/integration/openresponses/recordings/*.json\` files."
	} >> "$GITHUB_STEP_SUMMARY"
	elif [ -f /tmp/openresponses-results.json ] && jq -e '.summary' /tmp/openresponses-results.json > /dev/null 2>&1; then
	PASSED=$(jq -r '.summary.passed' /tmp/openresponses-results.json)
	FAILED=$(jq -r '.summary.failed' /tmp/openresponses-results.json)
	TOTAL=$(jq -r '.summary.total' /tmp/openresponses-results.json)

	{
	echo "### Summary"
	echo ""
	echo "\| Metric \| Count \|"
	echo "\|--------\|-------\|"
	echo "\| ✅ Passed \| $PASSED \|"
	echo "\| ❌ Failed \| $FAILED \|"
	echo "\| Total \| $TOTAL \|"
	echo ""
	echo "### Test Details"
	echo ""
	echo "\| Test \| Status \| Duration \|"
	echo "\|------\|--------\|----------\|"
	jq -r '.results[] \| "\| \(.name) \| \(if .status == "passed" then "✅ Pass" else "❌ Fail" end) \| \(.duration // "—")ms \|"' \
	/tmp/openresponses-results.json
	echo ""
	} >> "$GITHUB_STEP_SUMMARY"

	if [ "$FAILED" -gt 0 ]; then
	{
	echo "### Failure Details"
	echo ""
	jq -r '.results[] \| select(.status == "failed") \| "\(.name)\n" + ((.errors // []) \| map("- \(.)") \| join("\n")) + "\n"' \
	/tmp/openresponses-results.json
	} >> "$GITHUB_STEP_SUMMARY"
	fi
	else
	{
	echo "### Results"
	echo ""
	echo "Could not parse structured results."
	if [ -f /tmp/openresponses-output.txt ]; then
	echo ""
	echo "<details><summary>Raw output</summary>"
	echo ""
	echo '```'
	cat /tmp/openresponses-output.txt
	echo '```'
	echo ""
	echo "</details>"
	fi
	} >> "$GITHUB_STEP_SUMMARY"
	fi

	- name: Upload conformance test results
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
	with:
	name: openresponses-conformance-results-${{ github.run_id }}
	path: \|
	/tmp/openresponses-results.json
	/tmp/openresponses-output.txt
	retention-days: 30
	if-no-files-found: warn

	- name: Print server log on failure
	if: failure()
	run: \|
	echo "=== Llama Stack server log (last 200 lines) ==="
	tail -200 /tmp/llama-stack-conformance/server.log \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run OpenResponses conformance tests against llama-stack Responses API #1

Workflow file

Run OpenResponses conformance tests against llama-stack Responses API #1

Uh oh!

Workflow file for this run