METTLE Red Council Adversarial Testing #71

Workflow file for this run

.github/workflows/red-council.yml at 2fc26cb

	name: METTLE Red Council Adversarial Testing

	on:
	push:
	branches: [main, develop]
	paths:
	- 'mettle/**'
	- 'main.py'
	- 'scripts/**'
	- 'tests/redteam/**'
	pull_request:
	branches: [main]
	paths:
	- 'mettle/**'
	- 'main.py'
	- 'scripts/**'
	- 'tests/redteam/**'
	schedule:
	- cron: '0 4 * * *' # Daily at 4 AM UTC
	workflow_dispatch:
	inputs:
	severity_threshold:
	description: 'Minimum severity for critical failures (1-10)'
	required: false
	default: '7'

	permissions:
	contents: read
	security-events: write
	actions: read
	issues: write

	env:
	RED_COUNCIL_TIMEOUT: 300
	OWASP_SEVERITY_THRESHOLD: ${{ github.event.inputs.severity_threshold \|\| '7' }}
	SKIP_LIVE_TESTS: 'true'
	PYTHONPATH: ${{ github.workspace }}

	jobs:
	mettle-red-council-tests:
	name: OWASP Agentic Attack Testing
	runs-on: ubuntu-latest

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Setup Python
	uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Cache pip packages
	uses: actions/cache@v4
	with:
	path: ~/.cache/pip
	key: ${{ runner.os }}-pip-mettle-${{ hashFiles('requirements.txt') }}
	restore-keys: \|
	${{ runner.os }}-pip-mettle-
	${{ runner.os }}-pip-

	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install -r requirements.txt
	pip install -r requirements-dev.txt
	pip install pyyaml httpx

	- name: Validate scenarios file
	run: \|
	echo "Validating METTLE attack scenarios..."
	python3 - <<'EOF'
	import yaml
	import sys

	with open('tests/redteam/scenarios/mettle_red_council_attacks.yaml') as f:
	data = yaml.safe_load(f)

	scenarios = data.get('scenarios', [])
	print(f"Found {len(scenarios)} scenarios")

	# Validate required fields
	required = ['id', 'name', 'suite', 'challenge', 'prompt', 'should_pass_mettle']
	errors = []

	for s in scenarios:
	missing = [f for f in required if f not in s]
	if missing:
	errors.append(f"{s.get('id', 'UNKNOWN')}: missing {missing}")

	if errors:
	print("Validation errors:")
	for e in errors:
	print(f" - {e}")
	sys.exit(1)

	# Check OWASP coverage
	owasp = {}
	for s in scenarios:
	cat = s.get('owasp_category', 'UNKNOWN')
	owasp[cat] = owasp.get(cat, 0) + 1

	print("OWASP Coverage:")
	for cat in sorted(owasp.keys()):
	print(f" {cat}: {owasp[cat]} scenarios")

	print("\nScenarios validated successfully")
	EOF

	- name: Run METTLE Red Council attacks
	id: run_attacks
	run: \|
	echo "Running OWASP Agentic attacks against METTLE..."
	python3 scripts/testing/run_mettle_red_council.py \
	--scenarios tests/redteam/scenarios/mettle_red_council_attacks.yaml \
	--output reports/mettle_red_council_results.json \
	--severity-threshold "$OWASP_SEVERITY_THRESHOLD" \
	--timeout 30 \|\| echo "ATTACKS_FAILED=true" >> "$GITHUB_ENV"

	- name: Generate summary
	if: always()
	run: \|
	python3 - <<'EOF'
	import json
	from pathlib import Path

	report_path = Path('reports/mettle_red_council_results.json')
	if not report_path.exists():
	print("No report generated")
	exit(0)

	with open(report_path) as f:
	report = json.load(f)

	print("## METTLE Red Council Results")
	print("")
	print(f"Session: {report['session_id']}")
	print(f"Timestamp: {report['timestamp']}")
	print("")
	print(f"\| Metric \| Value \|")
	print(f"\|--------\|-------\|")
	print(f"\| Total Scenarios \| {report['total_scenarios']} \|")
	print(f"\| Passed \| {report['passed']} \|")
	print(f"\| Failed \| {report['failed']} \|")
	print(f"\| Critical Failures \| {report['critical_failures']} \|")
	print(f"\| Pass Rate \| {report['pass_rate']:.1%} \|")
	print("")

	# OWASP coverage
	print("### OWASP Agentic Coverage")
	print("")
	print("\| Category \| Passed \| Failed \|")
	print("\|----------\|--------\|--------\|")
	for cat, stats in sorted(report.get('owasp_coverage', {}).items()):
	status = "pass" if stats['failed'] == 0 else "fail"
	print(f"\| {status} {cat} \| {stats['passed']} \| {stats['failed']} \|")
	print("")

	# List failures
	failures = [s for s in report['scenarios'] if not s['passed']]
	if failures:
	print("### Failures")
	print("")
	for f in failures:
	sev = "HIGH" if f['severity'] >= 8 else "MED" if f['severity'] >= 6 else "LOW"
	print(f"- {sev} {f['id']}: {f['name']}")
	print(f" - Suite: {f['suite']}, Attack: {f['attack_type']}")
	print(f" - Expected METTLE={f['should_pass_mettle']}, Got={f['mettle_passed']}")
	EOF

	- name: Upload reports
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: mettle-red-council-reports
	path: \|
	reports/mettle_red_council_results.json
	retention-days: 30

	- name: Check for critical failures
	if: always()
	run: \|
	if [ "$ATTACKS_FAILED" = "true" ]; then
	echo "::error::Critical failures detected in METTLE Red Council testing"
	exit 1
	fi

	create-hardening-issue:
	name: Create Hardening Issue
	needs: mettle-red-council-tests
	runs-on: ubuntu-latest
	if: failure() && github.event_name != 'pull_request'

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download reports
	uses: actions/download-artifact@v4
	with:
	name: mettle-red-council-reports
	path: reports/

	- name: Create GitHub Issue
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');

	let report;
	try {
	report = JSON.parse(fs.readFileSync('reports/mettle_red_council_results.json'));
	} catch (e) {
	console.log('No report to process');
	return;
	}

	const failures = report.scenarios.filter(s => !s.passed);
	if (failures.length === 0) return;

	const critical = failures.filter(s => s.severity >= 7);

	// Build issue body from trusted report data only (no user input)
	let body = `## METTLE Red Council Test Failures\n\n`;
	body += `Run ID: ${context.runId}\n`;
	body += `Timestamp: ${report.timestamp}\n`;
	body += `Critical Failures: ${critical.length}\n\n`;
	body += `### Failed Scenarios\n\n`;

	for (const f of failures) {
	body += `#### ${f.id}: ${f.name}\n`;
	body += `- OWASP Category: ${f.owasp_category}\n`;
	body += `- Severity: ${f.severity}/10\n`;
	body += `- Suite: \`${f.suite}\`\n`;
	body += `- Attack Type: \`${f.attack_type}\`\n`;
	body += `- Expected METTLE: ${f.should_pass_mettle}\n`;
	body += `- Actual METTLE: ${f.mettle_passed}\n\n`;
	}

	body += `### Recommended Actions\n\n`;
	body += `1. Review the attack scenarios that failed\n`;
	body += `2. Determine if METTLE detection logic needs hardening\n`;
	body += `3. Add regression tests for fixed scenarios\n\n`;
	body += `---\nGenerated by METTLE Red Council CI\n`;

	await github.rest.issues.create({
	owner: context.repo.owner,
	repo: context.repo.repo,
	title: `[Red Council] ${failures.length} METTLE hardening recommendations`,
	body: body,
	labels: ['security', 'red-team', 'mettle', 'automated']
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

METTLE Red Council Adversarial Testing #71

Workflow file

METTLE Red Council Adversarial Testing #71

Uh oh!

Workflow file for this run