agent-redteam-framework/project.yaml at main · rexcoleman/agent-redteam-framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# project.yaml — Agent Security Red-Team Framework (FP-02)
#
# Self-directed research: red-teaming autonomous AI agents.
# Profile: security-ml (govML v2.4)
# Workload type: I/O-bound (API calls to LLM providers, not local model training)

project:
  name: "Agent Security Red-Team Framework"
  profile: security-ml
  python_version: "3.11"
  conda_env: "agent-redteam"

authority:
  tier1: "docs/PROJECT_BRIEF.md"
  tier2: null
  tier3: "docs/ADVERSARIAL_EVALUATION.md"

publication:
  title: "I Red-Teamed AI Agents: Here's How They Break (and How to Fix Them)"
  pillar: "AI Security Architecture"
  venue: "blog"
  conference_cfp: "BSides / DEF CON AI Village"
  target_date: "2026-05-01"

skill_clusters:
  L: { current: "L3+", target: "L4", how: "Agent orchestration, multi-framework tool-use patterns" }
  S: { current: "S2+", target: "S3", how: "Novel attack taxonomy + 3rd domain adversarial control analysis" }
  P: { current: "P2++", target: "P3", how: "CLI tool (pip install agent-redteam)" }
  D: { current: "D3+", target: "D4", how: "Defense tradeoff documentation, controllability matrix" }
  V: { current: "V1", target: "V2", how: "Blog post + conference CFP (hottest topic)" }

# Infrastructure assessment (ISS-039)
infrastructure:
  workload_type: "io_bound"          # API calls to Claude/OpenAI — not CPU-bound
  min_disk_gb: 5                      # Env (~1.2GB) + outputs (~500MB) + headroom
  min_ram_gb: 4                       # No large in-memory datasets
  min_cores: 2                        # I/O-bound, not CPU-bound
  api_budget_usd: 50                  # Claude + OpenAI token costs
  data_disk: "/data"                  # Azure data disk mounted for outputs

# Experiment matrix — attacks × agent targets × seeds
experiments:
  seeds: [42, 123, 456]
  parts:
    - name: baseline_behavior
      methods: [langchain_react, crewai_multi, autogen_chat]
      datasets: [agent_tasks]
      script: "scripts/run_baselines.py"
      flags: "--task simple_tool_call"
      output_dir: "outputs/baselines"
    - name: attacks
      methods: [prompt_injection, tool_misuse, privilege_escalation, memory_poisoning, output_manipulation]
      datasets: [agent_tasks]
      script: "scripts/run_attacks.py"
      flags: "--agent langchain_react"
      output_dir: "outputs/attacks"
    - name: defenses
      methods: [input_sanitizer, tool_permission_boundary, memory_integrity_check, output_validator]
      datasets: [agent_tasks]
      script: "scripts/run_defenses.py"
      flags: "--agent langchain_react --attack prompt_injection"
      output_dir: "outputs/defenses"

# "Dataset" = self-generated attack scenarios against controlled agents
datasets:
  - name: agent_tasks
    source: "Self-generated task scenarios for agent evaluation"
    download_method: "none"
    known_issues:
      - "Agent framework APIs change frequently — pin versions"
      - "LLM responses are non-deterministic — use temperature=0 + seeds where supported"
      - "API rate limits may throttle parallel execution"
    local_path: "data/tasks/"

config:
  base: "config/base.yaml"
  layers:
    - pattern: "config/agents/*.yaml"
    - pattern: "config/attacks/*.yaml"
    - pattern: "config/defenses/*.yaml"
  resolved_output: "config_resolved.yaml"

# Phase definitions
phases:
  - name: "Phase 0 — Environment & Agent Setup"
    checks:
      - command: "bash scripts/verify_env.sh"
        description: "Environment + framework imports verified"
      - command: "python scripts/smoke_test_agents.py"
        description: "Each agent target completes a simple tool-calling task"
      - command: "git remote -v"
        description: "Git remote configured"
  - name: "Phase 1 — Attack Taxonomy & Baselines"
    checks:
      - command: "python scripts/run_baselines.py --agent langchain_react --task simple_tool_call --seed 42"
        description: "Baseline behavior captured for at least 1 agent"
      - command: "test -f docs/attack_taxonomy.md"
        description: "Attack taxonomy document exists"
  - name: "Phase 2 — Attack Execution"
    checks:
      - command: "python scripts/run_attacks.py --agent langchain_react --attack prompt_injection --seed 42"
        description: "At least 1 attack runs successfully"
  - name: "Phase 3 — Defense Evaluation"
    checks:
      - command: "python scripts/run_defenses.py --agent langchain_react --attack prompt_injection --defense input_sanitizer --seed 42"
        description: "At least 1 defense runs successfully"
  - name: "Phase 4 — Findings & Publication"
    checks:
      - command: "python scripts/verify_manifests.py"
        description: "All artifacts verified"
      - command: "test -f FINDINGS.md"
        description: "FINDINGS.md exists"

artifacts:
  run_id_pattern: "{part}_{method}_{agent}_seed{seed}"
  per_run_files:
    - "summary.json"
    - "config_resolved.yaml"
    - "attack_log.jsonl"
  hash_algorithm: "sha256"
  manifest_file: "outputs/manifest.json"

ai_governance:
  tools:
    - name: "Claude Code"
      role: "Coding copilot, test generation, script execution, governance audit"
      prohibited: "Must not interpret WHY attacks succeed (human practitioner judgment). Must not attack production systems."
  anti_ghostwriting: true
  test_set_barrier: false  # No train/test split — all scenarios are evaluation