This repository was archived by the owner on Mar 20, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproject.yaml
More file actions
129 lines (118 loc) · 5.21 KB
/
project.yaml
File metadata and controls
129 lines (118 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# project.yaml — Agent Security Red-Team Framework (FP-02)
#
# Self-directed research: red-teaming autonomous AI agents.
# Profile: security-ml (govML v2.4)
# Workload type: I/O-bound (API calls to LLM providers, not local model training)
project:
name: "Agent Security Red-Team Framework"
profile: security-ml
python_version: "3.11"
conda_env: "agent-redteam"
authority:
tier1: "docs/PROJECT_BRIEF.md"
tier2: null
tier3: "docs/ADVERSARIAL_EVALUATION.md"
publication:
title: "I Red-Teamed AI Agents: Here's How They Break (and How to Fix Them)"
pillar: "AI Security Architecture"
venue: "blog"
conference_cfp: "BSides / DEF CON AI Village"
target_date: "2026-05-01"
skill_clusters:
L: { current: "L3+", target: "L4", how: "Agent orchestration, multi-framework tool-use patterns" }
S: { current: "S2+", target: "S3", how: "Novel attack taxonomy + 3rd domain adversarial control analysis" }
P: { current: "P2++", target: "P3", how: "CLI tool (pip install agent-redteam)" }
D: { current: "D3+", target: "D4", how: "Defense tradeoff documentation, controllability matrix" }
V: { current: "V1", target: "V2", how: "Blog post + conference CFP (hottest topic)" }
# Infrastructure assessment (ISS-039)
infrastructure:
workload_type: "io_bound" # API calls to Claude/OpenAI — not CPU-bound
min_disk_gb: 5 # Env (~1.2GB) + outputs (~500MB) + headroom
min_ram_gb: 4 # No large in-memory datasets
min_cores: 2 # I/O-bound, not CPU-bound
api_budget_usd: 50 # Claude + OpenAI token costs
data_disk: "/data" # Azure data disk mounted for outputs
# Experiment matrix — attacks × agent targets × seeds
experiments:
seeds: [42, 123, 456]
parts:
- name: baseline_behavior
methods: [langchain_react, crewai_multi, autogen_chat]
datasets: [agent_tasks]
script: "scripts/run_baselines.py"
flags: "--task simple_tool_call"
output_dir: "outputs/baselines"
- name: attacks
methods: [prompt_injection, tool_misuse, privilege_escalation, memory_poisoning, output_manipulation]
datasets: [agent_tasks]
script: "scripts/run_attacks.py"
flags: "--agent langchain_react"
output_dir: "outputs/attacks"
- name: defenses
methods: [input_sanitizer, tool_permission_boundary, memory_integrity_check, output_validator]
datasets: [agent_tasks]
script: "scripts/run_defenses.py"
flags: "--agent langchain_react --attack prompt_injection"
output_dir: "outputs/defenses"
# "Dataset" = self-generated attack scenarios against controlled agents
datasets:
- name: agent_tasks
source: "Self-generated task scenarios for agent evaluation"
download_method: "none"
known_issues:
- "Agent framework APIs change frequently — pin versions"
- "LLM responses are non-deterministic — use temperature=0 + seeds where supported"
- "API rate limits may throttle parallel execution"
local_path: "data/tasks/"
config:
base: "config/base.yaml"
layers:
- pattern: "config/agents/*.yaml"
- pattern: "config/attacks/*.yaml"
- pattern: "config/defenses/*.yaml"
resolved_output: "config_resolved.yaml"
# Phase definitions
phases:
- name: "Phase 0 — Environment & Agent Setup"
checks:
- command: "bash scripts/verify_env.sh"
description: "Environment + framework imports verified"
- command: "python scripts/smoke_test_agents.py"
description: "Each agent target completes a simple tool-calling task"
- command: "git remote -v"
description: "Git remote configured"
- name: "Phase 1 — Attack Taxonomy & Baselines"
checks:
- command: "python scripts/run_baselines.py --agent langchain_react --task simple_tool_call --seed 42"
description: "Baseline behavior captured for at least 1 agent"
- command: "test -f docs/attack_taxonomy.md"
description: "Attack taxonomy document exists"
- name: "Phase 2 — Attack Execution"
checks:
- command: "python scripts/run_attacks.py --agent langchain_react --attack prompt_injection --seed 42"
description: "At least 1 attack runs successfully"
- name: "Phase 3 — Defense Evaluation"
checks:
- command: "python scripts/run_defenses.py --agent langchain_react --attack prompt_injection --defense input_sanitizer --seed 42"
description: "At least 1 defense runs successfully"
- name: "Phase 4 — Findings & Publication"
checks:
- command: "python scripts/verify_manifests.py"
description: "All artifacts verified"
- command: "test -f FINDINGS.md"
description: "FINDINGS.md exists"
artifacts:
run_id_pattern: "{part}_{method}_{agent}_seed{seed}"
per_run_files:
- "summary.json"
- "config_resolved.yaml"
- "attack_log.jsonl"
hash_algorithm: "sha256"
manifest_file: "outputs/manifest.json"
ai_governance:
tools:
- name: "Claude Code"
role: "Coding copilot, test generation, script execution, governance audit"
prohibited: "Must not interpret WHY attacks succeed (human practitioner judgment). Must not attack production systems."
anti_ghostwriting: true
test_set_barrier: false # No train/test split — all scenarios are evaluation