-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreasoning_leak_canary.py
More file actions
39 lines (32 loc) · 1.15 KB
/
reasoning_leak_canary.py
File metadata and controls
39 lines (32 loc) · 1.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/env python3
"""PostToolUse hook: after editing prompt/outreach files, warn about reasoning leak risk."""
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from hook_base import run_hook
# Files that contain system prompts or outreach logic
PROMPT_PATTERNS = [
"system_prompt",
"outreach/",
"auto_reply",
"few_shot",
"prompt_template",
"persona_prompt",
]
def check(tool_name, tool_input, input_data):
if tool_name not in ("Edit", "Write"):
return False
file_path = tool_input.get("file_path", "")
return any(p in file_path for p in PROMPT_PATTERNS)
def action(tool_name, tool_input, input_data):
return (
"⚠️ **Prompt/outreach file edited.** Reasoning leak risk.\n"
"Before deploying, test with a canary message and verify:\n"
"1. No `<think>` tags in output\n"
"2. No `let me follow the rules` or similar CoT leaks\n"
"3. No meta-commentary about the prompt itself\n"
"Run the red team subset: `python red_team_chain.py --quick`"
)
if __name__ == "__main__":
run_hook(check, action, "reasoning_leak_canary")