Skip to content

Commit 2f00edf

Browse files
dgerogclaude
andcommitted
Add adapters, --import flag, firewall docs update (v0.9.1)
- Move PromptFoo/PyRIT adapters from fw lib to CLI - Add --import flag with auto-format detection - Document importing in firewall integration docs - Bump to 0.9.1 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6e78f30 commit 2f00edf

6 files changed

Lines changed: 204 additions & 6 deletions

File tree

docs/docs/integrations/firewall.md

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -387,14 +387,40 @@ hb firewall train --model detectors/setfit_classifier.py
387387
| `--until DATE` | Filter experiments until this date. |
388388
| `--min-samples N` | Minimum conversations required (default: 30). |
389389
| `--output PATH` | Output .hbfw file path (default: `firewall_<project>.hbfw`). |
390+
| `--import FILE` | Import external logs (repeatable). Auto-detects format. |
390391

391392
The command:
392393

393394
1. Fetches your adversarial and QA experiment logs
394-
2. Curates attack data (failed adversarial turns, stratified by fail category)
395-
3. Curates benign data (passed QA turns, stratified by user persona)
396-
4. Trains your AgentClassifier
397-
5. Saves the model as a `.hbfw` file
395+
2. Imports external logs if `--import` provided (PromptFoo, PyRIT)
396+
3. Curates attack data (failed adversarial turns, stratified by fail category)
397+
4. Curates benign data (passed QA turns, stratified by user persona)
398+
5. Trains your AgentClassifier
399+
6. Saves the model as a `.hbfw` file
400+
401+
### Importing External Logs
402+
403+
Combine data from other red-teaming frameworks with your Humanbound test data:
404+
405+
```bash
406+
# Auto-detect format from file structure
407+
hb firewall train --import pyrit_results.json
408+
409+
# Explicit format
410+
hb firewall train --import results.json:promptfoo
411+
412+
# Multiple sources
413+
hb firewall train --import pyrit.json --import promptfoo.json
414+
```
415+
416+
Supported frameworks:
417+
418+
| Framework | Format | Auto-detected by |
419+
|-----------|--------|-----------------|
420+
| [PyRIT](https://github.com/Azure/PyRIT) (Microsoft) | JSON scan output | `redteaming_data` key |
421+
| [PromptFoo](https://github.com/promptfoo/promptfoo) | JSON eval export | `evalId` + `results` keys |
422+
423+
Imported logs are merged with Humanbound logs before training. More data sources → better Tier 2 coverage.
398424

399425
### Show
400426

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Log adapters — convert external framework results to hb-firewall format.
2+
3+
Auto-detects format from file structure. Add new adapters by creating a module
4+
with SIGNATURES (list of keys to match) and convert(data) → list[dict].
5+
"""
6+
7+
import json
8+
from pathlib import Path
9+
10+
from . import promptfoo, pyrit
11+
12+
_ADAPTERS = {
13+
"promptfoo": promptfoo,
14+
"pyrit": pyrit,
15+
}
16+
17+
18+
def detect_format(data: dict) -> str:
19+
"""Auto-detect framework from file structure."""
20+
for name, adapter in _ADAPTERS.items():
21+
sigs = getattr(adapter, "SIGNATURES", [])
22+
if sigs and all(k in data for k in sigs):
23+
return name
24+
return ""
25+
26+
27+
def convert_file(file_path: str, format_tag: str = "") -> list[dict]:
28+
"""Convert an external log file to hb-firewall standard format.
29+
30+
Args:
31+
file_path: path to JSON file
32+
format_tag: explicit format (e.g. "promptfoo", "pyrit"). Auto-detects if empty.
33+
34+
Returns:
35+
list of logs in standard format:
36+
[{"conversation": [...], "result": "pass"|"fail", "test_category": "...",
37+
"fail_category": "...", "severity": float, "confidence": float}, ...]
38+
"""
39+
path = Path(file_path)
40+
41+
if path.suffix == ".jsonl":
42+
with open(path) as f:
43+
lines = [json.loads(line) for line in f if line.strip()]
44+
# JSONL: treat as list of individual results
45+
data = {"_jsonl_entries": lines}
46+
else:
47+
with open(path) as f:
48+
data = json.load(f)
49+
50+
tag = format_tag or detect_format(data)
51+
if not tag:
52+
available = ", ".join(_ADAPTERS.keys())
53+
raise ValueError(
54+
f"Unrecognized format in '{file_path}'. "
55+
f"Specify format: --import {file_path}:<format> "
56+
f"(available: {available})")
57+
58+
if tag not in _ADAPTERS:
59+
available = ", ".join(_ADAPTERS.keys())
60+
raise ValueError(f"Unknown format '{tag}'. Available: {available}")
61+
62+
return _ADAPTERS[tag].convert(data)
63+
64+
65+
def list_formats() -> list[str]:
66+
return list(_ADAPTERS.keys())
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""PromptFoo adapter — converts evaluation results to hb-firewall format.
2+
3+
Handles the export format from: promptfoo export eval <id> -o results.json
4+
"""
5+
6+
SIGNATURES = ["results", "evalId"]
7+
8+
9+
def convert(data: dict) -> list[dict]:
10+
logs = []
11+
results = data.get("results", {})
12+
13+
for entry in results.get("results", []):
14+
# Extract prompt text
15+
prompt_obj = entry.get("prompt", {})
16+
if isinstance(prompt_obj, dict):
17+
prompt = prompt_obj.get("raw", prompt_obj.get("label", ""))
18+
else:
19+
prompt = str(prompt_obj)
20+
21+
# Extract response text
22+
response_obj = entry.get("response", {})
23+
if isinstance(response_obj, dict):
24+
response = response_obj.get("output", "")
25+
else:
26+
response = str(response_obj)
27+
28+
if not prompt:
29+
continue
30+
31+
success = entry.get("success", True)
32+
score = entry.get("score", 1.0 if success else 0.0)
33+
34+
# Extract fail reason from grading
35+
fail_category = ""
36+
grading = entry.get("gradingResult", {})
37+
if grading and not success:
38+
fail_category = grading.get("reason", "")[:80]
39+
40+
# Map score to severity (0 = benign, 1 = severe)
41+
if isinstance(score, (int, float)) and score <= 1.0:
42+
severity = float((1.0 - score) * 100)
43+
else:
44+
severity = 50.0
45+
46+
logs.append({
47+
"conversation": [{"u": prompt, "a": response}],
48+
"result": "pass" if success else "fail",
49+
"test_category": "adversarial",
50+
"fail_category": fail_category,
51+
"severity": severity,
52+
"confidence": 80.0,
53+
"gen_category": "promptfoo",
54+
})
55+
56+
return logs

humanbound_cli/adapters/pyrit.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""PyRIT (Microsoft) adapter — converts red teaming scan results to hb-firewall format."""
2+
3+
SIGNATURES = ["redteaming_data"]
4+
5+
SEVERITY_MAP = {
6+
"very low": 10,
7+
"low": 30,
8+
"medium": 50,
9+
"high": 75,
10+
"very high": 95,
11+
}
12+
13+
14+
def convert(data: dict) -> list[dict]:
15+
logs = []
16+
for entry in data.get("redteaming_data", []):
17+
conv = []
18+
for msg in entry.get("conversation", []):
19+
role = msg.get("role", "")
20+
content = msg.get("content", "")
21+
if role == "user":
22+
conv.append({"u": content, "a": ""})
23+
elif role == "assistant" and conv:
24+
conv[-1]["a"] = content
25+
26+
if not conv:
27+
continue
28+
29+
attack_success = entry.get("attack_success", False)
30+
risk_cat = entry.get("risk_category", "")
31+
32+
# Extract max severity from risk_assessment
33+
severity = 0
34+
risk_assessment = entry.get("risk_assessment", {})
35+
for cat_info in risk_assessment.values():
36+
if isinstance(cat_info, dict):
37+
label = cat_info.get("severity_label", "").lower()
38+
severity = max(severity, SEVERITY_MAP.get(label, 0))
39+
40+
logs.append({
41+
"conversation": conv,
42+
"result": "fail" if attack_success else "pass",
43+
"test_category": "adversarial",
44+
"fail_category": risk_cat if attack_success else "",
45+
"severity": float(severity),
46+
"confidence": 90.0,
47+
"gen_category": entry.get("attack_technique", ""),
48+
})
49+
50+
return logs

humanbound_cli/commands/firewall.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def train_command(model_path, last_n, from_date, until_date, min_samples,
101101

102102
# Import external logs if provided
103103
if import_files:
104-
from hb_firewall.adapters import convert_file
104+
from humanbound_cli.adapters import convert_file
105105
for import_arg in import_files:
106106
# Parse file:format syntax
107107
if ":" in import_arg and not import_arg.startswith("/"):

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "humanbound-cli"
7-
version = "0.9.0"
7+
version = "0.9.1"
88
authors = [
99
{ name="Kostas Siabanis", email="hello@humanbound.ai" },
1010
{ name="Demetris Gerogiannis", email="hello@humanbound.ai" },

0 commit comments

Comments
 (0)