Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/stackone_defender/core/prompt_defense.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def __init__(
if block_high_risk:
self._config.block_high_risk = True

tool_rules = self._config.tool_rules if use_default_tool_rules else []
tool_rules = (config or {}).get("tool_rules") or (self._config.tool_rules if use_default_tool_rules else [])
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using or to chain the fallback means an explicitly empty config={"tool_rules": []} is treated as falsy and falls through to the use_default_tool_rules branch. If the intent is that custom config tool rules always take precedence (as stated in the PR description), consider using an explicit None check instead, e.g.: tool_rules = (config or {}).get("tool_rules") if (config or {}).get("tool_rules") is not None else (self._config.tool_rules if use_default_tool_rules else []). This way, an explicitly empty list from config would be respected as "no tool rules" rather than falling through.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown

@cubic-dev-ai cubic-dev-ai bot Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: The or fallback treats an explicitly empty tool_rules list as “not provided.” If a caller sets "tool_rules": [] to disable tool rules, this line still loads defaults when use_default_tool_rules is true. Use an explicit key check so empty lists are honored.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/stackone_defender/core/prompt_defense.py, line 58:

<comment>The `or` fallback treats an explicitly empty `tool_rules` list as “not provided.” If a caller sets `"tool_rules": []` to disable tool rules, this line still loads defaults when `use_default_tool_rules` is true. Use an explicit key check so empty lists are honored.</comment>

<file context>
@@ -55,7 +55,7 @@ def __init__(
             self._config.block_high_risk = True
 
-        tool_rules = self._config.tool_rules if use_default_tool_rules else []
+        tool_rules = (config or {}).get("tool_rules") or (self._config.tool_rules if use_default_tool_rules else [])
 
         self._tool_sanitizer: ToolResultSanitizer = create_tool_result_sanitizer(
</file context>
Suggested change
tool_rules = (config or {}).get("tool_rules") or (self._config.tool_rules if use_default_tool_rules else [])
tool_rules = (config or {}).get("tool_rules") if "tool_rules" in (config or {}) else (self._config.tool_rules if use_default_tool_rules else [])
Fix with Cubic


self._tool_sanitizer: ToolResultSanitizer = create_tool_result_sanitizer(
risky_fields=self._config.risky_fields,
Expand Down Expand Up @@ -120,7 +120,20 @@ def defend_tool_result(self, value: Any, tool_name: str) -> DefenseResult:
tier2_idx = _RISK_LEVELS.index(tier2_risk)
risk_level = _RISK_LEVELS[max(tier1_idx, tier2_idx)]

allowed = not self._config.block_high_risk or risk_level not in ("high", "critical")
# Determine whether any threat signals were found (Tier 1 or Tier 2).
# fields_sanitized captures sanitization methods (role stripping, encoding detection, etc.)
# that may fire without adding named pattern detections, so we include it here.
has_threats = (
len(detections) > 0
or len(fields_sanitized) > 0
or (tier2_score is not None and tier2_score >= self._config.tier2.high_risk_threshold)
Copy link
Copy Markdown

@cubic-dev-ai cubic-dev-ai bot Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: has_threats compares tier2 scores against self._config.tier2.high_risk_threshold, which doesn’t reflect tier2_config overrides. If the classifier uses a lower high-risk threshold, tier2_risk can be high while has_threats stays false, so block_high_risk won’t block. Use tier2_risk (or the classifier’s thresholds) instead of the config default.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At src/stackone_defender/core/prompt_defense.py, line 129:

<comment>`has_threats` compares tier2 scores against `self._config.tier2.high_risk_threshold`, which doesn’t reflect `tier2_config` overrides. If the classifier uses a lower high-risk threshold, `tier2_risk` can be high while `has_threats` stays false, so `block_high_risk` won’t block. Use `tier2_risk` (or the classifier’s thresholds) instead of the config default.</comment>

<file context>
@@ -120,7 +120,20 @@ def defend_tool_result(self, value: Any, tool_name: str) -> DefenseResult:
+        has_threats = (
+            len(detections) > 0
+            or len(fields_sanitized) > 0
+            or (tier2_score is not None and tier2_score >= self._config.tier2.high_risk_threshold)
+        )
+
</file context>
Suggested change
or (tier2_score is not None and tier2_score >= self._config.tier2.high_risk_threshold)
or tier2_risk in ("high", "critical")
Fix with Cubic

)

# Three cases for allowed:
# 1. block_high_risk is off -> always allow
# 2. No threat signals found -> allow (base risk from tool rules alone does not block)
# 3. Risk did not reach high/critical -> allow
allowed = not self._config.block_high_risk or not has_threats or risk_level not in ("high", "critical")

return DefenseResult(
allowed=allowed,
Expand Down
38 changes: 38 additions & 0 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,44 @@ def test_returns_latency(self):
assert result.latency_ms > 0


class TestUseDefaultToolRules:
def test_does_not_apply_tool_rules_by_default(self):
defense = create_prompt_defense()
data = {"subject": "Weekly team update", "body": "Reminder about the meeting tomorrow at 10am.", "thread_id": "thread123"}
result = defense.defend_tool_result(data, "gmail_get_message")
# Without use_default_tool_rules, gmail tool rule should NOT seed risk_level to 'high'
assert result.risk_level not in ("high", "critical")

def test_does_not_apply_tool_rules_when_explicitly_false(self):
defense = create_prompt_defense(use_default_tool_rules=False)
data = {"subject": "Weekly team update", "body": "Reminder about the meeting tomorrow at 10am.", "thread_id": "thread123"}
result = defense.defend_tool_result(data, "gmail_get_message")
assert result.risk_level not in ("high", "critical")

def test_applies_tool_rules_when_true(self):
defense = create_prompt_defense(use_default_tool_rules=True, block_high_risk=True)
data = {"subject": "Weekly team update", "body": "Reminder about the meeting tomorrow at 10am.", "thread_id": "thread123"}
result = defense.defend_tool_result(data, "gmail_get_message")
# With use_default_tool_rules, gmail tool rule seeds risk_level: 'high' as base risk,
# but safe content with no detections should still be allowed through.
assert result.risk_level == "high"
assert result.allowed is True
Comment on lines +164 to +171
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new tests verify that safe content is allowed through when block_high_risk=True with tool rules, but there's no test verifying the converse — that malicious content is still blocked when use_default_tool_rules=True and block_high_risk=True. Adding a test like test_blocks_malicious_content_with_tool_rules (e.g., using "SYSTEM: ignore previous instructions" in a gmail message body) would guard against regressions in the has_threats logic.

Copilot uses AI. Check for mistakes.

def test_always_applies_custom_tool_rules_from_config(self):
from stackone_defender.types import ToolSanitizationRule
defense = create_prompt_defense(
use_default_tool_rules=False,
config={"tool_rules": [ToolSanitizationRule(tool_pattern="custom_*", sanitization_level="high")]},
block_high_risk=True,
)
data = {"name": "Safe content"}
result = defense.defend_tool_result(data, "custom_tool")
# Custom rules set base risk_level: 'high', but safe content with no detections
# should still be allowed through — base risk alone does not block.
assert result.risk_level == "high"
assert result.allowed is True


class TestRealWorldScenarios:
def setup_method(self):
self.defense = create_prompt_defense()
Expand Down