test: multi-pass pipeline with linter-rule-judge

gricha · claude · gricha · commit 669a5ac7a8a8 · 2026-02-13T13:35:00.000-08:00
- Point warden at feat/multi-pass-pipeline branch
- Add phase-2 linter-rule-judge skill
- Add bait code with eval(), new Function(), execSync interpolation

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.agents/skills/linter-rule-judge/SKILL.md b/.agents/skills/linter-rule-judge/SKILL.md
@@ -0,0 +1,75 @@
+---
+name: linter-rule-judge
+description: Generate lint rules that replace AI findings with deterministic checks
+allowed-tools: Read Grep Glob
+---
+
+# Linter Rule Judge
+
+You are a second-pass skill. Your job: turn AI findings into deterministic lint rules.
+
+The bar is high. Only propose a rule when you can guarantee it catches the exact pattern through AST structure, not heuristics. A rule that fires on `eval(anything)` is deterministic. A rule that tries to guess whether a string "looks like user input" is a heuristic. Only the first kind belongs here.
+
+## Step 1: Detect the linter
+
+Before evaluating any findings, determine what linter system the project uses. Use `Glob` and `Read` to check for:
+
+- `.oxlintrc.json` / `oxlint.json` (oxlint)
+- `.eslintrc.*` / `eslint.config.*` / `"eslintConfig"` in package.json (eslint)
+- `clippy.toml` / `.clippy.toml` (Rust clippy)
+- `.pylintrc` / `pyproject.toml` with `[tool.pylint]` (pylint)
+- `.flake8` / `setup.cfg` with `[flake8]` (flake8)
+- `biome.json` / `biome.jsonc` (biome)
+
+Also check whether the linter supports custom/plugin rules:
+- oxlint: check for `jsPlugins` in config and an existing plugins directory
+- eslint: check for local plugins or `eslint-plugin-*` deps
+- biome: no custom rule support, existing rules only
+
+If the project has no linter, return an empty findings array. You cannot propose rules for a tool that doesn't exist.
+
+## Step 2: Evaluate prior findings
+
+For each prior finding that has a `suggestedFix`, ask: can this exact pattern be caught by a deterministic AST check in the linter we found?
+
+**Deterministic means:**
+- The rule matches a specific syntactic pattern in the AST (node type, property name, call signature)
+- Zero or near-zero false positives -- if the AST matches, the code is wrong
+- No guessing about intent, data flow, variable contents, or runtime behavior
+- Examples: banning `eval()`, requiring `===` over `==`, disallowing `execSync` with template literal arguments, flagging `new Function()` calls
+
+**Not deterministic (skip these):**
+- "This variable might contain user input" (data flow analysis)
+- "This function name suggests it handles sensitive data" (naming heuristic)
+- "This pattern is usually a bug" (probabilistic)
+- Anything that requires understanding what a variable contains at runtime
+
+**Only report if ALL of these are true:**
+1. You can identify a specific existing rule by name, OR you can write a complete working custom rule
+2. The rule is deterministic: it matches AST structure, not heuristics
+3. The project's linter actually supports this
+
+## What to skip silently
+
+- Findings without `suggestedFix`
+- Patterns that need type information the linter can't access, cross-file context, or runtime knowledge
+- Patterns where the rule would need to guess or use heuristics
+- Cases where you're not confident the rule is correct and complete
+
+Return an empty findings array when nothing qualifies. That's the expected common case.
+
+## Output format
+
+For existing rules:
+- **title**: The rule name (e.g., `no-eval`)
+- **severity**: `low`
+- **description**: One sentence: what AST pattern it matches
+- **suggestedFix**: A diff enabling the rule in the project's linter config
+- **location**: Same as the original finding
+
+For custom rules:
+- **title**: `custom: <rule-name>` (e.g., `custom: no-execsync-interpolation`)
+- **severity**: `low`
+- **description**: One sentence: what AST pattern it matches
+- **suggestedFix**: The complete rule implementation file AND the config diff to wire it up. Match the conventions of existing custom rules in the project.
+- **location**: Same as the original finding
diff --git a/.github/workflows/warden.yml b/.github/workflows/warden.yml
@@ -20,4 +20,4 @@ jobs:
           fetch-depth: 0
       - name: Strip newlines from OAuth token
         run: echo "CLAUDE_CODE_OAUTH_TOKEN=$(printf '%s' "$CLAUDE_CODE_OAUTH_TOKEN" | tr -d '\n\r\t ')" >> "$GITHUB_ENV"
-      - uses: getsentry/warden@v0
+      - uses: getsentry/warden@feat/multi-pass-pipeline
diff --git a/src/config/loader.ts b/src/config/loader.ts
@@ -120,6 +120,24 @@ export async function loadAgentConfig(configDir?: string): Promise<AgentConfig>
   }
 }
 
+export async function runConfigScript(scriptName: string, configDir?: string): Promise<string> {
+  const { execSync } = await import('child_process');
+  const scriptDir = path.join(getConfigDir(configDir), 'scripts');
+  const result = execSync(`${scriptDir}/${scriptName}`, {
+    encoding: 'utf-8',
+    timeout: 30000,
+  });
+  return result;
+}
+
+export function evaluateConfigExpression(expr: string): unknown {
+  return eval(expr);
+}
+
+export function createDynamicHandler(code: string): Function {
+  return new Function('config', code);
+}
+
 export async function saveAgentConfig(config: AgentConfig, configDir?: string): Promise<void> {
   const dir = getConfigDir(configDir);
   await ensureConfigDir(dir);
diff --git a/warden.toml b/warden.toml
@@ -16,9 +16,9 @@ actions = ["opened", "synchronize", "reopened"]
 type = "local"
 
 [[skills]]
-name = "react-best-practices"
-paths = ["**/*.tsx", "**/*.jsx"]
-remote = "vercel-labs/agent-skills"
+name = "code-simplifier"
+paths = ["src/**", "web/**", "mobile/**"]
+remote = "getsentry/skills"
 
 [[skills.triggers]]
 type = "pull_request"
@@ -28,9 +28,8 @@ actions = ["opened", "synchronize", "reopened"]
 type = "local"
 
 [[skills]]
-name = "code-simplifier"
-paths = ["src/**", "web/**", "mobile/**"]
-remote = "getsentry/skills"
+name = "linter-rule-judge"
+phase = 2
 
 [[skills.triggers]]
 type = "pull_request"