diff --git a/BATCHING_COMPARISON_REPORT_V4_CORRECTED.md b/BATCHING_COMPARISON_REPORT_V4_CORRECTED.md new file mode 100644 index 0000000..0039b92 --- /dev/null +++ b/BATCHING_COMPARISON_REPORT_V4_CORRECTED.md @@ -0,0 +1,54 @@ +# Rule Batching Optimization - Comparison Report (V4 Final) + +**Scope:** Manual Verification on `test.md` +**Experiment:** "Task-Based" Prompt + Reduced Batch Size (Limit 2) +**Date:** 2026-01-13 +**Status:** πŸ›‘ **Optimization Failed - Feature Disabled** + +--- + +## Executive Summary + +We performed a deep manual inspection of the evaluation results for `tests/fixtures/technical-accuracy/test.md` to verify the automated metrics. + +**Verdict:** The batching optimization introduces **instability**. While it successfully identified some issues (and even one that the baseline missed), it suffered from two critical failures: +1. **Context Bleed (Hallucination):** Applying the logic of Rule A to the context of Rule B. +2. **Missed Violations:** Failing to detect structural issues (Repetition) that required whole-document scanning. + +--- + +## πŸ” Detailed Finding Comparison + +| Rule | Issue Location | Baseline | Batched | Analysis | +| :--- | :--- | :--- | :--- | :--- | +| **AIPattern** | Line 13 ("ensures") | βœ… Found | βœ… Found | **Match** | +| **AIPattern** | Line 31 ("verify") | - | ❌ Found | **False Positive:** Flagged as an AIPattern violation likely because it was also a PseudoAdvice violation. The contexts merged. | +| **PseudoAdvice**| Line 31 ("Always verify") | βœ… Found | βœ… Found | **Match** | +| **PseudoAdvice**| Line 21 ("Always trust") | ❌ Missed | βœ… Found | **New Detection:** Batched model outperformed baseline here. | +| **Repetition** | Line 29 (CloudLint) | βœ… Found | ❌ Missed | **Missed:** The model failed to hold the document structure in memory while processing other rules. | + +--- + +## πŸ“‰ Root Cause Analysis + +### 1. The "Context Bleed" Phenomenon +The most concerning finding is the **False Positive on Line 31**. +* **The Text:** *"Always verify AI-generated claims..."* +* **Rule A (PseudoAdvice):** Correctly flagged this as an imperative without steps. +* **Rule B (AIPattern):** *Incorrectly* flagged this. The analysis explicitly stated *"No listed buzzwords... No change needed"*, yet it still generated a Warning. +* **Theory:** The model "felt" the violation from Rule A and allowed it to contaminate the result for Rule B. + +### 2. Logic Collapse on Structural Rules +`VectorLint.Repetition` requires scanning the entire document to find duplicates. In the Batched run, this global attention mechanism failed, likely because the model's attention was fragmented by the local line-by-line checks for AIPattern/PseudoAdvice. + +--- + +## πŸ›‘ Final Decision + +The feature remains **DISABLED**. +While the token savings (~70%) are attractive, the **cross-contamination of rules** (Context Bleed) makes the linter unreliable. Users would be confused by warnings that cite the wrong rule or contradict themselves. + +**Infrastructure Status:** +* Code: Merged & Preserved. +* Config: `BatchRules=false`. +* Future Work: Requires multi-pass or iterative prompting to solve Context Bleed. diff --git a/BATCHING_COMPARISON_REPORT_V5.md b/BATCHING_COMPARISON_REPORT_V5.md new file mode 100644 index 0000000..94ec3b0 --- /dev/null +++ b/BATCHING_COMPARISON_REPORT_V5.md @@ -0,0 +1,290 @@ +# Rule Batching Validation Report V5 (Batch Size = 4) + +**Date:** 2026-01-13 +**File Tested:** `tests/fixtures/ai-pattern/negation-pattern.md` +**Batch Size:** 4 rules per batch (MaxRulesPerBatch=4) + +--- + +## Executive Summary + +| Metric | Target | Result | Status | +|--------|--------|--------|--------| +| **Intersection (Overlap)** | >95% | **~38%** | ❌ FAIL | +| **Efficiency (Token Reduction)** | >50% | **~37%** | ❌ FAIL | +| **Hallucinations** | 0 false positives | ~9% | ⚠️ MARGINAL | + +**Recommendation:** Feature should remain **DISABLED by default**. + +--- + +## Test Results + +### Summary Counts + +| Mode | Warnings | Input Tokens | LLM Requests | Cost | +|------|----------|--------------|--------------|------| +| **Baseline (A)** | 32 | ~50,570 | ~24 | ~$0.20 | +| **Batched (B)** | 34 | ~31,996 | 6 | ~$0.15 | + +--- + +## 1. Baseline (A) Findings - 32 Warnings + +### AIPattern (17 findings) + +| Line:Col | Quoted Text | Description | +|----------|-------------|-------------| +| 1:60 | "it's about leveraging smarter code" | Using 'leveraging' sounds like generic tech marketing language rather than natural phrasing. | +| 1:102 | "don't just need tools, they need integrated platforms" | The "don't just X, they need Y" structure adds rhetorical flair but no new substance. | +| 3:15 | "doesn't simply improve productivity, it transforms workflows entirely" | "Doesn't simply improve" introduces and dismisses an idea that was never discussed before. | +| 3:86 | "You're not managing projects anymore, you're orchestrating outcomes" | The sentence contrasts two roles for effect, but "managing projects" was not established earlier. | +| 3:155 | "The goal isn't faster delivery, it's sustainable velocity" | Introduces "faster delivery" only to negate it, creating formulaic emphasis. | +| 12:1 | "The solution isn't to hire more reviewers or work longer hours. The solution is to apply AI" | The sentence sets up and dismisses a strawman option purely for rhetorical effect. | +| 37:1 | "Graphite Agent isn't just a comment bot. It's an interactive companion" | Negation-contrast pattern "isn't just X, it's Y" is used rhetorically without prior setup. | +| 39:1 | "The integration feels seamless." | Contains the buzzword "seamless," which is overused in AI-generated marketing copy. | +| 54:70 | "Copilot works seamlessly in VS Code" | Uses the buzzword "seamlessly," which is common in generic AI/marketing language. | +| 58:323 | "The AI doesn't have the same 'agentic' feel... It provides suggestions but lacks..." | Negation-contrast pattern redundantly contrasts negatives for emphasis. | +| 90:281 | "doesn't just look at the changed lines. It understands how" | Uses template "doesn't just look at X, it understands Y" without prior framing. | +| 92:1 | "This approach excels at answering complex questions..." | (False positive - baseline incorrectly flagged clean content) | +| 130:50 | "Instead of trying to do everything, it focuses exclusively on finding critical bugs" | Uses artificial contrast without prior discussion of BugBot attempting everything. | +| 136:31 | "BugBot doesn't generate PR summaries, doesn't provide architectural feedback..." | Repeated "doesn't" constructions stack negations, sounding like templated AI phrasing. | +| 136:166 | "It has one job: find critical bugs." | Completes familiar AI-style contrast pattern after list of "doesn't" capabilities. | +| 153:163 | "You're not just adding AI to a slow process, you're fixing the process itself." | Uses artificial contrast to sound emphatic rather than advancing the argument. | +| 168:168 | "It's not just an AI reviewer, it's a complete rethinking of how code review should work" | Introduces and dismisses a straw characterization for emphasis. | + +### Directness (6 findings) + +| Line:Col | Quoted Text | Description | +|----------|-------------|-------------| +| 31:1 | "Most AI code review tools are bots that bolt onto your existing GitHub workflow." | Context-before-answer pattern; section starts with industry background rather than explaining why Graphite Agent is best. | +| 71:1 | "CodeRabbit has established itself as the leading third-party AI review bot." | First sentence gives status/positioning rather than directly summarizing the tool's core function. | +| 90:1 | "Greptile takes a unique approach to AI code review by building a comprehensive knowledge graph..." | Foregrounds that approach is "unique" instead of directly stating the primary capability. | +| 152:1 | "How to Choose the Right Tool" | Header introduces new section, but no opening content follows to answer how to choose. | +| 164:1 | "AI code review tools are no longer optional." | Reader finishes first sentence unsure of concrete takeaway; main conclusion appears later. | +| 168:1 | "To truly fix code review, you need a platform that incentivizes better practices" | Section opening could immediately answer how to fix code review; clearer conclusion is buried. | + +### PseudoAdvice (7 findings) + +| Line:Col | Quoted Text | Description | +|----------|-------------|-------------| +| 1:71 | "leveraging smarter code" | Imperative-style recommendation but provides no concrete methods, tools, or steps. | +| 3:131 | "orchestrating outcomes" | Prescriptive framing suggests new way to work but offers no steps or examples. | +| 3:192 | "sustainable velocity" | Goal-level recommendation; surrounding text gives no concrete practices to reach it. | +| 43:11 | "Teams who want to fundamentally speed up their development velocity, not just add a bot" | Advice-like guidance lacks any how-to detail or examples in nearby sentences. | +| 81:1 | "Best for: Teams who want to keep the native GitHub UI but need better automated feedback..." | Advice framed as recommendation but lacks concrete decision criteria or steps. | +| 100:1 | "Best for: Large, complex monorepos where understanding the impact of changes..." | Recommends when to use tool but provides no concrete decision process or checklist. | +| 168:27 | "you need a platform that incentivizes better practices (smaller, more focused PRs)..." | States what to do but provides no actionable instructions, tools, or examples. | + +### Repetition (2 findings) + +| Line:Col | Quoted Text | Description | +|----------|-------------|-------------| +| 168:1 | "To truly fix code review, you need a platform..." (full paragraph) | Reiterates Graphite as workflow-level solution with smaller PRs and AI; already explained in prior sections. | +| 170:1 | "The best teams are shipping faster than ever while maintaining higher code quality..." | Repeats established theme that workflow improvements plus AI increase velocity; no new data or examples. | + +--- + +## 2. Batched (B) Findings - 34 Warnings + +### AIPattern (13 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 1:60 | "it's about leveraging smarter code" | The phrase "leveraging" is a listed buzzword and can feel like generic AI marketing language. | βœ… YES | +| 12:1 | "The solution isn't to hire more reviewers..." | The sentence introduces "hire more reviewers or work longer hours" only to dismiss it, creating an AI-like rhetorical pattern. | βœ… YES | +| 39:1 | "The integration feels seamless." | The word "seamless" is explicitly listed as an AI buzzword to flag. | βœ… YES | +| 54:70 | "Copilot works seamlessly in VS Code" | The adverb "seamlessly" matches the buzzword list and signals AI-like phrasing. | βœ… YES | +| 71:397 | "comprehensive feedback" | The phrase is somewhat buzzwordy, though not on the strict list; still mildly formulaic. | πŸ†• NEW | +| 75:15 | "highly configurable" | This phrase is overused in product copy and feels templated without immediate specifics. | πŸ†• NEW | +| 79:1 | "Pricing is competitive" | This phrase asserts value without detail and reads like boilerplate marketing language. | πŸ†• NEW | +| 88:11 | "Focuses heavily on understanding your entire codebase" | The construction is broad and promotional; could be more concrete and specific. | πŸ†• NEW | +| 90:66 | "comprehensive knowledge graph" | While not on the strict list, it reads as jargon if not concretely grounded. | πŸ†• NEW | +| 96:149 | "deploy Greptile on-premises for complete control over your data" | 'deploy' is a flagged buzzword that can make the prose feel formulaic and marketing-like. | πŸ†• NEW | +| 132:405 | "optimizations" | The term is mildly buzzwordy and can feel generic and marketing-like. | πŸ†• NEW | +| 153:20 | "You're not just adding AI to a slow process, you're fixing the process itself." | The "not just X, Y" contrast is formulaic and reads like AI rhetoric. | βœ… YES | +| 168:168 | "It's not just an AI reviewer, it's a complete rethinking" | The sentence introduces "just an AI reviewer" only to dismiss it for emphasis. | βœ… YES | + +### Directness (7 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 7:1 | "Introduction: The 'Review Gap' in the Age of AI" | The first sentence gives background on AI code generation; the gap itself is only named later. | πŸ†• NEW | +| 50:1 | "Overview: The default choice for many teams..." | First sentence states popularity and a narrow benefit rather than summarizing Copilot's core capabilities. | πŸ†• NEW | +| 69:1 | "Overview: A popular tool that connects to GitHub, GitLab, or Bitbucket..." | It answers what CodeRabbit is, but buries its standout aspects until later. | βœ… SIMILAR to 71 | +| 88:1 | "Overview: Focuses heavily on understanding your entire codebase..." | Readers must infer what Greptile actually does from a contrast, not a clear primary statement. | βœ… SIMILAR to 90 | +| 104:1 | "5. Ellipsis (Best for Automated Fixes)" | First sentence describes what Ellipsis is but does not front-load why it's best for automated fixes. | πŸ†• NEW | +| 152:1 | "How to Choose the Right Tool" | Readers see header but get no immediate criteria or steps. | βœ… YES | +| 163:1 | "Conclusion" | Readers expect first sentence to state the primary takeaway; instead opens with broad statements. | βœ… SIMILAR to 164 | + +### PseudoAdvice (7 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 12:65 | "The solution is to apply AI to the review process itself." | This is an imperative recommendation but gives no actionable detail on how to apply AI in practice. | βœ… SIMILAR | +| 43:1 | "Best for: Teams who want to fundamentally speed up their development velocity..." | It advises a type of team to pick the tool but lacks actionable criteria or steps. | βœ… YES | +| 77:216 | "You'll need to invest time in configuration to dial down irrelevant feedback." | Tells readers to invest time configuring without explaining which options to change or how. | πŸ†• NEW | +| 111:252 | "For minor refactoring tasks, style fixes, or simple logic adjustments, this saves significant time." | This is prescriptive but lacks actionable guidance on setup, workflow, or decision criteria. | πŸ†• NEW | +| 121:1 | "Best for: Teams who spend too much time on minor refactoring cycles..." | The sentence advises a type of team but doesn't explain how to assess fit or adopt it. | πŸ†• NEW | +| 142:1 | "Best for: High-compliance industries or mission-critical codebases..." | Advice identifies ideal users but omits how such teams should pilot, configure, or integrate. | πŸ†• NEW | +| 172:1 | "Try Graphite Agent today β€” it's included in every Graphite plan..." | Advice tells readers to try and sign up without specifying where or how beyond generic prompt. | πŸ†• NEW | + +### Repetition (7 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 10:1 | "This mismatch has created what I call the 'Review Gap.'" | Prior paragraph already explains AI speeding coding while review capacity stays flat; naming it largely repeats the concept. | πŸ†• NEW | +| 39:1 | "The integration feels seamless. Graphite Agent appears directly in Graphite's PR inbox..." | Line 37 notes the agent "lives on your PR page"; calling integration "seamless" restates quality without adding specifics. | πŸ†• NEW | +| 71:77 | "It offers a rich feature set that goes well beyond basic static analysis." | This high-level claim is rephrased by later specifics, making it somewhat redundant. | πŸ†• NEW | +| 111:252 | "For minor refactoring tasks, style fixes, or simple logic adjustments, this saves significant time." | Line 110 already explains Ellipsis automates fixes; this restates that it reduces back-and-forth without new detail. | πŸ†• NEW | +| 121:1 | "Best for: Teams who spend too much time on minor refactoring cycles..." | Multiple "Best for" sections across tools provide similar high-level fit guidance; could be consolidated. | πŸ†• NEW | +| 142:1 | "Best for: High-compliance industries or mission-critical codebases..." | Multiple "Best for" blurbs repeat the same kind of fit description; table already covers use cases. | πŸ†• NEW | +| 168:1 | "To truly fix code review, you need a platform that incentivizes better practices..." | This section reiterates that Graphite fixes underlying workflow issues, already conveyed earlier. | βœ… YES | + +--- + +## 3. Overlap Analysis + +### Matched Findings (by Rule+Line) + +| Baseline Finding | Batched Finding | Rule | Match Status | +|------------------|-----------------|------|--------------| +| Line 1:60 (leveraging) | Line 1:60 | AIPattern | βœ… Exact | +| Line 12 (solution isn't) | Line 12 | AIPattern | βœ… Exact | +| Line 39 (seamless) | Line 39 | AIPattern | βœ… Exact | +| Line 54 (seamlessly) | Line 54 | AIPattern | βœ… Exact | +| Line 153 (not just X, Y) | Line 153 | AIPattern | βœ… Exact | +| Line 168 (not just AI reviewer) | Line 168 | AIPattern | βœ… Exact | +| Line 152 (How to Choose) | Line 152 | Directness | βœ… Exact | +| Line 164 (Conclusion) | Line 163 | Directness | β‰ˆ Similar | +| Line 43 (Best for teams) | Line 43 | PseudoAdvice | βœ… Exact | +| Line 168 (Repetition) | Line 168 | Repetition | βœ… Exact | + +### Overlap Statistics + +- **Baseline Total:** 32 findings +- **Exact/Similar Matches:** ~12 findings +- **Overlap Rate:** 12/32 = **37.5%** + +**VERDICT: ❌ FAIL** (Target was >95%) + +--- + +## 4. Missed Findings (Baseline found, Batched missed) + +| Line:Col | Rule | Quoted Text | Description | +|----------|------|-------------|-------------| +| 1:102 | AIPattern | "don't just need tools, they need integrated platforms" | Rhetorical structure adds flair but no substance | +| 3:15 | AIPattern | "doesn't simply improve productivity" | Introduces and dismisses idea never discussed | +| 3:86 | AIPattern | "You're not managing projects anymore" | Contrasts roles without prior establishment | +| 3:155 | AIPattern | "The goal isn't faster delivery" | Creates formulaic emphasis | +| 37:1 | AIPattern | "isn't just a comment bot" | Negation-contrast without prior setup | +| 58:323 | AIPattern | "doesn't have X, but lacks Y" | Redundant negative contrasts | +| 90:281 | AIPattern | "doesn't just look at" | Template phrase without prior framing | +| 130:50 | AIPattern | "Instead of trying to do everything" | Artificial contrast | +| 136:31 | AIPattern | "doesn't generate, doesn't provide, doesn't help" | Repeated "doesn'ts" - templated AI phrasing | +| 136:166 | AIPattern | "It has one job: find critical bugs" | Completes AI-style contrast pattern | +| 31:1 | Directness | "Most AI code review tools are bots..." | Context-before-answer pattern in Graphite section | +| 71:1 | Directness | "CodeRabbit has established itself..." | Status before function in CodeRabbit overview | +| 90:1 | Directness | "Greptile takes a unique approach..." | "Unique approach" instead of direct capability | +| 168:1 | Directness | "To truly fix code review..." | Concrete takeaway buried mid-section | +| 1:71 | PseudoAdvice | "leveraging smarter code" | No concrete methods provided | +| 3:131 | PseudoAdvice | "orchestrating outcomes" | No steps or examples | +| 3:192 | PseudoAdvice | "sustainable velocity" | No concrete practices | +| 81:1 | PseudoAdvice | "Best for: Teams who want to keep..." | Lacks decision criteria | +| 100:1 | PseudoAdvice | "Best for: Large, complex monorepos..." | No checklist for assessment | +| 170:1 | Repetition | "The best teams are shipping faster..." | Repeats established theme | + +**Total Missed:** 18 findings + +--- + +## 5. New Findings (Batched found, Baseline missed) + +| Line:Col | Rule | Quoted Text | Description | Valid? | +|----------|------|-------------|-------------|--------| +| 71:397 | AIPattern | "comprehensive feedback" | Somewhat buzzwordy, mildly formulaic | βœ… Valid | +| 75:15 | AIPattern | "highly configurable" | Overused product copy phrase | βœ… Valid | +| 79:1 | AIPattern | "Pricing is competitive" | Asserts value without detail | βœ… Valid | +| 88:11 | AIPattern | "Focuses heavily on understanding" | Broad and promotional | ⚠️ Questionable | +| 90:66 | AIPattern | "comprehensive knowledge graph" | Reads as jargon | ⚠️ Questionable | +| 96:149 | AIPattern | "deploy" | Flagged buzzword | βœ… Valid | +| 132:405 | AIPattern | "optimizations" | Generic marketing-flavored term | βœ… Valid | +| 7:1 | Directness | "Introduction: The 'Review Gap'..." | Header vs first paragraph gap definition | βœ… Valid | +| 50:1 | Directness | "Overview: The default choice..." | Copilot overview buries core capabilities | βœ… Valid | +| 104:1 | Directness | "5. Ellipsis (Best for Automated Fixes)" | Header doesn't front-load key benefit | βœ… Valid | +| 77:216 | PseudoAdvice | "invest time in configuration" | No specifics given | βœ… Valid | +| 111:252 | PseudoAdvice | "saves significant time" | No actionable guidance | βœ… Valid | +| 121:1 | PseudoAdvice | "Best for: Teams who spend too much time..." | Lacks adoption guidance | βœ… Valid | +| 142:1 | PseudoAdvice | "Best for: High-compliance industries..." | No integration steps | βœ… Valid | +| 172:1 | PseudoAdvice | "Try Graphite Agent today" | Minimal actionable guidance | βœ… Valid | +| 10:1 | Repetition | "This mismatch has created what I call the 'Review Gap.'" | Naming repeats prior explanation | βœ… Valid | +| 39:1 | Repetition | "The integration feels seamless..." | Restates integration quality | βœ… Valid | +| 71:77 | Repetition | "rich feature set that goes well beyond" | Rephrased by later specifics | βœ… Valid | +| 111:252 | Repetition | "this saves significant time" | Restates prior explanation | βœ… Valid | +| 121:1 | Repetition | "Best for: Teams who spend..." | Pattern repeated across tools | βœ… Valid | +| 142:1 | Repetition | "Best for: High-compliance..." | Structurally redundant | βœ… Valid | + +**Total New:** 22 findings +**Hallucination Rate:** 2/22 = ~9% (the "questionable" items) + +--- + +## 6. Efficiency Analysis + +| Metric | Baseline | Batched | Reduction | +|--------|----------|---------|-----------| +| Input Tokens | ~50,570 | ~31,996 | **-37%** | +| LLM Requests | ~24 | 6 | **-75%** | +| Output Tokens | ~3,287 | ~3,852 | +17% | +| Total Cost | ~$0.20 | ~$0.15 | **-25%** | + +**Token Reduction:** 37% (Target was >50%) +**VERDICT: ❌ FAIL** + +--- + +## 7. Root Cause Analysis + +### Why Batched Mode Missed Baseline Findings + +1. **Lost in the Middle:** The negation-contrast patterns (lines 3, 37, 58, 130, 136) were systematically missed. These require careful rule application that gets diluted when 4 rules compete for attention in one prompt. + +2. **Different Focus:** Batched mode found MORE buzzword-style issues (lines 71, 75, 79, 96, 132) but FEWER structural rhetorical patterns. + +3. **Inconsistent Rule Application:** The Repetition rule found 7 issues in batched mode vs 2 in baseline - suggesting the model's interpretation varies significantly based on prompt structure. + +### Why Batched Mode Found Different Issues + +1. **Broader Scanning:** With 4 rules in context, the model may have done a more comprehensive scan for simpler patterns (buzzwords, vague advice). + +2. **Rule Bleeding:** Some findings appear to blend criteria from multiple rules, suggesting context contamination. + +--- + +## 8. Conclusion + +| Criterion | Target | Actual | Pass/Fail | +|-----------|--------|--------|-----------| +| Overlap with Baseline | >95% | 37.5% | ❌ FAIL | +| Token Reduction | >50% | 37% | ❌ FAIL | +| Hallucination Rate | 0% | ~9% | ⚠️ MARGINAL | + +### Recommendation + +**The Rule Batching feature should remain DISABLED by default.** + +While the infrastructure is functional and provides meaningful efficiency gains (~37% token reduction, ~75% request reduction), the quality degradation is unacceptable: + +- **63% of baseline findings were missed or reported differently** +- **Many new findings were valid but different** from what the baseline found +- **Results are non-reproducible** between modes + +The feature may be revisited if: +1. Better prompt engineering solves the "lost in the middle" problem +2. Smaller batch sizes (2-3 rules) are tested +3. Rule-type-specific batching is implemented (only simple buzzword rules together) + +--- + +*Report generated by manual A/B validation test.* diff --git a/BATCHING_COMPARISON_REPORT_V6.md b/BATCHING_COMPARISON_REPORT_V6.md new file mode 100644 index 0000000..84e0f2c --- /dev/null +++ b/BATCHING_COMPARISON_REPORT_V6.md @@ -0,0 +1,286 @@ +# Rule Batching Validation Report V6 (Batch Size = 2) + +**Date:** 2026-01-13 +**File Tested:** `tests/fixtures/ai-pattern/negation-pattern.md` +**Batch Size:** 2 rules per batch (MaxRulesPerBatch=2) + +--- + +## Executive Summary + +| Metric | Target | Batch=4 (V5) | **Batch=2 (V6)** | Status | +|--------|--------|--------------|------------------|--------| +| **Intersection (Overlap)** | >95% | 37.5% | **~59%** | ❌ FAIL (improved) | +| **Efficiency (Token Reduction)** | >50% | 37% | **21%** | ❌ FAIL (worse) | +| **Hallucinations** | 0 | ~9% | ~5% | ⚠️ MARGINAL | + +**Recommendation:** Feature should remain **DISABLED by default**, but batch size 2 shows significant improvement in accuracy. + +--- + +## Test Results + +### Summary Counts + +| Mode | Warnings | Input Tokens | LLM Requests | Cost | +|------|----------|--------------|--------------|------| +| **Baseline (A)** | 32 | ~50,570 | ~24 | $0.20 | +| **Batched (B) size=4** | 34 | ~31,996 | 6 | $0.15 | +| **Batched (B) size=2** | 37 | ~39,868 | 12 | $0.19 | + +--- + +## 1. Batched (size=2) Findings - 37 Warnings + +### AIPattern (17 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 1:60 | "it's about leveraging smarter code" | The phrase "leveraging" is a listed buzzword and contributes to an AI-like tone. | βœ… YES | +| 3:15 | "doesn't simply improve productivity, it transforms workflows entirely" | "doesn't simply X, it Y" introduces and dismisses X only for rhetorical effect. | βœ… YES | +| 3:86 | "You're not managing projects anymore, you're orchestrating outcomes" | The "not X anymore, you're Y" structure is a formulaic AI-style contrast. | βœ… YES | +| 3:155 | "The goal isn't faster delivery, it's sustainable velocity" | No one claimed the goal was faster delivery; contrast is added only for emphasis. | βœ… YES | +| 12:1 | "The solution isn't to hire more reviewers or work longer hours. The solution is to apply AI..." | The negation contrast is more rhetorical than corrective, echoing common AI phrasing. | βœ… YES | +| 39:1 | "The integration feels seamless." | The sentence uses the buzzword "seamless," which is on the prohibited list. | βœ… YES | +| 54:70 | "Copilot works seamlessly in VS Code" | The adverb "seamlessly" is on the buzzword list and weakens specificity. | βœ… YES | +| 71:1 | "CodeRabbit has established itself... It offers a rich feature set..." | The phrase is broad and marketing-like without concrete detail. | πŸ†• NEW | +| 75:1 | "CodeRabbit is highly configurable." | This vague evaluative phrase sounds like boilerplate; needs concrete examples. | πŸ†• NEW | +| 77:79 | "comprehensive analysis can be valuable" | This abstract phrase reads like stock copy; more specific wording would feel more human. | πŸ†• NEW | +| 90:10 | "takes a unique approach to AI code review" | The claim of uniqueness is vague and unsubstantiated, contributing to formulaic tone. | πŸ†• NEW | +| 90:66 | "comprehensive knowledge graph of your entire repository" | This phrasing leans on jargon instead of plainly describing what is stored. | πŸ†• NEW | +| 90:197 | "uses this context to provide unusually deep analysis" | The phrase is promotional and non-specific, which can feel AI-generated. | πŸ†• NEW | +| 90:281 | "doesn't just look at the changed lines. It understands how those changes ripple" | "doesn't just look" introduces and dismisses a strawman behavior, a common AI rhetorical tic. | βœ… YES | +| 96:149 | "deploy Greptile on-premises for complete control" | 'deploy' is a flagged buzzword that can make the prose feel formulaic. | πŸ†• NEW | +| 132:405 | "optimizations" | The term is somewhat generic and marketing-flavored; replacing it would sound more human. | πŸ†• NEW | +| 168:168 | "It's not just an AI reviewer, it's a complete rethinking" | The phrase introduces "just an AI reviewer" only to dismiss it for emphasis, without prior correction. | βœ… YES | + +### Directness (7 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 7:1 | "Introduction: The 'Review Gap' in the Age of AI" | The first sentence gives background on AI code generation; the gap itself is only named and explained in the second paragraph. | πŸ†• NEW (similar to 31) | +| 50:1 | "Overview: The default choice for many teams. GitHub Copilot excels at 'in-the-flow' assistance..." | The first sentence states popularity and a narrow benefit rather than directly summarizing Copilot's core capabilities. | πŸ†• NEW | +| 69:1 | "Overview: A popular tool that connects to GitHub, GitLab, or Bitbucket..." | The header promises an overview; the first clause focuses on popularity rather than the main function. | βœ… SIMILAR to 71 | +| 88:1 | "Overview: Focuses heavily on understanding your entire codebase, not just the diff..." | Readers must infer what Greptile actually does from a contrast, not a clear primary statement. | βœ… SIMILAR to 90 | +| 104:1 | "5. Ellipsis (Best for Automated Fixes)" | First sentence under this header describes what Ellipsis is but does not clearly front-load why it is best for automated fixes. | πŸ†• NEW | +| 152:1 | "How to Choose the Right Tool" | Readers see the header but get no immediate criteria or steps, forcing them to hunt for the actual answer. | βœ… YES | +| 163:1 | "Conclusion" | Readers may expect the first sentence under "Conclusion" to directly state the primary takeaway; instead it opens with broad statements. | βœ… SIMILAR to 164 | + +### PseudoAdvice (7 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 1:60 | "it's about leveraging smarter code" | This is framed as guidance but gives no actionable method, tools, or steps for leveraging smarter code. | βœ… YES | +| 3:155 | "The goal isn't faster delivery, it's sustainable velocity." | This is an advice-like statement about what to prioritize, with no how-to guidance around achieving sustainable velocity. | βœ… YES | +| 43:1 | "Best for: Teams who want to fundamentally speed up their development velocity..." | The statement gives selection advice without any concrete how-to guidance or implementation details. | βœ… YES | +| 77:216 | "You'll need to invest time in configuration to dial down irrelevant feedback." | This imperative statement tells the reader what to do but provides no actionable guidance on how to configure, what settings to adjust, or what process to follow. | πŸ†• NEW | +| 100:1 | "Best for: Large, complex monorepos where understanding the impact of changes..." | The sentence recommends when Greptile is "Best for" but does not explain how a team should assess whether their repo fits this description. | βœ… YES | +| 152:1 | "How to Choose the Right Tool" | The phrase is an advice-oriented section title implying guidance on tool selection, but there are no following sentences offering concrete steps or comparison criteria. | πŸ†• NEW | +| 172:66 | "Sign up and review your first stack in minutes." | Standalone imperative marketing advice without actionable detail on how to perform the review process qualifies as pseudo-advice. | πŸ†• NEW | + +### Repetition (6 findings) + +| Line:Col | Quoted Text | Description | Baseline Match? | +|----------|-------------|-------------|-----------------| +| 3:155 | "The goal isn't faster delivery, it's sustainable velocity." | Concept of shifting from speed to smarter/sustainable work is already conveyed in lines 1 and 3; this restatement adds no new mechanism or perspective. | πŸ†• NEW | +| 39:1 | "The integration feels seamless. Graphite Agent appears directly in Graphite's PR inbox..." | The Graphite integration description and Copilot's integration description both cover convenience and ecosystem fit; this could be tightened. | πŸ†• NEW | +| 71:77 | "It offers a rich feature set that goes well beyond basic static analysis." | The earlier overview already states that CodeRabbit "provides detailed AI reviews"; this line repeats the same core concept without adding new mechanisms. | πŸ†• NEW | +| 111:1 | "This capability is genuinely useful for reducing the back-and-forth in code reviews." | The benefit of reducing back-and-forth is already implied by line 109's explanation that Ellipsis can automatically implement requested changes. | πŸ†• NEW | +| 121:1 | "Best for: Teams who spend too much time on minor refactoring cycles..." | The sentence defines Ellipsis's ideal users; line 150 in the comparison table restates the same core idea, so one could be consolidated. | πŸ†• NEW | +| 168:1 | "To truly fix code review, you need a platform that incentivizes better practices..." | The section restates Graphite's value propositionβ€”workflow rethinking plus AIβ€”without introducing new information. | βœ… YES | + +--- + +## 2. Overlap Analysis + +### Matched Findings (Batch=2 vs Baseline) + +**AIPattern:** +| Baseline Line | Batch=2 Line | Status | +|---------------|--------------|--------| +| 1:60 | 1:60 | βœ… Exact | +| 3:15 | 3:15 | βœ… Exact | +| 3:86 | 3:86 | βœ… Exact | +| 3:155 | 3:155 | βœ… Exact | +| 12 | 12 | βœ… Exact | +| 39 | 39 | βœ… Exact | +| 54 | 54 | βœ… Exact | +| 90:281 | 90:281 | βœ… Exact | +| 153 | - | ❌ Missed | +| 168 | 168 | βœ… Exact | + +**AIPattern Match Rate:** 10/17 = **59%** + +**Directness:** +| Baseline Line | Batch=2 Line | Status | +|---------------|--------------|--------| +| 31 | 7 | β‰ˆ Similar | +| 71 | 69 | βœ… Similar | +| 90 | 88 | βœ… Similar | +| 152 | 152 | βœ… Exact | +| 164 | 163 | βœ… Similar | +| 168 | - | ❌ Missed | + +**Directness Match Rate:** 4/6 = **67%** + +**PseudoAdvice:** +| Baseline Line | Batch=2 Line | Status | +|---------------|--------------|--------| +| 1 | 1 | βœ… Exact | +| 3:131 | - | ❌ Missed | +| 3:192 | 3:155 | βœ… Similar | +| 43 | 43 | βœ… Exact | +| 81 | - | ❌ Missed | +| 100 | 100 | βœ… Exact | +| 168 | - | ❌ Missed | + +**PseudoAdvice Match Rate:** 4/7 = **57%** + +**Repetition:** +| Baseline Line | Batch=2 Line | Status | +|---------------|--------------|--------| +| 168 | 168 | βœ… Exact | +| 170 | - | ❌ Missed | + +**Repetition Match Rate:** 1/2 = **50%** + +### Overall Overlap + +| Rule | Baseline | Matched | Rate | +|------|----------|---------|------| +| AIPattern | 17 | 10 | 59% | +| Directness | 6 | 4 | 67% | +| PseudoAdvice | 7 | 4 | 57% | +| Repetition | 2 | 1 | 50% | +| **TOTAL** | **32** | **19** | **~59%** | + +**VERDICT: ❌ FAIL** (Target was >95%, but improved from 37.5% with batch=4) + +--- + +## 3. Missed Findings (Baseline found, Batch=2 missed) + +| Line:Col | Rule | Quoted Text | Description | +|----------|------|-------------|-------------| +| 1:102 | AIPattern | "don't just need tools, they need integrated platforms" | "don't just X, they need Y" structure adds rhetorical flair but no substance | +| 37:1 | AIPattern | "Graphite Agent isn't just a comment bot. It's an interactive companion" | Negation-contrast pattern "isn't just X, it's Y" used rhetorically | +| 58:323 | AIPattern | "The AI doesn't have the same 'agentic' feel... but lacks..." | Negation-contrast redundantly contrasts negatives | +| 92:1 | AIPattern | "This approach excels at answering complex questions..." | (False positive in baseline - clean content) | +| 130:50 | AIPattern | "Instead of trying to do everything, it focuses exclusively..." | Artificial contrast without prior discussion | +| 136:31 | AIPattern | "BugBot doesn't generate, doesn't provide, doesn't help..." | Repeated "doesn'ts" - templated AI phrasing | +| 136:166 | AIPattern | "It has one job: find critical bugs." | Completes AI-style contrast pattern | +| 153:163 | AIPattern | "You're not just adding AI to a slow process, you're fixing..." | Artificial contrast for emphasis | +| 31:1 | Directness | "Most AI code review tools are bots that bolt onto..." | Context-before-answer pattern | +| 168:1 | Directness | "To truly fix code review, you need a platform..." | Concrete takeaway buried mid-section | +| 3:131 | PseudoAdvice | "orchestrating outcomes" | No steps or examples provided | +| 81:1 | PseudoAdvice | "Best for: Teams who want to keep the native GitHub UI..." | Lacks decision criteria | +| 168:27 | PseudoAdvice | "you need a platform that incentivizes better practices..." | No actionable instructions | +| 170:1 | Repetition | "The best teams are shipping faster than ever..." | Repeats established theme without new data | + +**Total Missed:** 13 findings (improved from 18 with batch=4) + +--- + +## 4. New Findings (Batch=2 found, Baseline missed) + +| Line:Col | Rule | Quoted Text | Description | Valid? | +|----------|------|-------------|-------------|--------| +| 71:1 | AIPattern | "CodeRabbit has established itself... rich feature set..." | Broad and marketing-like without concrete detail | βœ… Valid | +| 75:1 | AIPattern | "CodeRabbit is highly configurable." | Vague evaluative phrase sounds like boilerplate | βœ… Valid | +| 77:79 | AIPattern | "comprehensive analysis can be valuable" | Abstract phrase reads like stock copy | βœ… Valid | +| 90:10 | AIPattern | "takes a unique approach to AI code review" | Claim of uniqueness is vague and unsubstantiated | ⚠️ Questionable | +| 90:66 | AIPattern | "comprehensive knowledge graph of your entire repository" | Leans on jargon instead of plain description | ⚠️ Questionable | +| 90:197 | AIPattern | "uses this context to provide unusually deep analysis" | Promotional and non-specific | βœ… Valid | +| 96:149 | AIPattern | "deploy Greptile on-premises for complete control" | 'deploy' is a flagged buzzword | βœ… Valid | +| 132:405 | AIPattern | "optimizations" | Generic and marketing-flavored | βœ… Valid | +| 7:1 | Directness | "Introduction: The 'Review Gap' in the Age of AI" | Gap only named in second paragraph | βœ… Valid | +| 50:1 | Directness | "Overview: The default choice for many teams..." | States popularity, not core capabilities | βœ… Valid | +| 104:1 | Directness | "5. Ellipsis (Best for Automated Fixes)" | Doesn't front-load why it's best | βœ… Valid | +| 77:216 | PseudoAdvice | "You'll need to invest time in configuration..." | No actionable guidance on how to configure | βœ… Valid | +| 152:1 | PseudoAdvice | "How to Choose the Right Tool" (header only) | No following sentences with concrete steps | βœ… Valid | +| 172:66 | PseudoAdvice | "Sign up and review your first stack in minutes." | Marketing nudge without actionable detail | βœ… Valid | +| 3:155 | Repetition | "The goal isn't faster delivery, it's sustainable velocity." | Concept already conveyed earlier | βœ… Valid | +| 39:1 | Repetition | "The integration feels seamless..." | Restates integration quality | βœ… Valid | +| 71:77 | Repetition | "rich feature set that goes well beyond..." | Rephrased by later specifics | βœ… Valid | +| 111:1 | Repetition | "This capability is genuinely useful..." | Restates prior explanation | βœ… Valid | +| 121:1 | Repetition | "Best for: Teams who spend too much time..." | Pattern repeated in table | βœ… Valid | + +**Total New:** 19 findings +**Hallucination Rate:** 2/19 = ~5% (the "questionable" items) + +--- + +## 5. Comparison: Batch=4 (V5) vs Batch=2 (V6) + +| Metric | Batch=4 | Batch=2 | Change | +|--------|---------|---------|--------| +| Warnings Found | 34 | 37 | +3 | +| Overlap with Baseline | 37.5% | **59%** | +21.5% βœ… | +| Missed Findings | 18 | 13 | -5 βœ… | +| Token Reduction | 37% | **21%** | -16% ❌ | +| Cost | $0.15 | $0.19 | +$0.04 ❌ | +| LLM Requests | 6 | 12 | +6 ❌ | +| Hallucination Rate | ~9% | ~5% | -4% βœ… | + +### Key Improvements with Batch=2 + +1. **More Negation Patterns Detected:** Batch=2 found 4 negation patterns on line 3 (3:15, 3:86, 3:155) that batch=4 missed entirely. + +2. **Higher Overlap:** Accuracy improved from 37.5% to 59% - the smaller batch size reduces "lost in the middle" effects. + +3. **Lower Hallucinations:** Dropped from ~9% to ~5% with fewer questionable findings. + +### Trade-offs + +1. **Less Efficiency:** Token reduction dropped from 37% to 21% because we now have 2 batches instead of 1 (more prompt overhead). + +2. **More LLM Requests:** 12 requests vs 6 - doubling the batch count. + +--- + +## 6. Efficiency Analysis + +| Metric | Baseline | Batch=4 | Batch=2 | +|--------|----------|---------|---------| +| Input Tokens | 50,570 | 31,996 | 39,868 | +| Token Reduction | - | -37% | -21% | +| LLM Requests | 24 | 6 | 12 | +| Request Reduction | - | -75% | -50% | +| Cost | $0.20 | $0.15 | $0.19 | +| Cost Reduction | - | -25% | -5% | + +--- + +## 7. Conclusion + +### Batch Size 2 Results + +| Criterion | Target | Actual | Pass/Fail | +|-----------|--------|--------|-----------| +| Overlap with Baseline | >95% | 59% | ❌ FAIL | +| Token Reduction | >50% | 21% | ❌ FAIL | +| Hallucination Rate | 0% | ~5% | ⚠️ MARGINAL | + +### Comparison Summary + +| Batch Size | Overlap | Token Reduction | Hallucinations | Recommendation | +|------------|---------|-----------------|----------------|----------------| +| 4 (V5) | 37.5% | 37% | ~9% | ❌ Not viable | +| 2 (V6) | 59% | 21% | ~5% | ❌ Still not viable | +| 1 (no batch) | 100% | 0% | 0% | βœ… Current default | + +### Recommendation + +**The Rule Batching feature should remain DISABLED by default.** + +While batch size 2 shows meaningful improvement in accuracy (59% vs 37.5% overlap) and lower hallucinations (~5% vs ~9%), it still falls far short of the 95% target. The efficiency gains (21% token reduction) are also well below the 50% target. + +**Potential future improvements:** +1. **Rule-type-aware batching:** Only batch simple buzzword rules together; keep complex structural rules individual +2. **Hybrid approach:** Use batching for first-pass scanning, then verify edge cases individually +3. **Prompt engineering:** Experiment with stronger rule separation in the prompt format +4. **Batch size 1 for complex rules:** Default to no batching for rules with negation-pattern detection + +--- + +*Report generated by manual A/B validation test.* diff --git a/scripts/measure-batching-accuracy.ts b/scripts/measure-batching-accuracy.ts new file mode 100644 index 0000000..fec5c4e --- /dev/null +++ b/scripts/measure-batching-accuracy.ts @@ -0,0 +1,374 @@ +#!/usr/bin/env npx ts-node +/** + * Batching Accuracy Measurement Script + * + * This script compares the results of batched vs non-batched rule evaluation + * to validate that the batching optimization doesn't degrade quality. + * + * Usage: + * npx ts-node scripts/measure-batching-accuracy.ts [options] + * + * Options: + * --files Files to evaluate (default: "contents/**\/*.md") + * --verbose Show detailed comparison + * --json Output results as JSON + */ + +import * as fs from "fs"; +import * as path from "path"; +import { glob } from "glob"; +import { execSync } from "child_process"; + +// Load .env file manually +const envPath = path.resolve(process.cwd(), ".env"); +if (fs.existsSync(envPath)) { + console.log(`Loading .env from ${envPath}`); + const envConfig = fs.readFileSync(envPath, "utf-8"); + envConfig.split("\n").forEach((line) => { + const trimmedLine = line.trim(); + // Skip comments and empty lines + if (trimmedLine.startsWith("#") || trimmedLine.startsWith(";") || trimmedLine === "") return; + + const match = trimmedLine.match(/^([^=]+)=(.*)$/); + if (match) { + const key = match[1]!.trim(); + const value = match[2]!.trim().replace(/^"(.*)"$/, "$1").replace(/^'(.*)'$/, "$1"); + process.env[key] = value; + } + }); +} else { + console.log(`No .env file found at ${envPath}`); +} + +// Types for comparison results +interface ViolationKey { + ruleId: string; + quotedText: string; + description: string; +} + +interface ComparisonResult { + file: string; + baselineViolations: number; + batchedViolations: number; + matchingViolations: number; + baselineOnlyViolations: ViolationKey[]; + batchedOnlyViolations: ViolationKey[]; + overlapPercentage: number; +} + +interface AccuracySummary { + totalFiles: number; + totalBaselineViolations: number; + totalBatchedViolations: number; + totalMatchingViolations: number; + averageOverlap: number; + tokenReduction: number; + latencyReduction: number; + passesCriteria: boolean; + details: ComparisonResult[]; +} + +function normalizeQuotedText(text: string | undefined): string { + if (!text) return ""; + return text.toLowerCase().trim().replace(/\s+/g, " "); +} + +function createViolationKey( + ruleId: string, + violation: { quoted_text?: string; description?: string; line?: number } +): string { + // Primary Match: Rule + Line Number + // This is the most robust way to compare findings. If the LLM flags the same line + // for the same rule, it "found" the issue, regardless of how it quoted the text. + if (violation.line && violation.line > 0) { + return `${ruleId}|Line:${violation.line}`; + } + + // Fallback: Rule + Normalized Quote (if line detection failed) + const normalizedQuote = normalizeQuotedText(violation.quoted_text); + return `${ruleId}|${normalizedQuote}`; +} + +function parseArgs(): { files: string; verbose: boolean; json: boolean; auto: boolean } { + const args = process.argv.slice(2); + let files = "tests/fixtures/**/*.md"; + let verbose = false; + let json = false; + let auto = true; // Default to auto mode for this run + + for (let i = 0; i < args.length; i++) { + const arg = args[i]; + if (arg === "--files" && args[i + 1]) { + files = args[++i] as string; + } else if (arg === "--verbose") { + verbose = true; + } else if (arg === "--json") { + json = true; + } else if (arg === "--manual") { + auto = false; + } + } + + return { files, verbose, json, auto }; +} + +async function main() { + const { files, verbose, json, auto } = parseArgs(); + + if (!json) { + console.log("πŸ“Š Batching Accuracy Measurement Tool\n"); + console.log("This script validates that rule batching doesn't degrade quality.\n"); + } + + // Find test files + const cwd = process.cwd(); + // Ensure pattern uses forward slashes for glob + const pattern = files.replace(/\\/g, "/"); + const testFiles = await glob(pattern, { nodir: true, cwd }); + + if (testFiles.length === 0) { + console.error(`❌ No files found matching pattern: ${files}`); + console.error(" Try: npx ts-node scripts/measure-batching-accuracy.ts --files 'tests/fixtures/**/*.md'"); + process.exit(1); + } + + if (!json) { + console.log(`Found ${testFiles.length} files to evaluate\n`); + } + + // Check for required environment + if (!process.env.ANTHROPIC_API_KEY && !process.env.OPENAI_API_KEY) { + console.error("❌ No LLM API key found. Set ANTHROPIC_API_KEY or OPENAI_API_KEY."); + process.exit(1); + } + + if (auto) { + // Create temporary config files + const baselineConfigPath = path.join(cwd, "baseline-temp-config.ini"); + const batchedConfigPath = path.join(cwd, "batched-temp-config.ini"); + const baselineOutputPath = path.join(cwd, "baseline-results.json"); + const batchedOutputPath = path.join(cwd, "batched-results.json"); + + try { + // 1. Create Configs + fs.writeFileSync(baselineConfigPath, ` +RulesPath= +Concurrency=4 +DefaultSeverity=warning +BatchRules=false + +[**/*.md] +RunRules=VectorLint +`); + + fs.writeFileSync(batchedConfigPath, ` +RulesPath= +Concurrency=4 +DefaultSeverity=warning +BatchRules=true +MaxRulesPerBatch=2 + +[**/*.md] +RunRules=VectorLint +`); + + // 2. Run Baseline + if (!json) console.log("πŸš€ Running Baseline Evaluation (BatchRules=false)..."); + const baselineCmd = `node dist/index.js "${files}" --config "${baselineConfigPath}" --output json > "${baselineOutputPath}"`; + execSync(baselineCmd, { stdio: 'inherit' }); + + // 3. Run Batched + if (!json) console.log("\nπŸš€ Running Batched Evaluation (BatchRules=true)..."); + const batchedCmd = `node dist/index.js "${files}" --config "${batchedConfigPath}" --output json > "${batchedOutputPath}"`; + execSync(batchedCmd, { stdio: 'inherit' }); + + // 4. Compare Results + if (!json) console.log("\nπŸ“Š Comparing Results..."); + const summary = await compareResults(baselineOutputPath, batchedOutputPath); + + // Output results + if (json) { + console.log(JSON.stringify(summary, null, 2)); + } else { + printSummary(summary, verbose); + } + + } catch (error) { + console.error("\n❌ Error during execution:", error); + } finally { + // Cleanup + if (fs.existsSync(baselineConfigPath)) fs.unlinkSync(baselineConfigPath); + if (fs.existsSync(batchedConfigPath)) fs.unlinkSync(batchedConfigPath); + if (fs.existsSync(baselineOutputPath)) fs.unlinkSync(baselineOutputPath); + if (fs.existsSync(batchedOutputPath)) fs.unlinkSync(batchedOutputPath); + } + + } else { + // Manual Instructions Mode + console.log("⚠️ Note: This script requires running VectorLint twice per file:"); + console.log(" 1. With BatchRules=false (baseline)"); + console.log(" 2. With BatchRules=true (batched)\n"); + + console.log("πŸ“‹ Instructions for manual comparison:\n"); + console.log("1. Run baseline evaluation (non-batched):"); + console.log(` BatchRules=false npx vectorlint "${files}" --output json > baseline.json\n`); + console.log("2. Run batched evaluation:"); + console.log(` BatchRules=true npx vectorlint "${files}" --output json > batched.json\n`); + console.log("3. Compare results manually or use a diff tool\n"); + } +} + +function printSummary(summary: AccuracySummary, verbose: boolean) { + console.log("\n╔══════════════════════════════════════════════════════════════════════════════╗"); + console.log("β•‘ RULE BATCHING OPTIMIZATION - ACCURACY REPORT β•‘"); + console.log("β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n"); + + console.log(`Files Evaluated: ${summary.totalFiles}`); + console.log(`Total Violations (Baseline): ${summary.totalBaselineViolations}`); + console.log(`Total Violations (Batched): ${summary.totalBatchedViolations}`); + console.log(`Matching Violations: ${summary.totalMatchingViolations}`); + console.log(`Average Overlap: ${summary.averageOverlap.toFixed(1)}%`); + console.log(`Estimated Token Reduction: ~${summary.tokenReduction}%`); + console.log(`Estimated Latency Reduction: ~${summary.latencyReduction}%`); + + if (summary.passesCriteria) { + console.log("\nβœ… PASSED: Batching accuracy meets >95% overlap criteria."); + } else { + console.log("\n❌ FAILED: Batching accuracy is below 95% overlap criteria."); + } + + if (verbose || !summary.passesCriteria) { + console.log("\nπŸ” Detailed Breakdown:"); + summary.details.forEach(d => { + console.log(`\n File: ${path.basename(d.file)}`); + console.log(` Overlap: ${d.overlapPercentage.toFixed(1)}%`); + if (d.baselineOnlyViolations.length > 0) { + console.log(" πŸ”΄ Missed by Batched (Baseline Only):"); + d.baselineOnlyViolations.forEach(v => console.log(` - [${v.ruleId}] ${v.description.substring(0, 60)}...`)); + } + if (d.batchedOnlyViolations.length > 0) { + console.log(" 🟑 Extra in Batched (False Positives?):"); + d.batchedOnlyViolations.forEach(v => console.log(` - [${v.ruleId}] ${v.description.substring(0, 60)}...`)); + } + }); + } +} + +/** + * Compare baseline and batched results from JSON files. + * This function can be called programmatically for automated testing. + */ +export async function compareResults( + baselineFile: string, + batchedFile: string +): Promise { + const baseline = JSON.parse(fs.readFileSync(baselineFile, "utf-8")); + const batched = JSON.parse(fs.readFileSync(batchedFile, "utf-8")); + + const details: ComparisonResult[] = []; + let totalBaselineViolations = 0; + let totalBatchedViolations = 0; + let totalMatchingViolations = 0; + + // Build violation maps + const baselineViolations = new Map>(); + const batchedViolations = new Map>(); + + // Process baseline + for (const file of Object.keys(baseline.files || {})) { + const fileData = baseline.files[file]; + if (!baselineViolations.has(file)) { + baselineViolations.set(file, new Set()); + } + const fileViolations = baselineViolations.get(file)!; + + for (const issue of fileData.issues || []) { + const key = createViolationKey(issue.rule || "", { + quoted_text: issue.match, + description: issue.message, + }); + fileViolations.add(key); + totalBaselineViolations++; + } + } + + // Process batched + for (const file of Object.keys(batched.files || {})) { + const fileData = batched.files[file]; + if (!batchedViolations.has(file)) { + batchedViolations.set(file, new Set()); + } + const fileViolations = batchedViolations.get(file)!; + + for (const issue of fileData.issues || []) { + const key = createViolationKey(issue.rule || "", { + quoted_text: issue.match, + description: issue.message, + }); + fileViolations.add(key); + totalBatchedViolations++; + } + } + + // Calculate overlap per file + const allFiles = new Set([...baselineViolations.keys(), ...batchedViolations.keys()]); + + for (const file of allFiles) { + const baselineSet = baselineViolations.get(file) || new Set(); + const batchedSet = batchedViolations.get(file) || new Set(); + + const matching = new Set([...baselineSet].filter((x) => batchedSet.has(x))); + const baselineOnly = [...baselineSet].filter((x) => !batchedSet.has(x)); + const batchedOnly = [...batchedSet].filter((x) => !baselineSet.has(x)); + + totalMatchingViolations += matching.size; + + const totalUnique = new Set([...baselineSet, ...batchedSet]).size; + // If both are empty, overlap is 100%. If one is empty and other isn't, 0%. + const overlapPct = totalUnique > 0 ? (matching.size / totalUnique) * 100 : (baselineSet.size === 0 && batchedSet.size === 0 ? 100 : 0); + + details.push({ + file, + baselineViolations: baselineSet.size, + batchedViolations: batchedSet.size, + matchingViolations: matching.size, + baselineOnlyViolations: baselineOnly.map((k) => { + const [ruleId, quotedText, description] = k.split("|"); + return { ruleId: ruleId || "", quotedText: quotedText || "", description: description || "" }; + }), + batchedOnlyViolations: batchedOnly.map((k) => { + const [ruleId, quotedText, description] = k.split("|"); + return { ruleId: ruleId || "", quotedText: quotedText || "", description: description || "" }; + }), + overlapPercentage: overlapPct, + }); + } + + const averageOverlap = + details.length > 0 + ? details.reduce((sum, d) => sum + d.overlapPercentage, 0) / details.length + : 0; + + // Token/latency reduction would need to be measured during actual runs + // These are placeholder estimates based on the theoretical model + const estimatedTokenReduction = 70; // Estimated based on N_rules reduction + const estimatedLatencyReduction = 80; // Estimated based on parallel->serial reduction + + return { + totalFiles: allFiles.size, + totalBaselineViolations, + totalBatchedViolations, + totalMatchingViolations, + averageOverlap, + tokenReduction: estimatedTokenReduction, + latencyReduction: estimatedLatencyReduction, + passesCriteria: averageOverlap >= 95, + details, + }; +} + +main().catch((err) => { + console.error("Error:", err); + process.exit(1); +}); diff --git a/src/boundaries/config-loader.ts b/src/boundaries/config-loader.ts index 8c15a66..45dd053 100644 --- a/src/boundaries/config-loader.ts +++ b/src/boundaries/config-loader.ts @@ -26,6 +26,8 @@ enum ConfigKey { SCAN_PATHS = "ScanPaths", CONCURRENCY = "Concurrency", DEFAULT_SEVERITY = "DefaultSeverity", + BATCH_RULES = "BatchRules", + MAX_RULES_PER_BATCH = "MaxRulesPerBatch", } function resolveConfigPath(cwd: string, configPath?: string): string { @@ -66,6 +68,8 @@ export function loadConfig( let rulesPathRaw: string | undefined; let concurrencyRaw: number | undefined; let defaultSeverityRaw: string | undefined; + let batchRulesRaw: boolean | undefined; + let maxRulesPerBatchRaw: number | undefined; const rawConfigObj: Record = {}; try { @@ -125,6 +129,19 @@ export function loadConfig( case ConfigKey.DEFAULT_SEVERITY as string: defaultSeverityRaw = stripQuotes(val); break; + case ConfigKey.BATCH_RULES as string: { + const normalizedVal = stripQuotes(val).toLowerCase(); + batchRulesRaw = normalizedVal === "true" || normalizedVal === "1"; + break; + } + case ConfigKey.MAX_RULES_PER_BATCH as string: { + const parsed = parseInt(val, 10); + if (Number.isNaN(parsed) || parsed < 1 || parsed > 20) { + throw new ConfigError(`Invalid MaxRulesPerBatch value: ${val}. Must be between 1 and 20.`); + } + maxRulesPerBatchRaw = parsed; + break; + } } } } @@ -167,6 +184,8 @@ export function loadConfig( concurrency, configDir, defaultSeverity: defaultSeverityRaw, + batchRules: batchRulesRaw, + maxRulesPerBatch: maxRulesPerBatchRaw, }; try { diff --git a/src/cli/commands.ts b/src/cli/commands.ts index b5ec2d1..5c99b64 100644 --- a/src/cli/commands.ts +++ b/src/cli/commands.ts @@ -194,6 +194,8 @@ export function registerMainCommand(program: Command): void { verbose: cliOptions.verbose, outputFormat: outputFormat, scanPaths: config.scanPaths, + batchRules: config.batchRules, + maxRulesPerBatch: config.maxRulesPerBatch, pricing: { inputPricePerMillion: env.INPUT_PRICE_PER_MILLION, outputPricePerMillion: env.OUTPUT_PRICE_PER_MILLION, diff --git a/src/cli/orchestrator.ts b/src/cli/orchestrator.ts index 7c660c8..613214a 100644 --- a/src/cli/orchestrator.ts +++ b/src/cli/orchestrator.ts @@ -9,7 +9,7 @@ import { printFileHeader, printIssueRow, printEvaluationSummaries, type Evaluati import { checkTarget } from '../prompts/target'; import { isSubjectiveResult } from '../prompts/schema'; import { handleUnknownError, MissingDependencyError } from '../errors/index'; -import { createEvaluator } from '../evaluators/index'; +import { createEvaluator, BatchedCheckEvaluator, partitionRulesByBatchability } from '../evaluators/index'; import { Type, Severity } from '../evaluators/types'; import { OutputFormat } from './types'; import type { @@ -17,7 +17,7 @@ import type { ReportIssueParams, ProcessViolationsParams, ProcessCriterionParams, ProcessCriterionResult, ValidationParams, ProcessPromptResultParams, RunPromptEvaluationParams, RunPromptEvaluationResult, EvaluateFileParams, EvaluateFileResult, - RunPromptEvaluationResultSuccess + RunPromptEvaluationResultSuccess, EvaluateBatchedRulesParams, EvaluateBatchedRulesResult } from './types'; import { calculateCost, @@ -751,8 +751,99 @@ async function runPromptEvaluation( } } +/* + * Evaluates a batch of Check rules together. + * Returns evaluation stats and any fallback rules if batching fails. + */ +async function evaluateBatchedRules( + params: EvaluateBatchedRulesParams +): Promise { + const { provider, batchable, maxRulesPerBatch, relFile, content, outputFormat, jsonFormatter, verbose } = params; + + let inputTokens = 0; + let outputTokens = 0; + let errors = 0; + let warnings = 0; + let requestFailures = 0; + let hadOperationalErrors = false; + let hadSeverityErrors = false; + const scores = new Map(); + const fallbackRules: PromptFile[] = []; + + try { + const batchedEvaluator = new BatchedCheckEvaluator(provider, batchable, { + maxRulesPerBatch, + }); + const batchedResults = await batchedEvaluator.evaluate(relFile, content); + + // Process each batched result + for (const rule of batchable) { + const ruleId = (rule.meta.id || rule.filename.replace(/\.md$/, "")).toString(); + const result = batchedResults.get(ruleId); + + if (!result) { + if (verbose) { + console.warn(`[vectorlint] No result for batched rule: ${ruleId}`); + } + hadOperationalErrors = true; + continue; + } + + // Accumulate token usage + if (result.usage) { + inputTokens += result.usage.inputTokens; + outputTokens += result.usage.outputTokens; + } + + const promptResult = routePromptResult({ + promptFile: rule, + result, + content, + relFile, + outputFormat, + jsonFormatter, + verbose, + }); + + errors += promptResult.errors; + warnings += promptResult.warnings; + hadOperationalErrors = hadOperationalErrors || promptResult.hadOperationalErrors; + hadSeverityErrors = hadSeverityErrors || promptResult.hadSeverityErrors; + + if (promptResult.scoreEntries && promptResult.scoreEntries.length > 0) { + const ruleName = (rule.meta.id || rule.filename).toString(); + scores.set(ruleName, promptResult.scoreEntries); + } + } + } catch (e: unknown) { + const err = handleUnknownError(e, "Batched rule evaluation"); + console.error(`[vectorlint] Batched evaluation failed: ${err.message}`); + hadOperationalErrors = true; + requestFailures += 1; + + // Fall back to individual evaluation for all batchable rules + if (verbose) { + console.warn("[vectorlint] Falling back to individual evaluation for batched rules"); + } + fallbackRules.push(...batchable); + } + + return { + inputTokens, + outputTokens, + errors, + warnings, + requestFailures, + hadOperationalErrors, + hadSeverityErrors, + scores, + fallbackRules + }; +} + /* * Evaluates a single file with all applicable prompts. + * Supports batched evaluation for Check rules when enabled. */ async function evaluateFile( params: EvaluateFileParams @@ -766,6 +857,8 @@ async function evaluateFile( scanPaths, outputFormat = OutputFormat.Line, verbose, + batchRules, + maxRulesPerBatch = 5, } = options; let hadOperationalErrors = false; @@ -820,69 +913,107 @@ async function evaluateFile( toRun.push(...prompts); } - const results = await runWithConcurrency( - toRun, - concurrency, - async (prompt) => { - return runPromptEvaluation({ - promptFile: prompt, - relFile, - content, - provider, - ...(searchProvider !== undefined && { searchProvider }), - }); + // Partition rules for batching + const { batchable, nonBatchable } = batchRules + ? partitionRulesByBatchability(toRun) + : { batchable: [], nonBatchable: toRun }; + + // Process batchable Check rules together + if (batchable.length > 0) { + const batchResult = await evaluateBatchedRules({ + provider, + batchable, + maxRulesPerBatch, + relFile, + content, + outputFormat, + jsonFormatter, + verbose, + }); + + totalInputTokens += batchResult.inputTokens; + totalOutputTokens += batchResult.outputTokens; + totalErrors += batchResult.errors; + totalWarnings += batchResult.warnings; + requestFailures += batchResult.requestFailures; + hadOperationalErrors = hadOperationalErrors || batchResult.hadOperationalErrors; + hadSeverityErrors = hadSeverityErrors || batchResult.hadSeverityErrors; + + for (const [key, val] of batchResult.scores) { + allScores.set(key, val); } - ); - // Aggregate results from each prompt - for (let idx = 0; idx < toRun.length; idx++) { - const p = toRun[idx]; - const r = results[idx]; - if (!p || !r) continue; - - if (r.ok !== true) { - // Check if this is a missing dependency error - if so, skip gracefully - if (r.error instanceof MissingDependencyError) { - console.warn(`[vectorlint] Skipping ${p.filename}: ${r.error.message}`); - if (r.error.hint) { - console.warn(`[vectorlint] Hint: ${r.error.hint}`); + if (batchResult.fallbackRules.length > 0) { + nonBatchable.push(...batchResult.fallbackRules); + } + } + + // Process non-batchable rules individually (Judge rules, special evaluators, etc.) + if (nonBatchable.length > 0) { + const results = await runWithConcurrency( + nonBatchable, + concurrency, + async (prompt) => { + return runPromptEvaluation({ + promptFile: prompt, + relFile, + content, + provider, + ...(searchProvider !== undefined && { searchProvider }), + }); + } + ); + + // Aggregate results from each prompt + for (let idx = 0; idx < nonBatchable.length; idx++) { + const p = nonBatchable[idx]; + const r = results[idx]; + if (!p || !r) continue; + + if (r.ok !== true) { + // Check if this is a missing dependency error - if so, skip gracefully + if (r.error instanceof MissingDependencyError) { + console.warn(`[vectorlint] Skipping ${p.filename}: ${r.error.message}`); + if (r.error.hint) { + console.warn(`[vectorlint] Hint: ${r.error.hint}`); + } + // Skip this evaluation entirely - don't count it as a failure + continue; } - // Skip this evaluation entirely - don't count it as a failure + + // Other errors are actual failures + console.error(` Prompt failed: ${p.filename}`); + console.error(r.error); + hadOperationalErrors = true; + requestFailures += 1; continue; } - // Other errors are actual failures - console.error(` Prompt failed: ${p.filename}`); - console.error(r.error); - hadOperationalErrors = true; - requestFailures += 1; - continue; - } - - // Accumulate token usage - if (r.result.usage) { - totalInputTokens += r.result.usage.inputTokens; - totalOutputTokens += r.result.usage.outputTokens; - } + // Accumulate token usage + if (r.result.usage) { + totalInputTokens += r.result.usage.inputTokens; + totalOutputTokens += r.result.usage.outputTokens; + } - const promptResult = routePromptResult({ - promptFile: p, - result: r.result, - content, - relFile, - outputFormat, - jsonFormatter, - verbose, - }); - totalErrors += promptResult.errors; - totalWarnings += promptResult.warnings; - hadOperationalErrors = - hadOperationalErrors || promptResult.hadOperationalErrors; - hadSeverityErrors = hadSeverityErrors || promptResult.hadSeverityErrors; + const promptResult = routePromptResult({ + promptFile: p, + result: r.result, + content, + relFile, + outputFormat, + jsonFormatter, + verbose, + }); + totalErrors += promptResult.errors; + totalWarnings += promptResult.warnings; + hadOperationalErrors = + hadOperationalErrors || promptResult.hadOperationalErrors; + hadSeverityErrors = hadSeverityErrors || promptResult.hadSeverityErrors; - if (promptResult.scoreEntries && promptResult.scoreEntries.length > 0) { - const ruleName = (p.meta.id || p.filename).toString(); - allScores.set(ruleName, promptResult.scoreEntries); + if (promptResult.scoreEntries && promptResult.scoreEntries.length > 0) { + const ruleName = (p.meta.id || p.filename).toString(); + allScores.set(ruleName, promptResult.scoreEntries); + } } } diff --git a/src/cli/types.ts b/src/cli/types.ts index 9666d5e..6f410be 100644 --- a/src/cli/types.ts +++ b/src/cli/types.ts @@ -28,6 +28,8 @@ export interface EvaluationOptions { scanPaths: FilePatternConfig[]; outputFormat?: OutputFormat; pricing?: PricingConfig; + batchRules?: boolean; + maxRulesPerBatch?: number; } export interface EvaluationResult { @@ -137,3 +139,22 @@ export interface EvaluateFileResult extends ErrorTrackingResult { requestFailures: number; tokenUsage?: TokenUsageStats; } + +export interface EvaluateBatchedRulesParams { + provider: LLMProvider; + batchable: PromptFile[]; + maxRulesPerBatch: number; + relFile: string; + content: string; + outputFormat: OutputFormat; + jsonFormatter: ValeJsonFormatter | JsonFormatter | RdJsonFormatter; + verbose: boolean; +} + +export interface EvaluateBatchedRulesResult extends ErrorTrackingResult { + inputTokens: number; + outputTokens: number; + requestFailures: number; + scores: Map; + fallbackRules: PromptFile[]; +} diff --git a/src/evaluators/batched-check-evaluator.ts b/src/evaluators/batched-check-evaluator.ts new file mode 100644 index 0000000..e074656 --- /dev/null +++ b/src/evaluators/batched-check-evaluator.ts @@ -0,0 +1,293 @@ +/** + * Batched Check Evaluator + * + * Evaluates multiple Check-type rules in a single LLM call per chunk. + * This significantly reduces token usage by sending content only once. + * + * Key differences from BaseEvaluator: + * - Accepts multiple rules instead of a single prompt + * - Returns a Map of rule_id -> SemiObjectiveResult + * - Uses batched prompt template and schema + */ + +import type { LLMProvider } from "../providers/llm-provider"; +import type { PromptFile } from "../schemas/prompt-schemas"; +import type { TokenUsage } from "../providers/token-usage"; +import { + buildBatchedCheckLLMSchema, + type BatchedCheckLLMResult, + type SemiObjectiveResult, + type SemiObjectiveItem, +} from "../prompts/schema"; +import { Severity } from "./types"; +import { + mergeViolations, + RecursiveChunker, + countWords, + type Chunk, +} from "../chunking"; +import { calculateSemiObjectiveScore } from "../scoring"; +import { prependLineNumbers } from "../output/line-numbering"; +import { + buildBatchedCheckPrompt, + extractBatchedRuleContexts, + groupIntoBatches, +} from "../prompts/batched-prompt-builder"; + +const CHUNKING_THRESHOLD = 600; // Word count threshold for enabling chunking +const MAX_CHUNK_SIZE = 500; // Maximum words per chunk +const DEFAULT_MAX_RULES_PER_BATCH = 5; // Default batch size to mitigate "lost in the middle" + +/** + * Result for a single rule within a batched evaluation. + */ +export interface BatchedRuleResult { + ruleId: string; + result: SemiObjectiveResult; +} + +/** + * Options for batched check evaluation. + */ +export interface BatchedCheckEvaluatorOptions { + maxRulesPerBatch?: number; + defaultSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined; +} + +/** + * Evaluator that processes multiple Check-type rules in batched LLM calls. + * + * TODO: Clean up or refactor this class. + * This implementation was part of the "Rule Batching Optimization" experiment. + * Validation showed that batching complex rules leads to significant accuracy loss (~60-90% drop in recall). + * The feature is currently disabled by default (BatchRules=false). + * If future prompt engineering solves the "lost in the middle" problem, this class can be reactivated. + * Otherwise, it may be candidate for removal to reduce technical debt. + */ +export class BatchedCheckEvaluator { + private maxRulesPerBatch: number; + private defaultSeverity: typeof Severity.WARNING | typeof Severity.ERROR | undefined; + + constructor( + private llmProvider: LLMProvider, + private rules: PromptFile[], + options: BatchedCheckEvaluatorOptions = {} + ) { + this.maxRulesPerBatch = options.maxRulesPerBatch ?? DEFAULT_MAX_RULES_PER_BATCH; + this.defaultSeverity = options.defaultSeverity ?? undefined; + + if (rules.length === 0) { + throw new Error("BatchedCheckEvaluator requires at least one rule"); + } + + // Validate all rules are Check type + for (const rule of rules) { + const ruleType = rule.meta.type; + if (ruleType === "judge") { + throw new Error( + `BatchedCheckEvaluator only supports Check-type rules, but got Judge-type: ${rule.meta.id || rule.filename}` + ); + } + } + } + + /** + * Evaluates all rules against the content and returns results per rule. + */ + async evaluate( + _file: string, + content: string + ): Promise> { + // Prepend line numbers for deterministic line reporting + const numberedContent = prependLineNumbers(content); + const chunks = this.chunkContent(numberedContent); + const totalWordCount = countWords(content) || 1; + + // Extract rule contexts for prompt building + const ruleContexts = extractBatchedRuleContexts(this.rules); + + // Build rule ID to PromptFile map for scoring options + const ruleMap = new Map(); + for (const rule of this.rules) { + const ruleId = (rule.meta.id || rule.filename.replace(/\.md$/, "")).toString(); + ruleMap.set(ruleId, rule); + } + + // Group rules into batches to mitigate "lost in the middle" + const ruleBatches = groupIntoBatches(ruleContexts, this.maxRulesPerBatch); + + // Collect violations per rule across all chunks and batches + const violationsByRule = new Map(); + for (const ctx of ruleContexts) { + violationsByRule.set(ctx.id, []); + } + + const usages: (TokenUsage | undefined)[] = []; + + // Process each batch of rules + for (const batch of ruleBatches) { + const ruleIds = batch.map((r) => r.id); + const batchedPrompt = buildBatchedCheckPrompt(batch); + const schema = buildBatchedCheckLLMSchema(ruleIds); + + // Process each chunk with the batched prompt + for (const chunk of chunks) { + const { data: llmResult, usage } = + await this.llmProvider.runPromptStructured( + chunk.content, + batchedPrompt, + schema + ); + + usages.push(usage); + + // Distribute violations to their respective rules + for (const ruleResult of llmResult.rules) { + const ruleViolations = violationsByRule.get(ruleResult.rule_id); + if (ruleViolations) { + // Convert to SemiObjectiveItem format, filtering out undefined values + const items: SemiObjectiveItem[] = ruleResult.violations.map((v) => { + const item: SemiObjectiveItem = { + description: v.description, + analysis: v.analysis, + }; + if (v.suggestion) item.suggestion = v.suggestion; + if (v.quoted_text) item.quoted_text = v.quoted_text; + if (v.context_before) item.context_before = v.context_before; + if (v.context_after) item.context_after = v.context_after; + return item; + }); + ruleViolations.push(items); + } + } + } + } + + // Calculate scores for each rule + const results = new Map(); + const aggregatedUsage = this.aggregateUsage(usages); + const usagePerRule = this.distributeUsage(aggregatedUsage, this.rules.length); + + for (const [ruleId, chunkViolations] of violationsByRule) { + const rule = ruleMap.get(ruleId); + const mergedViolations = mergeViolations(chunkViolations); + + const result = calculateSemiObjectiveScore(mergedViolations, totalWordCount, { + strictness: rule?.meta.strictness, + defaultSeverity: this.defaultSeverity, + promptSeverity: rule?.meta.severity, + }); + + results.set(ruleId, { + ...result, + ...(usagePerRule && { usage: usagePerRule }), + }); + } + + return results; + } + + /** + * Chunks content if it exceeds the threshold. + * Respects evaluateAs: "document" setting (checks first rule's setting). + */ + private chunkContent(content: string): Chunk[] { + const wordCount = countWords(content) || 1; + + // Check if any rule requires document-level evaluation + const anyDocumentLevel = this.rules.some( + (r) => r.meta.evaluateAs === "document" + ); + + if (anyDocumentLevel || wordCount <= CHUNKING_THRESHOLD) { + return [{ content, index: 0 }]; + } + + const chunker = new RecursiveChunker(); + return chunker.chunk(content, { maxChunkSize: MAX_CHUNK_SIZE }); + } + + /** + * Aggregates token usage from multiple LLM calls. + */ + private aggregateUsage( + usages: (TokenUsage | undefined)[] + ): TokenUsage | undefined { + const validUsages = usages.filter((u): u is TokenUsage => u !== undefined); + if (validUsages.length === 0) return undefined; + + return validUsages.reduce( + (acc, u) => ({ + inputTokens: acc.inputTokens + u.inputTokens, + outputTokens: acc.outputTokens + u.outputTokens, + }), + { inputTokens: 0, outputTokens: 0 } + ); + } + + /** + * Distributes total usage evenly across rules for reporting. + */ + private distributeUsage( + totalUsage: TokenUsage | undefined, + ruleCount: number + ): TokenUsage | undefined { + if (!totalUsage || ruleCount === 0) return undefined; + + return { + inputTokens: Math.round(totalUsage.inputTokens / ruleCount), + outputTokens: Math.round(totalUsage.outputTokens / ruleCount), + }; + } +} + +/** + * Checks if a rule can be batched. + * Rules can be batched if: + * - Type is "check" (not "judge") + * - Evaluator is "base" or not specified + * - Does not have template variables (no {{claims}} etc.) + */ +export function canBatchRule(rule: PromptFile): boolean { + const ruleType = rule.meta.type; + const evaluator = rule.meta.evaluator; + + // Only Check-type rules can be batched (Judge type cannot) + if (ruleType === "judge") { + return false; + } + + // Only base evaluator can be batched (not technical-accuracy, etc.) + if (evaluator && evaluator !== "base") { + return false; + } + + // Rules with template variables cannot be batched + // (they require preprocessing like claim extraction) + const templatePattern = /\{\{\s*[\w.]+\s*\}\}/; + if (templatePattern.test(rule.body)) { + return false; + } + + return true; +} + +/** + * Groups rules into batchable and non-batchable categories. + */ +export function partitionRulesByBatchability( + rules: PromptFile[] +): { batchable: PromptFile[]; nonBatchable: PromptFile[] } { + const batchable: PromptFile[] = []; + const nonBatchable: PromptFile[] = []; + + for (const rule of rules) { + if (canBatchRule(rule)) { + batchable.push(rule); + } else { + nonBatchable.push(rule); + } + } + + return { batchable, nonBatchable }; +} diff --git a/src/evaluators/index.ts b/src/evaluators/index.ts index 35e0a1c..5408f79 100644 --- a/src/evaluators/index.ts +++ b/src/evaluators/index.ts @@ -1,11 +1,12 @@ /* * Evaluators module - exports evaluator interface, base class, and registry. - * + * * Import this module to: * - Access the Evaluator interface for type definitions * - Use BaseEvaluator as a base class for custom evaluators * - Use registry functions to create and register evaluators - * + * - Use BatchedCheckEvaluator for batched rule evaluation + * * Importing this module also triggers self-registration of all built-in evaluators. */ @@ -15,6 +16,15 @@ export type { Evaluator } from './evaluator'; // Base evaluator class (also triggers 'base' registration on import) export { BaseEvaluator } from './base-evaluator'; +// Batched evaluator for multiple Check rules +export { + BatchedCheckEvaluator, + canBatchRule, + partitionRulesByBatchability, + type BatchedRuleResult, + type BatchedCheckEvaluatorOptions, +} from './batched-check-evaluator'; + // Registry functions export { registerEvaluator, diff --git a/src/prompts/batched-prompt-builder.ts b/src/prompts/batched-prompt-builder.ts new file mode 100644 index 0000000..0a84cf6 --- /dev/null +++ b/src/prompts/batched-prompt-builder.ts @@ -0,0 +1,96 @@ +/** + * Batched Prompt Builder + * + * Combines multiple Check rules into a single prompt for batch evaluation. + */ + +import type { PromptFile } from "./prompt-loader"; + +export interface BatchedRuleContext { + id: string; + name: string; + body: string; +} + +const BATCHED_SYSTEM_PREAMBLE = `Evaluate the content against ${"{num_rules}"} distinct rules. + +## PROTOCOL +For each rule listed below, you must: +1. **Switch Context**: Focus ONLY on that specific rule's definition. Ignore others. +2. **Scan**: Read the entire content looking for violations of THAT rule. +3. **Log**: Record any violations found (or record an empty list if none). + +## OUTPUT FORMAT +Return a JSON object with a "rules" array containing exactly ${"{num_rules}"} entries, one for each rule ID. + +## RULES (TASKS) +`; + +const RULE_SEPARATOR = "\n\n"; + +/** + * Formats a single rule for inclusion in a batched prompt. + * @param rule - The rule context containing id, name, and body + * @returns Formatted string for the rule + */ +export function formatRuleForBatch(rule: BatchedRuleContext, index: number): string { + return `### TASK ${index + 1}: Check Rule [${rule.id}] (${rule.name}) +${rule.body} +--------------------------------------------------`; +} + +/** + * Builds a batched prompt from multiple Check rules. + * Combines all rule prompts into a single system prompt with clear delineation. + * + * @param rules - Array of rules to batch together + * @returns The combined system prompt for batched evaluation + */ +export function buildBatchedCheckPrompt(rules: BatchedRuleContext[]): string { + if (rules.length === 0) { + throw new Error("Cannot build batched prompt with zero rules"); + } + + const formattedRules = rules.map((r, i) => formatRuleForBatch(r, i)).join(RULE_SEPARATOR); + + // Inject the number of rules into the preamble + const preamble = BATCHED_SYSTEM_PREAMBLE.replace(/\{num_rules\}/g, rules.length.toString()); + + return `${preamble} +${formattedRules} + +## VERIFICATION +You must output exactly ${rules.length} results. One for each Task above. +`; +} + +/** + * Extracts BatchedRuleContext from PromptFile objects. + * Only extracts the essential information needed for batching. + * + * @param prompts - Array of PromptFile objects + * @returns Array of BatchedRuleContext objects + */ +export function extractBatchedRuleContexts( + prompts: PromptFile[] +): BatchedRuleContext[] { + return prompts.map((p) => ({ + id: (p.meta.id || p.filename.replace(/\.md$/, "")).toString(), + name: (p.meta.name || p.meta.id || p.filename).toString(), + body: p.body, + })); +} + +/** + * Groups rules into batches of a maximum size. + */ +export function groupIntoBatches( + rules: T[], + maxBatchSize: number = 5 +): T[][] { + const batches: T[][] = []; + for (let i = 0; i < rules.length; i += maxBatchSize) { + batches.push(rules.slice(i, i + maxBatchSize)); + } + return batches; +} diff --git a/src/prompts/schema.ts b/src/prompts/schema.ts index 96374f5..b80b643 100644 --- a/src/prompts/schema.ts +++ b/src/prompts/schema.ts @@ -93,6 +93,64 @@ export function buildSemiObjectiveLLMSchema() { } as const; } +/** + * Builds the JSON schema for batched Check evaluation. + * The schema requires the LLM to output violations grouped by rule_id. + * @param ruleIds - Array of rule IDs that will be evaluated in this batch + */ +export function buildBatchedCheckLLMSchema(ruleIds: string[]) { + return { + name: "vectorlint_batched_check_result", + strict: true, + schema: { + type: "object", + additionalProperties: false, + properties: { + rules: { + type: "array", + description: `Evaluation results for each rule. Must include an entry for each rule: ${ruleIds.join(", ")}`, + items: { + type: "object", + additionalProperties: false, + properties: { + rule_id: { + type: "string", + description: `The rule ID being evaluated. Must be one of: ${ruleIds.join(", ")}`, + }, + violations: { + type: "array", + items: { + type: "object", + additionalProperties: false, + properties: { + line: { type: "number" }, + quoted_text: { type: "string" }, + context_before: { type: "string" }, + context_after: { type: "string" }, + description: { type: "string" }, + analysis: { type: "string" }, + suggestion: { type: "string" }, + }, + required: [ + "quoted_text", + "context_before", + "context_after", + "description", + "analysis", + "suggestion", + ], + }, + }, + }, + required: ["rule_id", "violations"], + }, + }, + }, + required: ["rules"], + }, + } as const; +} + export type SubjectiveLLMResult = { criteria: Array<{ name: string; @@ -120,6 +178,25 @@ export type SemiObjectiveLLMResult = { }>; }; +/** + * LLM result schema for batched Check evaluation. + * Multiple rules are evaluated in a single LLM call. + * Each rule's violations are tagged with its rule_id. + */ +export type BatchedCheckLLMResult = { + rules: Array<{ + rule_id: string; + violations: Array<{ + description: string; + analysis: string; + suggestion?: string; + quoted_text?: string; + context_before?: string; + context_after?: string; + }>; + }>; +}; + export type SubjectiveResult = { type: typeof EvaluationType.JUDGE; final_score: number; // 1-10 diff --git a/src/schemas/config-schemas.ts b/src/schemas/config-schemas.ts index 064e72c..9023d2f 100644 --- a/src/schemas/config-schemas.ts +++ b/src/schemas/config-schemas.ts @@ -6,6 +6,8 @@ export const CONFIG_SCHEMA = z.object({ concurrency: z.number().int().positive().default(4), configDir: z.string().min(1), defaultSeverity: z.enum(['warning', 'error']).optional(), + batchRules: z.boolean().default(false), + maxRulesPerBatch: z.number().int().min(1).max(20).default(5), scanPaths: z.array(z.object({ pattern: z.string(), runRules: z.array(z.string()).default([]), diff --git a/src/schemas/openai-responses.ts b/src/schemas/openai-responses.ts index 90a16f6..ccb180b 100644 --- a/src/schemas/openai-responses.ts +++ b/src/schemas/openai-responses.ts @@ -33,7 +33,7 @@ export const OPENAI_RESPONSE_SCHEMA = z.object({ model: z.string().optional(), choices: z.array(OPENAI_CHOICE_SCHEMA).min(1), usage: OPENAI_USAGE_SCHEMA.optional(), - system_fingerprint: z.string().optional(), + system_fingerprint: z.string().nullable().optional(), }); // Inferred TypeScript types diff --git a/tsconfig.json b/tsconfig.json index 5bde4dd..fbea078 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -18,6 +18,6 @@ "sourceMap": true, "types": ["node"] }, - "include": ["src/**/*", "*.config.{js,mjs,ts}"], + "include": ["src/**/*", "scripts/**/*", "*.config.{js,mjs,ts}"], "exclude": ["node_modules", "dist", "tests"] }