From 53fcdc5435cde06d0f4e7bb027c0ec4fc325ffa1 Mon Sep 17 00:00:00 2001
From: Osho Emmanuel <oshoklinsmann@gmail.com>
Date: Wed, 4 Mar 2026 14:16:32 +0100
Subject: [PATCH 1/3] refactor(scoring): score check rules from surfaced
 violations only

- Introduce RawCheckResult type: violations + word_count, no score fields
- Evaluators (base, accuracy) return RawCheckResult; scoring deferred
- Orchestrator calls calculateCheckScore after gate filtering, so only
  violations surfaced to the user contribute to the density penalty
- Add regression test: 1 filtered violation must not move score from
  9.0 to 8.0
---
 src/cli/orchestrator.ts              | 21 ++++++--
 src/evaluators/accuracy-evaluator.ts | 17 +++---
 src/evaluators/base-evaluator.ts     | 20 ++------
 src/prompts/schema.ts                | 19 ++++++-
 tests/orchestrator-filtering.test.ts | 77 +++++++++++++++++-----------
 tests/scoring-types.test.ts          | 17 +++---
 6 files changed, 100 insertions(+), 71 deletions(-)

diff --git a/src/cli/orchestrator.ts b/src/cli/orchestrator.ts
index 4bc4828..da50d4a 100644
--- a/src/cli/orchestrator.ts
+++ b/src/cli/orchestrator.ts
@@ -25,6 +25,7 @@ import {
   calculateCost,
   TokenUsageStats
 } from '../providers/token-usage';
+import { calculateCheckScore } from '../scoring';
 import { locateQuotedText } from "../output/location";
 import {
   computeFilterDecision,
@@ -615,12 +616,22 @@ function routePromptResult(
 
   // Handle Check Result
   if (!isJudgeResult(result)) {
-    const severity = result.severity;
     const { decisions, surfacedViolations } = getViolationFilterResults(
       result.violations
     );
     const violationCount = surfacedViolations.length;
 
+    // Score calculated from surfaced violations only — matches what user sees
+    const scored = calculateCheckScore(
+      surfacedViolations,
+      result.word_count,
+      {
+        strictness: promptFile.meta.strictness,
+        promptSeverity: promptFile.meta.severity,
+      }
+    );
+    const severity = scored.severity;
+
     // Group violations by criterionName
     const violationsByCriterion = new Map<
       string | undefined,
@@ -671,14 +682,14 @@ function routePromptResult(
     }
 
     // If no violations but we have a message (JSON output), report it
-    if (violationCount === 0 && (outputFormat === OutputFormat.Json || outputFormat === OutputFormat.ValeJson) && result.message) {
+    if (violationCount === 0 && (outputFormat === OutputFormat.Json || outputFormat === OutputFormat.ValeJson) && scored.message) {
       const ruleName = buildRuleName(promptFile.pack, promptId, undefined);
       reportIssue({
         file: relFile,
         line: 1,
         column: 1,
         severity,
-        summary: result.message,
+        summary: scored.message,
         ruleName,
         outputFormat,
         jsonFormatter,
@@ -689,8 +700,8 @@ function routePromptResult(
     // Create scoreEntry for Quality Scores display
     const scoreEntry: EvaluationSummary = {
       id: buildRuleName(promptFile.pack, promptId, undefined),
-      scoreText: `${result.final_score.toFixed(1)}/10`,
-      score: result.final_score,
+      scoreText: `${scored.final_score.toFixed(1)}/10`,
+      score: scored.final_score,
     };
 
     if (debugJson) {
diff --git a/src/evaluators/accuracy-evaluator.ts b/src/evaluators/accuracy-evaluator.ts
index c20e517..9a27b9f 100644
--- a/src/evaluators/accuracy-evaluator.ts
+++ b/src/evaluators/accuracy-evaluator.ts
@@ -3,14 +3,13 @@ import { registerEvaluator } from "./evaluator-registry";
 import type { LLMProvider } from "../providers/llm-provider";
 import type { SearchProvider } from "../providers/search-provider";
 import type { PromptFile } from "../schemas/prompt-schemas";
-import type { PromptEvaluationResult } from "../prompts/schema";
+import type { PromptEvaluationResult, RawCheckResult } from "../prompts/schema";
 import type { TokenUsage } from "../providers/token-usage";
 import { renderTemplate } from "../prompts/template-renderer";
 import { getPrompt } from "./prompt-loader";
 import { z } from "zod";
-import { Type, type Severity } from "./types";
+import { Type, EvaluationType, type Severity } from "./types";
 import { MissingDependencyError } from "../errors/index";
-import { calculateCheckScore } from "../scoring/scorer";
 import { countWords } from "../chunking";
 
 // Schema for claim extraction response
@@ -65,15 +64,13 @@ export class TechnicalAccuracyEvaluator extends BaseEvaluator {
     // Use the scoring module to calculate result
     if (claims.length === 0) {
       const wordCount = countWords(content) || 1;
-      const result = calculateCheckScore([], wordCount, {
-        strictness: this.prompt.meta.strictness,
-        defaultSeverity: this.defaultSeverity,
-        promptSeverity: this.prompt.meta.severity,
-      });
-      return {
-        ...result,
+      const raw: RawCheckResult = {
+        type: EvaluationType.CHECK,
+        violations: [],
+        word_count: wordCount,
         ...(claimUsage && { usage: claimUsage }),
       };
+      return raw;
     }
 
     // Step 2: Search for evidence for each claim
diff --git a/src/evaluators/base-evaluator.ts b/src/evaluators/base-evaluator.ts
index 1142dd0..4b84c39 100644
--- a/src/evaluators/base-evaluator.ts
+++ b/src/evaluators/base-evaluator.ts
@@ -9,7 +9,7 @@ import {
   type JudgeLLMResult,
   type CheckLLMResult,
   type JudgeResult,
-  type CheckResult,
+  type RawCheckResult,
   type PromptEvaluationResult,
 } from "../prompts/schema";
 import { registerEvaluator } from "./evaluator-registry";
@@ -22,7 +22,6 @@ import {
   type Chunk,
 } from "../chunking";
 import {
-  calculateCheckScore,
   calculateJudgeScore,
   averageJudgeScores,
 } from "../scoring";
@@ -197,7 +196,7 @@ export class BaseEvaluator implements Evaluator {
   protected async runCheckEvaluation(
     content: string,
     context?: EvalContext
-  ): Promise<CheckResult> {
+  ): Promise<RawCheckResult> {
     const schema = buildCheckLLMSchema();
 
     // Prepend line numbers for deterministic line reporting
@@ -228,22 +227,13 @@ export class BaseEvaluator implements Evaluator {
     // Merge and deduplicate violations
     const mergedViolations = mergeViolations(allChunkViolations);
 
-    // Calculate score once from all violations
-    const result = calculateCheckScore(
-      mergedViolations,
-      totalWordCount,
-      {
-        strictness: this.prompt.meta.strictness,
-        defaultSeverity: this.defaultSeverity,
-        promptSeverity: this.prompt.meta.severity,
-      }
-    );
-
     const aggregatedUsage = this.aggregateUsage(usages);
     const reasoning = chunkReasonings.join(" ").trim() || undefined;
 
     return {
-      ...result,
+      type: EvaluationType.CHECK,
+      violations: mergedViolations,
+      word_count: totalWordCount,
       ...(reasoning && { reasoning }),
       raw_model_output: rawChunkOutputs.length === 1 ? rawChunkOutputs[0] : rawChunkOutputs,
       ...(aggregatedUsage && { usage: aggregatedUsage }),
diff --git a/src/prompts/schema.ts b/src/prompts/schema.ts
index daf65b3..1ccc0e6 100644
--- a/src/prompts/schema.ts
+++ b/src/prompts/schema.ts
@@ -350,7 +350,16 @@ export type CheckResult = {
   raw_model_output?: unknown;
 };
 
-export type PromptEvaluationResult = JudgeResult | CheckResult;
+export type RawCheckResult = {
+  type: typeof EvaluationType.CHECK;
+  violations: CheckResult["violations"];
+  word_count: number;
+  reasoning?: string;
+  usage?: TokenUsage;
+  raw_model_output?: unknown;
+};
+
+export type PromptEvaluationResult = JudgeResult | RawCheckResult;
 
 export function isJudgeResult(
   result: PromptEvaluationResult
@@ -360,6 +369,12 @@ export function isJudgeResult(
 
 export function isCheckResult(
   result: PromptEvaluationResult
-): result is CheckResult {
+): result is RawCheckResult {
+  return result.type === EvaluationType.CHECK;
+}
+
+export function isRawCheckResult(
+  result: PromptEvaluationResult
+): result is RawCheckResult {
   return result.type === EvaluationType.CHECK;
 }
diff --git a/tests/orchestrator-filtering.test.ts b/tests/orchestrator-filtering.test.ts
index 8a740b1..7b249c9 100644
--- a/tests/orchestrator-filtering.test.ts
+++ b/tests/orchestrator-filtering.test.ts
@@ -6,13 +6,13 @@ import { evaluateFiles } from "../src/cli/orchestrator";
 import { OutputFormat, type EvaluationOptions } from "../src/cli/types";
 import { EvaluationType, Severity } from "../src/evaluators/types";
 import type { PromptFile } from "../src/prompts/prompt-loader";
-import type { CheckResult, JudgeResult } from "../src/prompts/schema";
+import type { JudgeResult, RawCheckResult } from "../src/prompts/schema";
 
 const { EVALUATE_MOCK } = vi.hoisted(() => ({
   EVALUATE_MOCK: vi.fn(),
 }));
 
-type CheckViolation = CheckResult["violations"][number];
+type CheckViolation = RawCheckResult["violations"][number];
 type JudgeViolation = JudgeResult["criteria"][number]["violations"][number];
 
 vi.mock("../src/evaluators/index", () => ({
@@ -113,21 +113,13 @@ function makeJudgeViolation(
 }
 
 function makeCheckResult(params: {
-  severity: Severity;
-  finalScore: number;
-  percentage: number;
-  message: string;
   violations: CheckViolation[];
-}): CheckResult {
+  wordCount?: number;
+}): RawCheckResult {
   return {
     type: EvaluationType.CHECK,
-    final_score: params.finalScore,
-    percentage: params.percentage,
-    violation_count: params.violations.length,
-    items: [],
-    severity: params.severity,
-    message: params.message,
     violations: params.violations,
+    word_count: params.wordCount ?? 100,
   };
 }
 
@@ -181,10 +173,6 @@ describe("CLI violation filtering", () => {
 
     EVALUATE_MOCK.mockResolvedValue(
       makeCheckResult({
-        severity: Severity.WARNING,
-        finalScore: 8,
-        percentage: 80,
-        message: "Found issues",
         violations: [
           makeCheckViolation(),
           makeCheckViolation({
@@ -208,10 +196,6 @@ describe("CLI violation filtering", () => {
     process.env.CONFIDENCE_THRESHOLD = "0.0";
     EVALUATE_MOCK.mockResolvedValue(
       makeCheckResult({
-        severity: Severity.WARNING,
-        finalScore: 8,
-        percentage: 80,
-        message: "Found issues",
         violations: [
           makeCheckViolation(),
           makeCheckViolation({
@@ -244,10 +228,6 @@ describe("CLI violation filtering", () => {
 
     EVALUATE_MOCK.mockResolvedValue(
       makeCheckResult({
-        severity: Severity.ERROR,
-        finalScore: 2,
-        percentage: 20,
-        message: "Found issue",
         violations: [
           makeCheckViolation({
             confidence: 0.2,
@@ -266,10 +246,6 @@ describe("CLI violation filtering", () => {
     process.env.CONFIDENCE_THRESHOLD = "0.0";
     EVALUATE_MOCK.mockResolvedValue(
       makeCheckResult({
-        severity: Severity.ERROR,
-        finalScore: 2,
-        percentage: 20,
-        message: "Found issue",
         violations: [
           makeCheckViolation({
             confidence: 0.2,
@@ -286,6 +262,49 @@ describe("CLI violation filtering", () => {
     expect(zeroThresholdRun.hadSeverityErrors).toBe(true);
   });
 
+  it("score reflects only surfaced violations, not filtered-out ones", async () => {
+    // 100-word file: 2 violations from model, 1 fails confidence gate
+    // With default threshold, only 1 violation surfaces
+    // Density: 1/100 * 100 * 10 = 10 penalty → score = 9.0
+    // If bug were present (scoring all 2): 2/100 * 100 * 10 = 20 penalty → score = 8.0
+
+    const content = new Array(100).fill("word").join(" ") + "\n";
+    const targetFile = createTempFile(content);
+
+    const prompt = createPrompt({
+      id: "ScorePrompt",
+      name: "Score Prompt",
+      type: "check",
+      severity: Severity.WARNING,
+    });
+
+    EVALUATE_MOCK.mockResolvedValue(
+      makeCheckResult({
+        violations: [
+          makeCheckViolation({ quoted_text: content.split(" ")[0] ?? "word" }),
+          makeCheckViolation({
+            quoted_text: content.split(" ")[1] ?? "word",
+            confidence: 0.2,  // fails confidence gate — should NOT affect score
+          }),
+        ],
+        wordCount: 100,
+      })
+    );
+
+    const logCalls: string[] = [];
+    vi.spyOn(console, "log").mockImplementation((...args) => {
+      logCalls.push(args.map(String).join(" "));
+    });
+
+    await evaluateFiles([targetFile], createBaseOptions([prompt]));
+
+    // Score should reflect 1 surfaced violation, not 2
+    const scoreLine = logCalls.find(l => l.includes("/10"));
+    expect(scoreLine).toBeDefined();
+    expect(scoreLine).toContain("9.0/10");
+    expect(scoreLine).not.toContain("8.0/10");
+  });
+
   it("filters low-confidence judge violations from CLI counts by default", async () => {
     const targetFile = createTempFile("Alpha text\nBeta text\n");
     const prompt = createPrompt({
diff --git a/tests/scoring-types.test.ts b/tests/scoring-types.test.ts
index ce90055..7e9e018 100644
--- a/tests/scoring-types.test.ts
+++ b/tests/scoring-types.test.ts
@@ -128,10 +128,9 @@ describe("Scoring Types", () => {
       if (result.type !== EvaluationType.CHECK)
         throw new Error("Wrong result type");
 
-      // Calculation: 2 violations = score of 8 (10 - 2)
-      expect(result.final_score).toBe(8.0);
-      expect(result.percentage).toBe(80);
-      expect(result.violation_count).toBe(2);
+      // Evaluator now returns raw violations and word count — scoring deferred to orchestrator
+      expect(result.violations).toHaveLength(2);
+      expect(result.word_count).toBe(100);
     });
 
     it("should handle empty violations list (perfect score)", async () => {
@@ -152,10 +151,8 @@ describe("Scoring Types", () => {
       if (result.type !== EvaluationType.CHECK)
         throw new Error("Wrong result type");
 
-      // No violations = perfect score
-      expect(result.final_score).toBe(10);
-      expect(result.percentage).toBe(100);
-      expect(result.violation_count).toBe(0);
+      expect(result.violations).toHaveLength(0);
+      expect(result.word_count).toBeGreaterThan(0);
     });
   });
 
@@ -201,8 +198,8 @@ describe("Scoring Types", () => {
 
       if (result.type !== EvaluationType.CHECK)
         throw new Error("Wrong result type");
-      expect(result.final_score).toBe(10);
-      expect(result.items).toEqual([]);
+      expect(result.violations).toHaveLength(0);
+      expect(result.word_count).toBeGreaterThan(0);
     });
   });
 });

From 45e69053c9ee7777c48d26d5ea6c092c5803acc5 Mon Sep 17 00:00:00 2001
From: Osho Emmanuel <oshoklinsmann@gmail.com>
Date: Wed, 4 Mar 2026 14:41:55 +0100
Subject: [PATCH 2/3] refactor(scoring): respect explicit promptSeverity over
 defaultSeverity

- promptSeverity !== undefined now wins unconditionally; previously only
  Severity.ERROR was treated as authoritative, so an explicit
  'severity: warning' in rule frontmatter was silently overridden by
  DefaultSeverity=error from config
- Remove unused isRawCheckResult type guard (duplicate of isCheckResult)
---
 src/prompts/schema.ts | 6 +-----
 src/scoring/scorer.ts | 4 ++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/prompts/schema.ts b/src/prompts/schema.ts
index 1ccc0e6..593bda5 100644
--- a/src/prompts/schema.ts
+++ b/src/prompts/schema.ts
@@ -373,8 +373,4 @@ export function isCheckResult(
   return result.type === EvaluationType.CHECK;
 }
 
-export function isRawCheckResult(
-  result: PromptEvaluationResult
-): result is RawCheckResult {
-  return result.type === EvaluationType.CHECK;
-}
+
diff --git a/src/scoring/scorer.ts b/src/scoring/scorer.ts
index b36bd1d..d5f9648 100644
--- a/src/scoring/scorer.ts
+++ b/src/scoring/scorer.ts
@@ -66,8 +66,8 @@ export function calculateCheckScore(
     Severity.WARNING;
 
   if (finalScore < 10) {
-    if (options.promptSeverity === Severity.ERROR) {
-      severity = Severity.ERROR;
+    if (options.promptSeverity !== undefined) {
+      severity = options.promptSeverity as typeof Severity.WARNING | typeof Severity.ERROR;
     } else if (options.defaultSeverity) {
       severity = options.defaultSeverity;
     }

From a287fb84a8fe30e1777c6c58de79245101324b12 Mon Sep 17 00:00:00 2001
From: Osho Emmanuel <oshoklinsmann@gmail.com>
Date: Wed, 4 Mar 2026 14:44:38 +0100
Subject: [PATCH 3/3] refactor(scoring): tighten promptSeverity type, remove
 unsafe cast

- Remove | string from CheckScoringOptions.promptSeverity; callers pass
  meta.severity which is Zod-validated as nativeEnum(Severity) at the
  config boundary, so the string widening was unnecessary
- Drop the as-cast that was required to compensate
---
 src/scoring/scorer.ts | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/scoring/scorer.ts b/src/scoring/scorer.ts
index d5f9648..ecea0db 100644
--- a/src/scoring/scorer.ts
+++ b/src/scoring/scorer.ts
@@ -10,11 +10,7 @@ export interface CheckScoringOptions {
   // Strictness factor. Higher = more penalty per violation.
   strictness?: number | "lenient" | "strict" | "standard" | undefined;
   defaultSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined;
-  promptSeverity?:
-  | typeof Severity.WARNING
-  | typeof Severity.ERROR
-  | string
-  | undefined;
+  promptSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined;
 }
 
 export interface JudgeScoringOptions {
@@ -67,7 +63,7 @@ export function calculateCheckScore(
 
   if (finalScore < 10) {
     if (options.promptSeverity !== undefined) {
-      severity = options.promptSeverity as typeof Severity.WARNING | typeof Severity.ERROR;
+      severity = options.promptSeverity;
     } else if (options.defaultSeverity) {
       severity = options.defaultSeverity;
     }