From 53fcdc5435cde06d0f4e7bb027c0ec4fc325ffa1 Mon Sep 17 00:00:00 2001 From: Osho Emmanuel Date: Wed, 4 Mar 2026 14:16:32 +0100 Subject: [PATCH 1/3] refactor(scoring): score check rules from surfaced violations only - Introduce RawCheckResult type: violations + word_count, no score fields - Evaluators (base, accuracy) return RawCheckResult; scoring deferred - Orchestrator calls calculateCheckScore after gate filtering, so only violations surfaced to the user contribute to the density penalty - Add regression test: 1 filtered violation must not move score from 9.0 to 8.0 --- src/cli/orchestrator.ts | 21 ++++++-- src/evaluators/accuracy-evaluator.ts | 17 +++--- src/evaluators/base-evaluator.ts | 20 ++------ src/prompts/schema.ts | 19 ++++++- tests/orchestrator-filtering.test.ts | 77 +++++++++++++++++----------- tests/scoring-types.test.ts | 17 +++--- 6 files changed, 100 insertions(+), 71 deletions(-) diff --git a/src/cli/orchestrator.ts b/src/cli/orchestrator.ts index 4bc4828..da50d4a 100644 --- a/src/cli/orchestrator.ts +++ b/src/cli/orchestrator.ts @@ -25,6 +25,7 @@ import { calculateCost, TokenUsageStats } from '../providers/token-usage'; +import { calculateCheckScore } from '../scoring'; import { locateQuotedText } from "../output/location"; import { computeFilterDecision, @@ -615,12 +616,22 @@ function routePromptResult( // Handle Check Result if (!isJudgeResult(result)) { - const severity = result.severity; const { decisions, surfacedViolations } = getViolationFilterResults( result.violations ); const violationCount = surfacedViolations.length; + // Score calculated from surfaced violations only — matches what user sees + const scored = calculateCheckScore( + surfacedViolations, + result.word_count, + { + strictness: promptFile.meta.strictness, + promptSeverity: promptFile.meta.severity, + } + ); + const severity = scored.severity; + // Group violations by criterionName const violationsByCriterion = new Map< string | undefined, @@ -671,14 +682,14 @@ function routePromptResult( } // If no violations but we have a message (JSON output), report it - if (violationCount === 0 && (outputFormat === OutputFormat.Json || outputFormat === OutputFormat.ValeJson) && result.message) { + if (violationCount === 0 && (outputFormat === OutputFormat.Json || outputFormat === OutputFormat.ValeJson) && scored.message) { const ruleName = buildRuleName(promptFile.pack, promptId, undefined); reportIssue({ file: relFile, line: 1, column: 1, severity, - summary: result.message, + summary: scored.message, ruleName, outputFormat, jsonFormatter, @@ -689,8 +700,8 @@ function routePromptResult( // Create scoreEntry for Quality Scores display const scoreEntry: EvaluationSummary = { id: buildRuleName(promptFile.pack, promptId, undefined), - scoreText: `${result.final_score.toFixed(1)}/10`, - score: result.final_score, + scoreText: `${scored.final_score.toFixed(1)}/10`, + score: scored.final_score, }; if (debugJson) { diff --git a/src/evaluators/accuracy-evaluator.ts b/src/evaluators/accuracy-evaluator.ts index c20e517..9a27b9f 100644 --- a/src/evaluators/accuracy-evaluator.ts +++ b/src/evaluators/accuracy-evaluator.ts @@ -3,14 +3,13 @@ import { registerEvaluator } from "./evaluator-registry"; import type { LLMProvider } from "../providers/llm-provider"; import type { SearchProvider } from "../providers/search-provider"; import type { PromptFile } from "../schemas/prompt-schemas"; -import type { PromptEvaluationResult } from "../prompts/schema"; +import type { PromptEvaluationResult, RawCheckResult } from "../prompts/schema"; import type { TokenUsage } from "../providers/token-usage"; import { renderTemplate } from "../prompts/template-renderer"; import { getPrompt } from "./prompt-loader"; import { z } from "zod"; -import { Type, type Severity } from "./types"; +import { Type, EvaluationType, type Severity } from "./types"; import { MissingDependencyError } from "../errors/index"; -import { calculateCheckScore } from "../scoring/scorer"; import { countWords } from "../chunking"; // Schema for claim extraction response @@ -65,15 +64,13 @@ export class TechnicalAccuracyEvaluator extends BaseEvaluator { // Use the scoring module to calculate result if (claims.length === 0) { const wordCount = countWords(content) || 1; - const result = calculateCheckScore([], wordCount, { - strictness: this.prompt.meta.strictness, - defaultSeverity: this.defaultSeverity, - promptSeverity: this.prompt.meta.severity, - }); - return { - ...result, + const raw: RawCheckResult = { + type: EvaluationType.CHECK, + violations: [], + word_count: wordCount, ...(claimUsage && { usage: claimUsage }), }; + return raw; } // Step 2: Search for evidence for each claim diff --git a/src/evaluators/base-evaluator.ts b/src/evaluators/base-evaluator.ts index 1142dd0..4b84c39 100644 --- a/src/evaluators/base-evaluator.ts +++ b/src/evaluators/base-evaluator.ts @@ -9,7 +9,7 @@ import { type JudgeLLMResult, type CheckLLMResult, type JudgeResult, - type CheckResult, + type RawCheckResult, type PromptEvaluationResult, } from "../prompts/schema"; import { registerEvaluator } from "./evaluator-registry"; @@ -22,7 +22,6 @@ import { type Chunk, } from "../chunking"; import { - calculateCheckScore, calculateJudgeScore, averageJudgeScores, } from "../scoring"; @@ -197,7 +196,7 @@ export class BaseEvaluator implements Evaluator { protected async runCheckEvaluation( content: string, context?: EvalContext - ): Promise { + ): Promise { const schema = buildCheckLLMSchema(); // Prepend line numbers for deterministic line reporting @@ -228,22 +227,13 @@ export class BaseEvaluator implements Evaluator { // Merge and deduplicate violations const mergedViolations = mergeViolations(allChunkViolations); - // Calculate score once from all violations - const result = calculateCheckScore( - mergedViolations, - totalWordCount, - { - strictness: this.prompt.meta.strictness, - defaultSeverity: this.defaultSeverity, - promptSeverity: this.prompt.meta.severity, - } - ); - const aggregatedUsage = this.aggregateUsage(usages); const reasoning = chunkReasonings.join(" ").trim() || undefined; return { - ...result, + type: EvaluationType.CHECK, + violations: mergedViolations, + word_count: totalWordCount, ...(reasoning && { reasoning }), raw_model_output: rawChunkOutputs.length === 1 ? rawChunkOutputs[0] : rawChunkOutputs, ...(aggregatedUsage && { usage: aggregatedUsage }), diff --git a/src/prompts/schema.ts b/src/prompts/schema.ts index daf65b3..1ccc0e6 100644 --- a/src/prompts/schema.ts +++ b/src/prompts/schema.ts @@ -350,7 +350,16 @@ export type CheckResult = { raw_model_output?: unknown; }; -export type PromptEvaluationResult = JudgeResult | CheckResult; +export type RawCheckResult = { + type: typeof EvaluationType.CHECK; + violations: CheckResult["violations"]; + word_count: number; + reasoning?: string; + usage?: TokenUsage; + raw_model_output?: unknown; +}; + +export type PromptEvaluationResult = JudgeResult | RawCheckResult; export function isJudgeResult( result: PromptEvaluationResult @@ -360,6 +369,12 @@ export function isJudgeResult( export function isCheckResult( result: PromptEvaluationResult -): result is CheckResult { +): result is RawCheckResult { + return result.type === EvaluationType.CHECK; +} + +export function isRawCheckResult( + result: PromptEvaluationResult +): result is RawCheckResult { return result.type === EvaluationType.CHECK; } diff --git a/tests/orchestrator-filtering.test.ts b/tests/orchestrator-filtering.test.ts index 8a740b1..7b249c9 100644 --- a/tests/orchestrator-filtering.test.ts +++ b/tests/orchestrator-filtering.test.ts @@ -6,13 +6,13 @@ import { evaluateFiles } from "../src/cli/orchestrator"; import { OutputFormat, type EvaluationOptions } from "../src/cli/types"; import { EvaluationType, Severity } from "../src/evaluators/types"; import type { PromptFile } from "../src/prompts/prompt-loader"; -import type { CheckResult, JudgeResult } from "../src/prompts/schema"; +import type { JudgeResult, RawCheckResult } from "../src/prompts/schema"; const { EVALUATE_MOCK } = vi.hoisted(() => ({ EVALUATE_MOCK: vi.fn(), })); -type CheckViolation = CheckResult["violations"][number]; +type CheckViolation = RawCheckResult["violations"][number]; type JudgeViolation = JudgeResult["criteria"][number]["violations"][number]; vi.mock("../src/evaluators/index", () => ({ @@ -113,21 +113,13 @@ function makeJudgeViolation( } function makeCheckResult(params: { - severity: Severity; - finalScore: number; - percentage: number; - message: string; violations: CheckViolation[]; -}): CheckResult { + wordCount?: number; +}): RawCheckResult { return { type: EvaluationType.CHECK, - final_score: params.finalScore, - percentage: params.percentage, - violation_count: params.violations.length, - items: [], - severity: params.severity, - message: params.message, violations: params.violations, + word_count: params.wordCount ?? 100, }; } @@ -181,10 +173,6 @@ describe("CLI violation filtering", () => { EVALUATE_MOCK.mockResolvedValue( makeCheckResult({ - severity: Severity.WARNING, - finalScore: 8, - percentage: 80, - message: "Found issues", violations: [ makeCheckViolation(), makeCheckViolation({ @@ -208,10 +196,6 @@ describe("CLI violation filtering", () => { process.env.CONFIDENCE_THRESHOLD = "0.0"; EVALUATE_MOCK.mockResolvedValue( makeCheckResult({ - severity: Severity.WARNING, - finalScore: 8, - percentage: 80, - message: "Found issues", violations: [ makeCheckViolation(), makeCheckViolation({ @@ -244,10 +228,6 @@ describe("CLI violation filtering", () => { EVALUATE_MOCK.mockResolvedValue( makeCheckResult({ - severity: Severity.ERROR, - finalScore: 2, - percentage: 20, - message: "Found issue", violations: [ makeCheckViolation({ confidence: 0.2, @@ -266,10 +246,6 @@ describe("CLI violation filtering", () => { process.env.CONFIDENCE_THRESHOLD = "0.0"; EVALUATE_MOCK.mockResolvedValue( makeCheckResult({ - severity: Severity.ERROR, - finalScore: 2, - percentage: 20, - message: "Found issue", violations: [ makeCheckViolation({ confidence: 0.2, @@ -286,6 +262,49 @@ describe("CLI violation filtering", () => { expect(zeroThresholdRun.hadSeverityErrors).toBe(true); }); + it("score reflects only surfaced violations, not filtered-out ones", async () => { + // 100-word file: 2 violations from model, 1 fails confidence gate + // With default threshold, only 1 violation surfaces + // Density: 1/100 * 100 * 10 = 10 penalty → score = 9.0 + // If bug were present (scoring all 2): 2/100 * 100 * 10 = 20 penalty → score = 8.0 + + const content = new Array(100).fill("word").join(" ") + "\n"; + const targetFile = createTempFile(content); + + const prompt = createPrompt({ + id: "ScorePrompt", + name: "Score Prompt", + type: "check", + severity: Severity.WARNING, + }); + + EVALUATE_MOCK.mockResolvedValue( + makeCheckResult({ + violations: [ + makeCheckViolation({ quoted_text: content.split(" ")[0] ?? "word" }), + makeCheckViolation({ + quoted_text: content.split(" ")[1] ?? "word", + confidence: 0.2, // fails confidence gate — should NOT affect score + }), + ], + wordCount: 100, + }) + ); + + const logCalls: string[] = []; + vi.spyOn(console, "log").mockImplementation((...args) => { + logCalls.push(args.map(String).join(" ")); + }); + + await evaluateFiles([targetFile], createBaseOptions([prompt])); + + // Score should reflect 1 surfaced violation, not 2 + const scoreLine = logCalls.find(l => l.includes("/10")); + expect(scoreLine).toBeDefined(); + expect(scoreLine).toContain("9.0/10"); + expect(scoreLine).not.toContain("8.0/10"); + }); + it("filters low-confidence judge violations from CLI counts by default", async () => { const targetFile = createTempFile("Alpha text\nBeta text\n"); const prompt = createPrompt({ diff --git a/tests/scoring-types.test.ts b/tests/scoring-types.test.ts index ce90055..7e9e018 100644 --- a/tests/scoring-types.test.ts +++ b/tests/scoring-types.test.ts @@ -128,10 +128,9 @@ describe("Scoring Types", () => { if (result.type !== EvaluationType.CHECK) throw new Error("Wrong result type"); - // Calculation: 2 violations = score of 8 (10 - 2) - expect(result.final_score).toBe(8.0); - expect(result.percentage).toBe(80); - expect(result.violation_count).toBe(2); + // Evaluator now returns raw violations and word count — scoring deferred to orchestrator + expect(result.violations).toHaveLength(2); + expect(result.word_count).toBe(100); }); it("should handle empty violations list (perfect score)", async () => { @@ -152,10 +151,8 @@ describe("Scoring Types", () => { if (result.type !== EvaluationType.CHECK) throw new Error("Wrong result type"); - // No violations = perfect score - expect(result.final_score).toBe(10); - expect(result.percentage).toBe(100); - expect(result.violation_count).toBe(0); + expect(result.violations).toHaveLength(0); + expect(result.word_count).toBeGreaterThan(0); }); }); @@ -201,8 +198,8 @@ describe("Scoring Types", () => { if (result.type !== EvaluationType.CHECK) throw new Error("Wrong result type"); - expect(result.final_score).toBe(10); - expect(result.items).toEqual([]); + expect(result.violations).toHaveLength(0); + expect(result.word_count).toBeGreaterThan(0); }); }); }); From 45e69053c9ee7777c48d26d5ea6c092c5803acc5 Mon Sep 17 00:00:00 2001 From: Osho Emmanuel Date: Wed, 4 Mar 2026 14:41:55 +0100 Subject: [PATCH 2/3] refactor(scoring): respect explicit promptSeverity over defaultSeverity - promptSeverity !== undefined now wins unconditionally; previously only Severity.ERROR was treated as authoritative, so an explicit 'severity: warning' in rule frontmatter was silently overridden by DefaultSeverity=error from config - Remove unused isRawCheckResult type guard (duplicate of isCheckResult) --- src/prompts/schema.ts | 6 +----- src/scoring/scorer.ts | 4 ++-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/prompts/schema.ts b/src/prompts/schema.ts index 1ccc0e6..593bda5 100644 --- a/src/prompts/schema.ts +++ b/src/prompts/schema.ts @@ -373,8 +373,4 @@ export function isCheckResult( return result.type === EvaluationType.CHECK; } -export function isRawCheckResult( - result: PromptEvaluationResult -): result is RawCheckResult { - return result.type === EvaluationType.CHECK; -} + diff --git a/src/scoring/scorer.ts b/src/scoring/scorer.ts index b36bd1d..d5f9648 100644 --- a/src/scoring/scorer.ts +++ b/src/scoring/scorer.ts @@ -66,8 +66,8 @@ export function calculateCheckScore( Severity.WARNING; if (finalScore < 10) { - if (options.promptSeverity === Severity.ERROR) { - severity = Severity.ERROR; + if (options.promptSeverity !== undefined) { + severity = options.promptSeverity as typeof Severity.WARNING | typeof Severity.ERROR; } else if (options.defaultSeverity) { severity = options.defaultSeverity; } From a287fb84a8fe30e1777c6c58de79245101324b12 Mon Sep 17 00:00:00 2001 From: Osho Emmanuel Date: Wed, 4 Mar 2026 14:44:38 +0100 Subject: [PATCH 3/3] refactor(scoring): tighten promptSeverity type, remove unsafe cast - Remove | string from CheckScoringOptions.promptSeverity; callers pass meta.severity which is Zod-validated as nativeEnum(Severity) at the config boundary, so the string widening was unnecessary - Drop the as-cast that was required to compensate --- src/scoring/scorer.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/scoring/scorer.ts b/src/scoring/scorer.ts index d5f9648..ecea0db 100644 --- a/src/scoring/scorer.ts +++ b/src/scoring/scorer.ts @@ -10,11 +10,7 @@ export interface CheckScoringOptions { // Strictness factor. Higher = more penalty per violation. strictness?: number | "lenient" | "strict" | "standard" | undefined; defaultSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined; - promptSeverity?: - | typeof Severity.WARNING - | typeof Severity.ERROR - | string - | undefined; + promptSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined; } export interface JudgeScoringOptions { @@ -67,7 +63,7 @@ export function calculateCheckScore( if (finalScore < 10) { if (options.promptSeverity !== undefined) { - severity = options.promptSeverity as typeof Severity.WARNING | typeof Severity.ERROR; + severity = options.promptSeverity; } else if (options.defaultSeverity) { severity = options.defaultSeverity; }