Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions src/cli/orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
calculateCost,
TokenUsageStats
} from '../providers/token-usage';
import { calculateCheckScore } from '../scoring';
import { locateQuotedText } from "../output/location";
import {
computeFilterDecision,
Expand Down Expand Up @@ -615,12 +616,22 @@ function routePromptResult(

// Handle Check Result
if (!isJudgeResult(result)) {
const severity = result.severity;
const { decisions, surfacedViolations } = getViolationFilterResults(
result.violations
);
const violationCount = surfacedViolations.length;

// Score calculated from surfaced violations only — matches what user sees
const scored = calculateCheckScore(
surfacedViolations,
result.word_count,
{
strictness: promptFile.meta.strictness,
promptSeverity: promptFile.meta.severity,
}
);
const severity = scored.severity;

// Group violations by criterionName
const violationsByCriterion = new Map<
string | undefined,
Expand Down Expand Up @@ -671,14 +682,14 @@ function routePromptResult(
}

// If no violations but we have a message (JSON output), report it
if (violationCount === 0 && (outputFormat === OutputFormat.Json || outputFormat === OutputFormat.ValeJson) && result.message) {
if (violationCount === 0 && (outputFormat === OutputFormat.Json || outputFormat === OutputFormat.ValeJson) && scored.message) {
const ruleName = buildRuleName(promptFile.pack, promptId, undefined);
reportIssue({
file: relFile,
line: 1,
column: 1,
severity,
summary: result.message,
summary: scored.message,
ruleName,
outputFormat,
jsonFormatter,
Expand All @@ -689,8 +700,8 @@ function routePromptResult(
// Create scoreEntry for Quality Scores display
const scoreEntry: EvaluationSummary = {
id: buildRuleName(promptFile.pack, promptId, undefined),
scoreText: `${result.final_score.toFixed(1)}/10`,
score: result.final_score,
scoreText: `${scored.final_score.toFixed(1)}/10`,
score: scored.final_score,
};

if (debugJson) {
Expand Down
17 changes: 7 additions & 10 deletions src/evaluators/accuracy-evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@ import { registerEvaluator } from "./evaluator-registry";
import type { LLMProvider } from "../providers/llm-provider";
import type { SearchProvider } from "../providers/search-provider";
import type { PromptFile } from "../schemas/prompt-schemas";
import type { PromptEvaluationResult } from "../prompts/schema";
import type { PromptEvaluationResult, RawCheckResult } from "../prompts/schema";
import type { TokenUsage } from "../providers/token-usage";
import { renderTemplate } from "../prompts/template-renderer";
import { getPrompt } from "./prompt-loader";
import { z } from "zod";
import { Type, type Severity } from "./types";
import { Type, EvaluationType, type Severity } from "./types";
import { MissingDependencyError } from "../errors/index";
import { calculateCheckScore } from "../scoring/scorer";
import { countWords } from "../chunking";

// Schema for claim extraction response
Expand Down Expand Up @@ -65,15 +64,13 @@ export class TechnicalAccuracyEvaluator extends BaseEvaluator {
// Use the scoring module to calculate result
if (claims.length === 0) {
const wordCount = countWords(content) || 1;
const result = calculateCheckScore([], wordCount, {
strictness: this.prompt.meta.strictness,
defaultSeverity: this.defaultSeverity,
promptSeverity: this.prompt.meta.severity,
});
return {
...result,
const raw: RawCheckResult = {
type: EvaluationType.CHECK,
violations: [],
word_count: wordCount,
...(claimUsage && { usage: claimUsage }),
};
return raw;
}

// Step 2: Search for evidence for each claim
Expand Down
20 changes: 5 additions & 15 deletions src/evaluators/base-evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {
type JudgeLLMResult,
type CheckLLMResult,
type JudgeResult,
type CheckResult,
type RawCheckResult,
type PromptEvaluationResult,
} from "../prompts/schema";
import { registerEvaluator } from "./evaluator-registry";
Expand All @@ -22,7 +22,6 @@ import {
type Chunk,
} from "../chunking";
import {
calculateCheckScore,
calculateJudgeScore,
averageJudgeScores,
} from "../scoring";
Expand Down Expand Up @@ -197,7 +196,7 @@ export class BaseEvaluator implements Evaluator {
protected async runCheckEvaluation(
content: string,
context?: EvalContext
): Promise<CheckResult> {
): Promise<RawCheckResult> {
const schema = buildCheckLLMSchema();

// Prepend line numbers for deterministic line reporting
Expand Down Expand Up @@ -228,22 +227,13 @@ export class BaseEvaluator implements Evaluator {
// Merge and deduplicate violations
const mergedViolations = mergeViolations(allChunkViolations);

// Calculate score once from all violations
const result = calculateCheckScore(
mergedViolations,
totalWordCount,
{
strictness: this.prompt.meta.strictness,
defaultSeverity: this.defaultSeverity,
promptSeverity: this.prompt.meta.severity,
}
);

const aggregatedUsage = this.aggregateUsage(usages);
const reasoning = chunkReasonings.join(" ").trim() || undefined;

return {
...result,
type: EvaluationType.CHECK,
violations: mergedViolations,
word_count: totalWordCount,
...(reasoning && { reasoning }),
raw_model_output: rawChunkOutputs.length === 1 ? rawChunkOutputs[0] : rawChunkOutputs,
...(aggregatedUsage && { usage: aggregatedUsage }),
Expand Down
15 changes: 13 additions & 2 deletions src/prompts/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,16 @@ export type CheckResult = {
raw_model_output?: unknown;
};

export type PromptEvaluationResult = JudgeResult | CheckResult;
export type RawCheckResult = {
type: typeof EvaluationType.CHECK;
violations: CheckResult["violations"];
word_count: number;
reasoning?: string;
usage?: TokenUsage;
raw_model_output?: unknown;
};

export type PromptEvaluationResult = JudgeResult | RawCheckResult;

export function isJudgeResult(
result: PromptEvaluationResult
Expand All @@ -360,6 +369,8 @@ export function isJudgeResult(

export function isCheckResult(
result: PromptEvaluationResult
): result is CheckResult {
): result is RawCheckResult {
return result.type === EvaluationType.CHECK;
}


10 changes: 3 additions & 7 deletions src/scoring/scorer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@ export interface CheckScoringOptions {
// Strictness factor. Higher = more penalty per violation.
strictness?: number | "lenient" | "strict" | "standard" | undefined;
defaultSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined;
promptSeverity?:
| typeof Severity.WARNING
| typeof Severity.ERROR
| string
| undefined;
promptSeverity?: typeof Severity.WARNING | typeof Severity.ERROR | undefined;
}

export interface JudgeScoringOptions {
Expand Down Expand Up @@ -66,8 +62,8 @@ export function calculateCheckScore(
Severity.WARNING;

if (finalScore < 10) {
if (options.promptSeverity === Severity.ERROR) {
severity = Severity.ERROR;
if (options.promptSeverity !== undefined) {
severity = options.promptSeverity;
} else if (options.defaultSeverity) {
severity = options.defaultSeverity;
}
Expand Down
77 changes: 48 additions & 29 deletions tests/orchestrator-filtering.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import { evaluateFiles } from "../src/cli/orchestrator";
import { OutputFormat, type EvaluationOptions } from "../src/cli/types";
import { EvaluationType, Severity } from "../src/evaluators/types";
import type { PromptFile } from "../src/prompts/prompt-loader";
import type { CheckResult, JudgeResult } from "../src/prompts/schema";
import type { JudgeResult, RawCheckResult } from "../src/prompts/schema";

const { EVALUATE_MOCK } = vi.hoisted(() => ({
EVALUATE_MOCK: vi.fn(),
}));

type CheckViolation = CheckResult["violations"][number];
type CheckViolation = RawCheckResult["violations"][number];
type JudgeViolation = JudgeResult["criteria"][number]["violations"][number];

vi.mock("../src/evaluators/index", () => ({
Expand Down Expand Up @@ -113,21 +113,13 @@ function makeJudgeViolation(
}

function makeCheckResult(params: {
severity: Severity;
finalScore: number;
percentage: number;
message: string;
violations: CheckViolation[];
}): CheckResult {
wordCount?: number;
}): RawCheckResult {
return {
type: EvaluationType.CHECK,
final_score: params.finalScore,
percentage: params.percentage,
violation_count: params.violations.length,
items: [],
severity: params.severity,
message: params.message,
violations: params.violations,
word_count: params.wordCount ?? 100,
};
}

Expand Down Expand Up @@ -181,10 +173,6 @@ describe("CLI violation filtering", () => {

EVALUATE_MOCK.mockResolvedValue(
makeCheckResult({
severity: Severity.WARNING,
finalScore: 8,
percentage: 80,
message: "Found issues",
violations: [
makeCheckViolation(),
makeCheckViolation({
Expand All @@ -208,10 +196,6 @@ describe("CLI violation filtering", () => {
process.env.CONFIDENCE_THRESHOLD = "0.0";
EVALUATE_MOCK.mockResolvedValue(
makeCheckResult({
severity: Severity.WARNING,
finalScore: 8,
percentage: 80,
message: "Found issues",
violations: [
makeCheckViolation(),
makeCheckViolation({
Expand Down Expand Up @@ -244,10 +228,6 @@ describe("CLI violation filtering", () => {

EVALUATE_MOCK.mockResolvedValue(
makeCheckResult({
severity: Severity.ERROR,
finalScore: 2,
percentage: 20,
message: "Found issue",
violations: [
makeCheckViolation({
confidence: 0.2,
Expand All @@ -266,10 +246,6 @@ describe("CLI violation filtering", () => {
process.env.CONFIDENCE_THRESHOLD = "0.0";
EVALUATE_MOCK.mockResolvedValue(
makeCheckResult({
severity: Severity.ERROR,
finalScore: 2,
percentage: 20,
message: "Found issue",
violations: [
makeCheckViolation({
confidence: 0.2,
Expand All @@ -286,6 +262,49 @@ describe("CLI violation filtering", () => {
expect(zeroThresholdRun.hadSeverityErrors).toBe(true);
});

it("score reflects only surfaced violations, not filtered-out ones", async () => {
// 100-word file: 2 violations from model, 1 fails confidence gate
// With default threshold, only 1 violation surfaces
// Density: 1/100 * 100 * 10 = 10 penalty → score = 9.0
// If bug were present (scoring all 2): 2/100 * 100 * 10 = 20 penalty → score = 8.0

const content = new Array(100).fill("word").join(" ") + "\n";
const targetFile = createTempFile(content);

const prompt = createPrompt({
id: "ScorePrompt",
name: "Score Prompt",
type: "check",
severity: Severity.WARNING,
});

EVALUATE_MOCK.mockResolvedValue(
makeCheckResult({
violations: [
makeCheckViolation({ quoted_text: content.split(" ")[0] ?? "word" }),
makeCheckViolation({
quoted_text: content.split(" ")[1] ?? "word",
confidence: 0.2, // fails confidence gate — should NOT affect score
}),
],
wordCount: 100,
})
);

const logCalls: string[] = [];
vi.spyOn(console, "log").mockImplementation((...args) => {
logCalls.push(args.map(String).join(" "));
});

await evaluateFiles([targetFile], createBaseOptions([prompt]));

// Score should reflect 1 surfaced violation, not 2
const scoreLine = logCalls.find(l => l.includes("/10"));
expect(scoreLine).toBeDefined();
expect(scoreLine).toContain("9.0/10");
expect(scoreLine).not.toContain("8.0/10");
});

it("filters low-confidence judge violations from CLI counts by default", async () => {
const targetFile = createTempFile("Alpha text\nBeta text\n");
const prompt = createPrompt({
Expand Down
17 changes: 7 additions & 10 deletions tests/scoring-types.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,9 @@ describe("Scoring Types", () => {
if (result.type !== EvaluationType.CHECK)
throw new Error("Wrong result type");

// Calculation: 2 violations = score of 8 (10 - 2)
expect(result.final_score).toBe(8.0);
expect(result.percentage).toBe(80);
expect(result.violation_count).toBe(2);
// Evaluator now returns raw violations and word count — scoring deferred to orchestrator
expect(result.violations).toHaveLength(2);
expect(result.word_count).toBe(100);
});

it("should handle empty violations list (perfect score)", async () => {
Expand All @@ -152,10 +151,8 @@ describe("Scoring Types", () => {
if (result.type !== EvaluationType.CHECK)
throw new Error("Wrong result type");

// No violations = perfect score
expect(result.final_score).toBe(10);
expect(result.percentage).toBe(100);
expect(result.violation_count).toBe(0);
expect(result.violations).toHaveLength(0);
expect(result.word_count).toBeGreaterThan(0);
});
});

Expand Down Expand Up @@ -201,8 +198,8 @@ describe("Scoring Types", () => {

if (result.type !== EvaluationType.CHECK)
throw new Error("Wrong result type");
expect(result.final_score).toBe(10);
expect(result.items).toEqual([]);
expect(result.violations).toHaveLength(0);
expect(result.word_count).toBeGreaterThan(0);
});
});
});