Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion scripts/lib_grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,8 @@ def _build_judge_prompt(task: Task, transcript_summary: str, rubric: str) -> str
f"{transcript_summary}\n\n"
"## Grading Rubric\n"
f"{rubric}\n\n"
"Score each criterion from 0.0 to 1.0.\n\n"
"Score each criterion from 0.0 to 1.0.\n"
'The "total" field must also be between 0.0 and 1.0, and it must be the arithmetic mean of the criterion scores, not their sum.\n\n'
"Respond with ONLY this JSON structure (no markdown, no code fences, no extra text):\n"
'{"scores": {"criterion_name": 0.0}, "total": 0.0, "notes": "brief justification"}'
)
Expand Down Expand Up @@ -428,6 +429,17 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:
values = [v for v in result["scores"].values() if isinstance(v, (int, float))]
if values:
result["total"] = sum(values) / len(values)

# Some judge models return a summed total across criteria even though each
# criterion is scored on a 0..1 scale. Normalize that back to a 0..1 mean.
values = [v for v in result["scores"].values() if isinstance(v, (int, float))]
if (
values
and result["total"] is not None
and result["total"] > 1.0
and all(0.0 <= float(v) <= 1.0 for v in values)
):
result["total"] = sum(values) / len(values)

# Extract notes/justification
if "notes" in parsed:
Expand Down
64 changes: 64 additions & 0 deletions tests/test_lib_grading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from __future__ import annotations

import sys
import unittest
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
SCRIPTS_DIR = ROOT / "scripts"
if str(SCRIPTS_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPTS_DIR))

from lib_grading import _combine_grades, _normalize_judge_response, GradeResult


class JudgeNormalizationTests(unittest.TestCase):
def test_normalize_judge_response_averages_summed_total_when_breakdown_is_unit_scale(
self,
) -> None:
parsed = {
"scores": {
"coverage": 0.75,
"synthesis": 0.75,
"structure": 0.75,
"tone": 0.8,
"conciseness": 0.8,
},
"total": 3.85,
"notes": "Summed by mistake",
}

normalized = _normalize_judge_response(parsed)

self.assertAlmostEqual(normalized["total"], 0.77)

def test_hybrid_score_uses_normalized_judge_total(self) -> None:
auto = GradeResult(
task_id="task_16_email_triage",
score=0.7062937062937062,
max_score=1.0,
grading_type="automated",
breakdown={},
notes="",
)
judge = GradeResult(
task_id="task_16_email_triage",
score=0.87,
max_score=1.0,
grading_type="llm_judge",
breakdown={},
notes="",
)

class _Task:
task_id = "task_16_email_triage"
grading_weights = {"automated": 0.4, "llm_judge": 0.6}

combined = _combine_grades(_Task(), auto, judge)

self.assertAlmostEqual(combined.score, 0.8045174825174824)


if __name__ == "__main__":
unittest.main()
Loading