diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index 46961bb..53ac254 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -286,7 +286,8 @@ def _build_judge_prompt(task: Task, transcript_summary: str, rubric: str) -> str f"{transcript_summary}\n\n" "## Grading Rubric\n" f"{rubric}\n\n" - "Score each criterion from 0.0 to 1.0.\n\n" + "Score each criterion from 0.0 to 1.0.\n" + 'The "total" field must also be between 0.0 and 1.0, and it must be the arithmetic mean of the criterion scores, not their sum.\n\n' "Respond with ONLY this JSON structure (no markdown, no code fences, no extra text):\n" '{"scores": {"criterion_name": 0.0}, "total": 0.0, "notes": "brief justification"}' ) @@ -428,6 +429,17 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]: values = [v for v in result["scores"].values() if isinstance(v, (int, float))] if values: result["total"] = sum(values) / len(values) + + # Some judge models return a summed total across criteria even though each + # criterion is scored on a 0..1 scale. Normalize that back to a 0..1 mean. + values = [v for v in result["scores"].values() if isinstance(v, (int, float))] + if ( + values + and result["total"] is not None + and result["total"] > 1.0 + and all(0.0 <= float(v) <= 1.0 for v in values) + ): + result["total"] = sum(values) / len(values) # Extract notes/justification if "notes" in parsed: diff --git a/tests/test_lib_grading.py b/tests/test_lib_grading.py new file mode 100644 index 0000000..8588715 --- /dev/null +++ b/tests/test_lib_grading.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import sys +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SCRIPTS_DIR = ROOT / "scripts" +if str(SCRIPTS_DIR) not in sys.path: + sys.path.insert(0, str(SCRIPTS_DIR)) + +from lib_grading import _combine_grades, _normalize_judge_response, GradeResult + + +class JudgeNormalizationTests(unittest.TestCase): + def test_normalize_judge_response_averages_summed_total_when_breakdown_is_unit_scale( + self, + ) -> None: + parsed = { + "scores": { + "coverage": 0.75, + "synthesis": 0.75, + "structure": 0.75, + "tone": 0.8, + "conciseness": 0.8, + }, + "total": 3.85, + "notes": "Summed by mistake", + } + + normalized = _normalize_judge_response(parsed) + + self.assertAlmostEqual(normalized["total"], 0.77) + + def test_hybrid_score_uses_normalized_judge_total(self) -> None: + auto = GradeResult( + task_id="task_16_email_triage", + score=0.7062937062937062, + max_score=1.0, + grading_type="automated", + breakdown={}, + notes="", + ) + judge = GradeResult( + task_id="task_16_email_triage", + score=0.87, + max_score=1.0, + grading_type="llm_judge", + breakdown={}, + notes="", + ) + + class _Task: + task_id = "task_16_email_triage" + grading_weights = {"automated": 0.4, "llm_judge": 0.6} + + combined = _combine_grades(_Task(), auto, judge) + + self.assertAlmostEqual(combined.score, 0.8045174825174824) + + +if __name__ == "__main__": + unittest.main()