diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index bf25b2d..16ca079 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -464,6 +464,93 @@ def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]: return {} +def _coerce_score_value(value: Any) -> float | None: + if isinstance(value, bool): + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value.strip()) + except ValueError: + return None + if isinstance(value, dict): + for key in ("score", "value", "weighted_score"): + if key in value: + return _coerce_score_value(value[key]) + return None + + +def _extract_named_scores(parsed: Dict[str, Any]) -> Dict[str, float]: + scores: Dict[str, float] = {} + + if "scores" in parsed and isinstance(parsed["scores"], dict): + for key, value in parsed["scores"].items(): + coerced = _coerce_score_value(value) + if coerced is not None: + scores[str(key)] = coerced + + if "criteria_scores" in parsed: + criteria = parsed["criteria_scores"] + if isinstance(criteria, dict): + for key, value in criteria.items(): + coerced = _coerce_score_value(value) + if coerced is not None: + scores[str(key)] = coerced + + if "criterion_scores" in parsed: + criteria = parsed["criterion_scores"] + if isinstance(criteria, dict): + for key, value in criteria.items(): + coerced = _coerce_score_value(value) + if coerced is not None: + scores[str(key)] = coerced + elif isinstance(criteria, list): + for idx, item in enumerate(criteria, start=1): + if isinstance(item, dict): + name = ( + item.get("name") + or item.get("criterion") + or item.get("label") + or f"criterion_{idx}" + ) + coerced = _coerce_score_value(item) + else: + name = f"criterion_{idx}" + coerced = _coerce_score_value(item) + if coerced is not None: + scores[str(name)] = coerced + + for key, value in parsed.items(): + if re.fullmatch(r"criterion\d+", str(key), re.IGNORECASE): + coerced = _coerce_score_value(value) + if coerced is not None: + scores[str(key)] = coerced + + return scores + + +def _extract_total_score(parsed: Dict[str, Any], scores: Dict[str, float]) -> float | None: + for key in ("total", "score", "overall_score", "completionScore", "total_score"): + if key in parsed: + coerced = _coerce_score_value(parsed[key]) + if coerced is not None: + return coerced + + overall = parsed.get("overall") + if isinstance(overall, dict): + coerced = _coerce_score_value(overall) + if coerced is not None: + return coerced + + if scores: + values = [v for v in scores.values() if isinstance(v, (int, float))] + if values: + return sum(values) / len(values) + + return None + + def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]: """ Normalize judge response to expected format with 'scores', 'total', and 'notes'. @@ -474,39 +561,9 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]: - {"score": 0.9, "justification": "..."} (simplified format) """ result: Dict[str, Any] = {"scores": {}, "total": None, "notes": ""} - - # Extract scores from various keys - if "scores" in parsed: - scores_data = parsed["scores"] - if isinstance(scores_data, dict): - # Handle nested structure: {"criterion": {"score": 0.9, "weight": 0.3}} - for key, value in scores_data.items(): - if isinstance(value, dict) and "score" in value: - result["scores"][key] = float(value["score"]) if isinstance(value["score"], (int, float, str)) else value["score"] - elif isinstance(value, (int, float)): - result["scores"][key] = value - elif "criteria_scores" in parsed: - # Handle Claude's alternate format - criteria = parsed["criteria_scores"] - if isinstance(criteria, dict): - for key, value in criteria.items(): - if isinstance(value, dict) and "score" in value: - result["scores"][key] = value["score"] - elif isinstance(value, (int, float)): - result["scores"][key] = value - - # Extract total score - if "total" in parsed and parsed["total"] is not None: - result["total"] = float(parsed["total"]) if isinstance(parsed["total"], (int, float)) else None - elif "score" in parsed and isinstance(parsed["score"], (int, float)): - result["total"] = float(parsed["score"]) - elif "overall_score" in parsed and isinstance(parsed["overall_score"], (int, float)): - result["total"] = float(parsed["overall_score"]) - elif result["scores"]: - # Calculate average if we have individual scores but no total - values = [v for v in result["scores"].values() if isinstance(v, (int, float))] - if values: - result["total"] = sum(values) / len(values) + + result["scores"] = _extract_named_scores(parsed) + result["total"] = _extract_total_score(parsed, result["scores"]) # Some judge models return a summed total across criteria even though each # criterion is scored on a 0..1 scale. Normalize that back to a 0..1 mean. diff --git a/tasks/task_08_memory.md b/tasks/task_08_memory.md index 9cb5f6f..1b96556 100644 --- a/tasks/task_08_memory.md +++ b/tasks/task_08_memory.md @@ -151,9 +151,15 @@ def grade(transcript: list, workspace_path: str) -> dict: # - read_file / readFile (Cursor, Windsurf, Claude Code) if tool_name in ["read", "read_file", "readFile"]: args = item.get("arguments", item.get("params", {})) - path_val = str(args.get("path", args.get("file_path", ""))) files = args.get("files", []) - if "notes.md" in path_val or any("notes.md" in str(f) for f in files): + path_candidates = [ + args.get("path", ""), + args.get("file_path", ""), + args.get("file", ""), + ] + if any("notes.md" in str(f) for f in files) or any( + "notes.md" in str(path) for path in path_candidates if path + ): read_notes = True break diff --git a/tasks/task_10_workflow.md b/tasks/task_10_workflow.md index c01603f..c8af21a 100644 --- a/tasks/task_10_workflow.md +++ b/tasks/task_10_workflow.md @@ -98,14 +98,20 @@ def grade(transcript: list, workspace_path: str) -> dict: for item in msg.get("content", []): if item.get("type") == "toolCall": tool_name = item.get("name", "") - params = item.get("params", {}) + params = item.get("arguments", item.get("params", {})) if tool_name.lower() in ["read_file", "readfile", "read"]: # Support multiple param formats across different agents: # - files: ["config.json"] (Cursor, Windsurf) - # - path/file_path: "config.json" (OpenClaw, Claude Code) + # - path/file_path/file: "config.json" (OpenClaw, Claude Code) files = params.get("files", []) - path_val = str(params.get("path", params.get("file_path", ""))) - if any("config.json" in str(f) for f in files) or "config.json" in path_val: + path_candidates = [ + params.get("path", ""), + params.get("file_path", ""), + params.get("file", ""), + ] + if any("config.json" in str(f) for f in files) or any( + "config.json" in str(path) for path in path_candidates if path + ): read_config = True break diff --git a/tasks/task_18_market_research.md b/tasks/task_18_market_research.md index 81e5239..87a0706 100644 --- a/tasks/task_18_market_research.md +++ b/tasks/task_18_market_research.md @@ -169,7 +169,7 @@ def grade(transcript: list, workspace_path: str) -> dict: for item in msg.get("content", []): if item.get("type") == "toolCall": tool_name = item.get("name", "").lower() - params = item.get("params", {}) + params = item.get("arguments", item.get("params", {})) # Check for web search / fetch tools if any(t in tool_name for t in [ "web_search", "websearch", "search",