Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 90 additions & 33 deletions scripts/lib_grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,93 @@ def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
return {}


def _coerce_score_value(value: Any) -> float | None:
if isinstance(value, bool):
return None
if isinstance(value, (int, float)):
return float(value)
if isinstance(value, str):
try:
return float(value.strip())
except ValueError:
return None
if isinstance(value, dict):
for key in ("score", "value", "weighted_score"):
if key in value:
return _coerce_score_value(value[key])
return None


def _extract_named_scores(parsed: Dict[str, Any]) -> Dict[str, float]:
scores: Dict[str, float] = {}

if "scores" in parsed and isinstance(parsed["scores"], dict):
for key, value in parsed["scores"].items():
coerced = _coerce_score_value(value)
if coerced is not None:
scores[str(key)] = coerced

if "criteria_scores" in parsed:
criteria = parsed["criteria_scores"]
if isinstance(criteria, dict):
for key, value in criteria.items():
coerced = _coerce_score_value(value)
if coerced is not None:
scores[str(key)] = coerced

if "criterion_scores" in parsed:
criteria = parsed["criterion_scores"]
if isinstance(criteria, dict):
for key, value in criteria.items():
coerced = _coerce_score_value(value)
if coerced is not None:
scores[str(key)] = coerced
elif isinstance(criteria, list):
for idx, item in enumerate(criteria, start=1):
if isinstance(item, dict):
name = (
item.get("name")
or item.get("criterion")
or item.get("label")
or f"criterion_{idx}"
)
coerced = _coerce_score_value(item)
else:
name = f"criterion_{idx}"
coerced = _coerce_score_value(item)
if coerced is not None:
scores[str(name)] = coerced

for key, value in parsed.items():
if re.fullmatch(r"criterion\d+", str(key), re.IGNORECASE):
coerced = _coerce_score_value(value)
if coerced is not None:
scores[str(key)] = coerced

return scores


def _extract_total_score(parsed: Dict[str, Any], scores: Dict[str, float]) -> float | None:
for key in ("total", "score", "overall_score", "completionScore", "total_score"):
if key in parsed:
coerced = _coerce_score_value(parsed[key])
if coerced is not None:
return coerced

overall = parsed.get("overall")
if isinstance(overall, dict):
coerced = _coerce_score_value(overall)
if coerced is not None:
return coerced

if scores:
values = [v for v in scores.values() if isinstance(v, (int, float))]
if values:
return sum(values) / len(values)

return None


def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize judge response to expected format with 'scores', 'total', and 'notes'.
Expand All @@ -474,39 +561,9 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:
- {"score": 0.9, "justification": "..."} (simplified format)
"""
result: Dict[str, Any] = {"scores": {}, "total": None, "notes": ""}

# Extract scores from various keys
if "scores" in parsed:
scores_data = parsed["scores"]
if isinstance(scores_data, dict):
# Handle nested structure: {"criterion": {"score": 0.9, "weight": 0.3}}
for key, value in scores_data.items():
if isinstance(value, dict) and "score" in value:
result["scores"][key] = float(value["score"]) if isinstance(value["score"], (int, float, str)) else value["score"]
elif isinstance(value, (int, float)):
result["scores"][key] = value
elif "criteria_scores" in parsed:
# Handle Claude's alternate format
criteria = parsed["criteria_scores"]
if isinstance(criteria, dict):
for key, value in criteria.items():
if isinstance(value, dict) and "score" in value:
result["scores"][key] = value["score"]
elif isinstance(value, (int, float)):
result["scores"][key] = value

# Extract total score
if "total" in parsed and parsed["total"] is not None:
result["total"] = float(parsed["total"]) if isinstance(parsed["total"], (int, float)) else None
elif "score" in parsed and isinstance(parsed["score"], (int, float)):
result["total"] = float(parsed["score"])
elif "overall_score" in parsed and isinstance(parsed["overall_score"], (int, float)):
result["total"] = float(parsed["overall_score"])
elif result["scores"]:
# Calculate average if we have individual scores but no total
values = [v for v in result["scores"].values() if isinstance(v, (int, float))]
if values:
result["total"] = sum(values) / len(values)

result["scores"] = _extract_named_scores(parsed)
result["total"] = _extract_total_score(parsed, result["scores"])

# Some judge models return a summed total across criteria even though each
# criterion is scored on a 0..1 scale. Normalize that back to a 0..1 mean.
Expand Down
10 changes: 8 additions & 2 deletions tasks/task_08_memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,15 @@ def grade(transcript: list, workspace_path: str) -> dict:
# - read_file / readFile (Cursor, Windsurf, Claude Code)
if tool_name in ["read", "read_file", "readFile"]:
args = item.get("arguments", item.get("params", {}))
path_val = str(args.get("path", args.get("file_path", "")))
files = args.get("files", [])
if "notes.md" in path_val or any("notes.md" in str(f) for f in files):
path_candidates = [
args.get("path", ""),
args.get("file_path", ""),
args.get("file", ""),
]
if any("notes.md" in str(f) for f in files) or any(
"notes.md" in str(path) for path in path_candidates if path
):
read_notes = True
break

Expand Down
14 changes: 10 additions & 4 deletions tasks/task_10_workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,20 @@ def grade(transcript: list, workspace_path: str) -> dict:
for item in msg.get("content", []):
if item.get("type") == "toolCall":
tool_name = item.get("name", "")
params = item.get("params", {})
params = item.get("arguments", item.get("params", {}))
if tool_name.lower() in ["read_file", "readfile", "read"]:
# Support multiple param formats across different agents:
# - files: ["config.json"] (Cursor, Windsurf)
# - path/file_path: "config.json" (OpenClaw, Claude Code)
# - path/file_path/file: "config.json" (OpenClaw, Claude Code)
files = params.get("files", [])
path_val = str(params.get("path", params.get("file_path", "")))
if any("config.json" in str(f) for f in files) or "config.json" in path_val:
path_candidates = [
params.get("path", ""),
params.get("file_path", ""),
params.get("file", ""),
]
if any("config.json" in str(f) for f in files) or any(
"config.json" in str(path) for path in path_candidates if path
):
read_config = True
break

Expand Down
2 changes: 1 addition & 1 deletion tasks/task_18_market_research.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def grade(transcript: list, workspace_path: str) -> dict:
for item in msg.get("content", []):
if item.get("type") == "toolCall":
tool_name = item.get("name", "").lower()
params = item.get("params", {})
params = item.get("arguments", item.get("params", {}))
# Check for web search / fetch tools
if any(t in tool_name for t in [
"web_search", "websearch", "search",
Expand Down