Skip to content

Commit 85b5ca6

Browse files
authored
Merge pull request #40 from softpudding/codex/top-layer-visibility-guard
Codex/top layer visibility guard
2 parents c53af12 + 1186493 commit 85b5ca6

33 files changed

+2091
-524
lines changed

eval/dataset/dataflow.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ difficulty: medium
44
description: "Test visual understanding capabilities through realistic dashboard tasks"
55
start_url: "http://localhost:16605/dataflow/"
66
instruction: "Please complete the following tasks: 1) Enable the weekly reports feature in settings. 2) View the detailed report for the quarter with the highest revenue. 3) Navigate to the Revenue tab."
7-
time_limit: 300.0
7+
time_limit: 600.0
88
cost_limit: 0.5
99

1010
criteria:
@@ -34,4 +34,4 @@ criteria:
3434
expected:
3535
event_type: tab_click
3636
tab: "revenue"
37-
page: "/dataflow/"
37+
page: "/dataflow/"

eval/evaluate_browser_agent.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,9 @@ def create_conversation(
215215
base_url: Optional base URL override
216216
model_alias: Optional configured model alias
217217
"""
218-
if self.chrome_uuid and not self.wait_for_browser_validity(timeout_seconds=30.0):
218+
if self.chrome_uuid and not self.wait_for_browser_validity(
219+
timeout_seconds=30.0
220+
):
219221
return None
220222

221223
request_json: Dict[str, Any] = {}
@@ -2031,9 +2033,11 @@ def _generate_json_report(
20312033
result.get("composite_score") or 0, 4
20322034
),
20332035
"total_score": round(
2034-
result.get("total_score")
2035-
if result.get("total_score") is not None
2036-
else result["task_score"],
2036+
(
2037+
result.get("total_score")
2038+
if result.get("total_score") is not None
2039+
else result["task_score"]
2040+
),
20372041
2,
20382042
),
20392043
"duration": round(result.get("duration") or 0, 2),
@@ -2241,9 +2245,7 @@ def main():
22412245
passed_float = 1.0 if result.passed else 0.0
22422246
eff_score = result.efficiency_score or 0.0
22432247
usage_score_val = result.usage_score or 0.0
2244-
test_composite = (
2245-
passed_float * 3 + eff_score + usage_score_val
2246-
) / 5.0
2248+
test_composite = (passed_float * 3 + eff_score + usage_score_val) / 5.0
22472249
print(f" Composite score: {test_composite:.2f}/1.0")
22482250
print(f" Total score: {result.total_score or result.score:.1f}")
22492251
print(
@@ -2282,9 +2284,7 @@ def main():
22822284
)
22832285
print(f" Status: {'PASS' if result.passed else 'FAIL'}")
22842286
print(f" Task score: {result.score:.1f}/{result.max_score:.1f}")
2285-
print(
2286-
f" Efficiency score: {result.efficiency_score or 0:.2f}/1.0"
2287-
)
2287+
print(f" Efficiency score: {result.efficiency_score or 0:.2f}/1.0")
22882288
print(f" Usage score: {result.usage_score or 0:.2f}/1.0")
22892289
# Calculate composite score for this test
22902290
passed_float = 1.0 if result.passed else 0.0

0 commit comments

Comments
 (0)