Skip to content

Commit 36b8658

Browse files
authored
Merge pull request #42 from softpudding/feat/clean-eval
Clean up managed tabs after eval tests
2 parents 5fbd1cd + bd0a86c commit 36b8658

File tree

2 files changed

+232
-1
lines changed

2 files changed

+232
-1
lines changed

eval/evaluate_browser_agent.py

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,116 @@ def delete_conversation(self, conversation_id: str) -> bool:
435435
except Exception:
436436
return False
437437

438+
def get_managed_tabs(self, conversation_id: str) -> List[Dict[str, Any]]:
439+
"""Return managed tabs for a conversation."""
440+
if not self.chrome_uuid:
441+
return []
442+
443+
try:
444+
response = self.session.get(
445+
f"{self.base_url}/tabs",
446+
params={
447+
"browser_id": self.chrome_uuid,
448+
"conversation_id": conversation_id,
449+
"managed_only": "true",
450+
},
451+
timeout=5,
452+
)
453+
if response.status_code != 200:
454+
logger.warning(
455+
"Failed to fetch managed tabs for %s: status=%s body=%s",
456+
conversation_id,
457+
response.status_code,
458+
response.text,
459+
)
460+
return []
461+
462+
data = response.json()
463+
if not data.get("success"):
464+
logger.warning(
465+
"Managed tab fetch was unsuccessful for %s: %s",
466+
conversation_id,
467+
data,
468+
)
469+
return []
470+
471+
tabs = data.get("data", {}).get("tabs", [])
472+
return tabs if isinstance(tabs, list) else []
473+
except Exception as e:
474+
logger.warning(
475+
"Failed to fetch managed tabs for %s: %s", conversation_id, e
476+
)
477+
return []
478+
479+
def close_tab(self, conversation_id: str, tab_id: int) -> bool:
480+
"""Close a managed tab for a conversation."""
481+
if not self.chrome_uuid:
482+
return False
483+
484+
try:
485+
response = self.session.post(
486+
f"{self.base_url}/tabs",
487+
params={
488+
"action": "close",
489+
"browser_id": self.chrome_uuid,
490+
"conversation_id": conversation_id,
491+
"tab_id": tab_id,
492+
},
493+
timeout=5,
494+
)
495+
if response.status_code != 200:
496+
logger.warning(
497+
"Failed to close tab %s for %s: status=%s body=%s",
498+
tab_id,
499+
conversation_id,
500+
response.status_code,
501+
response.text,
502+
)
503+
return False
504+
505+
data = response.json()
506+
success = bool(data.get("success"))
507+
if not success:
508+
logger.warning(
509+
"Close tab command failed for tab %s in %s: %s",
510+
tab_id,
511+
conversation_id,
512+
data,
513+
)
514+
return success
515+
except Exception as e:
516+
logger.warning(
517+
"Failed to close tab %s for %s: %s",
518+
tab_id,
519+
conversation_id,
520+
e,
521+
)
522+
return False
523+
524+
def cleanup_managed_tabs(self, conversation_id: str) -> bool:
525+
"""Close all managed tabs opened for a conversation."""
526+
tabs = self.get_managed_tabs(conversation_id)
527+
if not tabs:
528+
return True
529+
530+
all_closed = True
531+
for tab in tabs:
532+
tab_id = tab.get("tabId")
533+
if not isinstance(tab_id, int):
534+
tab_id = tab.get("tab_id")
535+
if not isinstance(tab_id, int):
536+
logger.warning(
537+
"Skipping managed tab cleanup for %s due to missing tab id: %s",
538+
conversation_id,
539+
tab,
540+
)
541+
all_closed = False
542+
continue
543+
544+
if not self.close_tab(conversation_id, tab_id):
545+
all_closed = False
546+
547+
return all_closed
438548

439549
class EvalServerClient:
440550
"""Client for evaluation server tracking API"""
@@ -700,6 +810,19 @@ def ensure_services(
700810

701811
return True
702812

813+
def _cleanup_openbrowser_conversation(self, conversation_id: Optional[str]) -> None:
814+
"""Close managed tabs and delete the OpenBrowser conversation."""
815+
if not conversation_id:
816+
return
817+
818+
cleaned_up = self.openbrowser.cleanup_managed_tabs(conversation_id)
819+
if not cleaned_up:
820+
logger.warning(
821+
"Managed tab cleanup did not fully succeed for conversation %s",
822+
conversation_id,
823+
)
824+
self.openbrowser.delete_conversation(conversation_id)
825+
703826
def load_test_cases(self) -> List[TestCase]:
704827
"""Load all test cases from dataset directory"""
705828
test_cases = []
@@ -912,7 +1035,7 @@ def run_test(self, test_case: TestCase) -> TestResult:
9121035
model=self.current_model,
9131036
)
9141037
finally:
915-
self.openbrowser.delete_conversation(conversation_id)
1038+
self._cleanup_openbrowser_conversation(conversation_id)
9161039

9171040
def _extract_images(
9181041
self,

server/tests/unit/test_eval_client.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
EvaluationRunLock,
1010
Evaluator,
1111
LLMTarget,
12+
MessageRunResult,
1213
OpenBrowserClient,
1314
)
1415

@@ -167,3 +168,110 @@ def test_extract_cost_uses_latest_usage_metrics_event() -> None:
167168
assert evaluator._extract_cost_from_sse_events(sse_events) == pytest.approx(
168169
0.9652088
169170
)
171+
172+
173+
def test_cleanup_managed_tabs_closes_all_tabs() -> None:
174+
"""Eval client should close every managed tab for the conversation."""
175+
client = OpenBrowserClient(
176+
base_url="http://example.test", chrome_uuid="browser-uuid-123"
177+
)
178+
client.session = MagicMock()
179+
180+
get_response = MagicMock()
181+
get_response.status_code = 200
182+
get_response.json.return_value = {
183+
"success": True,
184+
"data": {
185+
"tabs": [
186+
{"tabId": 11, "url": "https://example.com/a"},
187+
{"tabId": 22, "url": "https://example.com/b"},
188+
]
189+
},
190+
}
191+
close_response = MagicMock()
192+
close_response.status_code = 200
193+
close_response.json.return_value = {"success": True}
194+
195+
client.session.get.return_value = get_response
196+
client.session.post.return_value = close_response
197+
198+
assert client.cleanup_managed_tabs("conv-123") is True
199+
200+
client.session.get.assert_called_once_with(
201+
"http://example.test/tabs",
202+
params={
203+
"browser_id": "browser-uuid-123",
204+
"conversation_id": "conv-123",
205+
"managed_only": "true",
206+
},
207+
timeout=5,
208+
)
209+
assert client.session.post.call_count == 2
210+
assert client.session.post.call_args_list[0].kwargs == {
211+
"params": {
212+
"action": "close",
213+
"browser_id": "browser-uuid-123",
214+
"conversation_id": "conv-123",
215+
"tab_id": 11,
216+
},
217+
"timeout": 5,
218+
}
219+
assert client.session.post.call_args_list[1].kwargs == {
220+
"params": {
221+
"action": "close",
222+
"browser_id": "browser-uuid-123",
223+
"conversation_id": "conv-123",
224+
"tab_id": 22,
225+
},
226+
"timeout": 5,
227+
}
228+
229+
230+
def test_run_test_cleans_managed_tabs_before_delete(tmp_path) -> None:
231+
"""Test teardown should close managed tabs before deleting the conversation."""
232+
evaluator = Evaluator(chrome_uuid="browser-uuid-123")
233+
evaluator.output_dir = tmp_path
234+
evaluator.current_model = "dashscope/qwen3.5-plus"
235+
evaluator.current_target = LLMTarget(
236+
name="dashscope/qwen3.5-plus",
237+
alias="plus",
238+
model_name="dashscope/qwen3.5-plus",
239+
)
240+
evaluator.eval_server = MagicMock()
241+
evaluator.eval_server.clear_events.return_value = True
242+
evaluator.eval_server.get_events.return_value = []
243+
evaluator._save_track_events = MagicMock(return_value=None)
244+
evaluator._extract_images = MagicMock(return_value=[])
245+
evaluator._save_sse_events = MagicMock(return_value=None)
246+
evaluator._extract_cost_from_sse_events = MagicMock(return_value=0.0)
247+
evaluator._evaluate_criteria = MagicMock(return_value=(True, 1.0, 1.0))
248+
249+
teardown_calls: list[str] = []
250+
251+
evaluator.openbrowser = MagicMock()
252+
evaluator.openbrowser.create_conversation.return_value = "conv-123"
253+
evaluator.openbrowser.send_message.return_value = MessageRunResult(events=[])
254+
evaluator.openbrowser.cleanup_managed_tabs.side_effect = (
255+
lambda conversation_id: teardown_calls.append(
256+
f"cleanup:{conversation_id}"
257+
)
258+
or False
259+
)
260+
evaluator.openbrowser.delete_conversation.side_effect = (
261+
lambda conversation_id: teardown_calls.append(f"delete:{conversation_id}")
262+
or True
263+
)
264+
265+
test_case = eval_module.TestCase(
266+
id="demo",
267+
name="Demo",
268+
description="",
269+
instruction="Do the thing",
270+
start_url="",
271+
criteria=[],
272+
)
273+
274+
result = evaluator.run_test(test_case)
275+
276+
assert result.conversation_id == "conv-123"
277+
assert teardown_calls == ["cleanup:conv-123", "delete:conv-123"]

0 commit comments

Comments
 (0)