Merge pull request #42 from softpudding/feat/clean-eval

softpudding · web-flow · commit 36b8658752a7 · 2026-03-23T12:58:22.000+08:00
Clean up managed tabs after eval tests
diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py
@@ -435,6 +435,116 @@ def delete_conversation(self, conversation_id: str) -> bool:
         except Exception:
             return False
 
+    def get_managed_tabs(self, conversation_id: str) -> List[Dict[str, Any]]:
+        """Return managed tabs for a conversation."""
+        if not self.chrome_uuid:
+            return []
+
+        try:
+            response = self.session.get(
+                f"{self.base_url}/tabs",
+                params={
+                    "browser_id": self.chrome_uuid,
+                    "conversation_id": conversation_id,
+                    "managed_only": "true",
+                },
+                timeout=5,
+            )
+            if response.status_code != 200:
+                logger.warning(
+                    "Failed to fetch managed tabs for %s: status=%s body=%s",
+                    conversation_id,
+                    response.status_code,
+                    response.text,
+                )
+                return []
+
+            data = response.json()
+            if not data.get("success"):
+                logger.warning(
+                    "Managed tab fetch was unsuccessful for %s: %s",
+                    conversation_id,
+                    data,
+                )
+                return []
+
+            tabs = data.get("data", {}).get("tabs", [])
+            return tabs if isinstance(tabs, list) else []
+        except Exception as e:
+            logger.warning(
+                "Failed to fetch managed tabs for %s: %s", conversation_id, e
+            )
+            return []
+
+    def close_tab(self, conversation_id: str, tab_id: int) -> bool:
+        """Close a managed tab for a conversation."""
+        if not self.chrome_uuid:
+            return False
+
+        try:
+            response = self.session.post(
+                f"{self.base_url}/tabs",
+                params={
+                    "action": "close",
+                    "browser_id": self.chrome_uuid,
+                    "conversation_id": conversation_id,
+                    "tab_id": tab_id,
+                },
+                timeout=5,
+            )
+            if response.status_code != 200:
+                logger.warning(
+                    "Failed to close tab %s for %s: status=%s body=%s",
+                    tab_id,
+                    conversation_id,
+                    response.status_code,
+                    response.text,
+                )
+                return False
+
+            data = response.json()
+            success = bool(data.get("success"))
+            if not success:
+                logger.warning(
+                    "Close tab command failed for tab %s in %s: %s",
+                    tab_id,
+                    conversation_id,
+                    data,
+                )
+            return success
+        except Exception as e:
+            logger.warning(
+                "Failed to close tab %s for %s: %s",
+                tab_id,
+                conversation_id,
+                e,
+            )
+            return False
+
+    def cleanup_managed_tabs(self, conversation_id: str) -> bool:
+        """Close all managed tabs opened for a conversation."""
+        tabs = self.get_managed_tabs(conversation_id)
+        if not tabs:
+            return True
+
+        all_closed = True
+        for tab in tabs:
+            tab_id = tab.get("tabId")
+            if not isinstance(tab_id, int):
+                tab_id = tab.get("tab_id")
+            if not isinstance(tab_id, int):
+                logger.warning(
+                    "Skipping managed tab cleanup for %s due to missing tab id: %s",
+                    conversation_id,
+                    tab,
+                )
+                all_closed = False
+                continue
+
+            if not self.close_tab(conversation_id, tab_id):
+                all_closed = False
+
+        return all_closed
 
 class EvalServerClient:
     """Client for evaluation server tracking API"""
@@ -700,6 +810,19 @@ def ensure_services(
 
         return True
 
+    def _cleanup_openbrowser_conversation(self, conversation_id: Optional[str]) -> None:
+        """Close managed tabs and delete the OpenBrowser conversation."""
+        if not conversation_id:
+            return
+
+        cleaned_up = self.openbrowser.cleanup_managed_tabs(conversation_id)
+        if not cleaned_up:
+            logger.warning(
+                "Managed tab cleanup did not fully succeed for conversation %s",
+                conversation_id,
+            )
+        self.openbrowser.delete_conversation(conversation_id)
+
     def load_test_cases(self) -> List[TestCase]:
         """Load all test cases from dataset directory"""
         test_cases = []
@@ -912,7 +1035,7 @@ def run_test(self, test_case: TestCase) -> TestResult:
                 model=self.current_model,
             )
         finally:
-            self.openbrowser.delete_conversation(conversation_id)
+            self._cleanup_openbrowser_conversation(conversation_id)
 
     def _extract_images(
         self,
diff --git a/server/tests/unit/test_eval_client.py b/server/tests/unit/test_eval_client.py
@@ -9,6 +9,7 @@
     EvaluationRunLock,
     Evaluator,
     LLMTarget,
+    MessageRunResult,
     OpenBrowserClient,
 )
 
@@ -167,3 +168,110 @@ def test_extract_cost_uses_latest_usage_metrics_event() -> None:
     assert evaluator._extract_cost_from_sse_events(sse_events) == pytest.approx(
         0.9652088
     )
+
+
+def test_cleanup_managed_tabs_closes_all_tabs() -> None:
+    """Eval client should close every managed tab for the conversation."""
+    client = OpenBrowserClient(
+        base_url="http://example.test", chrome_uuid="browser-uuid-123"
+    )
+    client.session = MagicMock()
+
+    get_response = MagicMock()
+    get_response.status_code = 200
+    get_response.json.return_value = {
+        "success": True,
+        "data": {
+            "tabs": [
+                {"tabId": 11, "url": "https://example.com/a"},
+                {"tabId": 22, "url": "https://example.com/b"},
+            ]
+        },
+    }
+    close_response = MagicMock()
+    close_response.status_code = 200
+    close_response.json.return_value = {"success": True}
+
+    client.session.get.return_value = get_response
+    client.session.post.return_value = close_response
+
+    assert client.cleanup_managed_tabs("conv-123") is True
+
+    client.session.get.assert_called_once_with(
+        "http://example.test/tabs",
+        params={
+            "browser_id": "browser-uuid-123",
+            "conversation_id": "conv-123",
+            "managed_only": "true",
+        },
+        timeout=5,
+    )
+    assert client.session.post.call_count == 2
+    assert client.session.post.call_args_list[0].kwargs == {
+        "params": {
+            "action": "close",
+            "browser_id": "browser-uuid-123",
+            "conversation_id": "conv-123",
+            "tab_id": 11,
+        },
+        "timeout": 5,
+    }
+    assert client.session.post.call_args_list[1].kwargs == {
+        "params": {
+            "action": "close",
+            "browser_id": "browser-uuid-123",
+            "conversation_id": "conv-123",
+            "tab_id": 22,
+        },
+        "timeout": 5,
+    }
+
+
+def test_run_test_cleans_managed_tabs_before_delete(tmp_path) -> None:
+    """Test teardown should close managed tabs before deleting the conversation."""
+    evaluator = Evaluator(chrome_uuid="browser-uuid-123")
+    evaluator.output_dir = tmp_path
+    evaluator.current_model = "dashscope/qwen3.5-plus"
+    evaluator.current_target = LLMTarget(
+        name="dashscope/qwen3.5-plus",
+        alias="plus",
+        model_name="dashscope/qwen3.5-plus",
+    )
+    evaluator.eval_server = MagicMock()
+    evaluator.eval_server.clear_events.return_value = True
+    evaluator.eval_server.get_events.return_value = []
+    evaluator._save_track_events = MagicMock(return_value=None)
+    evaluator._extract_images = MagicMock(return_value=[])
+    evaluator._save_sse_events = MagicMock(return_value=None)
+    evaluator._extract_cost_from_sse_events = MagicMock(return_value=0.0)
+    evaluator._evaluate_criteria = MagicMock(return_value=(True, 1.0, 1.0))
+
+    teardown_calls: list[str] = []
+
+    evaluator.openbrowser = MagicMock()
+    evaluator.openbrowser.create_conversation.return_value = "conv-123"
+    evaluator.openbrowser.send_message.return_value = MessageRunResult(events=[])
+    evaluator.openbrowser.cleanup_managed_tabs.side_effect = (
+        lambda conversation_id: teardown_calls.append(
+            f"cleanup:{conversation_id}"
+        )
+        or False
+    )
+    evaluator.openbrowser.delete_conversation.side_effect = (
+        lambda conversation_id: teardown_calls.append(f"delete:{conversation_id}")
+        or True
+    )
+
+    test_case = eval_module.TestCase(
+        id="demo",
+        name="Demo",
+        description="",
+        instruction="Do the thing",
+        start_url="",
+        criteria=[],
+    )
+
+    result = evaluator.run_test(test_case)
+
+    assert result.conversation_id == "conv-123"
+    assert teardown_calls == ["cleanup:conv-123", "delete:conv-123"]