|
9 | 9 | EvaluationRunLock, |
10 | 10 | Evaluator, |
11 | 11 | LLMTarget, |
| 12 | + MessageRunResult, |
12 | 13 | OpenBrowserClient, |
13 | 14 | ) |
14 | 15 |
|
@@ -167,3 +168,110 @@ def test_extract_cost_uses_latest_usage_metrics_event() -> None: |
167 | 168 | assert evaluator._extract_cost_from_sse_events(sse_events) == pytest.approx( |
168 | 169 | 0.9652088 |
169 | 170 | ) |
| 171 | + |
| 172 | + |
| 173 | +def test_cleanup_managed_tabs_closes_all_tabs() -> None: |
| 174 | + """Eval client should close every managed tab for the conversation.""" |
| 175 | + client = OpenBrowserClient( |
| 176 | + base_url="http://example.test", chrome_uuid="browser-uuid-123" |
| 177 | + ) |
| 178 | + client.session = MagicMock() |
| 179 | + |
| 180 | + get_response = MagicMock() |
| 181 | + get_response.status_code = 200 |
| 182 | + get_response.json.return_value = { |
| 183 | + "success": True, |
| 184 | + "data": { |
| 185 | + "tabs": [ |
| 186 | + {"tabId": 11, "url": "https://example.com/a"}, |
| 187 | + {"tabId": 22, "url": "https://example.com/b"}, |
| 188 | + ] |
| 189 | + }, |
| 190 | + } |
| 191 | + close_response = MagicMock() |
| 192 | + close_response.status_code = 200 |
| 193 | + close_response.json.return_value = {"success": True} |
| 194 | + |
| 195 | + client.session.get.return_value = get_response |
| 196 | + client.session.post.return_value = close_response |
| 197 | + |
| 198 | + assert client.cleanup_managed_tabs("conv-123") is True |
| 199 | + |
| 200 | + client.session.get.assert_called_once_with( |
| 201 | + "http://example.test/tabs", |
| 202 | + params={ |
| 203 | + "browser_id": "browser-uuid-123", |
| 204 | + "conversation_id": "conv-123", |
| 205 | + "managed_only": "true", |
| 206 | + }, |
| 207 | + timeout=5, |
| 208 | + ) |
| 209 | + assert client.session.post.call_count == 2 |
| 210 | + assert client.session.post.call_args_list[0].kwargs == { |
| 211 | + "params": { |
| 212 | + "action": "close", |
| 213 | + "browser_id": "browser-uuid-123", |
| 214 | + "conversation_id": "conv-123", |
| 215 | + "tab_id": 11, |
| 216 | + }, |
| 217 | + "timeout": 5, |
| 218 | + } |
| 219 | + assert client.session.post.call_args_list[1].kwargs == { |
| 220 | + "params": { |
| 221 | + "action": "close", |
| 222 | + "browser_id": "browser-uuid-123", |
| 223 | + "conversation_id": "conv-123", |
| 224 | + "tab_id": 22, |
| 225 | + }, |
| 226 | + "timeout": 5, |
| 227 | + } |
| 228 | + |
| 229 | + |
| 230 | +def test_run_test_cleans_managed_tabs_before_delete(tmp_path) -> None: |
| 231 | + """Test teardown should close managed tabs before deleting the conversation.""" |
| 232 | + evaluator = Evaluator(chrome_uuid="browser-uuid-123") |
| 233 | + evaluator.output_dir = tmp_path |
| 234 | + evaluator.current_model = "dashscope/qwen3.5-plus" |
| 235 | + evaluator.current_target = LLMTarget( |
| 236 | + name="dashscope/qwen3.5-plus", |
| 237 | + alias="plus", |
| 238 | + model_name="dashscope/qwen3.5-plus", |
| 239 | + ) |
| 240 | + evaluator.eval_server = MagicMock() |
| 241 | + evaluator.eval_server.clear_events.return_value = True |
| 242 | + evaluator.eval_server.get_events.return_value = [] |
| 243 | + evaluator._save_track_events = MagicMock(return_value=None) |
| 244 | + evaluator._extract_images = MagicMock(return_value=[]) |
| 245 | + evaluator._save_sse_events = MagicMock(return_value=None) |
| 246 | + evaluator._extract_cost_from_sse_events = MagicMock(return_value=0.0) |
| 247 | + evaluator._evaluate_criteria = MagicMock(return_value=(True, 1.0, 1.0)) |
| 248 | + |
| 249 | + teardown_calls: list[str] = [] |
| 250 | + |
| 251 | + evaluator.openbrowser = MagicMock() |
| 252 | + evaluator.openbrowser.create_conversation.return_value = "conv-123" |
| 253 | + evaluator.openbrowser.send_message.return_value = MessageRunResult(events=[]) |
| 254 | + evaluator.openbrowser.cleanup_managed_tabs.side_effect = ( |
| 255 | + lambda conversation_id: teardown_calls.append( |
| 256 | + f"cleanup:{conversation_id}" |
| 257 | + ) |
| 258 | + or False |
| 259 | + ) |
| 260 | + evaluator.openbrowser.delete_conversation.side_effect = ( |
| 261 | + lambda conversation_id: teardown_calls.append(f"delete:{conversation_id}") |
| 262 | + or True |
| 263 | + ) |
| 264 | + |
| 265 | + test_case = eval_module.TestCase( |
| 266 | + id="demo", |
| 267 | + name="Demo", |
| 268 | + description="", |
| 269 | + instruction="Do the thing", |
| 270 | + start_url="", |
| 271 | + criteria=[], |
| 272 | + ) |
| 273 | + |
| 274 | + result = evaluator.run_test(test_case) |
| 275 | + |
| 276 | + assert result.conversation_id == "conv-123" |
| 277 | + assert teardown_calls == ["cleanup:conv-123", "delete:conv-123"] |
0 commit comments