diff --git a/bluebox/agents/principal_investigator.py b/bluebox/agents/principal_investigator.py index ed9a0312..cf43e223 100644 --- a/bluebox/agents/principal_investigator.py +++ b/bluebox/agents/principal_investigator.py @@ -1865,6 +1865,16 @@ def _submit_routine( attempt.execution_error = "Execution unavailable (no browser or execution crashed)" self._persist(f"attempt_{attempt.id}_executed") + # Persist attempt record with execution result immediately (before inspection). + # The final record will overwrite this once inspection completes. + self._record_attempt( + spec=spec, + attempt=attempt, + routine_json=routine_json, + test_parameters=test_parameters, + execution_result=execution_result, + inspection_result=None, + ) # Step 3: Send to inspector for quality review attempt.status = RoutineAttemptStatus.INSPECTING @@ -1949,8 +1959,30 @@ def _submit_routine( "description (>=8 words, explain action+inputs+outputs), and " "parameter descriptions (>=3 words, explain where to get values)." ) - if hints: - response["remediation_hints"] = hints + if any(kw in issues_text for kw in [ + "500", "nullreferenceexception", "null reference", "server error", + "internal server error", "bad request", "400", "422", + "unprocessable", "object reference not set", + ]): + hints.append( + "SERVER ERROR FIX: The API returned a server-side error, which usually " + "means the request body has wrong field types or structure. Use " + "execute_python to load the raw captured network data from your " + "workspace raw/ directory and compare the EXACT captured request body " + "(JSON types, field names, nesting) against what your routine sends. " + "Pay close attention to: arrays of objects vs arrays of primitives, " + "boolean vs string values, and required fields that may be missing. " + "Also compare against the execution result to see the actual request " + "that was sent." + ) + # Always remind about Python inspection on any failure + hints.append( + "REMINDER: You have execute_python available. Use it to load and inspect " + "the raw captured network events (JSONL in workspace raw/), compare exact " + "request bodies, and examine routine execution results. Don't guess — " + "verify the actual data." + ) + response["remediation_hints"] = hints # ----- Persist unified attempt record ----- # Overwrite the initial record for this attempt with final verdict/results. diff --git a/bluebox/agents/workers/experiment_worker.py b/bluebox/agents/workers/experiment_worker.py index 2cfe7f25..ae291431 100644 --- a/bluebox/agents/workers/experiment_worker.py +++ b/bluebox/agents/workers/experiment_worker.py @@ -591,7 +591,7 @@ def _search_recorded_transactions( "results": results, } - @agent_tool(availability=lambda self: self._network_data_loader is not None, token_optimized=True) + @agent_tool(availability=lambda self: self._network_data_loader is not None) def _get_recorded_transaction(self, request_id: str) -> dict[str, Any]: """ Get the full recorded request/response for a specific transaction. diff --git a/bluebox/cdp/async_cdp_session.py b/bluebox/cdp/async_cdp_session.py index 845edf3d..51be7805 100644 --- a/bluebox/cdp/async_cdp_session.py +++ b/bluebox/cdp/async_cdp_session.py @@ -45,6 +45,7 @@ def __init__( event_callback_fn: Callable[[str, BaseCDPEvent], Awaitable[None]], paths: dict[str, str] | None = None, target_id: str | None = None, + navigate_url: str | None = None, ) -> None: """ Initialize AsyncCDPSession. @@ -58,6 +59,8 @@ def __init__( target_id: Optional CDP target ID to attach to directly. If provided, skips the Target.getTargets scan and attaches to this specific target. Use when you know which tab to monitor (e.g. a proxy-context tab created by the caller). + navigate_url: Optional URL to navigate to after CDP setup completes. + If provided, Page.navigate is called after all monitors are ready. NOTE: The CDP sessionId will be obtained automatically in run() after connecting. CDP sessionIds are only valid for the specific WebSocket connection where Target.attachToTarget was called. @@ -69,6 +72,7 @@ def __init__( self.session_start_dtm = session_start_dtm self.paths = paths or {} self.target_id = target_id + self.navigate_url = navigate_url self.ws: ClientConnection | None = None self.seq = 0 # sequence ID for CDP commands @@ -474,6 +478,21 @@ async def message_receiver() -> None: await self.setup_cdp() logger.info("✅ CDP setup complete, message loop running") + # Navigate after all monitors are ready so every event is captured + if self.navigate_url and self.navigate_url != "about:blank": + url = self.navigate_url + if not url.startswith(("http://", "https://", "chrome://")): + url = f"https://{url}" + logger.info("🌐 Navigating to %s", url) + try: + await self.send_and_wait( + method="Page.navigate", + params={"url": url}, + timeout=10.0, + ) + except Exception as e: + logger.error("❌ Navigation failed: %s", e) + try: # wait for message receiver to complete await receiver_task diff --git a/bluebox/data_models/resource_base.py b/bluebox/data_models/resource_base.py index d557da31..0de2c069 100644 --- a/bluebox/data_models/resource_base.py +++ b/bluebox/data_models/resource_base.py @@ -9,10 +9,11 @@ from abc import ABC from datetime import datetime, timezone +from hashlib import sha256 from typing import Any from uuid import UUID, uuid4 -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, computed_field class ResourceBase(BaseModel, ABC): @@ -142,3 +143,14 @@ def uuid(self) -> UUID: def resource_type(self) -> str: """Return the resource type name (class name) for this instance.""" return self.__class__.__name__ + + def _compute_hash(self) -> str: + """Compute a SHA-256 hash of all fields except ``hash`` itself.""" + json_bytes = super().model_dump_json(exclude={"hash"}).encode("utf-8") + return sha256(json_bytes).hexdigest() + + @computed_field # type: ignore[prop-decorator] + @property + def hash(self) -> str: + """Content hash computed from all other fields.""" + return self._compute_hash() diff --git a/bluebox/scripts/api_indexing/run_api_indexing.py b/bluebox/scripts/api_indexing/run_api_indexing.py index c2cbd170..880acc3f 100644 --- a/bluebox/scripts/api_indexing/run_api_indexing.py +++ b/bluebox/scripts/api_indexing/run_api_indexing.py @@ -148,6 +148,7 @@ def run_explorations( cdp_captures_dir: Path, output_dir: Path, llm_model: LLMModel, + task: str | None = None, ) -> dict[str, str]: """ Run all 4 exploration specialists in parallel. @@ -178,6 +179,7 @@ def run_explorations( cdp_captures_dir, llm_model, workspace_dir=workspace_root / f"{domain}_exploration", + task=task, ): domain for domain, fn in runners.items() } @@ -358,6 +360,7 @@ def run_pi_with_recovery( num_workers: int = 3, num_inspectors: int = 1, max_pi_attempts: int = DEFAULT_MAX_PI_ATTEMPTS, + ledger: DiscoveryLedger | None = None, ) -> RoutineCatalog | None: """ Run the PrincipalInvestigator with automatic recovery. @@ -419,7 +422,6 @@ def _make_inspector_workspace() -> LocalAgentWorkspace: _mount_capture_inputs(inspector_workspace, capture_inputs) return inspector_workspace - ledger: DiscoveryLedger | None = None catalog: RoutineCatalog | None = None print("\n=== Phase 2: Routine Construction (PI loop) ===\n", file=sys.stderr) @@ -492,6 +494,7 @@ def run_api_indexing( llm_model: LLMModel | None = None, remote_debugging_address: str = "http://127.0.0.1:9222", skip_exploration: bool = False, + continue_from_ledger: bool = False, max_pi_iterations: int = 200, min_experiments_before_fail: int = 10, num_workers: int = 3, @@ -512,6 +515,9 @@ def run_api_indexing( llm_model: LLM model to use. remote_debugging_address: Chrome debugging URL for live browser experiments. skip_exploration: Skip Phase 1, load existing summaries from output_dir. + continue_from_ledger: Continue from existing ledger.json in output_dir. + Implies skip_exploration. The PI gets a fresh context but inherits + all prior routine specs, experiments, and shipped routines. max_pi_iterations: Max PI loop iterations per session. min_experiments_before_fail: Min experiments before PI can call mark_failed. num_workers: Max concurrent ExperimentWorker agents (default 3). @@ -534,6 +540,8 @@ def run_api_indexing( print(f" Captures: {cdp_captures_dir}", file=sys.stderr) print(f" Output: {output_dir}", file=sys.stderr) print(f" Model: {resolved_llm_model.value}", file=sys.stderr) + if continue_from_ledger: + print(f" Mode: --continue (resuming from existing ledger)", file=sys.stderr) remote_debugging_address = remote_debugging_address.strip() print(f" Browser: {remote_debugging_address}", file=sys.stderr) @@ -555,12 +563,16 @@ def run_api_indexing( ) return None + # --continue implies --skip-exploration + if continue_from_ledger: + skip_exploration = True + # Phase 1: Exploration if skip_exploration: print("\n Skipping exploration (--skip-exploration), loading from disk...", file=sys.stderr) summaries = load_explorations(output_dir) else: - summaries = run_explorations(cdp_captures_dir, output_dir, resolved_llm_model) + summaries = run_explorations(cdp_captures_dir, output_dir, resolved_llm_model, task=task) if not summaries: print("\n [!] No exploration summaries available. Cannot proceed.", file=sys.stderr) @@ -568,23 +580,41 @@ def run_api_indexing( _run_post_run_analysis(output_dir) return None - # Clean up Phase 2 artifacts from previous runs (preserve exploration + workspaces) - for subdir in [ - "experiments", - "attempts", # legacy output dir from older runs - "attempt_records", - "routines", - "agent_threads", - ]: - p = output_dir / subdir - if p.exists(): - shutil.rmtree(p) - logger.info("Cleaned up %s", p) - for f in ["ledger.json", "catalog.json"]: - p = output_dir / f - if p.exists(): - p.unlink() - logger.info("Cleaned up %s", p) + # Load existing ledger if continuing, otherwise clean up Phase 2 artifacts + existing_ledger: DiscoveryLedger | None = None + if continue_from_ledger: + ledger_path = output_dir / "ledger.json" + if ledger_path.exists(): + existing_ledger = DiscoveryLedger.model_validate_json(ledger_path.read_text()) + shipped = sum(1 for s in existing_ledger.routine_specs if s.shipped_attempt_id) + remaining = sum(1 for s in existing_ledger.routine_specs if s.shipped_attempt_id is None) + print( + f"\n Continuing from existing ledger: " + f"{len(existing_ledger.routine_specs)} specs " + f"({shipped} shipped, {remaining} remaining), " + f"{len(existing_ledger.experiments)} experiments\n", + file=sys.stderr, + ) + else: + print("\n [!] No ledger.json found in output dir, starting fresh.\n", file=sys.stderr) + else: + # Clean up Phase 2 artifacts from previous runs (preserve exploration + workspaces) + for subdir in [ + "experiments", + "attempts", # legacy output dir from older runs + "attempt_records", + "routines", + "agent_threads", + ]: + p = output_dir / subdir + if p.exists(): + shutil.rmtree(p) + logger.info("Cleaned up %s", p) + for f in ["ledger.json", "catalog.json"]: + p = output_dir / f + if p.exists(): + p.unlink() + logger.info("Cleaned up %s", p) # Phase 2: PI loop catalog = run_pi_with_recovery( @@ -599,6 +629,7 @@ def run_api_indexing( num_workers=num_workers, num_inspectors=num_inspectors, max_pi_attempts=max_pi_attempts, + ledger=existing_ledger, ) elapsed = time.time() - start_time @@ -669,6 +700,12 @@ def main() -> None: action="store_true", help="Skip Phase 1 exploration, load existing summaries from output-dir", ) + parser.add_argument( + "--continue", + action="store_true", + dest="continue_from_ledger", + help="Continue from existing ledger in output-dir (skips exploration, preserves all prior progress)", + ) parser.add_argument( "--max-pi-iterations", type=int, @@ -734,6 +771,7 @@ def main() -> None: llm_model=llm_model, remote_debugging_address=args.remote_debugging_address, skip_exploration=args.skip_exploration, + continue_from_ledger=args.continue_from_ledger, max_pi_iterations=args.max_pi_iterations, min_experiments_before_fail=args.min_experiments_before_fail, num_workers=args.num_workers, diff --git a/bluebox/scripts/api_indexing/run_dom_exploration.py b/bluebox/scripts/api_indexing/run_dom_exploration.py index 6979558d..9dafaca3 100644 --- a/bluebox/scripts/api_indexing/run_dom_exploration.py +++ b/bluebox/scripts/api_indexing/run_dom_exploration.py @@ -167,6 +167,7 @@ def run_dom_exploration( min_iterations: int = 3, max_iterations: int = 15, workspace_dir: Path | None = None, + task: str | None = None, ) -> DOMExplorationSummary | None: """ Run DOM exploration on a CDP captures directory. @@ -246,7 +247,14 @@ def _exploration_initial_message(task_text: str) -> str: specialist._get_autonomous_initial_message = _exploration_initial_message # type: ignore[assignment] # Build task message - task = ( + task_prefix = "" + if task: + task_prefix = ( + f"Here is the user's task: \"{task}\". " + "Keep this in mind when exploring, but do not limit yourself to this task.\n\n" + ) + exploration_task = ( + f"{task_prefix}" "Explore ALL DOM snapshots in this capture. Survey pages, scan forms, " "find embedded tokens (meta tags, hidden inputs), discover data blobs " "(scripts with __NEXT_DATA__, ld+json), examine tables, and infer the " @@ -260,7 +268,7 @@ def _exploration_initial_message(task_text: str) -> str: ) result = specialist.run_autonomous( - task=task, + task=exploration_task, config=config, output_schema=DOM_EXPLORATION_OUTPUT_SCHEMA, output_description=( diff --git a/bluebox/scripts/api_indexing/run_network_exploration.py b/bluebox/scripts/api_indexing/run_network_exploration.py index a75189b7..002f08e1 100644 --- a/bluebox/scripts/api_indexing/run_network_exploration.py +++ b/bluebox/scripts/api_indexing/run_network_exploration.py @@ -229,6 +229,7 @@ def run_network_exploration( min_iterations: int = 3, max_iterations: int = 15, workspace_dir: Path | None = None, + task: str | None = None, ) -> NetworkExplorationSummary | None: """ Run network exploration on a CDP captures directory. @@ -330,7 +331,14 @@ def _exploration_initial_message(task_text: str) -> str: specialist._get_autonomous_initial_message = _exploration_initial_message # type: ignore[assignment] # Run autonomous exploration - task = ( + task_prefix = "" + if task: + task_prefix = ( + f"Here is the user's task: \"{task}\". " + "Keep this in mind when exploring, but do not limit yourself to this task.\n\n" + ) + exploration_task = ( + f"{task_prefix}" "Explore ALL network traffic in this capture. Use the tools to survey URLs, " "inspect interesting entries, and identify auth patterns. Then call " "finalize_with_output(output={...}) with a COMPLETE JSON object containing: " @@ -346,7 +354,7 @@ def _exploration_initial_message(task_text: str) -> str: ) result = specialist.run_autonomous( - task=task, + task=exploration_task, config=config, output_schema=NETWORK_EXPLORATION_OUTPUT_SCHEMA, output_description=( diff --git a/bluebox/scripts/api_indexing/run_storage_exploration.py b/bluebox/scripts/api_indexing/run_storage_exploration.py index 2b781453..14d9de81 100644 --- a/bluebox/scripts/api_indexing/run_storage_exploration.py +++ b/bluebox/scripts/api_indexing/run_storage_exploration.py @@ -182,6 +182,7 @@ def run_storage_exploration( min_iterations: int = 3, max_iterations: int = 15, workspace_dir: Path | None = None, + task: str | None = None, ) -> StorageExplorationSummary | None: """ Run storage exploration on a CDP captures directory. @@ -314,7 +315,14 @@ def _exploration_initial_message(task_text: str) -> str: specialist._get_autonomous_initial_message = _exploration_initial_message # type: ignore[assignment] # Run autonomous exploration - task = ( + task_prefix = "" + if task: + task_prefix = ( + f"Here is the user's task: \"{task}\". " + "Keep this in mind when exploring, but do not limit yourself to this task.\n\n" + ) + exploration_task = ( + f"{task_prefix}" "Explore ALL storage and window property data in this capture. " "Find tokens (JWTs, session IDs, CSRF tokens, API keys) and data blocks " "(large JSON objects, cached responses, config). Count the noise. " @@ -329,7 +337,7 @@ def _exploration_initial_message(task_text: str) -> str: ) result = specialist.run_autonomous( - task=task, + task=exploration_task, config=config, output_schema=STORAGE_EXPLORATION_OUTPUT_SCHEMA, output_description=( diff --git a/bluebox/scripts/api_indexing/run_ui_exploration.py b/bluebox/scripts/api_indexing/run_ui_exploration.py index ad3d32e5..2383c98c 100644 --- a/bluebox/scripts/api_indexing/run_ui_exploration.py +++ b/bluebox/scripts/api_indexing/run_ui_exploration.py @@ -163,6 +163,7 @@ def run_ui_exploration( min_iterations: int = 3, max_iterations: int = 15, workspace_dir: Path | None = None, + task: str | None = None, ) -> UIExplorationSummary | None: """ Run UI/interaction exploration on a CDP captures directory. @@ -266,7 +267,14 @@ def _exploration_initial_message(task_text: str) -> str: specialist._get_autonomous_initial_message = _exploration_initial_message # type: ignore[assignment] # Build task message - task = ( + task_prefix = "" + if task: + task_prefix = ( + f"Here is the user's task: \"{task}\". " + "Keep this in mind when exploring, but do not limit yourself to this task.\n\n" + ) + exploration_task = ( + f"{task_prefix}" "Analyze ALL interaction events in this capture. Discover what the user " "typed, clicked, and selected. Map the navigation flow. Infer what the " f"user was trying to accomplish. There are {interaction_loader.stats.total_events} " @@ -280,7 +288,7 @@ def _exploration_initial_message(task_text: str) -> str: ) result = specialist.run_autonomous( - task=task, + task=exploration_task, config=config, output_schema=UI_EXPLORATION_OUTPUT_SCHEMA, output_description=( diff --git a/bluebox/scripts/browser_monitor.py b/bluebox/scripts/browser_monitor.py index 5999e95b..19018260 100644 --- a/bluebox/scripts/browser_monitor.py +++ b/bluebox/scripts/browser_monitor.py @@ -168,7 +168,7 @@ async def async_main(args: argparse.Namespace, tab_id: str | None) -> None: tab_id, context_id, browser_ws = cdp_new_tab( remote_debugging_address=remote_debugging_address, incognito=args.incognito, - url=args.url if not args.no_navigate else "about:blank" + url="about:blank", ) try: browser_ws.close() @@ -190,11 +190,14 @@ async def async_main(args: argparse.Namespace, tab_id: str | None) -> None: logger.info(f"Target URL: {args.url if not args.no_navigate else 'No navigation (attach only)'}") logger.info(f"Tab ID: {tab_id}") + navigate_url = args.url if not args.no_navigate else None session = AsyncCDPSession( ws_url=ws_url, session_start_dtm=datetime.now(timezone.utc).isoformat(), event_callback_fn=writer.write_event, paths=writer.paths, + target_id=tab_id, + navigate_url=navigate_url, ) try: diff --git a/tests/unit/data_models/test_resource_base.py b/tests/unit/data_models/test_resource_base.py index 16627c0b..7d4d4574 100644 --- a/tests/unit/data_models/test_resource_base.py +++ b/tests/unit/data_models/test_resource_base.py @@ -563,3 +563,107 @@ class GrandChild(SampleResource): resource = GrandChild(name="test") assert resource.id.startswith("GrandChild_") assert isinstance(resource.uuid, UUID) + + +class TestHash: + """Tests for ResourceBase.hash computed field.""" + + def test_hash_is_string(self) -> None: + """hash should return a hex string.""" + resource = SampleResource(name="test") + assert isinstance(resource.hash, str) + + def test_hash_is_sha256_hex(self) -> None: + """hash should be a 64-character hex string (SHA-256).""" + resource = SampleResource(name="test") + assert len(resource.hash) == 64 + assert all(c in "0123456789abcdef" for c in resource.hash) + + def test_hash_deterministic(self) -> None: + """Accessing hash multiple times should return the same value.""" + resource = SampleResource(name="test", id="fixed_id", created_at=0.0, updated_at=0.0) + assert resource.hash == resource.hash + + def test_hash_changes_when_base_field_changes(self) -> None: + """Changing an inherited field should change the hash.""" + resource = SampleResource(name="test") + h1 = resource.hash + resource.updated_at = 999999.0 + assert resource.hash != h1 + + def test_hash_changes_when_subclass_field_changes(self) -> None: + """Changing a subclass-defined field should change the hash.""" + resource = SampleResource(name="original", id="fixed", created_at=0.0, updated_at=0.0) + h1 = resource.hash + resource.name = "changed" + assert resource.hash != h1 + + def test_hash_includes_all_subclass_fields(self) -> None: + """All subclass fields must contribute to the hash.""" + r1 = SampleResourceWithCustomFields( + title="Title", value=1, tags=["a"], + id="fixed", created_at=0.0, updated_at=0.0, + ) + r2 = SampleResourceWithCustomFields( + title="Title", value=1, tags=["a", "b"], + id="fixed", created_at=0.0, updated_at=0.0, + ) + assert r1.hash != r2.hash + + def test_hash_includes_nested_subclass_fields(self) -> None: + """Fields from multi-level inheritance should contribute to the hash.""" + class GrandChild(SampleResource): + extra: int = 0 + + r1 = GrandChild(name="test", extra=1, id="fixed", created_at=0.0, updated_at=0.0) + r2 = GrandChild(name="test", extra=2, id="fixed", created_at=0.0, updated_at=0.0) + assert r1.hash != r2.hash + + def test_hash_in_model_dump(self) -> None: + """model_dump() should include the hash key.""" + resource = SampleResource(name="test") + data = resource.model_dump() + assert "hash" in data + assert data["hash"] == resource.hash + + def test_hash_in_model_dump_json(self) -> None: + """model_dump_json() should include the hash.""" + resource = SampleResource(name="test") + json_str = resource.model_dump_json() + assert '"hash"' in json_str + assert resource.hash in json_str + + def test_hash_excludes_itself(self) -> None: + """Hash must not depend on its own value (no circularity).""" + resource = SampleResource(name="test", id="fixed", created_at=0.0, updated_at=0.0) + # If hash depended on itself, calling it twice could yield different results + # or raise a recursion error. Verify it's stable. + hashes = [resource.hash for _ in range(5)] + assert len(set(hashes)) == 1 + + def test_same_content_same_hash(self) -> None: + """Two instances with identical field values should have the same hash.""" + kwargs = dict(name="test", description="desc", id="fixed", created_at=0.0, updated_at=0.0) + r1 = SampleResource(**kwargs) + r2 = SampleResource(**kwargs) + assert r1.hash == r2.hash + + def test_different_content_different_hash(self) -> None: + """Two instances with different field values should have different hashes.""" + kwargs = dict(id="fixed", created_at=0.0, updated_at=0.0) + r1 = SampleResource(name="alpha", **kwargs) + r2 = SampleResource(name="beta", **kwargs) + assert r1.hash != r2.hash + + def test_hash_changes_on_metadata_change(self) -> None: + """Changing the metadata dict should change the hash.""" + resource = SampleResource(name="test", id="fixed", created_at=0.0, updated_at=0.0) + h1 = resource.hash + resource.metadata = {"key": "value"} + assert resource.hash != h1 + + def test_hash_on_resource_base_directly(self) -> None: + """hash should work on ResourceBase itself.""" + resource = ResourceBase(id="fixed", created_at=0.0, updated_at=0.0) + assert isinstance(resource.hash, str) + assert len(resource.hash) == 64