diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py index 62f4795..33d93fe 100644 --- a/benchmarks/perception/runner.py +++ b/benchmarks/perception/runner.py @@ -38,9 +38,14 @@ class BenchmarkConfig: enable_verification: bool = True # Multi-phase verification with subagents # Test settings + start_timepoint: int = 0 max_timepoints_per_embryo: Optional[int] = None embryo_ids: Optional[List[str]] = None # None = all + # Ablation toggles + include_temporal_context: bool = True + include_previous_observations: bool = True + # Custom system prompt override system_prompt_override: Optional[str] = None @@ -57,6 +62,9 @@ def to_dict(self) -> Dict[str, Any]: "enable_view_reference": self.enable_view_reference, "enable_view_previous": self.enable_view_previous, "enable_verification": self.enable_verification, + "include_temporal_context": self.include_temporal_context, + "include_previous_observations": self.include_previous_observations, + "start_timepoint": self.start_timepoint, "max_timepoints_per_embryo": self.max_timepoints_per_embryo, "embryo_ids": self.embryo_ids, "system_prompt_override": self.system_prompt_override, @@ -254,6 +262,8 @@ async def _get_engine(self): claude_client=client, examples_path=examples_path, enable_verification=self.config.enable_verification, + include_temporal_context=self.config.include_temporal_context, + include_previous_observations=self.config.include_previous_observations, ) self._engine = engine @@ -287,7 +297,11 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult: if self.config.max_timepoints_per_embryo: end_tp = self.config.max_timepoints_per_embryo - for test_case in self.testset.iter_embryo(embryo_id, end_timepoint=end_tp): + for test_case in self.testset.iter_embryo( + embryo_id, + start_timepoint=self.config.start_timepoint, + end_timepoint=end_tp, + ): logger.info( f"[{embryo_id}] Processing T{test_case.timepoint} " f"(GT: {test_case.ground_truth_stage})" @@ -343,6 +357,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult: reasoning=perception_result.reasoning, is_transitional=perception_result.is_transitional, transition_between=perception_result.transition_between, + timestamp=test_case.acquired_at, ) logger.info( @@ -423,10 +438,26 @@ async def main(): action="append", help="Specific embryo(s) to run (can specify multiple)", ) + parser.add_argument( + "--start-timepoint", + type=int, + default=0, + help="First timepoint index to process (skip earlier frames)", + ) parser.add_argument( "--max-timepoints", type=int, - help="Maximum timepoints per embryo", + help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).", + ) + parser.add_argument( + "--no-temporal-context", + action="store_true", + help="Ablation: omit the TEMPORAL CONTEXT block from the prompt", + ) + parser.add_argument( + "--no-previous-observations", + action="store_true", + help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt", ) parser.add_argument( "--description", @@ -447,6 +478,11 @@ async def main(): format="%(asctime)s %(levelname)s %(message)s", ) + # The perception engine reads stage definitions etc. from the active + # organism module, which is normally loaded by launch_gently.py. + from gently.organisms import load_organism + load_organism("celegans") + # Find session path session_path = Path(args.session) if not session_path.exists(): @@ -486,7 +522,10 @@ async def main(): # Create config config = BenchmarkConfig( embryo_ids=args.embryo, + start_timepoint=args.start_timepoint, max_timepoints_per_embryo=args.max_timepoints, + include_temporal_context=not args.no_temporal_context, + include_previous_observations=not args.no_previous_observations, description=args.description, ) diff --git a/benchmarks/perception/testset.py b/benchmarks/perception/testset.py index 5dc3f34..658cc01 100644 --- a/benchmarks/perception/testset.py +++ b/benchmarks/perception/testset.py @@ -7,6 +7,7 @@ import base64 import io from dataclasses import dataclass +from datetime import datetime from pathlib import Path from typing import Iterator, List, Optional, Tuple, Dict @@ -43,12 +44,13 @@ class TestCase: side_image_b64: Optional[str] # SIDE view only volume: Optional[np.ndarray] ground_truth_stage: Optional[str] + acquired_at: Optional[datetime] = None -def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dict[str, List[Path]]: - """Discover volume files in a session directory.""" - from datetime import datetime - +def _discover_volumes( + session_dir: Path, embryo_id: Optional[str] = None +) -> Dict[str, List[Tuple[datetime, Path]]]: + """Discover volume files (with parsed acquisition timestamps) in a session directory.""" if not session_dir.exists(): return {} @@ -79,7 +81,7 @@ def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dic result = {} for eid, volumes in embryo_volumes.items(): volumes.sort(key=lambda x: x[0]) - result[eid] = [v[1] for v in volumes] + result[eid] = volumes return result @@ -286,7 +288,7 @@ def iter_embryo( end_timepoint = len(volumes) for timepoint in range(start_timepoint, min(end_timepoint, len(volumes))): - vol_path = volumes[timepoint] + acquired_at, vol_path = volumes[timepoint] # Load volume volume = _load_volume(vol_path) if self.load_volumes else None @@ -313,6 +315,7 @@ def iter_embryo( side_image_b64=side_b64, volume=volume, ground_truth_stage=gt_stage, + acquired_at=acquired_at, ) def iter_all(self) -> Iterator[Tuple[str, Iterator[TestCase]]]: diff --git a/gently/app/orchestration/timelapse.py b/gently/app/orchestration/timelapse.py index 6deaf00..61fd1c5 100644 --- a/gently/app/orchestration/timelapse.py +++ b/gently/app/orchestration/timelapse.py @@ -1007,7 +1007,6 @@ def _check_interval_rules( embryo_id=embryo_id, detector_name=detector_name, stage=stage, - timepoint=estate.timepoints_acquired, ): # Round-based: interval rules now modify the global interval old_interval = self._base_interval_seconds diff --git a/gently/harness/perception/engine.py b/gently/harness/perception/engine.py index b732296..440d5f1 100644 --- a/gently/harness/perception/engine.py +++ b/gently/harness/perception/engine.py @@ -332,6 +332,8 @@ def __init__( volume_accessor: Optional[Callable[[str, int], Optional[np.ndarray]]] = None, enable_verification: bool = True, multishot_turns: int = 0, + include_temporal_context: bool = True, + include_previous_observations: bool = True, ): """ Parameters @@ -355,6 +357,8 @@ def __init__( self.volume_accessor = volume_accessor self.enable_verification = enable_verification self.multishot_turns = multishot_turns + self.include_temporal_context = include_temporal_context + self.include_previous_observations = include_previous_observations # Load examples if provided if example_store: @@ -533,7 +537,8 @@ async def _run_reasoning_loop( """ Run the interleaved reasoning loop with tool use. - Returns (PerceptionResult, ReasoningTrace) + Returns (PerceptionResult, ReasoningTrace, messages) where messages is the + full conversation history, so callers can continue the conversation. """ trace = ReasoningTrace() @@ -591,8 +596,9 @@ async def _run_reasoning_loop( )) # Parse and return + messages.append({"role": "assistant", "content": [{"type": "text", "text": text_response}]}) result = self._parse_response(text_response) - return result, trace + return result, trace, messages # Handle tool use if response.stop_reason == "tool_use": @@ -620,12 +626,33 @@ async def _run_reasoning_loop( tool_input=block.input, )) + assistant_content = [] + for block in response.content: + if block.type == "text": + assistant_content.append({"type": "text", "text": block.text}) + elif block.type == "tool_use": + assistant_content.append({ + "type": "tool_use", + "id": block.id, + "name": block.name, + "input": block.input, + }) + messages.append({"role": "assistant", "content": assistant_content}) + # Run verification and return result result = await self._handle_verification_request( verification_block.input, trace, ) - return result, trace + messages.append({ + "role": "user", + "content": [{ + "type": "tool_result", + "tool_use_id": verification_block.id, + "content": f"Verification complete: stage={result.stage}, confidence={result.confidence:.0%}", + }], + }) + return result, trace, messages # Build assistant message with the response assistant_content = [] @@ -688,7 +715,7 @@ async def _run_reasoning_loop( # Max iterations reached - parse last response logger.warning(f"Max reasoning iterations ({max_iterations}) reached") - return self._parse_response(""), trace + return self._parse_response(""), trace, messages def _handle_tool_call( self, @@ -1060,6 +1087,18 @@ def _build_cached_system_prompt(self) -> List[Dict]: "cache_control": {"type": "ephemeral", "ttl": "1h"} }] + def _build_reconsider_prompt(self, result: "PerceptionResult", turn: int) -> str: + raise NotImplementedError( + "Multishot reconsideration is not yet implemented. " + "Set multishot_turns=0 (the default) to disable." + ) + + async def _call_claude(self, messages: List[Dict], include_tools: bool = True) -> Any: + raise NotImplementedError( + "Multishot reconsideration is not yet implemented. " + "Set multishot_turns=0 (the default) to disable." + ) + async def _call_claude_with_tools(self, messages: List[Dict]) -> Any: """Call Claude API with tools enabled for interleaved reasoning.""" try: @@ -1148,32 +1187,34 @@ def _build_prompt( }) # 2. Previous observations (last 3) - recent = session.get_recent_observations(3) - if recent: - obs_text = "\nPREVIOUS OBSERVATIONS:\n" - for obs in recent: - obs_text += f"- T{obs.timepoint}: {obs.stage}" - if obs.is_hatching: - obs_text += " (hatching in progress)" - obs_text += "\n" - content.append({"type": "text", "text": obs_text}) + if self.include_previous_observations: + recent = session.get_recent_observations(3) + if recent: + obs_text = "\nPREVIOUS OBSERVATIONS:\n" + for obs in recent: + obs_text += f"- T{obs.timepoint}: {obs.stage}" + if obs.is_hatching: + obs_text += " (hatching in progress)" + obs_text += "\n" + content.append({"type": "text", "text": obs_text}) # 3. Temporal context (for detecting arrested/dead embryos) - temporal = session.compute_temporal_analysis() - if temporal: - temporal_text = f""" + if self.include_temporal_context: + temporal = session.compute_temporal_analysis() + if temporal: + temporal_text = f""" TEMPORAL CONTEXT: - Current stage: {temporal.current_stage} - Time at this stage: {temporal.time_in_current_stage_min:.0f} minutes - Expected duration: {temporal.expected_duration_min or 'N/A'} minutes - Overtime ratio: {temporal.overtime_ratio:.1f}x (>2x is unusual, >3x is concerning) """ - if temporal.is_potentially_arrested: - temporal_text += f""" + if temporal.is_potentially_arrested: + temporal_text += f""" WARNING - POTENTIAL DEVELOPMENTAL ARREST: - {temporal.arrest_reason} """ - content.append({"type": "text", "text": temporal_text}) + content.append({"type": "text", "text": temporal_text}) # 4. Current image to analyze use_separate_images = top_image_b64 is not None and side_image_b64 is not None diff --git a/gently/harness/perception/session.py b/gently/harness/perception/session.py index aeabb46..ddd7a01 100644 --- a/gently/harness/perception/session.py +++ b/gently/harness/perception/session.py @@ -207,7 +207,9 @@ def compute_temporal_analysis(self) -> Optional[TemporalAnalysis]: return None current_stage = self.get_current_stage() - now = datetime.now() + # Reference "now" against the most recent observation's timestamp so + # time-in-stage reflects acquisition time, not benchmark wallclock. + now = self.observations[-1].timestamp # Find when current stage started (walk backwards through observations) stage_start_time = None @@ -634,3 +636,7 @@ class PerceptionResult: candidate_stages: Optional[List[CandidateStage]] = None # Candidates from Phase 1 multi_phase_trace: Optional[MultiPhaseReasoningTrace] = None # Full multi-phase trace phase_count: int = 1 # Number of phases executed (1, 2, or 3) + + # Multishot reconsideration: the first answer before any reconsider turns + initial_stage: Optional[str] = None + initial_confidence: Optional[float] = None