From 0b6ecc1750246e4924df2c6a7bdc06233dc196c7 Mon Sep 17 00:00:00 2001 From: Trisha Bansal Date: Tue, 7 Apr 2026 17:44:30 +0000 Subject: [PATCH 1/4] Fix perception engine unpack crash and interval rule kwarg mismatch _run_reasoning_loop returned 2-tuples but perceive() unpacked 3, and _check_interval_rules passed an unsupported timepoint= kwarg to IntervalRule.matches(). Both errors were swallowed by broad except handlers, silently degrading every prediction to "early" and preventing interval rules from ever firing. Also adds initial_stage/initial_confidence to PerceptionResult and completes the messages history at early returns so the multishot scaffold is type-clean and continuable. --- gently/app/orchestration/timelapse.py | 1 - gently/harness/perception/engine.py | 43 ++++++++++++++++++++++++--- gently/harness/perception/session.py | 4 +++ 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/gently/app/orchestration/timelapse.py b/gently/app/orchestration/timelapse.py index 6deaf00..61fd1c5 100644 --- a/gently/app/orchestration/timelapse.py +++ b/gently/app/orchestration/timelapse.py @@ -1007,7 +1007,6 @@ def _check_interval_rules( embryo_id=embryo_id, detector_name=detector_name, stage=stage, - timepoint=estate.timepoints_acquired, ): # Round-based: interval rules now modify the global interval old_interval = self._base_interval_seconds diff --git a/gently/harness/perception/engine.py b/gently/harness/perception/engine.py index b732296..83eed33 100644 --- a/gently/harness/perception/engine.py +++ b/gently/harness/perception/engine.py @@ -533,7 +533,8 @@ async def _run_reasoning_loop( """ Run the interleaved reasoning loop with tool use. - Returns (PerceptionResult, ReasoningTrace) + Returns (PerceptionResult, ReasoningTrace, messages) where messages is the + full conversation history, so callers can continue the conversation. """ trace = ReasoningTrace() @@ -591,8 +592,9 @@ async def _run_reasoning_loop( )) # Parse and return + messages.append({"role": "assistant", "content": [{"type": "text", "text": text_response}]}) result = self._parse_response(text_response) - return result, trace + return result, trace, messages # Handle tool use if response.stop_reason == "tool_use": @@ -620,12 +622,33 @@ async def _run_reasoning_loop( tool_input=block.input, )) + assistant_content = [] + for block in response.content: + if block.type == "text": + assistant_content.append({"type": "text", "text": block.text}) + elif block.type == "tool_use": + assistant_content.append({ + "type": "tool_use", + "id": block.id, + "name": block.name, + "input": block.input, + }) + messages.append({"role": "assistant", "content": assistant_content}) + # Run verification and return result result = await self._handle_verification_request( verification_block.input, trace, ) - return result, trace + messages.append({ + "role": "user", + "content": [{ + "type": "tool_result", + "tool_use_id": verification_block.id, + "content": f"Verification complete: stage={result.stage}, confidence={result.confidence:.0%}", + }], + }) + return result, trace, messages # Build assistant message with the response assistant_content = [] @@ -688,7 +711,7 @@ async def _run_reasoning_loop( # Max iterations reached - parse last response logger.warning(f"Max reasoning iterations ({max_iterations}) reached") - return self._parse_response(""), trace + return self._parse_response(""), trace, messages def _handle_tool_call( self, @@ -1060,6 +1083,18 @@ def _build_cached_system_prompt(self) -> List[Dict]: "cache_control": {"type": "ephemeral", "ttl": "1h"} }] + def _build_reconsider_prompt(self, result: "PerceptionResult", turn: int) -> str: + raise NotImplementedError( + "Multishot reconsideration is not yet implemented. " + "Set multishot_turns=0 (the default) to disable." + ) + + async def _call_claude(self, messages: List[Dict], include_tools: bool = True) -> Any: + raise NotImplementedError( + "Multishot reconsideration is not yet implemented. " + "Set multishot_turns=0 (the default) to disable." + ) + async def _call_claude_with_tools(self, messages: List[Dict]) -> Any: """Call Claude API with tools enabled for interleaved reasoning.""" try: diff --git a/gently/harness/perception/session.py b/gently/harness/perception/session.py index aeabb46..e3fbe24 100644 --- a/gently/harness/perception/session.py +++ b/gently/harness/perception/session.py @@ -634,3 +634,7 @@ class PerceptionResult: candidate_stages: Optional[List[CandidateStage]] = None # Candidates from Phase 1 multi_phase_trace: Optional[MultiPhaseReasoningTrace] = None # Full multi-phase trace phase_count: int = 1 # Number of phases executed (1, 2, or 3) + + # Multishot reconsideration: the first answer before any reconsider turns + initial_stage: Optional[str] = None + initial_confidence: Optional[float] = None From 2d374f2e8d3b40eb55cf9380f87626de1546708b Mon Sep 17 00:00:00 2001 From: Trisha Bansal Date: Thu, 9 Apr 2026 13:26:19 +0000 Subject: [PATCH 2/4] benchmarks: load organism + add --start-timepoint to perception runner The runner previously crashed with 'No organism loaded' because it never called load_organism() (launch_gently.py does that, but the benchmark is a separate entrypoint). Hardcode celegans for now. Also add --start-timepoint so hard stages (comma/1.5fold/2fold/pretzel, which start at T39-T90) can be targeted without burning API calls on the ~33-50 easy 'early' frames that precede them. iter_embryo() already supported the parameter; this just threads it through the CLI/config. --- benchmarks/perception/runner.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py index 62f4795..fc29fd2 100644 --- a/benchmarks/perception/runner.py +++ b/benchmarks/perception/runner.py @@ -38,6 +38,7 @@ class BenchmarkConfig: enable_verification: bool = True # Multi-phase verification with subagents # Test settings + start_timepoint: int = 0 max_timepoints_per_embryo: Optional[int] = None embryo_ids: Optional[List[str]] = None # None = all @@ -57,6 +58,7 @@ def to_dict(self) -> Dict[str, Any]: "enable_view_reference": self.enable_view_reference, "enable_view_previous": self.enable_view_previous, "enable_verification": self.enable_verification, + "start_timepoint": self.start_timepoint, "max_timepoints_per_embryo": self.max_timepoints_per_embryo, "embryo_ids": self.embryo_ids, "system_prompt_override": self.system_prompt_override, @@ -287,7 +289,11 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult: if self.config.max_timepoints_per_embryo: end_tp = self.config.max_timepoints_per_embryo - for test_case in self.testset.iter_embryo(embryo_id, end_timepoint=end_tp): + for test_case in self.testset.iter_embryo( + embryo_id, + start_timepoint=self.config.start_timepoint, + end_timepoint=end_tp, + ): logger.info( f"[{embryo_id}] Processing T{test_case.timepoint} " f"(GT: {test_case.ground_truth_stage})" @@ -423,10 +429,16 @@ async def main(): action="append", help="Specific embryo(s) to run (can specify multiple)", ) + parser.add_argument( + "--start-timepoint", + type=int, + default=0, + help="First timepoint index to process (skip earlier frames)", + ) parser.add_argument( "--max-timepoints", type=int, - help="Maximum timepoints per embryo", + help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).", ) parser.add_argument( "--description", @@ -447,6 +459,11 @@ async def main(): format="%(asctime)s %(levelname)s %(message)s", ) + # The perception engine reads stage definitions etc. from the active + # organism module, which is normally loaded by launch_gently.py. + from gently.organisms import load_organism + load_organism("celegans") + # Find session path session_path = Path(args.session) if not session_path.exists(): @@ -486,6 +503,7 @@ async def main(): # Create config config = BenchmarkConfig( embryo_ids=args.embryo, + start_timepoint=args.start_timepoint, max_timepoints_per_embryo=args.max_timepoints, description=args.description, ) From ea10ea1541ac88e28f9042c741eda1b801ac0640 Mon Sep 17 00:00:00 2001 From: Trisha Bansal Date: Thu, 9 Apr 2026 15:16:15 +0000 Subject: [PATCH 3/4] Ablation toggles + real-timestamp option for perception benchmark Adds three runner CLI flags for isolating the comma-lock cause: --no-temporal-context omit TEMPORAL CONTEXT block from prompt --no-previous-observations omit PREVIOUS OBSERVATIONS block --real-timestamps pass TIFF acquisition time to session Threaded through BenchmarkConfig -> to_dict() -> PerceptionEngine ctor / session.add_observation(timestamp=). All default to current behavior. testset.py: expose TestCase.acquired_at parsed from TIFF YYYYmmdd_HHMMSS filenames (was already parsed for sorting, then discarded). session.py: compute_temporal_analysis now uses observations[-1].timestamp instead of datetime.now() so real (historical) timestamps work. NOTE: in the live-agent path (manager.py), perceive() runs before add_observation, so this shifts the reported time-in-stage down by one acquisition interval (~4 min) and the 225-min arrest threshold fires one frame later. Arguably more correct (time *observed* in stage), but it is a small behavior change, not a no-op. total_session_duration_min goes negative under --real-timestamps (created_at is wallclock vs now is TIFF time); cosmetic only, never reaches the prompt. --- benchmarks/perception/runner.py | 29 ++++++++++++++++++++++ benchmarks/perception/testset.py | 15 +++++++----- gently/harness/perception/engine.py | 36 ++++++++++++++++------------ gently/harness/perception/session.py | 4 +++- 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py index fc29fd2..8f1b8f3 100644 --- a/benchmarks/perception/runner.py +++ b/benchmarks/perception/runner.py @@ -42,6 +42,11 @@ class BenchmarkConfig: max_timepoints_per_embryo: Optional[int] = None embryo_ids: Optional[List[str]] = None # None = all + # Ablation toggles + include_temporal_context: bool = True + include_previous_observations: bool = True + real_timestamps: bool = False + # Custom system prompt override system_prompt_override: Optional[str] = None @@ -58,6 +63,9 @@ def to_dict(self) -> Dict[str, Any]: "enable_view_reference": self.enable_view_reference, "enable_view_previous": self.enable_view_previous, "enable_verification": self.enable_verification, + "include_temporal_context": self.include_temporal_context, + "include_previous_observations": self.include_previous_observations, + "real_timestamps": self.real_timestamps, "start_timepoint": self.start_timepoint, "max_timepoints_per_embryo": self.max_timepoints_per_embryo, "embryo_ids": self.embryo_ids, @@ -256,6 +264,8 @@ async def _get_engine(self): claude_client=client, examples_path=examples_path, enable_verification=self.config.enable_verification, + include_temporal_context=self.config.include_temporal_context, + include_previous_observations=self.config.include_previous_observations, ) self._engine = engine @@ -349,6 +359,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult: reasoning=perception_result.reasoning, is_transitional=perception_result.is_transitional, transition_between=perception_result.transition_between, + timestamp=test_case.acquired_at if self.config.real_timestamps else None, ) logger.info( @@ -440,6 +451,21 @@ async def main(): type=int, help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).", ) + parser.add_argument( + "--no-temporal-context", + action="store_true", + help="Ablation: omit the TEMPORAL CONTEXT block from the prompt", + ) + parser.add_argument( + "--no-previous-observations", + action="store_true", + help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt", + ) + parser.add_argument( + "--real-timestamps", + action="store_true", + help="Use TIFF acquisition timestamps for temporal context instead of wallclock", + ) parser.add_argument( "--description", default="", @@ -505,6 +531,9 @@ async def main(): embryo_ids=args.embryo, start_timepoint=args.start_timepoint, max_timepoints_per_embryo=args.max_timepoints, + include_temporal_context=not args.no_temporal_context, + include_previous_observations=not args.no_previous_observations, + real_timestamps=args.real_timestamps, description=args.description, ) diff --git a/benchmarks/perception/testset.py b/benchmarks/perception/testset.py index 5dc3f34..658cc01 100644 --- a/benchmarks/perception/testset.py +++ b/benchmarks/perception/testset.py @@ -7,6 +7,7 @@ import base64 import io from dataclasses import dataclass +from datetime import datetime from pathlib import Path from typing import Iterator, List, Optional, Tuple, Dict @@ -43,12 +44,13 @@ class TestCase: side_image_b64: Optional[str] # SIDE view only volume: Optional[np.ndarray] ground_truth_stage: Optional[str] + acquired_at: Optional[datetime] = None -def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dict[str, List[Path]]: - """Discover volume files in a session directory.""" - from datetime import datetime - +def _discover_volumes( + session_dir: Path, embryo_id: Optional[str] = None +) -> Dict[str, List[Tuple[datetime, Path]]]: + """Discover volume files (with parsed acquisition timestamps) in a session directory.""" if not session_dir.exists(): return {} @@ -79,7 +81,7 @@ def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dic result = {} for eid, volumes in embryo_volumes.items(): volumes.sort(key=lambda x: x[0]) - result[eid] = [v[1] for v in volumes] + result[eid] = volumes return result @@ -286,7 +288,7 @@ def iter_embryo( end_timepoint = len(volumes) for timepoint in range(start_timepoint, min(end_timepoint, len(volumes))): - vol_path = volumes[timepoint] + acquired_at, vol_path = volumes[timepoint] # Load volume volume = _load_volume(vol_path) if self.load_volumes else None @@ -313,6 +315,7 @@ def iter_embryo( side_image_b64=side_b64, volume=volume, ground_truth_stage=gt_stage, + acquired_at=acquired_at, ) def iter_all(self) -> Iterator[Tuple[str, Iterator[TestCase]]]: diff --git a/gently/harness/perception/engine.py b/gently/harness/perception/engine.py index 83eed33..440d5f1 100644 --- a/gently/harness/perception/engine.py +++ b/gently/harness/perception/engine.py @@ -332,6 +332,8 @@ def __init__( volume_accessor: Optional[Callable[[str, int], Optional[np.ndarray]]] = None, enable_verification: bool = True, multishot_turns: int = 0, + include_temporal_context: bool = True, + include_previous_observations: bool = True, ): """ Parameters @@ -355,6 +357,8 @@ def __init__( self.volume_accessor = volume_accessor self.enable_verification = enable_verification self.multishot_turns = multishot_turns + self.include_temporal_context = include_temporal_context + self.include_previous_observations = include_previous_observations # Load examples if provided if example_store: @@ -1183,32 +1187,34 @@ def _build_prompt( }) # 2. Previous observations (last 3) - recent = session.get_recent_observations(3) - if recent: - obs_text = "\nPREVIOUS OBSERVATIONS:\n" - for obs in recent: - obs_text += f"- T{obs.timepoint}: {obs.stage}" - if obs.is_hatching: - obs_text += " (hatching in progress)" - obs_text += "\n" - content.append({"type": "text", "text": obs_text}) + if self.include_previous_observations: + recent = session.get_recent_observations(3) + if recent: + obs_text = "\nPREVIOUS OBSERVATIONS:\n" + for obs in recent: + obs_text += f"- T{obs.timepoint}: {obs.stage}" + if obs.is_hatching: + obs_text += " (hatching in progress)" + obs_text += "\n" + content.append({"type": "text", "text": obs_text}) # 3. Temporal context (for detecting arrested/dead embryos) - temporal = session.compute_temporal_analysis() - if temporal: - temporal_text = f""" + if self.include_temporal_context: + temporal = session.compute_temporal_analysis() + if temporal: + temporal_text = f""" TEMPORAL CONTEXT: - Current stage: {temporal.current_stage} - Time at this stage: {temporal.time_in_current_stage_min:.0f} minutes - Expected duration: {temporal.expected_duration_min or 'N/A'} minutes - Overtime ratio: {temporal.overtime_ratio:.1f}x (>2x is unusual, >3x is concerning) """ - if temporal.is_potentially_arrested: - temporal_text += f""" + if temporal.is_potentially_arrested: + temporal_text += f""" WARNING - POTENTIAL DEVELOPMENTAL ARREST: - {temporal.arrest_reason} """ - content.append({"type": "text", "text": temporal_text}) + content.append({"type": "text", "text": temporal_text}) # 4. Current image to analyze use_separate_images = top_image_b64 is not None and side_image_b64 is not None diff --git a/gently/harness/perception/session.py b/gently/harness/perception/session.py index e3fbe24..ddd7a01 100644 --- a/gently/harness/perception/session.py +++ b/gently/harness/perception/session.py @@ -207,7 +207,9 @@ def compute_temporal_analysis(self) -> Optional[TemporalAnalysis]: return None current_stage = self.get_current_stage() - now = datetime.now() + # Reference "now" against the most recent observation's timestamp so + # time-in-stage reflects acquisition time, not benchmark wallclock. + now = self.observations[-1].timestamp # Find when current stage started (walk backwards through observations) stage_start_time = None From 074a4d20530b2582760646f72c2c3ae69143f054 Mon Sep 17 00:00:00 2001 From: Trisha Bansal Date: Thu, 9 Apr 2026 17:13:14 +0000 Subject: [PATCH 4/4] benchmarks: always use TIFF acquisition timestamps (wallclock fix) Ablation confirmed the wallclock-time bug is the dominant cause of the perception engine's stage-lock in benchmarks. Make the fix unconditional: pass test_case.acquired_at to session.add_observation() instead of letting it default to datetime.now(). Removes the --real-timestamps flag (was for ablation only). --- benchmarks/perception/runner.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py index 8f1b8f3..33d93fe 100644 --- a/benchmarks/perception/runner.py +++ b/benchmarks/perception/runner.py @@ -45,7 +45,6 @@ class BenchmarkConfig: # Ablation toggles include_temporal_context: bool = True include_previous_observations: bool = True - real_timestamps: bool = False # Custom system prompt override system_prompt_override: Optional[str] = None @@ -65,7 +64,6 @@ def to_dict(self) -> Dict[str, Any]: "enable_verification": self.enable_verification, "include_temporal_context": self.include_temporal_context, "include_previous_observations": self.include_previous_observations, - "real_timestamps": self.real_timestamps, "start_timepoint": self.start_timepoint, "max_timepoints_per_embryo": self.max_timepoints_per_embryo, "embryo_ids": self.embryo_ids, @@ -359,7 +357,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult: reasoning=perception_result.reasoning, is_transitional=perception_result.is_transitional, transition_between=perception_result.transition_between, - timestamp=test_case.acquired_at if self.config.real_timestamps else None, + timestamp=test_case.acquired_at, ) logger.info( @@ -461,11 +459,6 @@ async def main(): action="store_true", help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt", ) - parser.add_argument( - "--real-timestamps", - action="store_true", - help="Use TIFF acquisition timestamps for temporal context instead of wallclock", - ) parser.add_argument( "--description", default="", @@ -533,7 +526,6 @@ async def main(): max_timepoints_per_embryo=args.max_timepoints, include_temporal_context=not args.no_temporal_context, include_previous_observations=not args.no_previous_observations, - real_timestamps=args.real_timestamps, description=args.description, )