From 0b6ecc1750246e4924df2c6a7bdc06233dc196c7 Mon Sep 17 00:00:00 2001
From: Trisha Bansal <trisha@anthropic.com>
Date: Tue, 7 Apr 2026 17:44:30 +0000
Subject: [PATCH 1/4] Fix perception engine unpack crash and interval rule
 kwarg mismatch

_run_reasoning_loop returned 2-tuples but perceive() unpacked 3, and
_check_interval_rules passed an unsupported timepoint= kwarg to
IntervalRule.matches(). Both errors were swallowed by broad except
handlers, silently degrading every prediction to "early" and preventing
interval rules from ever firing. Also adds initial_stage/initial_confidence
to PerceptionResult and completes the messages history at early returns so
the multishot scaffold is type-clean and continuable.
---
 gently/app/orchestration/timelapse.py |  1 -
 gently/harness/perception/engine.py   | 43 ++++++++++++++++++++++++---
 gently/harness/perception/session.py  |  4 +++
 3 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/gently/app/orchestration/timelapse.py b/gently/app/orchestration/timelapse.py
index 6deaf00..61fd1c5 100644
--- a/gently/app/orchestration/timelapse.py
+++ b/gently/app/orchestration/timelapse.py
@@ -1007,7 +1007,6 @@ def _check_interval_rules(
                 embryo_id=embryo_id,
                 detector_name=detector_name,
                 stage=stage,
-                timepoint=estate.timepoints_acquired,
             ):
                 # Round-based: interval rules now modify the global interval
                 old_interval = self._base_interval_seconds
diff --git a/gently/harness/perception/engine.py b/gently/harness/perception/engine.py
index b732296..83eed33 100644
--- a/gently/harness/perception/engine.py
+++ b/gently/harness/perception/engine.py
@@ -533,7 +533,8 @@ async def _run_reasoning_loop(
         """
         Run the interleaved reasoning loop with tool use.
 
-        Returns (PerceptionResult, ReasoningTrace)
+        Returns (PerceptionResult, ReasoningTrace, messages) where messages is the
+        full conversation history, so callers can continue the conversation.
         """
         trace = ReasoningTrace()
 
@@ -591,8 +592,9 @@ async def _run_reasoning_loop(
                 ))
 
                 # Parse and return
+                messages.append({"role": "assistant", "content": [{"type": "text", "text": text_response}]})
                 result = self._parse_response(text_response)
-                return result, trace
+                return result, trace, messages
 
             # Handle tool use
             if response.stop_reason == "tool_use":
@@ -620,12 +622,33 @@ async def _run_reasoning_loop(
                                 tool_input=block.input,
                             ))
 
+                    assistant_content = []
+                    for block in response.content:
+                        if block.type == "text":
+                            assistant_content.append({"type": "text", "text": block.text})
+                        elif block.type == "tool_use":
+                            assistant_content.append({
+                                "type": "tool_use",
+                                "id": block.id,
+                                "name": block.name,
+                                "input": block.input,
+                            })
+                    messages.append({"role": "assistant", "content": assistant_content})
+
                     # Run verification and return result
                     result = await self._handle_verification_request(
                         verification_block.input,
                         trace,
                     )
-                    return result, trace
+                    messages.append({
+                        "role": "user",
+                        "content": [{
+                            "type": "tool_result",
+                            "tool_use_id": verification_block.id,
+                            "content": f"Verification complete: stage={result.stage}, confidence={result.confidence:.0%}",
+                        }],
+                    })
+                    return result, trace, messages
 
                 # Build assistant message with the response
                 assistant_content = []
@@ -688,7 +711,7 @@ async def _run_reasoning_loop(
 
         # Max iterations reached - parse last response
         logger.warning(f"Max reasoning iterations ({max_iterations}) reached")
-        return self._parse_response(""), trace
+        return self._parse_response(""), trace, messages
 
     def _handle_tool_call(
         self,
@@ -1060,6 +1083,18 @@ def _build_cached_system_prompt(self) -> List[Dict]:
             "cache_control": {"type": "ephemeral", "ttl": "1h"}
         }]
 
+    def _build_reconsider_prompt(self, result: "PerceptionResult", turn: int) -> str:
+        raise NotImplementedError(
+            "Multishot reconsideration is not yet implemented. "
+            "Set multishot_turns=0 (the default) to disable."
+        )
+
+    async def _call_claude(self, messages: List[Dict], include_tools: bool = True) -> Any:
+        raise NotImplementedError(
+            "Multishot reconsideration is not yet implemented. "
+            "Set multishot_turns=0 (the default) to disable."
+        )
+
     async def _call_claude_with_tools(self, messages: List[Dict]) -> Any:
         """Call Claude API with tools enabled for interleaved reasoning."""
         try:
diff --git a/gently/harness/perception/session.py b/gently/harness/perception/session.py
index aeabb46..e3fbe24 100644
--- a/gently/harness/perception/session.py
+++ b/gently/harness/perception/session.py
@@ -634,3 +634,7 @@ class PerceptionResult:
     candidate_stages: Optional[List[CandidateStage]] = None  # Candidates from Phase 1
     multi_phase_trace: Optional[MultiPhaseReasoningTrace] = None  # Full multi-phase trace
     phase_count: int = 1  # Number of phases executed (1, 2, or 3)
+
+    # Multishot reconsideration: the first answer before any reconsider turns
+    initial_stage: Optional[str] = None
+    initial_confidence: Optional[float] = None

From 2d374f2e8d3b40eb55cf9380f87626de1546708b Mon Sep 17 00:00:00 2001
From: Trisha Bansal <trisha@anthropic.com>
Date: Thu, 9 Apr 2026 13:26:19 +0000
Subject: [PATCH 2/4] benchmarks: load organism + add --start-timepoint to
 perception runner

The runner previously crashed with 'No organism loaded' because it never
called load_organism() (launch_gently.py does that, but the benchmark is
a separate entrypoint). Hardcode celegans for now.

Also add --start-timepoint so hard stages (comma/1.5fold/2fold/pretzel,
which start at T39-T90) can be targeted without burning API calls on the
~33-50 easy 'early' frames that precede them. iter_embryo() already
supported the parameter; this just threads it through the CLI/config.
---
 benchmarks/perception/runner.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py
index 62f4795..fc29fd2 100644
--- a/benchmarks/perception/runner.py
+++ b/benchmarks/perception/runner.py
@@ -38,6 +38,7 @@ class BenchmarkConfig:
     enable_verification: bool = True  # Multi-phase verification with subagents
 
     # Test settings
+    start_timepoint: int = 0
     max_timepoints_per_embryo: Optional[int] = None
     embryo_ids: Optional[List[str]] = None  # None = all
 
@@ -57,6 +58,7 @@ def to_dict(self) -> Dict[str, Any]:
             "enable_view_reference": self.enable_view_reference,
             "enable_view_previous": self.enable_view_previous,
             "enable_verification": self.enable_verification,
+            "start_timepoint": self.start_timepoint,
             "max_timepoints_per_embryo": self.max_timepoints_per_embryo,
             "embryo_ids": self.embryo_ids,
             "system_prompt_override": self.system_prompt_override,
@@ -287,7 +289,11 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
             if self.config.max_timepoints_per_embryo:
                 end_tp = self.config.max_timepoints_per_embryo
 
-            for test_case in self.testset.iter_embryo(embryo_id, end_timepoint=end_tp):
+            for test_case in self.testset.iter_embryo(
+                embryo_id,
+                start_timepoint=self.config.start_timepoint,
+                end_timepoint=end_tp,
+            ):
                 logger.info(
                     f"[{embryo_id}] Processing T{test_case.timepoint} "
                     f"(GT: {test_case.ground_truth_stage})"
@@ -423,10 +429,16 @@ async def main():
         action="append",
         help="Specific embryo(s) to run (can specify multiple)",
     )
+    parser.add_argument(
+        "--start-timepoint",
+        type=int,
+        default=0,
+        help="First timepoint index to process (skip earlier frames)",
+    )
     parser.add_argument(
         "--max-timepoints",
         type=int,
-        help="Maximum timepoints per embryo",
+        help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).",
     )
     parser.add_argument(
         "--description",
@@ -447,6 +459,11 @@ async def main():
         format="%(asctime)s %(levelname)s %(message)s",
     )
 
+    # The perception engine reads stage definitions etc. from the active
+    # organism module, which is normally loaded by launch_gently.py.
+    from gently.organisms import load_organism
+    load_organism("celegans")
+
     # Find session path
     session_path = Path(args.session)
     if not session_path.exists():
@@ -486,6 +503,7 @@ async def main():
     # Create config
     config = BenchmarkConfig(
         embryo_ids=args.embryo,
+        start_timepoint=args.start_timepoint,
         max_timepoints_per_embryo=args.max_timepoints,
         description=args.description,
     )

From ea10ea1541ac88e28f9042c741eda1b801ac0640 Mon Sep 17 00:00:00 2001
From: Trisha Bansal <trisha@anthropic.com>
Date: Thu, 9 Apr 2026 15:16:15 +0000
Subject: [PATCH 3/4] Ablation toggles + real-timestamp option for perception
 benchmark

Adds three runner CLI flags for isolating the comma-lock cause:
  --no-temporal-context        omit TEMPORAL CONTEXT block from prompt
  --no-previous-observations   omit PREVIOUS OBSERVATIONS block
  --real-timestamps            pass TIFF acquisition time to session

Threaded through BenchmarkConfig -> to_dict() -> PerceptionEngine ctor /
session.add_observation(timestamp=). All default to current behavior.

testset.py: expose TestCase.acquired_at parsed from TIFF YYYYmmdd_HHMMSS
filenames (was already parsed for sorting, then discarded).

session.py: compute_temporal_analysis now uses observations[-1].timestamp
instead of datetime.now() so real (historical) timestamps work. NOTE: in
the live-agent path (manager.py), perceive() runs before add_observation,
so this shifts the reported time-in-stage down by one acquisition interval
(~4 min) and the 225-min arrest threshold fires one frame later. Arguably
more correct (time *observed* in stage), but it is a small behavior change,
not a no-op.

total_session_duration_min goes negative under --real-timestamps (created_at
is wallclock vs now is TIFF time); cosmetic only, never reaches the prompt.
---
 benchmarks/perception/runner.py      | 29 ++++++++++++++++++++++
 benchmarks/perception/testset.py     | 15 +++++++-----
 gently/harness/perception/engine.py  | 36 ++++++++++++++++------------
 gently/harness/perception/session.py |  4 +++-
 4 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py
index fc29fd2..8f1b8f3 100644
--- a/benchmarks/perception/runner.py
+++ b/benchmarks/perception/runner.py
@@ -42,6 +42,11 @@ class BenchmarkConfig:
     max_timepoints_per_embryo: Optional[int] = None
     embryo_ids: Optional[List[str]] = None  # None = all
 
+    # Ablation toggles
+    include_temporal_context: bool = True
+    include_previous_observations: bool = True
+    real_timestamps: bool = False
+
     # Custom system prompt override
     system_prompt_override: Optional[str] = None
 
@@ -58,6 +63,9 @@ def to_dict(self) -> Dict[str, Any]:
             "enable_view_reference": self.enable_view_reference,
             "enable_view_previous": self.enable_view_previous,
             "enable_verification": self.enable_verification,
+            "include_temporal_context": self.include_temporal_context,
+            "include_previous_observations": self.include_previous_observations,
+            "real_timestamps": self.real_timestamps,
             "start_timepoint": self.start_timepoint,
             "max_timepoints_per_embryo": self.max_timepoints_per_embryo,
             "embryo_ids": self.embryo_ids,
@@ -256,6 +264,8 @@ async def _get_engine(self):
             claude_client=client,
             examples_path=examples_path,
             enable_verification=self.config.enable_verification,
+            include_temporal_context=self.config.include_temporal_context,
+            include_previous_observations=self.config.include_previous_observations,
         )
 
         self._engine = engine
@@ -349,6 +359,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
                     reasoning=perception_result.reasoning,
                     is_transitional=perception_result.is_transitional,
                     transition_between=perception_result.transition_between,
+                    timestamp=test_case.acquired_at if self.config.real_timestamps else None,
                 )
 
                 logger.info(
@@ -440,6 +451,21 @@ async def main():
         type=int,
         help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).",
     )
+    parser.add_argument(
+        "--no-temporal-context",
+        action="store_true",
+        help="Ablation: omit the TEMPORAL CONTEXT block from the prompt",
+    )
+    parser.add_argument(
+        "--no-previous-observations",
+        action="store_true",
+        help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt",
+    )
+    parser.add_argument(
+        "--real-timestamps",
+        action="store_true",
+        help="Use TIFF acquisition timestamps for temporal context instead of wallclock",
+    )
     parser.add_argument(
         "--description",
         default="",
@@ -505,6 +531,9 @@ async def main():
         embryo_ids=args.embryo,
         start_timepoint=args.start_timepoint,
         max_timepoints_per_embryo=args.max_timepoints,
+        include_temporal_context=not args.no_temporal_context,
+        include_previous_observations=not args.no_previous_observations,
+        real_timestamps=args.real_timestamps,
         description=args.description,
     )
 
diff --git a/benchmarks/perception/testset.py b/benchmarks/perception/testset.py
index 5dc3f34..658cc01 100644
--- a/benchmarks/perception/testset.py
+++ b/benchmarks/perception/testset.py
@@ -7,6 +7,7 @@
 import base64
 import io
 from dataclasses import dataclass
+from datetime import datetime
 from pathlib import Path
 from typing import Iterator, List, Optional, Tuple, Dict
 
@@ -43,12 +44,13 @@ class TestCase:
     side_image_b64: Optional[str]  # SIDE view only
     volume: Optional[np.ndarray]
     ground_truth_stage: Optional[str]
+    acquired_at: Optional[datetime] = None
 
 
-def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dict[str, List[Path]]:
-    """Discover volume files in a session directory."""
-    from datetime import datetime
-
+def _discover_volumes(
+    session_dir: Path, embryo_id: Optional[str] = None
+) -> Dict[str, List[Tuple[datetime, Path]]]:
+    """Discover volume files (with parsed acquisition timestamps) in a session directory."""
     if not session_dir.exists():
         return {}
 
@@ -79,7 +81,7 @@ def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dic
     result = {}
     for eid, volumes in embryo_volumes.items():
         volumes.sort(key=lambda x: x[0])
-        result[eid] = [v[1] for v in volumes]
+        result[eid] = volumes
 
     return result
 
@@ -286,7 +288,7 @@ def iter_embryo(
             end_timepoint = len(volumes)
 
         for timepoint in range(start_timepoint, min(end_timepoint, len(volumes))):
-            vol_path = volumes[timepoint]
+            acquired_at, vol_path = volumes[timepoint]
 
             # Load volume
             volume = _load_volume(vol_path) if self.load_volumes else None
@@ -313,6 +315,7 @@ def iter_embryo(
                 side_image_b64=side_b64,
                 volume=volume,
                 ground_truth_stage=gt_stage,
+                acquired_at=acquired_at,
             )
 
     def iter_all(self) -> Iterator[Tuple[str, Iterator[TestCase]]]:
diff --git a/gently/harness/perception/engine.py b/gently/harness/perception/engine.py
index 83eed33..440d5f1 100644
--- a/gently/harness/perception/engine.py
+++ b/gently/harness/perception/engine.py
@@ -332,6 +332,8 @@ def __init__(
         volume_accessor: Optional[Callable[[str, int], Optional[np.ndarray]]] = None,
         enable_verification: bool = True,
         multishot_turns: int = 0,
+        include_temporal_context: bool = True,
+        include_previous_observations: bool = True,
     ):
         """
         Parameters
@@ -355,6 +357,8 @@ def __init__(
         self.volume_accessor = volume_accessor
         self.enable_verification = enable_verification
         self.multishot_turns = multishot_turns
+        self.include_temporal_context = include_temporal_context
+        self.include_previous_observations = include_previous_observations
 
         # Load examples if provided
         if example_store:
@@ -1183,32 +1187,34 @@ def _build_prompt(
         })
 
         # 2. Previous observations (last 3)
-        recent = session.get_recent_observations(3)
-        if recent:
-            obs_text = "\nPREVIOUS OBSERVATIONS:\n"
-            for obs in recent:
-                obs_text += f"- T{obs.timepoint}: {obs.stage}"
-                if obs.is_hatching:
-                    obs_text += " (hatching in progress)"
-                obs_text += "\n"
-            content.append({"type": "text", "text": obs_text})
+        if self.include_previous_observations:
+            recent = session.get_recent_observations(3)
+            if recent:
+                obs_text = "\nPREVIOUS OBSERVATIONS:\n"
+                for obs in recent:
+                    obs_text += f"- T{obs.timepoint}: {obs.stage}"
+                    if obs.is_hatching:
+                        obs_text += " (hatching in progress)"
+                    obs_text += "\n"
+                content.append({"type": "text", "text": obs_text})
 
         # 3. Temporal context (for detecting arrested/dead embryos)
-        temporal = session.compute_temporal_analysis()
-        if temporal:
-            temporal_text = f"""
+        if self.include_temporal_context:
+            temporal = session.compute_temporal_analysis()
+            if temporal:
+                temporal_text = f"""
 TEMPORAL CONTEXT:
 - Current stage: {temporal.current_stage}
 - Time at this stage: {temporal.time_in_current_stage_min:.0f} minutes
 - Expected duration: {temporal.expected_duration_min or 'N/A'} minutes
 - Overtime ratio: {temporal.overtime_ratio:.1f}x (>2x is unusual, >3x is concerning)
 """
-            if temporal.is_potentially_arrested:
-                temporal_text += f"""
+                if temporal.is_potentially_arrested:
+                    temporal_text += f"""
 WARNING - POTENTIAL DEVELOPMENTAL ARREST:
 - {temporal.arrest_reason}
 """
-            content.append({"type": "text", "text": temporal_text})
+                content.append({"type": "text", "text": temporal_text})
 
         # 4. Current image to analyze
         use_separate_images = top_image_b64 is not None and side_image_b64 is not None
diff --git a/gently/harness/perception/session.py b/gently/harness/perception/session.py
index e3fbe24..ddd7a01 100644
--- a/gently/harness/perception/session.py
+++ b/gently/harness/perception/session.py
@@ -207,7 +207,9 @@ def compute_temporal_analysis(self) -> Optional[TemporalAnalysis]:
             return None
 
         current_stage = self.get_current_stage()
-        now = datetime.now()
+        # Reference "now" against the most recent observation's timestamp so
+        # time-in-stage reflects acquisition time, not benchmark wallclock.
+        now = self.observations[-1].timestamp
 
         # Find when current stage started (walk backwards through observations)
         stage_start_time = None

From 074a4d20530b2582760646f72c2c3ae69143f054 Mon Sep 17 00:00:00 2001
From: Trisha Bansal <trisha@anthropic.com>
Date: Thu, 9 Apr 2026 17:13:14 +0000
Subject: [PATCH 4/4] benchmarks: always use TIFF acquisition timestamps
 (wallclock fix)

Ablation confirmed the wallclock-time bug is the dominant cause of the
perception engine's stage-lock in benchmarks. Make the fix unconditional:
pass test_case.acquired_at to session.add_observation() instead of letting
it default to datetime.now(). Removes the --real-timestamps flag (was for
ablation only).
---
 benchmarks/perception/runner.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py
index 8f1b8f3..33d93fe 100644
--- a/benchmarks/perception/runner.py
+++ b/benchmarks/perception/runner.py
@@ -45,7 +45,6 @@ class BenchmarkConfig:
     # Ablation toggles
     include_temporal_context: bool = True
     include_previous_observations: bool = True
-    real_timestamps: bool = False
 
     # Custom system prompt override
     system_prompt_override: Optional[str] = None
@@ -65,7 +64,6 @@ def to_dict(self) -> Dict[str, Any]:
             "enable_verification": self.enable_verification,
             "include_temporal_context": self.include_temporal_context,
             "include_previous_observations": self.include_previous_observations,
-            "real_timestamps": self.real_timestamps,
             "start_timepoint": self.start_timepoint,
             "max_timepoints_per_embryo": self.max_timepoints_per_embryo,
             "embryo_ids": self.embryo_ids,
@@ -359,7 +357,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
                     reasoning=perception_result.reasoning,
                     is_transitional=perception_result.is_transitional,
                     transition_between=perception_result.transition_between,
-                    timestamp=test_case.acquired_at if self.config.real_timestamps else None,
+                    timestamp=test_case.acquired_at,
                 )
 
                 logger.info(
@@ -461,11 +459,6 @@ async def main():
         action="store_true",
         help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt",
     )
-    parser.add_argument(
-        "--real-timestamps",
-        action="store_true",
-        help="Use TIFF acquisition timestamps for temporal context instead of wallclock",
-    )
     parser.add_argument(
         "--description",
         default="",
@@ -533,7 +526,6 @@ async def main():
         max_timepoints_per_embryo=args.max_timepoints,
         include_temporal_context=not args.no_temporal_context,
         include_previous_observations=not args.no_previous_observations,
-        real_timestamps=args.real_timestamps,
         description=args.description,
     )