pskeshu · trisha-ant · Apr 7, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/benchmarks/perception/runner.py b/benchmarks/perception/runner.py
@@ -38,9 +38,14 @@ class BenchmarkConfig:
     enable_verification: bool = True  # Multi-phase verification with subagents
 
     # Test settings
+    start_timepoint: int = 0
     max_timepoints_per_embryo: Optional[int] = None
     embryo_ids: Optional[List[str]] = None  # None = all
 
+    # Ablation toggles
+    include_temporal_context: bool = True
+    include_previous_observations: bool = True
+
     # Custom system prompt override
     system_prompt_override: Optional[str] = None
 
@@ -57,6 +62,9 @@ def to_dict(self) -> Dict[str, Any]:
             "enable_view_reference": self.enable_view_reference,
             "enable_view_previous": self.enable_view_previous,
             "enable_verification": self.enable_verification,
+            "include_temporal_context": self.include_temporal_context,
+            "include_previous_observations": self.include_previous_observations,
+            "start_timepoint": self.start_timepoint,
             "max_timepoints_per_embryo": self.max_timepoints_per_embryo,
             "embryo_ids": self.embryo_ids,
             "system_prompt_override": self.system_prompt_override,
@@ -254,6 +262,8 @@ async def _get_engine(self):
             claude_client=client,
             examples_path=examples_path,
             enable_verification=self.config.enable_verification,
+            include_temporal_context=self.config.include_temporal_context,
+            include_previous_observations=self.config.include_previous_observations,
         )
 
         self._engine = engine
@@ -287,7 +297,11 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
             if self.config.max_timepoints_per_embryo:
                 end_tp = self.config.max_timepoints_per_embryo
 
-            for test_case in self.testset.iter_embryo(embryo_id, end_timepoint=end_tp):
+            for test_case in self.testset.iter_embryo(
+                embryo_id,
+                start_timepoint=self.config.start_timepoint,
+                end_timepoint=end_tp,
+            ):
                 logger.info(
                     f"[{embryo_id}] Processing T{test_case.timepoint} "
                     f"(GT: {test_case.ground_truth_stage})"
@@ -343,6 +357,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
                     reasoning=perception_result.reasoning,
                     is_transitional=perception_result.is_transitional,
                     transition_between=perception_result.transition_between,
+                    timestamp=test_case.acquired_at,
                 )
 
                 logger.info(
@@ -423,10 +438,26 @@ async def main():
         action="append",
         help="Specific embryo(s) to run (can specify multiple)",
     )
+    parser.add_argument(
+        "--start-timepoint",
+        type=int,
+        default=0,
+        help="First timepoint index to process (skip earlier frames)",
+    )
     parser.add_argument(
         "--max-timepoints",
         type=int,
-        help="Maximum timepoints per embryo",
+        help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).",
+    )
+    parser.add_argument(
+        "--no-temporal-context",
+        action="store_true",
+        help="Ablation: omit the TEMPORAL CONTEXT block from the prompt",
+    )
+    parser.add_argument(
+        "--no-previous-observations",
+        action="store_true",
+        help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt",
     )
     parser.add_argument(
         "--description",
@@ -447,6 +478,11 @@ async def main():
         format="%(asctime)s %(levelname)s %(message)s",
     )
 
+    # The perception engine reads stage definitions etc. from the active
+    # organism module, which is normally loaded by launch_gently.py.
+    from gently.organisms import load_organism
+    load_organism("celegans")
+
     # Find session path
     session_path = Path(args.session)
     if not session_path.exists():
@@ -486,7 +522,10 @@ async def main():
     # Create config
     config = BenchmarkConfig(
         embryo_ids=args.embryo,
+        start_timepoint=args.start_timepoint,
         max_timepoints_per_embryo=args.max_timepoints,
+        include_temporal_context=not args.no_temporal_context,
+        include_previous_observations=not args.no_previous_observations,
         description=args.description,
     )
 

diff --git a/benchmarks/perception/testset.py b/benchmarks/perception/testset.py
@@ -7,6 +7,7 @@
 import base64
 import io
 from dataclasses import dataclass
+from datetime import datetime
 from pathlib import Path
 from typing import Iterator, List, Optional, Tuple, Dict
 
@@ -43,12 +44,13 @@ class TestCase:
     side_image_b64: Optional[str]  # SIDE view only
     volume: Optional[np.ndarray]
     ground_truth_stage: Optional[str]
+    acquired_at: Optional[datetime] = None
 
 
-def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dict[str, List[Path]]:
-    """Discover volume files in a session directory."""
-    from datetime import datetime
-
+def _discover_volumes(
+    session_dir: Path, embryo_id: Optional[str] = None
+) -> Dict[str, List[Tuple[datetime, Path]]]:
+    """Discover volume files (with parsed acquisition timestamps) in a session directory."""
     if not session_dir.exists():
         return {}
 
@@ -79,7 +81,7 @@ def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dic
     result = {}
     for eid, volumes in embryo_volumes.items():
         volumes.sort(key=lambda x: x[0])
-        result[eid] = [v[1] for v in volumes]
+        result[eid] = volumes
 
     return result
 
@@ -286,7 +288,7 @@ def iter_embryo(
             end_timepoint = len(volumes)
 
         for timepoint in range(start_timepoint, min(end_timepoint, len(volumes))):
-            vol_path = volumes[timepoint]
+            acquired_at, vol_path = volumes[timepoint]
 
             # Load volume
             volume = _load_volume(vol_path) if self.load_volumes else None
@@ -313,6 +315,7 @@ def iter_embryo(
                 side_image_b64=side_b64,
                 volume=volume,
                 ground_truth_stage=gt_stage,
+                acquired_at=acquired_at,
             )
 
     def iter_all(self) -> Iterator[Tuple[str, Iterator[TestCase]]]:

diff --git a/gently/app/orchestration/timelapse.py b/gently/app/orchestration/timelapse.py
@@ -1007,7 +1007,6 @@ def _check_interval_rules(
                 embryo_id=embryo_id,
                 detector_name=detector_name,
                 stage=stage,
-                timepoint=estate.timepoints_acquired,
             ):
                 # Round-based: interval rules now modify the global interval
                 old_interval = self._base_interval_seconds

diff --git a/gently/harness/perception/engine.py b/gently/harness/perception/engine.py
@@ -332,6 +332,8 @@ def __init__(
         volume_accessor: Optional[Callable[[str, int], Optional[np.ndarray]]] = None,
         enable_verification: bool = True,
         multishot_turns: int = 0,
+        include_temporal_context: bool = True,
+        include_previous_observations: bool = True,
     ):
         """
         Parameters
@@ -355,6 +357,8 @@ def __init__(
         self.volume_accessor = volume_accessor
         self.enable_verification = enable_verification
         self.multishot_turns = multishot_turns
+        self.include_temporal_context = include_temporal_context
+        self.include_previous_observations = include_previous_observations
 
         # Load examples if provided
         if example_store:
@@ -533,7 +537,8 @@ async def _run_reasoning_loop(
         """
         Run the interleaved reasoning loop with tool use.
 
-        Returns (PerceptionResult, ReasoningTrace)
+        Returns (PerceptionResult, ReasoningTrace, messages) where messages is the
+        full conversation history, so callers can continue the conversation.
         """
         trace = ReasoningTrace()
 
@@ -591,8 +596,9 @@ async def _run_reasoning_loop(
                 ))
 
                 # Parse and return
+                messages.append({"role": "assistant", "content": [{"type": "text", "text": text_response}]})
                 result = self._parse_response(text_response)
-                return result, trace
+                return result, trace, messages
 
             # Handle tool use
             if response.stop_reason == "tool_use":
@@ -620,12 +626,33 @@ async def _run_reasoning_loop(
                                 tool_input=block.input,
                             ))
 
+                    assistant_content = []
+                    for block in response.content:
+                        if block.type == "text":
+                            assistant_content.append({"type": "text", "text": block.text})
+                        elif block.type == "tool_use":
+                            assistant_content.append({
+                                "type": "tool_use",
+                                "id": block.id,
+                                "name": block.name,
+                                "input": block.input,
+                            })
+                    messages.append({"role": "assistant", "content": assistant_content})
+
                     # Run verification and return result
                     result = await self._handle_verification_request(
                         verification_block.input,
                         trace,
                     )
-                    return result, trace
+                    messages.append({
+                        "role": "user",
+                        "content": [{
+                            "type": "tool_result",
+                            "tool_use_id": verification_block.id,
+                            "content": f"Verification complete: stage={result.stage}, confidence={result.confidence:.0%}",
+                        }],
+                    })
+                    return result, trace, messages
 
                 # Build assistant message with the response
                 assistant_content = []
@@ -688,7 +715,7 @@ async def _run_reasoning_loop(
 
         # Max iterations reached - parse last response
         logger.warning(f"Max reasoning iterations ({max_iterations}) reached")
-        return self._parse_response(""), trace
+        return self._parse_response(""), trace, messages
 
     def _handle_tool_call(
         self,
@@ -1060,6 +1087,18 @@ def _build_cached_system_prompt(self) -> List[Dict]:
             "cache_control": {"type": "ephemeral", "ttl": "1h"}
         }]
 
+    def _build_reconsider_prompt(self, result: "PerceptionResult", turn: int) -> str:
+        raise NotImplementedError(
+            "Multishot reconsideration is not yet implemented. "
+            "Set multishot_turns=0 (the default) to disable."
+        )
+
+    async def _call_claude(self, messages: List[Dict], include_tools: bool = True) -> Any:
+        raise NotImplementedError(
+            "Multishot reconsideration is not yet implemented. "
+            "Set multishot_turns=0 (the default) to disable."
+        )
+
     async def _call_claude_with_tools(self, messages: List[Dict]) -> Any:
         """Call Claude API with tools enabled for interleaved reasoning."""
         try:
@@ -1148,32 +1187,34 @@ def _build_prompt(
         })
 
         # 2. Previous observations (last 3)
-        recent = session.get_recent_observations(3)
-        if recent:
-            obs_text = "\nPREVIOUS OBSERVATIONS:\n"
-            for obs in recent:
-                obs_text += f"- T{obs.timepoint}: {obs.stage}"
-                if obs.is_hatching:
-                    obs_text += " (hatching in progress)"
-                obs_text += "\n"
-            content.append({"type": "text", "text": obs_text})
+        if self.include_previous_observations:
+            recent = session.get_recent_observations(3)
+            if recent:
+                obs_text = "\nPREVIOUS OBSERVATIONS:\n"
+                for obs in recent:
+                    obs_text += f"- T{obs.timepoint}: {obs.stage}"
+                    if obs.is_hatching:
+                        obs_text += " (hatching in progress)"
+                    obs_text += "\n"
+                content.append({"type": "text", "text": obs_text})
 
         # 3. Temporal context (for detecting arrested/dead embryos)
-        temporal = session.compute_temporal_analysis()
-        if temporal:
-            temporal_text = f"""
+        if self.include_temporal_context:
+            temporal = session.compute_temporal_analysis()
+            if temporal:
+                temporal_text = f"""
 TEMPORAL CONTEXT:
 - Current stage: {temporal.current_stage}
 - Time at this stage: {temporal.time_in_current_stage_min:.0f} minutes
 - Expected duration: {temporal.expected_duration_min or 'N/A'} minutes
 - Overtime ratio: {temporal.overtime_ratio:.1f}x (>2x is unusual, >3x is concerning)
 """
-            if temporal.is_potentially_arrested:
-                temporal_text += f"""
+                if temporal.is_potentially_arrested:
+                    temporal_text += f"""
 WARNING - POTENTIAL DEVELOPMENTAL ARREST:
 - {temporal.arrest_reason}
 """
-            content.append({"type": "text", "text": temporal_text})
+                content.append({"type": "text", "text": temporal_text})
 
         # 4. Current image to analyze
         use_separate_images = top_image_b64 is not None and side_image_b64 is not None

diff --git a/gently/harness/perception/session.py b/gently/harness/perception/session.py
@@ -207,7 +207,9 @@ def compute_temporal_analysis(self) -> Optional[TemporalAnalysis]:
             return None
 
         current_stage = self.get_current_stage()
-        now = datetime.now()
+        # Reference "now" against the most recent observation's timestamp so
+        # time-in-stage reflects acquisition time, not benchmark wallclock.
+        now = self.observations[-1].timestamp
 
         # Find when current stage started (walk backwards through observations)
         stage_start_time = None
@@ -634,3 +636,7 @@ class PerceptionResult:
     candidate_stages: Optional[List[CandidateStage]] = None  # Candidates from Phase 1
     multi_phase_trace: Optional[MultiPhaseReasoningTrace] = None  # Full multi-phase trace
     phase_count: int = 1  # Number of phases executed (1, 2, or 3)
+
+    # Multishot reconsideration: the first answer before any reconsider turns
+    initial_stage: Optional[str] = None
+    initial_confidence: Optional[float] = None