Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 41 additions & 2 deletions benchmarks/perception/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,14 @@ class BenchmarkConfig:
enable_verification: bool = True # Multi-phase verification with subagents

# Test settings
start_timepoint: int = 0
max_timepoints_per_embryo: Optional[int] = None
embryo_ids: Optional[List[str]] = None # None = all

# Ablation toggles
include_temporal_context: bool = True
include_previous_observations: bool = True

# Custom system prompt override
system_prompt_override: Optional[str] = None

Expand All @@ -57,6 +62,9 @@ def to_dict(self) -> Dict[str, Any]:
"enable_view_reference": self.enable_view_reference,
"enable_view_previous": self.enable_view_previous,
"enable_verification": self.enable_verification,
"include_temporal_context": self.include_temporal_context,
"include_previous_observations": self.include_previous_observations,
"start_timepoint": self.start_timepoint,
"max_timepoints_per_embryo": self.max_timepoints_per_embryo,
"embryo_ids": self.embryo_ids,
"system_prompt_override": self.system_prompt_override,
Expand Down Expand Up @@ -254,6 +262,8 @@ async def _get_engine(self):
claude_client=client,
examples_path=examples_path,
enable_verification=self.config.enable_verification,
include_temporal_context=self.config.include_temporal_context,
include_previous_observations=self.config.include_previous_observations,
)

self._engine = engine
Expand Down Expand Up @@ -287,7 +297,11 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
if self.config.max_timepoints_per_embryo:
end_tp = self.config.max_timepoints_per_embryo

for test_case in self.testset.iter_embryo(embryo_id, end_timepoint=end_tp):
for test_case in self.testset.iter_embryo(
embryo_id,
start_timepoint=self.config.start_timepoint,
end_timepoint=end_tp,
):
logger.info(
f"[{embryo_id}] Processing T{test_case.timepoint} "
f"(GT: {test_case.ground_truth_stage})"
Expand Down Expand Up @@ -343,6 +357,7 @@ async def run_embryo(self, embryo_id: str) -> EmbryoResult:
reasoning=perception_result.reasoning,
is_transitional=perception_result.is_transitional,
transition_between=perception_result.transition_between,
timestamp=test_case.acquired_at,
)

logger.info(
Expand Down Expand Up @@ -423,10 +438,26 @@ async def main():
action="append",
help="Specific embryo(s) to run (can specify multiple)",
)
parser.add_argument(
"--start-timepoint",
type=int,
default=0,
help="First timepoint index to process (skip earlier frames)",
)
parser.add_argument(
"--max-timepoints",
type=int,
help="Maximum timepoints per embryo",
help="End timepoint index (exclusive). With --start-timepoint, processes [start, max).",
)
parser.add_argument(
"--no-temporal-context",
action="store_true",
help="Ablation: omit the TEMPORAL CONTEXT block from the prompt",
)
parser.add_argument(
"--no-previous-observations",
action="store_true",
help="Ablation: omit the PREVIOUS OBSERVATIONS block from the prompt",
)
parser.add_argument(
"--description",
Expand All @@ -447,6 +478,11 @@ async def main():
format="%(asctime)s %(levelname)s %(message)s",
)

# The perception engine reads stage definitions etc. from the active
# organism module, which is normally loaded by launch_gently.py.
from gently.organisms import load_organism
load_organism("celegans")

# Find session path
session_path = Path(args.session)
if not session_path.exists():
Expand Down Expand Up @@ -486,7 +522,10 @@ async def main():
# Create config
config = BenchmarkConfig(
embryo_ids=args.embryo,
start_timepoint=args.start_timepoint,
max_timepoints_per_embryo=args.max_timepoints,
include_temporal_context=not args.no_temporal_context,
include_previous_observations=not args.no_previous_observations,
description=args.description,
)

Expand Down
15 changes: 9 additions & 6 deletions benchmarks/perception/testset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import base64
import io
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Iterator, List, Optional, Tuple, Dict

Expand Down Expand Up @@ -43,12 +44,13 @@ class TestCase:
side_image_b64: Optional[str] # SIDE view only
volume: Optional[np.ndarray]
ground_truth_stage: Optional[str]
acquired_at: Optional[datetime] = None


def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dict[str, List[Path]]:
"""Discover volume files in a session directory."""
from datetime import datetime

def _discover_volumes(
session_dir: Path, embryo_id: Optional[str] = None
) -> Dict[str, List[Tuple[datetime, Path]]]:
"""Discover volume files (with parsed acquisition timestamps) in a session directory."""
if not session_dir.exists():
return {}

Expand Down Expand Up @@ -79,7 +81,7 @@ def _discover_volumes(session_dir: Path, embryo_id: Optional[str] = None) -> Dic
result = {}
for eid, volumes in embryo_volumes.items():
volumes.sort(key=lambda x: x[0])
result[eid] = [v[1] for v in volumes]
result[eid] = volumes

return result

Expand Down Expand Up @@ -286,7 +288,7 @@ def iter_embryo(
end_timepoint = len(volumes)

for timepoint in range(start_timepoint, min(end_timepoint, len(volumes))):
vol_path = volumes[timepoint]
acquired_at, vol_path = volumes[timepoint]

# Load volume
volume = _load_volume(vol_path) if self.load_volumes else None
Expand All @@ -313,6 +315,7 @@ def iter_embryo(
side_image_b64=side_b64,
volume=volume,
ground_truth_stage=gt_stage,
acquired_at=acquired_at,
)

def iter_all(self) -> Iterator[Tuple[str, Iterator[TestCase]]]:
Expand Down
1 change: 0 additions & 1 deletion gently/app/orchestration/timelapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,6 @@ def _check_interval_rules(
embryo_id=embryo_id,
detector_name=detector_name,
stage=stage,
timepoint=estate.timepoints_acquired,
):
# Round-based: interval rules now modify the global interval
old_interval = self._base_interval_seconds
Expand Down
79 changes: 60 additions & 19 deletions gently/harness/perception/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,8 @@ def __init__(
volume_accessor: Optional[Callable[[str, int], Optional[np.ndarray]]] = None,
enable_verification: bool = True,
multishot_turns: int = 0,
include_temporal_context: bool = True,
include_previous_observations: bool = True,
):
"""
Parameters
Expand All @@ -355,6 +357,8 @@ def __init__(
self.volume_accessor = volume_accessor
self.enable_verification = enable_verification
self.multishot_turns = multishot_turns
self.include_temporal_context = include_temporal_context
self.include_previous_observations = include_previous_observations

# Load examples if provided
if example_store:
Expand Down Expand Up @@ -533,7 +537,8 @@ async def _run_reasoning_loop(
"""
Run the interleaved reasoning loop with tool use.

Returns (PerceptionResult, ReasoningTrace)
Returns (PerceptionResult, ReasoningTrace, messages) where messages is the
full conversation history, so callers can continue the conversation.
"""
trace = ReasoningTrace()

Expand Down Expand Up @@ -591,8 +596,9 @@ async def _run_reasoning_loop(
))

# Parse and return
messages.append({"role": "assistant", "content": [{"type": "text", "text": text_response}]})
result = self._parse_response(text_response)
return result, trace
return result, trace, messages

# Handle tool use
if response.stop_reason == "tool_use":
Expand Down Expand Up @@ -620,12 +626,33 @@ async def _run_reasoning_loop(
tool_input=block.input,
))

assistant_content = []
for block in response.content:
if block.type == "text":
assistant_content.append({"type": "text", "text": block.text})
elif block.type == "tool_use":
assistant_content.append({
"type": "tool_use",
"id": block.id,
"name": block.name,
"input": block.input,
})
messages.append({"role": "assistant", "content": assistant_content})

# Run verification and return result
result = await self._handle_verification_request(
verification_block.input,
trace,
)
return result, trace
messages.append({
"role": "user",
"content": [{
"type": "tool_result",
"tool_use_id": verification_block.id,
"content": f"Verification complete: stage={result.stage}, confidence={result.confidence:.0%}",
}],
})
return result, trace, messages

# Build assistant message with the response
assistant_content = []
Expand Down Expand Up @@ -688,7 +715,7 @@ async def _run_reasoning_loop(

# Max iterations reached - parse last response
logger.warning(f"Max reasoning iterations ({max_iterations}) reached")
return self._parse_response(""), trace
return self._parse_response(""), trace, messages

def _handle_tool_call(
self,
Expand Down Expand Up @@ -1060,6 +1087,18 @@ def _build_cached_system_prompt(self) -> List[Dict]:
"cache_control": {"type": "ephemeral", "ttl": "1h"}
}]

def _build_reconsider_prompt(self, result: "PerceptionResult", turn: int) -> str:
raise NotImplementedError(
"Multishot reconsideration is not yet implemented. "
"Set multishot_turns=0 (the default) to disable."
)

async def _call_claude(self, messages: List[Dict], include_tools: bool = True) -> Any:
raise NotImplementedError(
"Multishot reconsideration is not yet implemented. "
"Set multishot_turns=0 (the default) to disable."
)

async def _call_claude_with_tools(self, messages: List[Dict]) -> Any:
"""Call Claude API with tools enabled for interleaved reasoning."""
try:
Expand Down Expand Up @@ -1148,32 +1187,34 @@ def _build_prompt(
})

# 2. Previous observations (last 3)
recent = session.get_recent_observations(3)
if recent:
obs_text = "\nPREVIOUS OBSERVATIONS:\n"
for obs in recent:
obs_text += f"- T{obs.timepoint}: {obs.stage}"
if obs.is_hatching:
obs_text += " (hatching in progress)"
obs_text += "\n"
content.append({"type": "text", "text": obs_text})
if self.include_previous_observations:
recent = session.get_recent_observations(3)
if recent:
obs_text = "\nPREVIOUS OBSERVATIONS:\n"
for obs in recent:
obs_text += f"- T{obs.timepoint}: {obs.stage}"
if obs.is_hatching:
obs_text += " (hatching in progress)"
obs_text += "\n"
content.append({"type": "text", "text": obs_text})

# 3. Temporal context (for detecting arrested/dead embryos)
temporal = session.compute_temporal_analysis()
if temporal:
temporal_text = f"""
if self.include_temporal_context:
temporal = session.compute_temporal_analysis()
if temporal:
temporal_text = f"""
TEMPORAL CONTEXT:
- Current stage: {temporal.current_stage}
- Time at this stage: {temporal.time_in_current_stage_min:.0f} minutes
- Expected duration: {temporal.expected_duration_min or 'N/A'} minutes
- Overtime ratio: {temporal.overtime_ratio:.1f}x (>2x is unusual, >3x is concerning)
"""
if temporal.is_potentially_arrested:
temporal_text += f"""
if temporal.is_potentially_arrested:
temporal_text += f"""
WARNING - POTENTIAL DEVELOPMENTAL ARREST:
- {temporal.arrest_reason}
"""
content.append({"type": "text", "text": temporal_text})
content.append({"type": "text", "text": temporal_text})

# 4. Current image to analyze
use_separate_images = top_image_b64 is not None and side_image_b64 is not None
Expand Down
8 changes: 7 additions & 1 deletion gently/harness/perception/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@ def compute_temporal_analysis(self) -> Optional[TemporalAnalysis]:
return None

current_stage = self.get_current_stage()
now = datetime.now()
# Reference "now" against the most recent observation's timestamp so
# time-in-stage reflects acquisition time, not benchmark wallclock.
now = self.observations[-1].timestamp

# Find when current stage started (walk backwards through observations)
stage_start_time = None
Expand Down Expand Up @@ -634,3 +636,7 @@ class PerceptionResult:
candidate_stages: Optional[List[CandidateStage]] = None # Candidates from Phase 1
multi_phase_trace: Optional[MultiPhaseReasoningTrace] = None # Full multi-phase trace
phase_count: int = 1 # Number of phases executed (1, 2, or 3)

# Multishot reconsideration: the first answer before any reconsider turns
initial_stage: Optional[str] = None
initial_confidence: Optional[float] = None