Skip to content
Closed
76 changes: 7 additions & 69 deletions pantheon/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,12 +128,8 @@ class AgentRunContext:

agent: "Agent"
memory: "Memory | None"
execution_context_id: str | None = None
process_step_message: Callable | None = None
process_chunk: Callable | None = None
cache_safe_runtime_params: Any | None = None
cache_safe_prompt_messages: list[dict] | None = None
cache_safe_tool_definitions: list[dict] | None = None
process_step_message: Callable | None
process_chunk: Callable | None


_RUN_CONTEXT: ContextVar[AgentRunContext | None] = ContextVar(
Expand Down Expand Up @@ -879,8 +875,7 @@ async def get_tools_for_llm(self) -> list[dict]:
# Providers return ToolInfo with pre-generated inputSchema (the "function" part)
logger.debug(f"get tools for llm: {self.providers} ")
provider_tools = []
for provider_name in sorted(self.providers):
provider = self.providers[provider_name]
for provider_name, provider in self.providers.items():
try:
# Get tools from provider (uses cached list if available)
tools = await provider.list_tools()
Expand Down Expand Up @@ -950,9 +945,7 @@ async def get_tools_for_llm(self) -> list[dict]:
if not self.force_litellm:
func["parameters"].setdefault("required", []).append("_background")

from pantheon.utils.token_optimization import stabilize_tool_definitions

return stabilize_tool_definitions(all_tools)
return all_tools

def _should_inject_context_variables(self, prefixed_name: str) -> bool:
"""Determine if context_variables should be injected for a tool.
Expand Down Expand Up @@ -1353,8 +1346,7 @@ async def _run_single_tool_call(call: dict) -> dict:
# Process and truncate tool result in one step
content = process_tool_result(
result,
max_length=self.max_tool_content_length,
tool_name=func_name,
max_length=self.max_tool_content_length
)

tool_message.update({
Expand Down Expand Up @@ -1418,39 +1410,7 @@ async def _acompletion(

# Step 1: Process messages for the model
async with tracker.measure("message_processing"):
from pantheon.utils.token_optimization import (
build_llm_view_async,
inject_cache_control_markers,
is_anthropic_model,
)

run_context = get_current_run_context()
optimization_memory = run_context.memory if run_context else None
is_main_thread = (
run_context.execution_context_id is None if run_context else True
)
messages = await build_llm_view_async(
messages,
memory=optimization_memory,
is_main_thread=is_main_thread,
autocompact_model=model,
)
messages = process_messages_for_model(messages, model)
# Inject Anthropic prompt-cache markers so the server-side cache
# activates — mirrors Claude Code's getCacheControl() strategy.
if is_anthropic_model(model):
messages = inject_cache_control_markers(messages)
if run_context is not None:
# Selective copy: shallow for messages with string content,
# deepcopy only for messages with list content (Anthropic blocks
# from inject_cache_control_markers) to avoid mutation issues.
cached = []
for m in messages:
if isinstance(m.get("content"), list):
cached.append(copy.deepcopy(m))
else:
cached.append({**m})
run_context.cache_safe_prompt_messages = cached

# Step 2: Detect provider and get configuration
provider_config = detect_provider(model, self.relaxed_schema)
Expand Down Expand Up @@ -1486,8 +1446,6 @@ async def _acompletion(
# Use get_tools_for_llm() for unified tool access
# This includes both base_functions and provider tools
tools = await self.get_tools_for_llm() or None
if run_context is not None and tools is not None:
run_context.cache_safe_tool_definitions = copy.deepcopy(tools)

# For non-OpenAI providers or OpenAI-compatible providers, adjust tool format
# OpenAI-compatible providers (e.g. minimax) have api_key set in config
Expand Down Expand Up @@ -1530,15 +1488,6 @@ async def _acompletion(
if context_variables and "model_params" in context_variables:
# Runtime overrides instance defaults
model_params = {**self.model_params, **context_variables["model_params"]}

if run_context is not None:
from pantheon.utils.token_optimization import build_cache_safe_runtime_params

run_context.cache_safe_runtime_params = build_cache_safe_runtime_params(
model=model,
model_params=model_params,
response_format=response_format,
)

# Step 8: Call LLM provider (unified interface)
# logger.info(f"Raw messages: {messages}")
Expand Down Expand Up @@ -2153,11 +2102,6 @@ async def _prepare_execution_context(
# Determine whether to use memory
should_use_memory = use_memory if use_memory is not None else self.use_memory
memory_instance = memory or self.memory
working_context_variables = (context_variables or {}).copy()
fork_context_messages = working_context_variables.pop(
"_cache_safe_fork_context_messages",
None,
)

input_messages = None # Only set for normal user input, not AgentTransfer

Expand Down Expand Up @@ -2187,21 +2131,16 @@ async def _prepare_execution_context(
conversation_history = (
memory_instance.get_messages(
execution_context_id=execution_context_id,
for_llm=False
for_llm=True
)
if (should_use_memory and memory_instance)
else []
)
if isinstance(fork_context_messages, list) and fork_context_messages:
conversation_history = [
*copy.deepcopy(fork_context_messages),
*conversation_history,
]
conversation_history += input_messages
conversation_history = self._sanitize_messages(conversation_history)

# preserve execution_context_id if tool need
context_variables = working_context_variables
context_variables = (context_variables or {}).copy()

# Inject global context variables from settings
from .settings import get_settings
Expand Down Expand Up @@ -2341,7 +2280,6 @@ async def _process_chunk(chunk: dict):
run_context = AgentRunContext(
agent=self,
memory=exec_context.memory_instance,
execution_context_id=exec_context.execution_context_id,
process_step_message=_process_step_message,
process_chunk=_process_chunk,
)
Expand Down
21 changes: 21 additions & 0 deletions pantheon/factory/templates/prompts/delegation.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,24 @@ call_agent(
```python
call_agent("researcher", "Do analysis fast.")
```

### Failure Recovery

Tool failures and sub-agent errors are expected — **never terminate without producing output.**

When a tool call fails, apply the following recovery ladder in order:

**File write failures** (e.g. content too large, output truncation):
1. **Use Two-Phase Write Protocol**: `write_file` (skeleton only) → `update_file` (one section at a time) → `append_file` (BibTeX / list batches). Never retry `write_file` with the same large content.
2. **Downgrade format**: If `.tex` fails after protocol, write `.md`; if `.md` fails, write `.txt`
3. **Inline output**: If all file writes fail, output the full content as a code block in the chat

**Sub-agent failures** (researcher or illustrator returns error or empty result):
1. **Retry with narrower scope**: Re-delegate with a smaller, more focused Task Brief
2. **Self-execute fallback**: Handle the task directly if sub-agent repeatedly fails
3. **Partial output**: Deliver what was completed and clearly state what is missing

**Hard rule — no silent failures:**
- Always produce at least one artifact per session, even if degraded
- When falling back to a simpler format, tell the user explicitly: what you tried, why it failed, what you're delivering instead
- A partial result delivered is always better than a perfect result abandoned
8 changes: 4 additions & 4 deletions pantheon/factory/templates/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
// - "thread": Execute in separate thread (isolated, for heavy tasks)
"local_toolset_execution_mode": "direct",
// === Tool Output Limits ===
// Maximum characters for tool output (fallback; per-tool thresholds take priority)
"max_tool_content_length": 50000,
// Maximum characters for tool output (used for smart truncation)
"max_tool_content_length": 10000,
// Maximum lines for file read operations
"max_file_read_lines": 1000,
// Maximum characters for file read operations (safety valve; per-tool thresholds handle LLM sizing)
"max_file_read_chars": 500000,
// Maximum characters for file read operations (prevents single-line overflow)
"max_file_read_chars": 100000,
// Maximum results for glob/search operations
"max_glob_results": 50,
// Enable notebook execution logging (JSONL files in .pantheon/logs/notebook/)
Expand Down
10 changes: 7 additions & 3 deletions pantheon/factory/templates/teams/default.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,12 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa
- Data analysis, EDA, statistical analysis
- Literature review and multi-source research

**Scientific writing gate (MANDATORY):** Before writing any report, paper, or document that requires domain knowledge or citations, you MUST first delegate a research task to `researcher`. Writing without a prior research delegation is not allowed for these task types.

#### Scientific Illustrator

**Delegate for:** Scientific diagrams, publication-quality visualizations, complex figures
**Execute directly:** Simple chart embedding, displaying existing charts
**Delegate for:** Schematic diagrams, pathway figures, cell structure illustrations, BioRender-style publication figures — tasks where the output is a conceptual diagram, not a data-driven chart.
**Execute directly (or via Researcher):** Data visualizations, statistical plots, matplotlib/seaborn charts derived from analysis results.

### Decision Summary

Expand All @@ -100,9 +102,11 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa
| Explore/read/understand codebase | **MUST delegate** to researcher |
| Web search or documentation lookup | **MUST delegate** to researcher |
| Data analysis or research | **MUST delegate** to researcher |
| Scientific writing (report/paper) | **MUST delegate research first**, then write |
| Multiple independent research tasks | **MUST parallelize** with multiple researchers |
| Schematic/pathway/cell diagrams | **Delegate** to scientific_illustrator |
| Read 1 known file | Execute directly |
| Write/edit/create files | Execute directly |
| Write/edit/create files (post-research) | Execute directly |
| Synthesize researcher results | Execute directly (your core role) |

{{delegation}}
25 changes: 12 additions & 13 deletions pantheon/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,13 +661,11 @@ def tool_timeout(self) -> int:
def max_tool_content_length(self) -> int:
"""
Maximum characters for tool output content.
Used as fallback for smart truncation at agent level.
Per-tool thresholds (from token_optimization.py) take priority
when available.
Defaults to 50000 (~12.5K tokens).
Used for smart truncation at agent level.
Defaults to 10000 (~5K tokens).
"""
self._ensure_loaded()
return self._settings.get("endpoint", {}).get("max_tool_content_length", 50000)
return self._settings.get("endpoint", {}).get("max_tool_content_length", 10000)

@property
def max_file_read_lines(self) -> int:
Expand All @@ -681,16 +679,17 @@ def max_file_read_lines(self) -> int:
@property
def max_file_read_chars(self) -> int:
"""
Maximum characters for read_file output (safety valve).

Acts as an upper bound to prevent unbounded output from pathological
files. Per-tool thresholds (from token_optimization.py) handle the
actual LLM-context sizing at Layer 2.

Defaults to 500000 characters.
Maximum characters for read_file output.

Set higher than max_tool_content_length to allow reading larger files
while preventing unbounded output. When exceeded, read_file returns
truncated content with a 'truncated' flag to prevent infinite loops.

Industry reference: Cursor uses 100K limit.
Defaults to 50000 characters.
"""
self._ensure_loaded()
return self._settings.get("endpoint", {}).get("max_file_read_chars", 500000)
return self._settings.get("endpoint", {}).get("max_file_read_chars", 50000)

@property
def max_glob_results(self) -> int:
Expand Down
Loading
Loading