aristoteleo · Starlitnightly · Mar 30, 2026 · Mar 30, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/pantheon/agent.py b/pantheon/agent.py
@@ -128,12 +128,8 @@ class AgentRunContext:
 
     agent: "Agent"
     memory: "Memory | None"
-    execution_context_id: str | None = None
-    process_step_message: Callable | None = None
-    process_chunk: Callable | None = None
-    cache_safe_runtime_params: Any | None = None
-    cache_safe_prompt_messages: list[dict] | None = None
-    cache_safe_tool_definitions: list[dict] | None = None
+    process_step_message: Callable | None
+    process_chunk: Callable | None
 
 
 _RUN_CONTEXT: ContextVar[AgentRunContext | None] = ContextVar(
@@ -879,8 +875,7 @@ async def get_tools_for_llm(self) -> list[dict]:
         # Providers return ToolInfo with pre-generated inputSchema (the "function" part)
         logger.debug(f"get tools for llm: {self.providers} ")
         provider_tools = []
-        for provider_name in sorted(self.providers):
-            provider = self.providers[provider_name]
+        for provider_name, provider in self.providers.items():
             try:
                 # Get tools from provider (uses cached list if available)
                 tools = await provider.list_tools()
@@ -950,9 +945,7 @@ async def get_tools_for_llm(self) -> list[dict]:
             if not self.force_litellm:
                 func["parameters"].setdefault("required", []).append("_background")
 
-        from pantheon.utils.token_optimization import stabilize_tool_definitions
-
-        return stabilize_tool_definitions(all_tools)
+        return all_tools
 
     def _should_inject_context_variables(self, prefixed_name: str) -> bool:
         """Determine if context_variables should be injected for a tool.
@@ -1353,8 +1346,7 @@ async def _run_single_tool_call(call: dict) -> dict:
                 # Process and truncate tool result in one step
                 content = process_tool_result(
                     result,
-                    max_length=self.max_tool_content_length,
-                    tool_name=func_name,
+                    max_length=self.max_tool_content_length
                 )
 
                 tool_message.update({
@@ -1418,39 +1410,7 @@ async def _acompletion(
 
         # Step 1: Process messages for the model
         async with tracker.measure("message_processing"):
-            from pantheon.utils.token_optimization import (
-                build_llm_view_async,
-                inject_cache_control_markers,
-                is_anthropic_model,
-            )
-
-            run_context = get_current_run_context()
-            optimization_memory = run_context.memory if run_context else None
-            is_main_thread = (
-                run_context.execution_context_id is None if run_context else True
-            )
-            messages = await build_llm_view_async(
-                messages,
-                memory=optimization_memory,
-                is_main_thread=is_main_thread,
-                autocompact_model=model,
-            )
             messages = process_messages_for_model(messages, model)
-            # Inject Anthropic prompt-cache markers so the server-side cache
-            # activates — mirrors Claude Code's getCacheControl() strategy.
-            if is_anthropic_model(model):
-                messages = inject_cache_control_markers(messages)
-            if run_context is not None:
-                # Selective copy: shallow for messages with string content,
-                # deepcopy only for messages with list content (Anthropic blocks
-                # from inject_cache_control_markers) to avoid mutation issues.
-                cached = []
-                for m in messages:
-                    if isinstance(m.get("content"), list):
-                        cached.append(copy.deepcopy(m))
-                    else:
-                        cached.append({**m})
-                run_context.cache_safe_prompt_messages = cached
 
         # Step 2: Detect provider and get configuration
         provider_config = detect_provider(model, self.relaxed_schema)
@@ -1486,8 +1446,6 @@ async def _acompletion(
                 # Use get_tools_for_llm() for unified tool access
                 # This includes both base_functions and provider tools
                 tools = await self.get_tools_for_llm() or None
-                if run_context is not None and tools is not None:
-                    run_context.cache_safe_tool_definitions = copy.deepcopy(tools)
 
                 # For non-OpenAI providers or OpenAI-compatible providers, adjust tool format
                 # OpenAI-compatible providers (e.g. minimax) have api_key set in config
@@ -1530,15 +1488,6 @@ async def _acompletion(
         if context_variables and "model_params" in context_variables:
             # Runtime overrides instance defaults
             model_params = {**self.model_params, **context_variables["model_params"]}
-
-        if run_context is not None:
-            from pantheon.utils.token_optimization import build_cache_safe_runtime_params
-
-            run_context.cache_safe_runtime_params = build_cache_safe_runtime_params(
-                model=model,
-                model_params=model_params,
-                response_format=response_format,
-            )
 
         # Step 8: Call LLM provider (unified interface)
         # logger.info(f"Raw messages: {messages}")
@@ -2153,11 +2102,6 @@ async def _prepare_execution_context(
         # Determine whether to use memory
         should_use_memory = use_memory if use_memory is not None else self.use_memory
         memory_instance = memory or self.memory
-        working_context_variables = (context_variables or {}).copy()
-        fork_context_messages = working_context_variables.pop(
-            "_cache_safe_fork_context_messages",
-            None,
-        )
 
         input_messages = None  # Only set for normal user input, not AgentTransfer
 
@@ -2187,21 +2131,16 @@ async def _prepare_execution_context(
             conversation_history = (
                 memory_instance.get_messages(
                     execution_context_id=execution_context_id,
-                    for_llm=False
+                    for_llm=True
                 )
                 if (should_use_memory and memory_instance)
                 else []
             )
-            if isinstance(fork_context_messages, list) and fork_context_messages:
-                conversation_history = [
-                    *copy.deepcopy(fork_context_messages),
-                    *conversation_history,
-                ]
             conversation_history += input_messages
             conversation_history = self._sanitize_messages(conversation_history)
 
         # preserve execution_context_id if tool need
-        context_variables = working_context_variables
+        context_variables = (context_variables or {}).copy()
 
         # Inject global context variables from settings
         from .settings import get_settings
@@ -2341,7 +2280,6 @@ async def _process_chunk(chunk: dict):
             run_context = AgentRunContext(
                 agent=self,
                 memory=exec_context.memory_instance,
-                execution_context_id=exec_context.execution_context_id,
                 process_step_message=_process_step_message,
                 process_chunk=_process_chunk,
             )

diff --git a/pantheon/factory/templates/prompts/delegation.md b/pantheon/factory/templates/prompts/delegation.md
@@ -91,3 +91,24 @@ call_agent(
 ```python
 call_agent("researcher", "Do analysis fast.")
 ```
+
+### Failure Recovery
+
+Tool failures and sub-agent errors are expected — **never terminate without producing output.**
+
+When a tool call fails, apply the following recovery ladder in order:
+
+**File write failures** (e.g. content too large, output truncation):
+1. **Use Two-Phase Write Protocol**: `write_file` (skeleton only) → `update_file` (one section at a time) → `append_file` (BibTeX / list batches). Never retry `write_file` with the same large content.
+2. **Downgrade format**: If `.tex` fails after protocol, write `.md`; if `.md` fails, write `.txt`
+3. **Inline output**: If all file writes fail, output the full content as a code block in the chat
+
+**Sub-agent failures** (researcher or illustrator returns error or empty result):
+1. **Retry with narrower scope**: Re-delegate with a smaller, more focused Task Brief
+2. **Self-execute fallback**: Handle the task directly if sub-agent repeatedly fails
+3. **Partial output**: Deliver what was completed and clearly state what is missing
+
+**Hard rule — no silent failures:**
+- Always produce at least one artifact per session, even if degraded
+- When falling back to a simpler format, tell the user explicitly: what you tried, why it failed, what you're delivering instead
+- A partial result delivered is always better than a perfect result abandoned
diff --git a/pantheon/factory/templates/settings.json b/pantheon/factory/templates/settings.json
@@ -25,12 +25,12 @@
         // - "thread": Execute in separate thread (isolated, for heavy tasks)
         "local_toolset_execution_mode": "direct",
         // === Tool Output Limits ===
-        // Maximum characters for tool output (fallback; per-tool thresholds take priority)
-        "max_tool_content_length": 50000,
+        // Maximum characters for tool output (used for smart truncation)
+        "max_tool_content_length": 10000,
         // Maximum lines for file read operations
         "max_file_read_lines": 1000,
-        // Maximum characters for file read operations (safety valve; per-tool thresholds handle LLM sizing)
-        "max_file_read_chars": 500000,
+        // Maximum characters for file read operations (prevents single-line overflow)
+        "max_file_read_chars": 100000,
         // Maximum results for glob/search operations
         "max_glob_results": 50,
         // Enable notebook execution logging (JSONL files in .pantheon/logs/notebook/)

diff --git a/pantheon/factory/templates/teams/default.md b/pantheon/factory/templates/teams/default.md
@@ -88,10 +88,12 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa
 - Data analysis, EDA, statistical analysis
 - Literature review and multi-source research
 
+**Scientific writing gate (MANDATORY):** Before writing any report, paper, or document that requires domain knowledge or citations, you MUST first delegate a research task to `researcher`. Writing without a prior research delegation is not allowed for these task types.
+
 #### Scientific Illustrator
 
-**Delegate for:** Scientific diagrams, publication-quality visualizations, complex figures
-**Execute directly:** Simple chart embedding, displaying existing charts
+**Delegate for:** Schematic diagrams, pathway figures, cell structure illustrations, BioRender-style publication figures — tasks where the output is a conceptual diagram, not a data-driven chart.
+**Execute directly (or via Researcher):** Data visualizations, statistical plots, matplotlib/seaborn charts derived from analysis results.
 
 ### Decision Summary
 
@@ -100,9 +102,11 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa
 | Explore/read/understand codebase | **MUST delegate** to researcher |
 | Web search or documentation lookup | **MUST delegate** to researcher |
 | Data analysis or research | **MUST delegate** to researcher |
+| Scientific writing (report/paper) | **MUST delegate research first**, then write |
 | Multiple independent research tasks | **MUST parallelize** with multiple researchers |
+| Schematic/pathway/cell diagrams | **Delegate** to scientific_illustrator |
 | Read 1 known file | Execute directly |
-| Write/edit/create files | Execute directly |
+| Write/edit/create files (post-research) | Execute directly |
 | Synthesize researcher results | Execute directly (your core role) |
 
 {{delegation}}
diff --git a/pantheon/settings.py b/pantheon/settings.py
@@ -661,13 +661,11 @@ def tool_timeout(self) -> int:
     def max_tool_content_length(self) -> int:
         """
         Maximum characters for tool output content.
-        Used as fallback for smart truncation at agent level.
-        Per-tool thresholds (from token_optimization.py) take priority
-        when available.
-        Defaults to 50000 (~12.5K tokens).
+        Used for smart truncation at agent level.
+        Defaults to 10000 (~5K tokens).
         """
         self._ensure_loaded()
-        return self._settings.get("endpoint", {}).get("max_tool_content_length", 50000)
+        return self._settings.get("endpoint", {}).get("max_tool_content_length", 10000)
 
     @property
     def max_file_read_lines(self) -> int:
@@ -681,16 +679,17 @@ def max_file_read_lines(self) -> int:
     @property
     def max_file_read_chars(self) -> int:
         """
-        Maximum characters for read_file output (safety valve).
-
-        Acts as an upper bound to prevent unbounded output from pathological
-        files. Per-tool thresholds (from token_optimization.py) handle the
-        actual LLM-context sizing at Layer 2.
-
-        Defaults to 500000 characters.
+        Maximum characters for read_file output.
+
+        Set higher than max_tool_content_length to allow reading larger files
+        while preventing unbounded output. When exceeded, read_file returns
+        truncated content with a 'truncated' flag to prevent infinite loops.
+
+        Industry reference: Cursor uses 100K limit.
+        Defaults to 50000 characters.
         """
         self._ensure_loaded()
-        return self._settings.get("endpoint", {}).get("max_file_read_chars", 500000)
+        return self._settings.get("endpoint", {}).get("max_file_read_chars", 50000)
 
     @property
     def max_glob_results(self) -> int: