aristoteleo · Starlitnightly · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/pantheon/agent.py b/pantheon/agent.py
@@ -128,8 +128,12 @@ class AgentRunContext:
 
     agent: "Agent"
     memory: "Memory | None"
-    process_step_message: Callable | None
-    process_chunk: Callable | None
+    execution_context_id: str | None = None
+    process_step_message: Callable | None = None
+    process_chunk: Callable | None = None
+    cache_safe_runtime_params: Any | None = None
+    cache_safe_prompt_messages: list[dict] | None = None
+    cache_safe_tool_definitions: list[dict] | None = None
 
 
 _RUN_CONTEXT: ContextVar[AgentRunContext | None] = ContextVar(
@@ -944,7 +948,9 @@ async def get_tools_for_llm(self) -> list[dict]:
             # All params must be in required for strict mode
             func["parameters"].setdefault("required", []).append("_background")
 
-        return all_tools
+        from pantheon.utils.token_optimization import stabilize_tool_definitions
+
+        return stabilize_tool_definitions(all_tools)
 
     def _should_inject_context_variables(self, prefixed_name: str) -> bool:
         """Determine if context_variables should be injected for a tool.
@@ -1345,7 +1351,8 @@ async def _run_single_tool_call(call: dict) -> dict:
                 # Process and truncate tool result in one step
                 content = process_tool_result(
                     result,
-                    max_length=self.max_tool_content_length
+                    max_length=self.max_tool_content_length,
+                    tool_name=func_name,
                 )
 
                 tool_message.update({
@@ -1409,7 +1416,41 @@ async def _acompletion(
 
         # Step 1: Process messages for the model
         async with tracker.measure("message_processing"):
+            from pantheon.utils.token_optimization import (
+                build_llm_view_async,
+                inject_cache_control_markers,
+                supports_explicit_cache_control,
+            )
+
+            run_context = get_current_run_context()
+            optimization_memory = run_context.memory if run_context else None
+            is_main_thread = (
+                run_context.execution_context_id is None if run_context else True
+            )
+            messages = await build_llm_view_async(
+                messages,
+                memory=optimization_memory,
+                is_main_thread=is_main_thread,
+                autocompact_model=model,
+            )
             messages = process_messages_for_model(messages, model)
+            # Inject prompt-cache markers for providers that support
+            # explicit cache_control (Anthropic, Qwen).
+            # OpenAI/DeepSeek/Gemini use automatic prefix caching —
+            # stabilize_tool_definitions() ensures stable prefixes for them.
+            if supports_explicit_cache_control(model):
+                messages = inject_cache_control_markers(messages)
+            if run_context is not None:
+                # Selective copy: shallow for messages with string content,
+                # deepcopy only for messages with list content (Anthropic blocks
+                # from inject_cache_control_markers) to avoid mutation issues.
+                cached = []
+                for m in messages:
+                    if isinstance(m.get("content"), list):
+                        cached.append(copy.deepcopy(m))
+                    else:
+                        cached.append({**m})
+                run_context.cache_safe_prompt_messages = cached
 
         # Step 2: Detect provider and get configuration
         provider_config = detect_provider(model, self.relaxed_schema)
@@ -1445,6 +1486,8 @@ async def _acompletion(
                 # Use get_tools_for_llm() for unified tool access
                 # This includes both base_functions and provider tools
                 tools = await self.get_tools_for_llm() or None
+                if run_context is not None and tools is not None:
+                    run_context.cache_safe_tool_definitions = copy.deepcopy(tools)
 
                 # For non-OpenAI providers or OpenAI-compatible providers, adjust tool format
                 # OpenAI-compatible providers (e.g. minimax) have api_key set in config
@@ -1487,7 +1530,16 @@ async def _acompletion(
         if context_variables and "model_params" in context_variables:
             # Runtime overrides instance defaults
             model_params = {**self.model_params, **context_variables["model_params"]}
-
+
+        if run_context is not None:
+            from pantheon.utils.token_optimization import build_cache_safe_runtime_params
+
+            run_context.cache_safe_runtime_params = build_cache_safe_runtime_params(
+                model=model,
+                model_params=model_params,
+                response_format=response_format,
+            )
+
         # Step 8: Call LLM provider (unified interface)
         # logger.info(f"Raw messages: {messages}")
 
@@ -2101,6 +2153,11 @@ async def _prepare_execution_context(
         # Determine whether to use memory
         should_use_memory = use_memory if use_memory is not None else self.use_memory
         memory_instance = memory or self.memory
+        working_context_variables = (context_variables or {}).copy()
+        fork_context_messages = working_context_variables.pop(
+            "_cache_safe_fork_context_messages",
+            None,
+        )
 
         input_messages = None  # Only set for normal user input, not AgentTransfer
 
@@ -2130,16 +2187,21 @@ async def _prepare_execution_context(
             conversation_history = (
                 memory_instance.get_messages(
                     execution_context_id=execution_context_id,
-                    for_llm=True
+                    for_llm=False
                 )
                 if (should_use_memory and memory_instance)
                 else []
             )
+            if isinstance(fork_context_messages, list) and fork_context_messages:
+                conversation_history = [
+                    *copy.deepcopy(fork_context_messages),
+                    *conversation_history,
+                ]
             conversation_history += input_messages
             conversation_history = self._sanitize_messages(conversation_history)
 
         # preserve execution_context_id if tool need
-        context_variables = (context_variables or {}).copy()
+        context_variables = working_context_variables
 
         # Inject global context variables from settings
         from .settings import get_settings
@@ -2279,6 +2341,7 @@ async def _process_chunk(chunk: dict):
             run_context = AgentRunContext(
                 agent=self,
                 memory=exec_context.memory_instance,
+                execution_context_id=exec_context.execution_context_id,
                 process_step_message=_process_step_message,
                 process_chunk=_process_chunk,
             )

diff --git a/pantheon/factory/templates/settings.json b/pantheon/factory/templates/settings.json
@@ -25,12 +25,12 @@
         // - "thread": Execute in separate thread (isolated, for heavy tasks)
         "local_toolset_execution_mode": "direct",
         // === Tool Output Limits ===
-        // Maximum characters for tool output (used for smart truncation)
-        "max_tool_content_length": 10000,
+        // Maximum characters for tool output (fallback; per-tool thresholds take priority)
+        "max_tool_content_length": 50000,
         // Maximum lines for file read operations
         "max_file_read_lines": 1000,
-        // Maximum characters for file read operations (prevents single-line overflow)
-        "max_file_read_chars": 100000,
+        // Maximum characters for file read operations (safety valve; per-tool thresholds handle LLM sizing)
+        "max_file_read_chars": 500000,
         // Maximum results for glob/search operations
         "max_glob_results": 50,
         // Enable notebook execution logging (JSONL files in .pantheon/logs/notebook/)

diff --git a/pantheon/settings.py b/pantheon/settings.py
@@ -661,11 +661,13 @@ def tool_timeout(self) -> int:
     def max_tool_content_length(self) -> int:
         """
         Maximum characters for tool output content.
-        Used for smart truncation at agent level.
-        Defaults to 10000 (~5K tokens).
+        Used as fallback for smart truncation at agent level.
+        Per-tool thresholds (from token_optimization.py) take priority
+        when available.
+        Defaults to 50000 (~12.5K tokens).
         """
         self._ensure_loaded()
-        return self._settings.get("endpoint", {}).get("max_tool_content_length", 10000)
+        return self._settings.get("endpoint", {}).get("max_tool_content_length", 50000)
 
     @property
     def max_file_read_lines(self) -> int:
@@ -679,17 +681,16 @@ def max_file_read_lines(self) -> int:
     @property
     def max_file_read_chars(self) -> int:
         """
-        Maximum characters for read_file output.
-
-        Set higher than max_tool_content_length to allow reading larger files
-        while preventing unbounded output. When exceeded, read_file returns
-        truncated content with a 'truncated' flag to prevent infinite loops.
-
-        Industry reference: Cursor uses 100K limit.
-        Defaults to 50000 characters.
+        Maximum characters for read_file output (safety valve).
+
+        Acts as an upper bound to prevent unbounded output from pathological
+        files. Per-tool thresholds (from token_optimization.py) handle the
+        actual LLM-context sizing at Layer 2.
+
+        Defaults to 500000 characters.
         """
         self._ensure_loaded()
-        return self._settings.get("endpoint", {}).get("max_file_read_chars", 50000)
+        return self._settings.get("endpoint", {}).get("max_file_read_chars", 500000)
 
     @property
     def max_glob_results(self) -> int: