Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 70 additions & 7 deletions pantheon/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,12 @@ class AgentRunContext:

agent: "Agent"
memory: "Memory | None"
process_step_message: Callable | None
process_chunk: Callable | None
execution_context_id: str | None = None
process_step_message: Callable | None = None
process_chunk: Callable | None = None
cache_safe_runtime_params: Any | None = None
cache_safe_prompt_messages: list[dict] | None = None
cache_safe_tool_definitions: list[dict] | None = None


_RUN_CONTEXT: ContextVar[AgentRunContext | None] = ContextVar(
Expand Down Expand Up @@ -944,7 +948,9 @@ async def get_tools_for_llm(self) -> list[dict]:
# All params must be in required for strict mode
func["parameters"].setdefault("required", []).append("_background")

return all_tools
from pantheon.utils.token_optimization import stabilize_tool_definitions

return stabilize_tool_definitions(all_tools)

def _should_inject_context_variables(self, prefixed_name: str) -> bool:
"""Determine if context_variables should be injected for a tool.
Expand Down Expand Up @@ -1345,7 +1351,8 @@ async def _run_single_tool_call(call: dict) -> dict:
# Process and truncate tool result in one step
content = process_tool_result(
result,
max_length=self.max_tool_content_length
max_length=self.max_tool_content_length,
tool_name=func_name,
)

tool_message.update({
Expand Down Expand Up @@ -1409,7 +1416,41 @@ async def _acompletion(

# Step 1: Process messages for the model
async with tracker.measure("message_processing"):
from pantheon.utils.token_optimization import (
build_llm_view_async,
inject_cache_control_markers,
supports_explicit_cache_control,
)

run_context = get_current_run_context()
optimization_memory = run_context.memory if run_context else None
is_main_thread = (
run_context.execution_context_id is None if run_context else True
)
messages = await build_llm_view_async(
messages,
memory=optimization_memory,
is_main_thread=is_main_thread,
autocompact_model=model,
)
messages = process_messages_for_model(messages, model)
# Inject prompt-cache markers for providers that support
# explicit cache_control (Anthropic, Qwen).
# OpenAI/DeepSeek/Gemini use automatic prefix caching —
# stabilize_tool_definitions() ensures stable prefixes for them.
if supports_explicit_cache_control(model):
messages = inject_cache_control_markers(messages)
if run_context is not None:
# Selective copy: shallow for messages with string content,
# deepcopy only for messages with list content (Anthropic blocks
# from inject_cache_control_markers) to avoid mutation issues.
cached = []
for m in messages:
if isinstance(m.get("content"), list):
cached.append(copy.deepcopy(m))
else:
cached.append({**m})
run_context.cache_safe_prompt_messages = cached

# Step 2: Detect provider and get configuration
provider_config = detect_provider(model, self.relaxed_schema)
Expand Down Expand Up @@ -1445,6 +1486,8 @@ async def _acompletion(
# Use get_tools_for_llm() for unified tool access
# This includes both base_functions and provider tools
tools = await self.get_tools_for_llm() or None
if run_context is not None and tools is not None:
run_context.cache_safe_tool_definitions = copy.deepcopy(tools)

# For non-OpenAI providers or OpenAI-compatible providers, adjust tool format
# OpenAI-compatible providers (e.g. minimax) have api_key set in config
Expand Down Expand Up @@ -1487,7 +1530,16 @@ async def _acompletion(
if context_variables and "model_params" in context_variables:
# Runtime overrides instance defaults
model_params = {**self.model_params, **context_variables["model_params"]}


if run_context is not None:
from pantheon.utils.token_optimization import build_cache_safe_runtime_params

run_context.cache_safe_runtime_params = build_cache_safe_runtime_params(
model=model,
model_params=model_params,
response_format=response_format,
)

# Step 8: Call LLM provider (unified interface)
# logger.info(f"Raw messages: {messages}")

Expand Down Expand Up @@ -2101,6 +2153,11 @@ async def _prepare_execution_context(
# Determine whether to use memory
should_use_memory = use_memory if use_memory is not None else self.use_memory
memory_instance = memory or self.memory
working_context_variables = (context_variables or {}).copy()
fork_context_messages = working_context_variables.pop(
"_cache_safe_fork_context_messages",
None,
)

input_messages = None # Only set for normal user input, not AgentTransfer

Expand Down Expand Up @@ -2130,16 +2187,21 @@ async def _prepare_execution_context(
conversation_history = (
memory_instance.get_messages(
execution_context_id=execution_context_id,
for_llm=True
for_llm=False
)
if (should_use_memory and memory_instance)
else []
)
if isinstance(fork_context_messages, list) and fork_context_messages:
conversation_history = [
*copy.deepcopy(fork_context_messages),
*conversation_history,
]
conversation_history += input_messages
conversation_history = self._sanitize_messages(conversation_history)

# preserve execution_context_id if tool need
context_variables = (context_variables or {}).copy()
context_variables = working_context_variables

# Inject global context variables from settings
from .settings import get_settings
Expand Down Expand Up @@ -2279,6 +2341,7 @@ async def _process_chunk(chunk: dict):
run_context = AgentRunContext(
agent=self,
memory=exec_context.memory_instance,
execution_context_id=exec_context.execution_context_id,
process_step_message=_process_step_message,
process_chunk=_process_chunk,
)
Expand Down
8 changes: 4 additions & 4 deletions pantheon/factory/templates/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
// - "thread": Execute in separate thread (isolated, for heavy tasks)
"local_toolset_execution_mode": "direct",
// === Tool Output Limits ===
// Maximum characters for tool output (used for smart truncation)
"max_tool_content_length": 10000,
// Maximum characters for tool output (fallback; per-tool thresholds take priority)
"max_tool_content_length": 50000,
// Maximum lines for file read operations
"max_file_read_lines": 1000,
// Maximum characters for file read operations (prevents single-line overflow)
"max_file_read_chars": 100000,
// Maximum characters for file read operations (safety valve; per-tool thresholds handle LLM sizing)
"max_file_read_chars": 500000,
// Maximum results for glob/search operations
"max_glob_results": 50,
// Enable notebook execution logging (JSONL files in .pantheon/logs/notebook/)
Expand Down
25 changes: 13 additions & 12 deletions pantheon/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,11 +661,13 @@ def tool_timeout(self) -> int:
def max_tool_content_length(self) -> int:
"""
Maximum characters for tool output content.
Used for smart truncation at agent level.
Defaults to 10000 (~5K tokens).
Used as fallback for smart truncation at agent level.
Per-tool thresholds (from token_optimization.py) take priority
when available.
Defaults to 50000 (~12.5K tokens).
"""
self._ensure_loaded()
return self._settings.get("endpoint", {}).get("max_tool_content_length", 10000)
return self._settings.get("endpoint", {}).get("max_tool_content_length", 50000)

@property
def max_file_read_lines(self) -> int:
Expand All @@ -679,17 +681,16 @@ def max_file_read_lines(self) -> int:
@property
def max_file_read_chars(self) -> int:
"""
Maximum characters for read_file output.

Set higher than max_tool_content_length to allow reading larger files
while preventing unbounded output. When exceeded, read_file returns
truncated content with a 'truncated' flag to prevent infinite loops.

Industry reference: Cursor uses 100K limit.
Defaults to 50000 characters.
Maximum characters for read_file output (safety valve).

Acts as an upper bound to prevent unbounded output from pathological
files. Per-tool thresholds (from token_optimization.py) handle the
actual LLM-context sizing at Layer 2.

Defaults to 500000 characters.
"""
self._ensure_loaded()
return self._settings.get("endpoint", {}).get("max_file_read_chars", 50000)
return self._settings.get("endpoint", {}).get("max_file_read_chars", 500000)

@property
def max_glob_results(self) -> int:
Expand Down
Loading
Loading