From 527b85389f90892eee306272b10b2089f9351eb9 Mon Sep 17 00:00:00 2001 From: hazelian0619 Date: Mon, 30 Mar 2026 23:10:02 +0800 Subject: [PATCH 01/13] fix(prompts): add failure recovery protocol and scientific writing gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem A (partial): Add MANDATORY scientific writing gate to default.md — Leader must delegate to Researcher before writing any domain paper. Clarify Scientific Illustrator scope (schematic/pathway diagrams only, not data plots). Problem C: Add Failure Recovery section to delegation.md — three-tier ladder for file write failures (Two-Phase Write Protocol → format downgrade → inline) and sub-agent failures (narrow retry → self-execute → partial output). Hard rule: never terminate without producing at least one artifact. Validated by experiment (2026-03-30): - Case 3 (SSR1/GWAS): Leader called 3x parallel Researcher before any content; Researchers produced 978 lines across 3 reports using Two-Phase Write Protocol - Case 0 (EC论文): Leader called 2x parallel Researcher; BibTeX built to 397 lines via append_file batches (vs. previous silent truncation at char 88); PDF artifact (117KB) delivered despite E2BIG and relay-API update_file errors New bugs discovered (tracked separately): - Relay API truncates update_file tool call args mid-generation (high severity) - think tool infinite loop at ~90K token context (medium severity) Co-Authored-By: Claude Sonnet 4.6 --- .../factory/templates/prompts/delegation.md | 21 +++++++++++++++++++ pantheon/factory/templates/teams/default.md | 10 ++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/pantheon/factory/templates/prompts/delegation.md b/pantheon/factory/templates/prompts/delegation.md index fd6d61a72..49ffa0430 100644 --- a/pantheon/factory/templates/prompts/delegation.md +++ b/pantheon/factory/templates/prompts/delegation.md @@ -91,3 +91,24 @@ call_agent( ```python call_agent("researcher", "Do analysis fast.") ``` + +### Failure Recovery + +Tool failures and sub-agent errors are expected — **never terminate without producing output.** + +When a tool call fails, apply the following recovery ladder in order: + +**File write failures** (e.g. content too large, output truncation): +1. **Use Two-Phase Write Protocol**: `write_file` (skeleton only) → `update_file` (one section at a time) → `append_file` (BibTeX / list batches). Never retry `write_file` with the same large content. +2. **Downgrade format**: If `.tex` fails after protocol, write `.md`; if `.md` fails, write `.txt` +3. **Inline output**: If all file writes fail, output the full content as a code block in the chat + +**Sub-agent failures** (researcher or illustrator returns error or empty result): +1. **Retry with narrower scope**: Re-delegate with a smaller, more focused Task Brief +2. **Self-execute fallback**: Handle the task directly if sub-agent repeatedly fails +3. **Partial output**: Deliver what was completed and clearly state what is missing + +**Hard rule — no silent failures:** +- Always produce at least one artifact per session, even if degraded +- When falling back to a simpler format, tell the user explicitly: what you tried, why it failed, what you're delivering instead +- A partial result delivered is always better than a perfect result abandoned diff --git a/pantheon/factory/templates/teams/default.md b/pantheon/factory/templates/teams/default.md index 76aa2be6a..403d0eaeb 100644 --- a/pantheon/factory/templates/teams/default.md +++ b/pantheon/factory/templates/teams/default.md @@ -88,10 +88,12 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa - Data analysis, EDA, statistical analysis - Literature review and multi-source research +**Scientific writing gate (MANDATORY):** Before writing any report, paper, or document that requires domain knowledge or citations, you MUST first delegate a research task to `researcher`. Writing without a prior research delegation is not allowed for these task types. + #### Scientific Illustrator -**Delegate for:** Scientific diagrams, publication-quality visualizations, complex figures -**Execute directly:** Simple chart embedding, displaying existing charts +**Delegate for:** Schematic diagrams, pathway figures, cell structure illustrations, BioRender-style publication figures — tasks where the output is a conceptual diagram, not a data-driven chart. +**Execute directly (or via Researcher):** Data visualizations, statistical plots, matplotlib/seaborn charts derived from analysis results. ### Decision Summary @@ -100,9 +102,11 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa | Explore/read/understand codebase | **MUST delegate** to researcher | | Web search or documentation lookup | **MUST delegate** to researcher | | Data analysis or research | **MUST delegate** to researcher | +| Scientific writing (report/paper) | **MUST delegate research first**, then write | | Multiple independent research tasks | **MUST parallelize** with multiple researchers | +| Schematic/pathway/cell diagrams | **Delegate** to scientific_illustrator | | Read 1 known file | Execute directly | -| Write/edit/create files | Execute directly | +| Write/edit/create files (post-research) | Execute directly | | Synthesize researcher results | Execute directly (your core role) | {{delegation}} From 453d4c454e9bfaf3c6a9edd60e458891f95f4eb6 Mon Sep 17 00:00:00 2001 From: hazelian0619 Date: Tue, 31 Mar 2026 07:34:44 +0800 Subject: [PATCH 02/13] fix(file_manager): add output-token truncation guards and append_file tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 bug: when LLM generates large files (LaTeX papers, BibTeX) in a single write_file/update_file call, the relay API truncates the output stream mid-JSON, causing 'Unterminated string' parse errors and silent data loss. Root cause: LLM output token limit is separate from context window. File content in tool call parameters must be generated as LLM output, hitting max_tokens before the JSON closes. LaTeX/BibTeX content with escape chars inflates token count ~1.5x. Changes: - write_file: hard reject content > 12,000 chars; docstring teaches Two-Phase Write Protocol (scaffold first, fill by section, append for lists/bib) - append_file: new tool for chunked appending; 6,000 char limit; requires file to exist first; primary use case is BibTeX batches (<=10 entries per call) - update_file: hard reject new_string > 8,000 chars with guidance to split section into smaller semantic units Validated against 20-case baseline (15% success rate before fix): - Case 1 (LaTeX review paper, previously FAIL): now generates full PDF with 44 references via append_file batches — confirmed in controlled re-run - Agent proactively adopted Two-Phase protocol after reading docstring (0 content_too_large rejections; protocol was followed before guard triggered) Co-Authored-By: Claude Sonnet 4.6 --- pantheon/toolsets/file/file_manager.py | 116 ++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 14 deletions(-) diff --git a/pantheon/toolsets/file/file_manager.py b/pantheon/toolsets/file/file_manager.py index 24da1a3c9..b91e6f87f 100644 --- a/pantheon/toolsets/file/file_manager.py +++ b/pantheon/toolsets/file/file_manager.py @@ -817,33 +817,50 @@ async def write_file( content: str = "", overwrite: bool = True, ) -> dict: - """Use this tool to CREATE NEW file. + """Use this tool to CREATE a NEW file with a skeleton or short content. - This tool writes content to a file, automatically creating parent - directories if they do not exist. + ⚠️ LARGE FILE PROTOCOL — MUST FOLLOW FOR PAPERS, REPORTS, LaTeX, BibTeX: + NEVER pass an entire document as `content` in one call. + Use the Two-Phase Write Protocol instead: - IMPORTANT: For EDITING existing file, use `update_file` instead. - DO NOT rewrite entire file when only small changes are needed, its is wasteful and error-prone. + Phase 1 — Scaffold (this tool, once): + write_file(path, content=) - Use this tool when: - - Creating a brand new file - - Completely rewriting a file from scratch (rare) + Phase 2 — Fill (per semantic section): + update_file(path, old_string=
, new_string=) + → one call per semantic unit (Introduction, Methods, Results, etc.) - DO NOT use this tool when: - - Making partial modifications to an existing file - - Changing a few lines in a large file - - For these cases, use `update_file` instead + For lists / bibliographies (append_file, batched): + append_file(path, content=<10 BibTeX entries or 1 table block at a time>) + + This tool will REFUSE content longer than 12,000 characters. Writing large + content in one shot causes output-token truncation and silent data loss. Args: file_path: The path to the file to write. content: The content to write to the file. overwrite: When False, abort if the target file already exists. - Default is True, but consider using update_file for edits. Returns: dict: Success status or error message. """ - + _WRITE_FILE_MAX_CHARS = 12000 + if len(content) > _WRITE_FILE_MAX_CHARS: + return { + "success": False, + "reason": "content_too_large", + "error": ( + f"Content is {len(content):,} chars, exceeding the " + f"{_WRITE_FILE_MAX_CHARS:,}-char limit per write_file call. " + f"Use the Two-Phase Write Protocol:\n" + f" 1. write_file('{file_path}', content=)\n" + f" 2. update_file('{file_path}', old_string=, new_string=
) " + f"← one call per section\n" + f" 3. append_file('{file_path}', content=) " + f"← for BibTeX / lists (<=10 items per call)\n" + f"Do NOT retry write_file with the same large content." + ), + } target_path = self._resolve_path(file_path) if not overwrite and target_path.exists(): return { @@ -861,6 +878,65 @@ async def write_file( logger.error(f"write_file failed for {file_path}: {exc}") return {"success": False, "error": str(exc)} + @tool + async def append_file( + self, + file_path: str, + content: str, + ) -> dict: + """Append content to the end of an existing file without overwriting it. + + ## Primary use case: chunked writing for large documents + + When a single write_file or update_file call would be too large, split + the content and stream it in parts: + + write_file(path, skeleton) # 1. write header / skeleton + append_file(path, introduction) # 2. append Introduction section + append_file(path, methods) # 3. append Methods section + append_file(path, results) # 4. append Results + Discussion + append_file(path, bibliography) # 5. append Bibliography + + ## For BibTeX bibliographies: + Split into batches of <=10 @article / @inproceedings blocks per call. + + ## Limits: + Content must be <=6,000 characters per call. Split further if needed. + File must already exist (use write_file to create it first). + + Args: + file_path: Path to the file to append to (relative to workspace root). + content: Text to append. Include a leading newline if needed. + + Returns: + dict: {success: true, appended_chars: int} or {success: false, error: str} + """ + _APPEND_FILE_MAX_CHARS = 6000 + if len(content) > _APPEND_FILE_MAX_CHARS: + return { + "success": False, + "reason": "content_too_large", + "error": ( + f"Content is {len(content):,} chars, exceeding the " + f"{_APPEND_FILE_MAX_CHARS:,}-char limit per append_file call. " + f"Split into smaller batches (<=10 BibTeX entries or one section at a time)." + ), + } + target_path = self._resolve_path(file_path) + if not target_path.exists(): + return { + "success": False, + "error": f"File '{file_path}' does not exist. Use write_file to create it first.", + "reason": "file_not_found", + } + try: + with open(target_path, "a", encoding="utf-8") as f: + f.write(content) + return {"success": True, "appended_chars": len(content)} + except Exception as exc: + logger.error(f"append_file failed for {file_path}: {exc}") + return {"success": False, "error": str(exc)} + @tool async def update_file( self, @@ -893,6 +969,18 @@ async def update_file( Returns: dict: {success: bool, replacements: int} or {success: False, error: str} """ + _UPDATE_FILE_MAX_CHARS = 8000 + if len(new_string) > _UPDATE_FILE_MAX_CHARS: + return { + "success": False, + "reason": "content_too_large", + "error": ( + f"new_string is {len(new_string):,} chars, exceeding the " + f"{_UPDATE_FILE_MAX_CHARS:,}-char limit per update_file call. " + f"Split this section into smaller semantic units and call " + f"update_file once per unit (e.g. one paragraph or subsection at a time)." + ), + } target_path = self._resolve_path(file_path) if not target_path.exists(): return {"success": False, "error": "File does not exist"} From 45b7c55d5fea278d366b5e8aab2a5517b245f9c0 Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Tue, 31 Mar 2026 14:41:30 -0700 Subject: [PATCH 03/13] test: add comprehensive tests for output-token truncation guards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests for PR #52 file manager changes: - write_file: reject >12K, accept at limit, file not created on reject - append_file: basic append, multi-batch (BibTeX pattern), reject nonexistent file, reject >6K, accept at limit - update_file: reject new_string >8K, accept at limit, original unchanged - Two-Phase Write Protocol end-to-end: scaffold → section fill → append 14/14 file manager tests passing. --- tests/test_file_manager.py | 147 +++++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index 1e73eda00..d877c883b 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -510,3 +510,150 @@ async def test_manage_path_comprehensive(temp_toolset): result = await temp_toolset.manage_path("delete", "nonexistent.txt") assert result["success"] is False assert "does not exist" in result["error"] + + +# --------------------------------------------------------------------------- +# Output-token truncation guards (PR #52) +# --------------------------------------------------------------------------- + +async def test_write_file_rejects_large_content(temp_toolset): + """write_file must reject content > 12,000 chars.""" + big = "x" * 13_000 + res = await temp_toolset.write_file("big.txt", big) + assert not res["success"] + assert res["reason"] == "content_too_large" + assert "12,000" in res["error"] + # File must NOT exist on disk + assert not (temp_toolset.path / "big.txt").exists() + + +async def test_write_file_accepts_content_at_limit(temp_toolset): + """write_file must accept content exactly at 12,000 chars.""" + content = "a" * 12_000 + res = await temp_toolset.write_file("exact.txt", content) + assert res["success"] + assert (temp_toolset.path / "exact.txt").read_text() == content + + +async def test_append_file_basic(temp_toolset): + """append_file appends to existing file.""" + await temp_toolset.write_file("log.txt", "header\n") + res = await temp_toolset.append_file("log.txt", "line1\nline2\n") + assert res["success"] + assert res["appended_chars"] == len("line1\nline2\n") + content = (await temp_toolset.read_file("log.txt"))["content"] + assert content == "header\nline1\nline2\n" + + +async def test_append_file_multiple_batches(temp_toolset): + """append_file supports multiple sequential appends (BibTeX batch pattern).""" + await temp_toolset.write_file("refs.bib", "% Bibliography\n") + for i in range(5): + batch = f"@article{{ref{i},\n title={{Title {i}}},\n}}\n\n" + res = await temp_toolset.append_file("refs.bib", batch) + assert res["success"], f"Batch {i} failed: {res}" + content = (await temp_toolset.read_file("refs.bib"))["content"] + assert content.startswith("% Bibliography\n") + assert content.count("@article{") == 5 + + +async def test_append_file_rejects_nonexistent(temp_toolset): + """append_file must reject when target file does not exist.""" + res = await temp_toolset.append_file("missing.txt", "data") + assert not res["success"] + assert res["reason"] == "file_not_found" + + +async def test_append_file_rejects_large_content(temp_toolset): + """append_file must reject content > 6,000 chars.""" + await temp_toolset.write_file("base.txt", "ok\n") + big = "x" * 7_000 + res = await temp_toolset.append_file("base.txt", big) + assert not res["success"] + assert res["reason"] == "content_too_large" + assert "6,000" in res["error"] + # Original content must be unchanged + content = (await temp_toolset.read_file("base.txt"))["content"] + assert content == "ok\n" + + +async def test_append_file_accepts_content_at_limit(temp_toolset): + """append_file must accept content exactly at 6,000 chars.""" + await temp_toolset.write_file("base.txt", "start\n") + chunk = "b" * 6_000 + res = await temp_toolset.append_file("base.txt", chunk) + assert res["success"] + content = (await temp_toolset.read_file("base.txt"))["content"] + assert content == "start\n" + chunk + + +async def test_update_file_rejects_large_new_string(temp_toolset): + """update_file must reject new_string > 8,000 chars.""" + await temp_toolset.write_file("doc.txt", "PLACEHOLDER\n") + big = "y" * 9_000 + res = await temp_toolset.update_file("doc.txt", "PLACEHOLDER", big) + assert not res["success"] + assert res["reason"] == "content_too_large" + assert "8,000" in res["error"] + # Original content must be unchanged + content = (await temp_toolset.read_file("doc.txt"))["content"] + assert content == "PLACEHOLDER\n" + + +async def test_update_file_accepts_new_string_at_limit(temp_toolset): + """update_file must accept new_string exactly at 8,000 chars.""" + await temp_toolset.write_file("doc.txt", "STUB\n") + replacement = "c" * 8_000 + res = await temp_toolset.update_file("doc.txt", "STUB", replacement) + assert res["success"] + content = (await temp_toolset.read_file("doc.txt"))["content"] + assert replacement in content + + +async def test_two_phase_write_protocol(temp_toolset): + """End-to-end: scaffold → section fill → append (the protocol PR #52 teaches).""" + # Phase 1: scaffold + skeleton = ( + "\\documentclass{article}\n" + "\\begin{document}\n" + "\\section{Introduction}\n" + "% INTRO_PLACEHOLDER\n" + "\\section{Methods}\n" + "% METHODS_PLACEHOLDER\n" + "\\end{document}\n" + ) + res = await temp_toolset.write_file("paper.tex", skeleton) + assert res["success"] + + # Phase 2: fill sections via update_file + res = await temp_toolset.update_file( + "paper.tex", + "% INTRO_PLACEHOLDER", + "This paper presents a novel approach to analyzing single-cell data.", + ) + assert res["success"] + + res = await temp_toolset.update_file( + "paper.tex", + "% METHODS_PLACEHOLDER", + "We applied dimensionality reduction using UMAP.", + ) + assert res["success"] + + # Phase 3: append bibliography + bib_entries = "\\begin{thebibliography}{9}\n\\bibitem{ref1} Author, Title, 2024.\n\\end{thebibliography}\n" + # Insert before \end{document} via update_file + res = await temp_toolset.update_file( + "paper.tex", + "\\end{document}", + bib_entries + "\\end{document}", + ) + assert res["success"] + + # Verify final document + content = (await temp_toolset.read_file("paper.tex"))["content"] + assert "novel approach" in content + assert "UMAP" in content + assert "\\bibitem{ref1}" in content + assert "INTRO_PLACEHOLDER" not in content + assert "METHODS_PLACEHOLDER" not in content From 80a92dfaf353c36667a91070a211dcc499918378 Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Tue, 31 Mar 2026 15:06:51 -0700 Subject: [PATCH 04/13] fix(llm): set max_tokens to model's max output + raise tool guard thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause fix: acompletion_litellm() never passed max_tokens (output) to litellm. Anthropic models default to 4096 output tokens, causing tool_use JSON to be truncated mid-generation when the model writes large file content. Fix: auto-detect model's max_output_tokens via litellm.get_model_info() and set it as kwargs["max_tokens"] when not already specified by model_params. With the root cause fixed, the tool-level size guards from PR #52 are now defense-in-depth (not the primary fix). Raised thresholds to match actual output capacity: - write_file: 12K → 40K chars - update_file: 8K → 30K chars - append_file: 6K → 20K chars Thresholds moved to class-level constants (WRITE_FILE_MAX_CHARS, etc.) for easy per-deployment tuning. Tests updated to reference constants instead of hardcoded values. 14/14 file manager tests passing. --- pantheon/toolsets/file/file_manager.py | 60 +++++++++----------------- pantheon/utils/llm.py | 15 +++++++ tests/test_file_manager.py | 33 +++++++------- 3 files changed, 54 insertions(+), 54 deletions(-) diff --git a/pantheon/toolsets/file/file_manager.py b/pantheon/toolsets/file/file_manager.py index b91e6f87f..99f2b59b5 100644 --- a/pantheon/toolsets/file/file_manager.py +++ b/pantheon/toolsets/file/file_manager.py @@ -810,6 +810,15 @@ async def view_file_outline(self, file_path: str) -> dict: except Exception as e: return {"success": False, "error": str(e)} + # Configurable size guards (defense-in-depth). + # With max_tokens properly set at the LLM call layer, these are safety + # nets — not the primary truncation fix. Defaults are generous enough + # for most single-section writes; the Two-Phase Protocol is recommended + # only for truly huge documents. + WRITE_FILE_MAX_CHARS = 40_000 + APPEND_FILE_MAX_CHARS = 20_000 + UPDATE_FILE_MAX_CHARS = 30_000 + @tool async def write_file( self, @@ -817,24 +826,13 @@ async def write_file( content: str = "", overwrite: bool = True, ) -> dict: - """Use this tool to CREATE a NEW file with a skeleton or short content. - - ⚠️ LARGE FILE PROTOCOL — MUST FOLLOW FOR PAPERS, REPORTS, LaTeX, BibTeX: - NEVER pass an entire document as `content` in one call. - Use the Two-Phase Write Protocol instead: - - Phase 1 — Scaffold (this tool, once): - write_file(path, content=) - - Phase 2 — Fill (per semantic section): - update_file(path, old_string=
, new_string=) - → one call per semantic unit (Introduction, Methods, Results, etc.) - - For lists / bibliographies (append_file, batched): - append_file(path, content=<10 BibTeX entries or 1 table block at a time>) + """Create or overwrite a file. - This tool will REFUSE content longer than 12,000 characters. Writing large - content in one shot causes output-token truncation and silent data loss. + For very large documents (papers, reports), prefer the Two-Phase + Write Protocol: + 1. write_file(path, skeleton) + 2. update_file(path, stub, full_section) — per section + 3. append_file(path, batch) — for BibTeX / lists Args: file_path: The path to the file to write. @@ -844,7 +842,7 @@ async def write_file( Returns: dict: Success status or error message. """ - _WRITE_FILE_MAX_CHARS = 12000 + _WRITE_FILE_MAX_CHARS = self.WRITE_FILE_MAX_CHARS if len(content) > _WRITE_FILE_MAX_CHARS: return { "success": False, @@ -884,34 +882,18 @@ async def append_file( file_path: str, content: str, ) -> dict: - """Append content to the end of an existing file without overwriting it. - - ## Primary use case: chunked writing for large documents - - When a single write_file or update_file call would be too large, split - the content and stream it in parts: - - write_file(path, skeleton) # 1. write header / skeleton - append_file(path, introduction) # 2. append Introduction section - append_file(path, methods) # 3. append Methods section - append_file(path, results) # 4. append Results + Discussion - append_file(path, bibliography) # 5. append Bibliography - - ## For BibTeX bibliographies: - Split into batches of <=10 @article / @inproceedings blocks per call. + """Append content to the end of an existing file. - ## Limits: - Content must be <=6,000 characters per call. Split further if needed. File must already exist (use write_file to create it first). Args: - file_path: Path to the file to append to (relative to workspace root). - content: Text to append. Include a leading newline if needed. + file_path: Path to the file to append to. + content: Text to append. Returns: dict: {success: true, appended_chars: int} or {success: false, error: str} """ - _APPEND_FILE_MAX_CHARS = 6000 + _APPEND_FILE_MAX_CHARS = self.APPEND_FILE_MAX_CHARS if len(content) > _APPEND_FILE_MAX_CHARS: return { "success": False, @@ -969,7 +951,7 @@ async def update_file( Returns: dict: {success: bool, replacements: int} or {success: False, error: str} """ - _UPDATE_FILE_MAX_CHARS = 8000 + _UPDATE_FILE_MAX_CHARS = self.UPDATE_FILE_MAX_CHARS if len(new_string) > _UPDATE_FILE_MAX_CHARS: return { "success": False, diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 3e70b1e70..05d587359 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -562,6 +562,21 @@ async def acompletion( provider_key, model_name, provider_config = find_provider_for_model(model) sdk_type = provider_config.get("sdk", "openai") + # ========== Ensure max_tokens (output) is set ========== + # Without explicit max_tokens, some providers (Anthropic) default to + # very low output limits (4096), causing tool_use JSON to be truncated + # mid-generation when the model writes large file content. + # Set to the model's declared max_output_tokens if not already specified. + if "max_tokens" not in kwargs and "max_output_tokens" not in kwargs: + try: + from litellm.utils import get_model_info + _info = get_model_info(model) + _max_out = _info.get("max_output_tokens") + if _max_out and _max_out > 0: + kwargs["max_tokens"] = _max_out + except Exception: + pass # Fall through to provider default + # ========== Mode Detection & Configuration ========== proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index d877c883b..7603bf610 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -517,19 +517,20 @@ async def test_manage_path_comprehensive(temp_toolset): # --------------------------------------------------------------------------- async def test_write_file_rejects_large_content(temp_toolset): - """write_file must reject content > 12,000 chars.""" - big = "x" * 13_000 + """write_file must reject content exceeding WRITE_FILE_MAX_CHARS.""" + limit = temp_toolset.WRITE_FILE_MAX_CHARS + big = "x" * (limit + 1000) res = await temp_toolset.write_file("big.txt", big) assert not res["success"] assert res["reason"] == "content_too_large" - assert "12,000" in res["error"] # File must NOT exist on disk assert not (temp_toolset.path / "big.txt").exists() async def test_write_file_accepts_content_at_limit(temp_toolset): - """write_file must accept content exactly at 12,000 chars.""" - content = "a" * 12_000 + """write_file must accept content exactly at WRITE_FILE_MAX_CHARS.""" + limit = temp_toolset.WRITE_FILE_MAX_CHARS + content = "a" * limit res = await temp_toolset.write_file("exact.txt", content) assert res["success"] assert (temp_toolset.path / "exact.txt").read_text() == content @@ -565,22 +566,23 @@ async def test_append_file_rejects_nonexistent(temp_toolset): async def test_append_file_rejects_large_content(temp_toolset): - """append_file must reject content > 6,000 chars.""" + """append_file must reject content exceeding APPEND_FILE_MAX_CHARS.""" await temp_toolset.write_file("base.txt", "ok\n") - big = "x" * 7_000 + limit = temp_toolset.APPEND_FILE_MAX_CHARS + big = "x" * (limit + 1000) res = await temp_toolset.append_file("base.txt", big) assert not res["success"] assert res["reason"] == "content_too_large" - assert "6,000" in res["error"] # Original content must be unchanged content = (await temp_toolset.read_file("base.txt"))["content"] assert content == "ok\n" async def test_append_file_accepts_content_at_limit(temp_toolset): - """append_file must accept content exactly at 6,000 chars.""" + """append_file must accept content exactly at APPEND_FILE_MAX_CHARS.""" await temp_toolset.write_file("base.txt", "start\n") - chunk = "b" * 6_000 + limit = temp_toolset.APPEND_FILE_MAX_CHARS + chunk = "b" * limit res = await temp_toolset.append_file("base.txt", chunk) assert res["success"] content = (await temp_toolset.read_file("base.txt"))["content"] @@ -588,22 +590,23 @@ async def test_append_file_accepts_content_at_limit(temp_toolset): async def test_update_file_rejects_large_new_string(temp_toolset): - """update_file must reject new_string > 8,000 chars.""" + """update_file must reject new_string exceeding UPDATE_FILE_MAX_CHARS.""" await temp_toolset.write_file("doc.txt", "PLACEHOLDER\n") - big = "y" * 9_000 + limit = temp_toolset.UPDATE_FILE_MAX_CHARS + big = "y" * (limit + 1000) res = await temp_toolset.update_file("doc.txt", "PLACEHOLDER", big) assert not res["success"] assert res["reason"] == "content_too_large" - assert "8,000" in res["error"] # Original content must be unchanged content = (await temp_toolset.read_file("doc.txt"))["content"] assert content == "PLACEHOLDER\n" async def test_update_file_accepts_new_string_at_limit(temp_toolset): - """update_file must accept new_string exactly at 8,000 chars.""" + """update_file must accept new_string exactly at UPDATE_FILE_MAX_CHARS.""" await temp_toolset.write_file("doc.txt", "STUB\n") - replacement = "c" * 8_000 + limit = temp_toolset.UPDATE_FILE_MAX_CHARS + replacement = "c" * limit res = await temp_toolset.update_file("doc.txt", "STUB", replacement) assert res["success"] content = (await temp_toolset.read_file("doc.txt"))["content"] From c701dc1ae8c8c2bf1cabe9c829e5acf5cae8f726 Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 22:19:10 -0700 Subject: [PATCH 05/13] fix(llm): adapt max_tokens auto-detection for litellm-free architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace litellm.utils.get_model_info with provider_registry.get_model_info and fix kwargs → model_params for the new acompletion signature. Add tests: - test_max_tokens_auto_set: verify catalog has correct max_output_tokens - test_max_tokens_live_openai: live API test (skipped without OPENAI_API_KEY) --- pantheon/utils/llm.py | 7 ++++--- tests/test_file_manager.py | 43 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 05d587359..40b8a2664 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -567,13 +567,14 @@ async def acompletion( # very low output limits (4096), causing tool_use JSON to be truncated # mid-generation when the model writes large file content. # Set to the model's declared max_output_tokens if not already specified. - if "max_tokens" not in kwargs and "max_output_tokens" not in kwargs: + model_params = dict(model_params or {}) + if "max_tokens" not in model_params and "max_output_tokens" not in model_params: try: - from litellm.utils import get_model_info + from .provider_registry import get_model_info _info = get_model_info(model) _max_out = _info.get("max_output_tokens") if _max_out and _max_out > 0: - kwargs["max_tokens"] = _max_out + model_params["max_tokens"] = _max_out except Exception: pass # Fall through to provider default diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index 7603bf610..99edf3c80 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -3,6 +3,8 @@ from tempfile import TemporaryDirectory from pantheon.toolsets.file import FileManagerToolSet +HAS_OPENAI = bool(os.environ.get("OPENAI_API_KEY")) + @pytest.fixture def temp_toolset(): """Create a FileManagerToolSet with a temporary directory.""" @@ -660,3 +662,44 @@ async def test_two_phase_write_protocol(temp_toolset): assert "\\bibitem{ref1}" in content assert "INTRO_PLACEHOLDER" not in content assert "METHODS_PLACEHOLDER" not in content + + +# --------------------------------------------------------------------------- +# max_tokens auto-detection (PR #55 — 7920a72) +# --------------------------------------------------------------------------- + +def test_max_tokens_auto_set(): + """acompletion must auto-set max_tokens from model's max_output_tokens + when not explicitly provided (prevents Anthropic 4096 default truncation).""" + from pantheon.utils.provider_registry import get_model_info + + # Anthropic model — the original failure case + info = get_model_info("anthropic/claude-3-haiku-20240307") + max_out = info.get("max_output_tokens", 0) + assert max_out > 4096, ( + f"Expected max_output_tokens > 4096 for claude-3-haiku, got {max_out}" + ) + + # OpenAI model + info = get_model_info("openai/gpt-4.1-mini") + max_out = info.get("max_output_tokens", 0) + assert max_out > 0, f"Expected max_output_tokens > 0 for gpt-4.1-mini, got {max_out}" + + +@pytest.mark.skipif(not HAS_OPENAI, reason="OPENAI_API_KEY not set") +async def test_max_tokens_live_openai(): + """Live test: acompletion sets max_tokens automatically, preventing truncation.""" + from pantheon.utils.llm_providers import call_llm_provider, detect_provider + + provider_config = detect_provider("openai/gpt-4.1-mini", False) + # Call with a simple prompt, no explicit max_tokens in model_params + message = await call_llm_provider( + config=provider_config, + messages=[ + {"role": "system", "content": "Reply with exactly: OK"}, + {"role": "user", "content": "Say OK"}, + ], + ) + assert isinstance(message, dict) + content = message.get("content", "") + assert len(content) > 0, "Expected non-empty response" From 0b41033df7defe98944d6411d2bd50e89e86da1d Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 22:49:03 -0700 Subject: [PATCH 06/13] refactor(file_manager): remove size guards, merge append_file into write_file Root cause of output truncation was missing max_tokens (fixed in 7920a72), not tool-level size limits. The guards caused worse problems for code generation (repeated rejections, lost content, wasted tokens). Changes: - Remove WRITE_FILE_MAX_CHARS/APPEND_FILE_MAX_CHARS/UPDATE_FILE_MAX_CHARS - Remove content_too_large guards from write_file/update_file - Merge append_file into write_file(append=True) - Update tests: verify large content writes succeed (100K+ chars) - Add live integration test (paper + code scenarios) Live test results (gpt-4.1-mini, no guards): - LaTeX paper: 9,208 chars, 1 write_file call, 0 rejections - Python code: 13,754 chars (314 lines, 4 classes), 1 call, 0 rejections Previously with guards (2K limit test): - Python code: 24 calls, 8 rejections, 3/4 classes MISSING --- pantheon/toolsets/file/file_manager.py | 109 +++------------ scripts/test_two_phase_live.py | 183 +++++++++++++++++++++++++ tests/test_file_manager.py | 141 ++++--------------- 3 files changed, 229 insertions(+), 204 deletions(-) create mode 100644 scripts/test_two_phase_live.py diff --git a/pantheon/toolsets/file/file_manager.py b/pantheon/toolsets/file/file_manager.py index 99f2b59b5..1063566aa 100644 --- a/pantheon/toolsets/file/file_manager.py +++ b/pantheon/toolsets/file/file_manager.py @@ -810,56 +810,42 @@ async def view_file_outline(self, file_path: str) -> dict: except Exception as e: return {"success": False, "error": str(e)} - # Configurable size guards (defense-in-depth). - # With max_tokens properly set at the LLM call layer, these are safety - # nets — not the primary truncation fix. Defaults are generous enough - # for most single-section writes; the Two-Phase Protocol is recommended - # only for truly huge documents. - WRITE_FILE_MAX_CHARS = 40_000 - APPEND_FILE_MAX_CHARS = 20_000 - UPDATE_FILE_MAX_CHARS = 30_000 - @tool async def write_file( self, file_path: str, content: str = "", overwrite: bool = True, + append: bool = False, ) -> dict: - """Create or overwrite a file. - - For very large documents (papers, reports), prefer the Two-Phase - Write Protocol: - 1. write_file(path, skeleton) - 2. update_file(path, stub, full_section) — per section - 3. append_file(path, batch) — for BibTeX / lists + """Create, overwrite, or append to a file. Args: file_path: The path to the file to write. content: The content to write to the file. - overwrite: When False, abort if the target file already exists. + overwrite: When False, abort if the target file already exists (ignored when append=True). + append: When True, append content to the end of an existing file instead of overwriting. Returns: dict: Success status or error message. """ - _WRITE_FILE_MAX_CHARS = self.WRITE_FILE_MAX_CHARS - if len(content) > _WRITE_FILE_MAX_CHARS: - return { - "success": False, - "reason": "content_too_large", - "error": ( - f"Content is {len(content):,} chars, exceeding the " - f"{_WRITE_FILE_MAX_CHARS:,}-char limit per write_file call. " - f"Use the Two-Phase Write Protocol:\n" - f" 1. write_file('{file_path}', content=)\n" - f" 2. update_file('{file_path}', old_string=, new_string=
) " - f"← one call per section\n" - f" 3. append_file('{file_path}', content=) " - f"← for BibTeX / lists (<=10 items per call)\n" - f"Do NOT retry write_file with the same large content." - ), - } target_path = self._resolve_path(file_path) + + if append: + if not target_path.exists(): + return { + "success": False, + "error": f"File '{file_path}' does not exist. Use write_file without append=True to create it first.", + "reason": "file_not_found", + } + try: + with open(target_path, "a", encoding="utf-8") as f: + f.write(content) + return {"success": True, "appended_chars": len(content)} + except Exception as exc: + logger.error(f"write_file(append) failed for {file_path}: {exc}") + return {"success": False, "error": str(exc)} + if not overwrite and target_path.exists(): return { "success": False, @@ -876,49 +862,6 @@ async def write_file( logger.error(f"write_file failed for {file_path}: {exc}") return {"success": False, "error": str(exc)} - @tool - async def append_file( - self, - file_path: str, - content: str, - ) -> dict: - """Append content to the end of an existing file. - - File must already exist (use write_file to create it first). - - Args: - file_path: Path to the file to append to. - content: Text to append. - - Returns: - dict: {success: true, appended_chars: int} or {success: false, error: str} - """ - _APPEND_FILE_MAX_CHARS = self.APPEND_FILE_MAX_CHARS - if len(content) > _APPEND_FILE_MAX_CHARS: - return { - "success": False, - "reason": "content_too_large", - "error": ( - f"Content is {len(content):,} chars, exceeding the " - f"{_APPEND_FILE_MAX_CHARS:,}-char limit per append_file call. " - f"Split into smaller batches (<=10 BibTeX entries or one section at a time)." - ), - } - target_path = self._resolve_path(file_path) - if not target_path.exists(): - return { - "success": False, - "error": f"File '{file_path}' does not exist. Use write_file to create it first.", - "reason": "file_not_found", - } - try: - with open(target_path, "a", encoding="utf-8") as f: - f.write(content) - return {"success": True, "appended_chars": len(content)} - except Exception as exc: - logger.error(f"append_file failed for {file_path}: {exc}") - return {"success": False, "error": str(exc)} - @tool async def update_file( self, @@ -951,18 +894,6 @@ async def update_file( Returns: dict: {success: bool, replacements: int} or {success: False, error: str} """ - _UPDATE_FILE_MAX_CHARS = self.UPDATE_FILE_MAX_CHARS - if len(new_string) > _UPDATE_FILE_MAX_CHARS: - return { - "success": False, - "reason": "content_too_large", - "error": ( - f"new_string is {len(new_string):,} chars, exceeding the " - f"{_UPDATE_FILE_MAX_CHARS:,}-char limit per update_file call. " - f"Split this section into smaller semantic units and call " - f"update_file once per unit (e.g. one paragraph or subsection at a time)." - ), - } target_path = self._resolve_path(file_path) if not target_path.exists(): return {"success": False, "error": "File does not exist"} diff --git a/scripts/test_two_phase_live.py b/scripts/test_two_phase_live.py new file mode 100644 index 000000000..a68d96591 --- /dev/null +++ b/scripts/test_two_phase_live.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Live integration test: long paper + long code file writing. + +Verifies that after removing size guards, the LLM can write large files +directly without truncation (root cause fixed by max_tokens auto-detection). + +Requires: OPENAI_API_KEY + +Usage: + OPENAI_API_KEY=sk-... python scripts/test_two_phase_live.py +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import tempfile +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + + +async def run_scenario(name, task, make_checks, model="openai/gpt-4.1-mini"): + from pantheon.agent import Agent + from pantheon.toolsets.file import FileManagerToolSet + + with tempfile.TemporaryDirectory() as tmpdir: + print(f"\n{'─' * 70}") + print(f" Scenario: {name}") + print(f" Model: {model}") + print(f"{'─' * 70}") + + fm = FileManagerToolSet("file_manager", tmpdir) + agent = Agent( + name="writer", + model=model, + instructions=( + "You are a skilled developer and writer. " + "Use file tools (write_file, update_file, read_file) to complete tasks. " + "Write complete, production-quality content — do NOT leave stubs or placeholders." + ), + ) + await agent.toolset(fm) + + calls = [] + rejections = 0 + + async def log(msg): + nonlocal rejections + if msg.get("role") == "assistant": + for tc in msg.get("tool_calls", []) or []: + fn = tc.get("function", {}) + tool_name = fn.get("name", "?").replace("file_manager__", "") + args_len = len(fn.get("arguments", "")) + calls.append(tool_name) + print(f" {tool_name} ({args_len:,} chars)") + elif msg.get("role") == "tool": + c = str(msg.get("content", "")) + if "content_too_large" in c: + rejections += 1 + print(f" -> REJECTED") + + resp = await agent.run( + [{"role": "user", "content": task}], + process_step_message=log, + use_memory=False, + ) + + # Build and run checks + checks = make_checks(tmpdir) + print() + all_pass = True + for check_name, check_fn in checks: + try: + result = check_fn(tmpdir, calls, rejections) + status = "PASS" if result else "FAIL" + if not result: + all_pass = False + except Exception as e: + status = f"FAIL ({e})" + all_pass = False + print(f" [{status}] {check_name}") + + # Show file sizes + for f in Path(tmpdir).rglob("*"): + if f.is_file(): + content = f.read_text(errors="replace") + print(f"\n {f.name}: {len(content):,} chars, {len(content.splitlines())} lines") + + print(f"\n Tool calls: {len(calls)} total, {rejections} rejected") + print(f" Sequence: {' -> '.join(calls[:15])}{'...' if len(calls) > 15 else ''}") + return all_pass + + +async def main(): + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("SKIP: OPENAI_API_KEY not set") + sys.exit(0) + + print("=" * 70) + print(" Live File Writing Test (no size guards)") + print("=" * 70) + + results = [] + + # ── Scenario 1: Long LaTeX paper ── + paper_task = ( + "Write a complete LaTeX review paper to 'review.tex' about " + "single-cell RNA sequencing analysis methods. Requirements:\n" + "- \\documentclass{article} with proper packages\n" + "- Abstract (100+ words)\n" + "- Introduction (200+ words)\n" + "- Methods section covering: quality control, normalization, " + "dimensionality reduction, clustering, differential expression (300+ words total)\n" + "- Results (150+ words)\n" + "- Discussion (150+ words)\n" + "- Bibliography with at least 10 \\bibitem references\n" + "Write EVERYTHING in a single write_file call to review.tex." + ) + + def make_paper_checks(tmpdir): + p = Path(tmpdir) / "review.tex" + def r(): return p.read_text() if p.exists() else "" + return [ + ("File created", lambda *_: p.exists()), + ("File > 5000 chars", lambda *_: len(r()) > 5000), + ("Has \\documentclass", lambda *_: "\\documentclass" in r()), + ("Has Introduction", lambda *_: "Introduction" in r()), + ("Has Methods", lambda *_: "Methods" in r()), + ("Has Discussion", lambda *_: "Discussion" in r()), + ("Has 10+ bibitem", lambda *_: r().count("\\bibitem") >= 10), + ("No rejections", lambda tmpdir, calls, rej: rej == 0), + ] + + r = await run_scenario("Long LaTeX Paper (single write_file)", paper_task, make_paper_checks) + results.append(("Paper", r)) + + # ── Scenario 2: Long Python code ── + code_task = ( + "Write a complete Python file 'data_pipeline.py' that implements:\n" + "1. A DataLoader class with methods: load_csv, load_json, load_parquet, validate_schema " + "(each with full implementation using pandas, proper docstrings, type hints, error handling)\n" + "2. A DataTransformer class with methods: normalize, filter_outliers, " + "encode_categorical, impute_missing (each fully implemented)\n" + "3. A DataExporter class with methods: to_csv, to_json, to_parquet, to_sql " + "(each fully implemented)\n" + "4. A Pipeline class that chains DataLoader -> DataTransformer -> DataExporter " + "with a run() method, logging, and error handling\n" + "5. A if __name__ == '__main__' block with example usage\n" + "Write EVERYTHING in a single write_file call. Every method must have " + "a real implementation (no pass, no TODO, no placeholders)." + ) + + def make_code_checks(tmpdir): + p = Path(tmpdir) / "data_pipeline.py" + def r(): return p.read_text() if p.exists() else "" + return [ + ("File created", lambda *_: p.exists()), + ("File > 3000 chars", lambda *_: len(r()) > 3000), + ("Has DataLoader", lambda *_: "class DataLoader" in r()), + ("Has DataTransformer", lambda *_: "class DataTransformer" in r()), + ("Has DataExporter", lambda *_: "class DataExporter" in r()), + ("Has Pipeline", lambda *_: "class Pipeline" in r()), + ("Has __main__", lambda *_: "__main__" in r()), + ("No rejections", lambda tmpdir, calls, rej: rej == 0), + ] + + r = await run_scenario("Long Python Code (single write_file)", code_task, make_code_checks) + results.append(("Code", r)) + + # ── Summary ── + print(f"\n{'=' * 70}") + print(" Summary") + print(f"{'=' * 70}") + for name, passed in results: + print(f" {name}: {'PASS' if passed else 'FAIL'}") + print(f"{'=' * 70}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index 99edf3c80..3c2eb6cdb 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -515,153 +515,64 @@ async def test_manage_path_comprehensive(temp_toolset): # --------------------------------------------------------------------------- -# Output-token truncation guards (PR #52) +# write_file append mode + large content tests # --------------------------------------------------------------------------- -async def test_write_file_rejects_large_content(temp_toolset): - """write_file must reject content exceeding WRITE_FILE_MAX_CHARS.""" - limit = temp_toolset.WRITE_FILE_MAX_CHARS - big = "x" * (limit + 1000) - res = await temp_toolset.write_file("big.txt", big) - assert not res["success"] - assert res["reason"] == "content_too_large" - # File must NOT exist on disk - assert not (temp_toolset.path / "big.txt").exists() - - -async def test_write_file_accepts_content_at_limit(temp_toolset): - """write_file must accept content exactly at WRITE_FILE_MAX_CHARS.""" - limit = temp_toolset.WRITE_FILE_MAX_CHARS - content = "a" * limit - res = await temp_toolset.write_file("exact.txt", content) - assert res["success"] - assert (temp_toolset.path / "exact.txt").read_text() == content - - -async def test_append_file_basic(temp_toolset): - """append_file appends to existing file.""" +async def test_write_file_append_basic(temp_toolset): + """write_file(append=True) appends to existing file.""" await temp_toolset.write_file("log.txt", "header\n") - res = await temp_toolset.append_file("log.txt", "line1\nline2\n") + res = await temp_toolset.write_file("log.txt", "line1\nline2\n", append=True) assert res["success"] assert res["appended_chars"] == len("line1\nline2\n") content = (await temp_toolset.read_file("log.txt"))["content"] assert content == "header\nline1\nline2\n" -async def test_append_file_multiple_batches(temp_toolset): - """append_file supports multiple sequential appends (BibTeX batch pattern).""" +async def test_write_file_append_multiple_batches(temp_toolset): + """write_file(append=True) supports multiple sequential appends.""" await temp_toolset.write_file("refs.bib", "% Bibliography\n") for i in range(5): batch = f"@article{{ref{i},\n title={{Title {i}}},\n}}\n\n" - res = await temp_toolset.append_file("refs.bib", batch) + res = await temp_toolset.write_file("refs.bib", batch, append=True) assert res["success"], f"Batch {i} failed: {res}" content = (await temp_toolset.read_file("refs.bib"))["content"] assert content.startswith("% Bibliography\n") assert content.count("@article{") == 5 -async def test_append_file_rejects_nonexistent(temp_toolset): - """append_file must reject when target file does not exist.""" - res = await temp_toolset.append_file("missing.txt", "data") +async def test_write_file_append_rejects_nonexistent(temp_toolset): + """write_file(append=True) rejects when file does not exist.""" + res = await temp_toolset.write_file("missing.txt", "data", append=True) assert not res["success"] assert res["reason"] == "file_not_found" -async def test_append_file_rejects_large_content(temp_toolset): - """append_file must reject content exceeding APPEND_FILE_MAX_CHARS.""" - await temp_toolset.write_file("base.txt", "ok\n") - limit = temp_toolset.APPEND_FILE_MAX_CHARS - big = "x" * (limit + 1000) - res = await temp_toolset.append_file("base.txt", big) - assert not res["success"] - assert res["reason"] == "content_too_large" - # Original content must be unchanged - content = (await temp_toolset.read_file("base.txt"))["content"] - assert content == "ok\n" - - -async def test_append_file_accepts_content_at_limit(temp_toolset): - """append_file must accept content exactly at APPEND_FILE_MAX_CHARS.""" - await temp_toolset.write_file("base.txt", "start\n") - limit = temp_toolset.APPEND_FILE_MAX_CHARS - chunk = "b" * limit - res = await temp_toolset.append_file("base.txt", chunk) +async def test_write_file_large_content(temp_toolset): + """write_file accepts large content (no size guards — root cause fixed at LLM layer).""" + big = "x" * 100_000 + res = await temp_toolset.write_file("big.txt", big) assert res["success"] - content = (await temp_toolset.read_file("base.txt"))["content"] - assert content == "start\n" + chunk + assert (temp_toolset.path / "big.txt").read_text() == big -async def test_update_file_rejects_large_new_string(temp_toolset): - """update_file must reject new_string exceeding UPDATE_FILE_MAX_CHARS.""" +async def test_update_file_large_new_string(temp_toolset): + """update_file accepts large new_string (no size guards).""" await temp_toolset.write_file("doc.txt", "PLACEHOLDER\n") - limit = temp_toolset.UPDATE_FILE_MAX_CHARS - big = "y" * (limit + 1000) + big = "y" * 50_000 res = await temp_toolset.update_file("doc.txt", "PLACEHOLDER", big) - assert not res["success"] - assert res["reason"] == "content_too_large" - # Original content must be unchanged - content = (await temp_toolset.read_file("doc.txt"))["content"] - assert content == "PLACEHOLDER\n" - - -async def test_update_file_accepts_new_string_at_limit(temp_toolset): - """update_file must accept new_string exactly at UPDATE_FILE_MAX_CHARS.""" - await temp_toolset.write_file("doc.txt", "STUB\n") - limit = temp_toolset.UPDATE_FILE_MAX_CHARS - replacement = "c" * limit - res = await temp_toolset.update_file("doc.txt", "STUB", replacement) assert res["success"] content = (await temp_toolset.read_file("doc.txt"))["content"] - assert replacement in content + assert big in content -async def test_two_phase_write_protocol(temp_toolset): - """End-to-end: scaffold → section fill → append (the protocol PR #52 teaches).""" - # Phase 1: scaffold - skeleton = ( - "\\documentclass{article}\n" - "\\begin{document}\n" - "\\section{Introduction}\n" - "% INTRO_PLACEHOLDER\n" - "\\section{Methods}\n" - "% METHODS_PLACEHOLDER\n" - "\\end{document}\n" - ) - res = await temp_toolset.write_file("paper.tex", skeleton) - assert res["success"] - - # Phase 2: fill sections via update_file - res = await temp_toolset.update_file( - "paper.tex", - "% INTRO_PLACEHOLDER", - "This paper presents a novel approach to analyzing single-cell data.", - ) - assert res["success"] - - res = await temp_toolset.update_file( - "paper.tex", - "% METHODS_PLACEHOLDER", - "We applied dimensionality reduction using UMAP.", - ) - assert res["success"] - - # Phase 3: append bibliography - bib_entries = "\\begin{thebibliography}{9}\n\\bibitem{ref1} Author, Title, 2024.\n\\end{thebibliography}\n" - # Insert before \end{document} via update_file - res = await temp_toolset.update_file( - "paper.tex", - "\\end{document}", - bib_entries + "\\end{document}", - ) +async def test_write_file_append_large_content(temp_toolset): + """write_file(append=True) accepts large content (no size guards).""" + await temp_toolset.write_file("base.txt", "start\n") + big = "z" * 50_000 + res = await temp_toolset.write_file("base.txt", big, append=True) assert res["success"] - - # Verify final document - content = (await temp_toolset.read_file("paper.tex"))["content"] - assert "novel approach" in content - assert "UMAP" in content - assert "\\bibitem{ref1}" in content - assert "INTRO_PLACEHOLDER" not in content - assert "METHODS_PLACEHOLDER" not in content + content = (await temp_toolset.read_file("base.txt"))["content"] + assert content == "start\n" + big # --------------------------------------------------------------------------- From 5e89899f5af75cc1df025fda237035fb5c0f211d Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 22:50:58 -0700 Subject: [PATCH 07/13] fix(prompts): remove Two-Phase Protocol reference from delegation failure recovery --- pantheon/factory/templates/prompts/delegation.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pantheon/factory/templates/prompts/delegation.md b/pantheon/factory/templates/prompts/delegation.md index 49ffa0430..082b586cf 100644 --- a/pantheon/factory/templates/prompts/delegation.md +++ b/pantheon/factory/templates/prompts/delegation.md @@ -98,9 +98,9 @@ Tool failures and sub-agent errors are expected — **never terminate without pr When a tool call fails, apply the following recovery ladder in order: -**File write failures** (e.g. content too large, output truncation): -1. **Use Two-Phase Write Protocol**: `write_file` (skeleton only) → `update_file` (one section at a time) → `append_file` (BibTeX / list batches). Never retry `write_file` with the same large content. -2. **Downgrade format**: If `.tex` fails after protocol, write `.md`; if `.md` fails, write `.txt` +**File write failures** (e.g. output truncation, encoding errors): +1. **Retry once**: Transient errors may resolve on retry +2. **Downgrade format**: If `.tex` fails, write `.md`; if `.md` fails, write `.txt` 3. **Inline output**: If all file writes fail, output the full content as a code block in the chat **Sub-agent failures** (researcher or illustrator returns error or empty result): From 54ed89bf49975be10aad92f49274d51556f302ec Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 22:51:52 -0700 Subject: [PATCH 08/13] fix(prompts): generalize scientific illustrator description for non-bio domains --- pantheon/factory/templates/teams/default.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pantheon/factory/templates/teams/default.md b/pantheon/factory/templates/teams/default.md index 403d0eaeb..5ceadf74c 100644 --- a/pantheon/factory/templates/teams/default.md +++ b/pantheon/factory/templates/teams/default.md @@ -92,8 +92,8 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa #### Scientific Illustrator -**Delegate for:** Schematic diagrams, pathway figures, cell structure illustrations, BioRender-style publication figures — tasks where the output is a conceptual diagram, not a data-driven chart. -**Execute directly (or via Researcher):** Data visualizations, statistical plots, matplotlib/seaborn charts derived from analysis results. +**Delegate for:** Schematic diagrams, conceptual illustrations, architecture diagrams, publication-quality figures — tasks where the output is a conceptual diagram, not a data-driven chart. +**Execute directly (or via Researcher):** Data visualizations, statistical plots, charts derived from analysis results. ### Decision Summary From bf1a2ba5c9ea06814f7499cb7a972356ec50ba8b Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 22:53:45 -0700 Subject: [PATCH 09/13] docs(file_manager): improve write_file docstring with usage guidance --- pantheon/toolsets/file/file_manager.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pantheon/toolsets/file/file_manager.py b/pantheon/toolsets/file/file_manager.py index 1063566aa..f08534adf 100644 --- a/pantheon/toolsets/file/file_manager.py +++ b/pantheon/toolsets/file/file_manager.py @@ -818,13 +818,28 @@ async def write_file( overwrite: bool = True, append: bool = False, ) -> dict: - """Create, overwrite, or append to a file. + """Create a new file, overwrite an existing one, or append to it. + + Parent directories are created automatically if they do not exist. + + For EDITING existing files, prefer `update_file` instead — it is + safer and more efficient for partial modifications. + + Use this tool when: + - Creating a brand new file + - Completely rewriting a file from scratch + - Appending content to an existing file (set append=True) + + Do NOT use this tool when: + - Making partial modifications to an existing file (use `update_file`) + - Changing a few lines in a large file (use `update_file`) Args: file_path: The path to the file to write. content: The content to write to the file. overwrite: When False, abort if the target file already exists (ignored when append=True). append: When True, append content to the end of an existing file instead of overwriting. + The file must already exist when using append mode. Returns: dict: Success status or error message. From 55342fbdbbeb9b31106ca5bd76399221793f67ac Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 22:58:38 -0700 Subject: [PATCH 10/13] refactor(prompts): remove failure recovery section from delegation.md Eliminated the detailed failure recovery guidelines for tool and sub-agent errors from delegation.md to streamline the document. This change simplifies the prompts and focuses on essential delegation instructions. --- .../factory/templates/prompts/delegation.md | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/pantheon/factory/templates/prompts/delegation.md b/pantheon/factory/templates/prompts/delegation.md index 082b586cf..63c6c0fc1 100644 --- a/pantheon/factory/templates/prompts/delegation.md +++ b/pantheon/factory/templates/prompts/delegation.md @@ -92,23 +92,3 @@ call_agent( call_agent("researcher", "Do analysis fast.") ``` -### Failure Recovery - -Tool failures and sub-agent errors are expected — **never terminate without producing output.** - -When a tool call fails, apply the following recovery ladder in order: - -**File write failures** (e.g. output truncation, encoding errors): -1. **Retry once**: Transient errors may resolve on retry -2. **Downgrade format**: If `.tex` fails, write `.md`; if `.md` fails, write `.txt` -3. **Inline output**: If all file writes fail, output the full content as a code block in the chat - -**Sub-agent failures** (researcher or illustrator returns error or empty result): -1. **Retry with narrower scope**: Re-delegate with a smaller, more focused Task Brief -2. **Self-execute fallback**: Handle the task directly if sub-agent repeatedly fails -3. **Partial output**: Deliver what was completed and clearly state what is missing - -**Hard rule — no silent failures:** -- Always produce at least one artifact per session, even if degraded -- When falling back to a simpler format, tell the user explicitly: what you tried, why it failed, what you're delivering instead -- A partial result delivered is always better than a perfect result abandoned From b60c94d9a594a461c994e157d14beea0ae855386 Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 23:28:02 -0700 Subject: [PATCH 11/13] feat(llm): enhance token management and model integration - Added new models to the LLM catalog, including "anthropic" and "google-auth" with their respective dependencies and configurations. - Implemented dynamic handling of output token parameters across different providers to ensure compatibility and prevent truncation issues. - Updated the LLM response handling to utilize model-specific output token limits, improving the robustness of API calls. - Enhanced tests to verify the correct application of output token parameters and model information retrieval. This update aims to streamline interactions with various LLM providers and improve overall system reliability. --- pantheon/utils/adapters/openai_adapter.py | 64 +++++++++++ pantheon/utils/llm.py | 46 +++++--- pantheon/utils/llm_catalog.json | 31 ++++++ pantheon/utils/provider_registry.py | 26 +++++ tests/test_agent.py | 35 +++++- tests/test_provider_adapters.py | 130 ++++++++++++++++++++++ 6 files changed, 314 insertions(+), 18 deletions(-) diff --git a/pantheon/utils/adapters/openai_adapter.py b/pantheon/utils/adapters/openai_adapter.py index bdedd2ede..f80c63acd 100644 --- a/pantheon/utils/adapters/openai_adapter.py +++ b/pantheon/utils/adapters/openai_adapter.py @@ -6,6 +6,7 @@ """ import os +import re import time from typing import Any, Callable @@ -74,6 +75,60 @@ def _normalize_response_format(response_format: Any) -> Any: return response_format +_OUTPUT_TOKEN_KEYS = ("max_tokens", "max_completion_tokens", "max_output_tokens") + + +def _clone_call_kwargs(call_kwargs: dict[str, Any]) -> dict[str, Any]: + """Clone request kwargs without mutating the current attempt.""" + cloned = dict(call_kwargs) + if "stream_options" in call_kwargs: + cloned["stream_options"] = dict(call_kwargs["stream_options"]) + return cloned + + +def _build_output_token_retry_kwargs(call_kwargs: dict[str, Any], err: Exception) -> tuple[dict[str, Any] | None, str | None]: + """Try to recover from vendor-specific output-token parameter failures.""" + err_text = str(err) + current_key = next((key for key in _OUTPUT_TOKEN_KEYS if key in call_kwargs), None) + current_value = call_kwargs.get(current_key) if current_key else None + + suggested_match = re.search( + r"Unsupported parameter: '([^']+)'.*?Use '([^']+)' instead", + err_text, + re.IGNORECASE, + ) + if suggested_match: + bad_key, suggested_key = suggested_match.groups() + value = call_kwargs.get(bad_key, current_value) + if value is not None and suggested_key not in call_kwargs: + new_kwargs = _clone_call_kwargs(call_kwargs) + new_kwargs.pop(bad_key, None) + new_kwargs[suggested_key] = value + return new_kwargs, f"switching output token parameter from {bad_key} to {suggested_key}" + + max_tokens_match = re.search( + r"supports at most (\d+) completion tokens", + err_text, + re.IGNORECASE, + ) + if current_key and current_value and max_tokens_match: + supported_max = int(max_tokens_match.group(1)) + if int(current_value) > supported_max: + new_kwargs = _clone_call_kwargs(call_kwargs) + new_kwargs[current_key] = supported_max + return new_kwargs, f"clamping {current_key} from {current_value} to {supported_max}" + + if current_key and ( + "unsupported parameter" in err_text.lower() + or "unknown parameter" in err_text.lower() + ): + new_kwargs = _clone_call_kwargs(call_kwargs) + new_kwargs.pop(current_key, None) + return new_kwargs, f"removing unsupported output token parameter {current_key}" + + return None, None + + class OpenAIAdapter(BaseAdapter): """Adapter for OpenAI and OpenAI-compatible APIs.""" @@ -134,6 +189,7 @@ async def acompletion( call_kwargs.update(kwargs) retry_count = num_retries + recovery_attempts = 2 while retry_count > 0: try: stream_start_time = time.time() @@ -204,6 +260,14 @@ async def acompletion( return collected_chunks except Exception as e: + recovered_kwargs, recovery_reason = _build_output_token_retry_kwargs(call_kwargs, e) + if recovered_kwargs is not None and recovery_attempts > 0: + recovery_attempts -= 1 + call_kwargs = recovered_kwargs + logger.warning( + f"Retrying chat completion after request adjustment ({recovery_reason}) [{model}]" + ) + continue wrapped = _wrap_openai_error(e) if isinstance(wrapped, APIConnectionError): retry_count -= 1 diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 40b8a2664..22a6e07b0 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -241,6 +241,7 @@ async def acompletion_responses( """ from openai import AsyncOpenAI from .llm_providers import get_proxy_kwargs + from .provider_registry import get_model_info, get_output_token_param # ========== Build client ========== proxy_kwargs = get_proxy_kwargs() @@ -257,7 +258,19 @@ async def acompletion_responses( # ========== Convert inputs ========== instructions, input_items = _convert_messages_to_responses_input(messages) converted_tools = _convert_tools_for_responses(tools) - extra_params = _convert_model_params_for_responses(model_params) + response_model_params = dict(model_params or {}) + if not any( + key in response_model_params + for key in ("max_tokens", "max_completion_tokens", "max_output_tokens") + ): + try: + max_out = get_model_info(model).get("max_output_tokens") + token_param = get_output_token_param(model, api_mode="responses") + if max_out and max_out > 0: + response_model_params[token_param] = max_out + except Exception: + pass + extra_params = _convert_model_params_for_responses(response_model_params) # ========== Build kwargs ========== kwargs: dict[str, Any] = { @@ -553,7 +566,13 @@ async def acompletion( - Uses native SDK adapters (openai, anthropic, google-genai) """ from .llm_providers import get_proxy_kwargs - from .provider_registry import find_provider_for_model, get_provider_config, completion_cost + from .provider_registry import ( + find_provider_for_model, + get_provider_config, + completion_cost, + get_model_info, + get_output_token_param, + ) from .adapters import get_adapter logger.debug(f"[ACOMPLETION] Starting LLM call | Model={model}") @@ -562,19 +581,20 @@ async def acompletion( provider_key, model_name, provider_config = find_provider_for_model(model) sdk_type = provider_config.get("sdk", "openai") - # ========== Ensure max_tokens (output) is set ========== - # Without explicit max_tokens, some providers (Anthropic) default to - # very low output limits (4096), causing tool_use JSON to be truncated - # mid-generation when the model writes large file content. - # Set to the model's declared max_output_tokens if not already specified. + # ========== Ensure output token limit is set from the catalog ========== + # Different vendors use different parameter names for the same concept. + # The catalog records the preferred parameter name; we use it here so the + # first request is correct for known providers/models. model_params = dict(model_params or {}) - if "max_tokens" not in model_params and "max_output_tokens" not in model_params: + if not any( + key in model_params + for key in ("max_tokens", "max_completion_tokens", "max_output_tokens") + ): try: - from .provider_registry import get_model_info - _info = get_model_info(model) - _max_out = _info.get("max_output_tokens") - if _max_out and _max_out > 0: - model_params["max_tokens"] = _max_out + max_out = get_model_info(model).get("max_output_tokens") + token_param = get_output_token_param(model, api_mode="chat") + if max_out and max_out > 0: + model_params[token_param] = max_out except Exception: pass # Fall through to provider default diff --git a/pantheon/utils/llm_catalog.json b/pantheon/utils/llm_catalog.json index 6f1d47e9e..71f5dc9d4 100644 --- a/pantheon/utils/llm_catalog.json +++ b/pantheon/utils/llm_catalog.json @@ -7,6 +7,8 @@ "base_url": "https://api.openai.com/v1", "api_key_env": "OPENAI_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_completion_tokens", + "responses_output_token_param": "max_output_tokens", "models": { "gpt-5.4-pro": { "max_input_tokens": 1000000, @@ -184,6 +186,22 @@ "supports_computer_use": false, "supports_assistant_prefill": false }, + "gpt-4o-mini": { + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, "o3-pro": { "max_input_tokens": 200000, "max_output_tokens": 100000, @@ -307,6 +325,7 @@ "base_url": "https://api.anthropic.com", "api_key_env": "ANTHROPIC_API_KEY", "openai_compatible": false, + "chat_output_token_param": "max_tokens", "models": { "claude-opus-4-6": { "max_input_tokens": 1000000, @@ -428,6 +447,7 @@ "base_url": "https://generativelanguage.googleapis.com", "api_key_env": "GEMINI_API_KEY", "openai_compatible": false, + "chat_output_token_param": "max_output_tokens", "models": { "gemini-3.1-pro-preview": { "max_input_tokens": 2000000, @@ -560,6 +580,7 @@ "base_url": "https://api.deepseek.com/v1", "api_key_env": "DEEPSEEK_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "deepseek-chat": { "max_input_tokens": 131072, @@ -601,6 +622,7 @@ "base_url": "https://open.bigmodel.cn/api/paas/v4", "api_key_env": "ZAI_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "glm-5": { "max_input_tokens": 131072, @@ -706,6 +728,7 @@ "base_url": "https://api.minimax.io/v1", "api_key_env": "MINIMAX_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "MiniMax-M2.7": { "max_input_tokens": 1000000, @@ -795,6 +818,7 @@ "base_url": "https://api.moonshot.ai/v1", "api_key_env": "MOONSHOT_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "kimi-k2.5": { "max_input_tokens": 131072, @@ -836,6 +860,7 @@ "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", "api_key_env": "DASHSCOPE_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "qwen3-235b-a22b": { "max_input_tokens": 131072, @@ -989,6 +1014,7 @@ "base_url": "https://api.groq.com/openai/v1", "api_key_env": "GROQ_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_completion_tokens", "models": { "openai/gpt-oss-120b": { "max_input_tokens": 131072, @@ -1110,6 +1136,7 @@ "base_url": "https://openrouter.ai/api/v1", "api_key_env": "OPENROUTER_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "anthropic/claude-sonnet-4-6": { "max_input_tokens": 1000000, @@ -1183,6 +1210,7 @@ "base_url": "https://api.mistral.ai/v1", "api_key_env": "MISTRAL_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "mistral-large-latest": { "max_input_tokens": 262144, @@ -1272,6 +1300,7 @@ "base_url": "https://api.together.xyz/v1", "api_key_env": "TOGETHER_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "Qwen/Qwen3.5-397B-A17B": { "max_input_tokens": 262144, @@ -1346,6 +1375,7 @@ "api_key_env": "", "openai_compatible": false, "auth_mode": "oauth", + "responses_output_token_param": "max_output_tokens", "models": { "gpt-5.4": { "max_input_tokens": 1000000, @@ -1406,6 +1436,7 @@ "api_key_env": "", "openai_compatible": true, "local": true, + "chat_output_token_param": "max_tokens", "models": {} } } diff --git a/pantheon/utils/provider_registry.py b/pantheon/utils/provider_registry.py index 5d92596bd..7f69b3be7 100644 --- a/pantheon/utils/provider_registry.py +++ b/pantheon/utils/provider_registry.py @@ -34,6 +34,13 @@ "supports_assistant_prefill": False, } +_DEFAULT_OUTPUT_TOKEN_PARAMS = { + "openai": "max_tokens", + "anthropic": "max_tokens", + "google-genai": "max_output_tokens", + "codex": "max_output_tokens", +} + @lru_cache(maxsize=1) def load_catalog() -> dict: @@ -98,6 +105,25 @@ def get_provider_config(provider: str) -> dict: return catalog.get("providers", {}).get(provider, {}) +def get_output_token_param(model: str, api_mode: str = "chat") -> str: + """Return the provider/model-specific output token parameter name. + + Args: + model: Model string, e.g. ``openai/gpt-5.4`` or ``gpt-4o-mini``. + api_mode: ``chat`` for chat/completions style APIs, ``responses`` for + OpenAI Responses-style APIs. + """ + provider_key, _model_name, provider_config = find_provider_for_model(model) + if api_mode == "responses": + return provider_config.get("responses_output_token_param", "max_output_tokens") + + if "chat_output_token_param" in provider_config: + return provider_config["chat_output_token_param"] + + sdk_type = provider_config.get("sdk", "openai") + return _DEFAULT_OUTPUT_TOKEN_PARAMS.get(sdk_type, "max_tokens") + + # ============ Model Metadata ============ diff --git a/tests/test_agent.py b/tests/test_agent.py index 34f61d1ad..6e4a5945d 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -1,4 +1,5 @@ import asyncio +import json import random from pathlib import Path from typing import List @@ -187,8 +188,19 @@ def get_weather(city: str, unit: str = "celsius"): """Get the weather of a city.""" return {"weather": "sunny", "temperature": 20} - resp = await agent.run("What is the weather in Palo Alto?") - print(resp.content) + sync_tool_messages = await agent._handle_tool_calls( + tool_calls=[{ + "id": "call_sync_weather", + "function": { + "name": "get_weather", + "arguments": json.dumps({"city": "Palo Alto", "unit": "celsius"}), + }, + }], + context_variables={}, + timeout=agent.tool_timeout, + ) + assert sync_tool_messages + assert "sunny" in sync_tool_messages[0]["content"].lower() agent.functions.clear() @@ -201,9 +213,22 @@ async def get_weather(city: str, unit: str = "celsius"): nonlocal flag flag = False - resp = await agent.run("What is the weather in Palo Alto?") - assert flag, "Tool should have timed out but it completed execution" - print(resp) + tool_messages = await agent._handle_tool_calls( + tool_calls=[{ + "id": "call_async_weather", + "function": { + "name": "get_weather", + "arguments": json.dumps({"city": "Palo Alto", "unit": "celsius"}), + }, + }], + context_variables={}, + timeout=agent.tool_timeout, + ) + assert tool_messages + bg_tasks = agent._bg_manager.list_tasks() + assert bg_tasks, "Timed out tool should be adopted into background execution" + assert bg_tasks[0].source == "timeout" + assert flag, "Tool coroutine should continue in background instead of blocking the foreground call" async def test_agent_transfer(): diff --git a/tests/test_provider_adapters.py b/tests/test_provider_adapters.py index 8046ab00f..ad51ca32a 100644 --- a/tests/test_provider_adapters.py +++ b/tests/test_provider_adapters.py @@ -6,6 +6,7 @@ import os import sys +from types import SimpleNamespace import pytest sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) @@ -24,6 +25,7 @@ load_catalog, find_provider_for_model, get_model_info, + get_output_token_param, completion_cost, models_by_provider, token_counter, @@ -70,6 +72,11 @@ def test_get_model_info_known(self): assert info["max_input_tokens"] == 1_000_000 assert info["supports_vision"] is True + def test_get_model_info_openai_gpt_4o_mini(self): + info = get_model_info("gpt-4o-mini") + assert info["max_input_tokens"] == 128_000 + assert info["max_output_tokens"] == 16_384 + def test_get_model_info_unknown_returns_defaults(self): info = get_model_info("fake/nonexistent-model") assert info["max_input_tokens"] == 200_000 @@ -86,6 +93,15 @@ def test_models_by_provider_qwen(self): models = models_by_provider("qwen") assert len(models) == 9 + def test_output_token_param_catalog(self): + assert get_output_token_param("openai/gpt-5.4") == "max_completion_tokens" + assert get_output_token_param("anthropic/claude-sonnet-4-6") == "max_tokens" + assert get_output_token_param("gemini/gemini-2.5-flash") == "max_output_tokens" + assert get_output_token_param("deepseek/deepseek-chat") == "max_tokens" + assert get_output_token_param("minimax/MiniMax-M2.5") == "max_tokens" + assert get_output_token_param("groq/llama-3.3-70b-versatile") == "max_completion_tokens" + assert get_output_token_param("codex/gpt-5.4", api_mode="responses") == "max_output_tokens" + def test_token_counter_basic(self): count = token_counter(model="gpt-4", messages=[{"role": "user", "content": "Hello"}]) assert count > 0 @@ -107,6 +123,120 @@ def test_all_default_models_in_catalog(self): assert missing == [], f"Models in selector but not in catalog: {missing}" +@pytest.mark.asyncio +async def test_llm_uses_catalog_output_param_for_openai(monkeypatch): + from pantheon.utils import llm as llm_module + from pantheon.utils import adapters as adapters_module + + captured = {} + + class DummyAdapter: + async def acompletion(self, **kwargs): + captured.update(kwargs) + return [ + { + "choices": [ + { + "index": 0, + "delta": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + } + ], + "model": kwargs["model"], + }, + { + "usage": { + "prompt_tokens": 1, + "completion_tokens": 1, + "total_tokens": 2, + }, + "choices": [], + }, + ] + + monkeypatch.setattr(adapters_module, "get_adapter", lambda _sdk: DummyAdapter()) + + resp = await llm_module.acompletion( + messages=[{"role": "user", "content": "hello"}], + model="openai/gpt-5.4", + model_params={}, + ) + + assert resp.choices[0].message.content == "ok" + assert captured["max_completion_tokens"] == 64000 + assert "max_tokens" not in captured + + +@pytest.mark.asyncio +async def test_openai_adapter_recovers_from_unsupported_max_tokens(monkeypatch): + from pantheon.utils.adapters.openai_adapter import OpenAIAdapter + + calls = [] + + class FakeChunk: + def __init__(self, content: str, finish_reason: str | None = None): + delta = SimpleNamespace(model_dump=lambda: {"role": "assistant", "content": content}) + self.choices = [SimpleNamespace(delta=delta, finish_reason=finish_reason)] + self._dump = { + "choices": [ + { + "index": 0, + "delta": {"role": "assistant", "content": content}, + "finish_reason": finish_reason, + } + ] + } + + def model_dump(self): + return self._dump + + class FakeResponse: + def __init__(self): + self._chunks = [ + FakeChunk("hi"), + FakeChunk("", "stop"), + ] + + def __aiter__(self): + self._iter = iter(self._chunks) + return self + + async def __anext__(self): + try: + return next(self._iter) + except StopIteration: + raise StopAsyncIteration + + class FakeCompletions: + async def create(self, **kwargs): + calls.append(dict(kwargs)) + if len(calls) == 1: + raise Exception( + "Unsupported parameter: 'max_tokens' is not supported with this model. " + "Use 'max_completion_tokens' instead." + ) + return FakeResponse() + + class FakeClient: + def __init__(self): + self.chat = SimpleNamespace(completions=FakeCompletions()) + + adapter = OpenAIAdapter() + monkeypatch.setattr(adapter, "_make_client", lambda base_url, api_key: FakeClient()) + + chunks = await adapter.acompletion( + model="gpt-5.4", + messages=[{"role": "user", "content": "hello"}], + max_tokens=64, + ) + + assert len(chunks) == 2 + assert calls[0]["max_tokens"] == 64 + assert "max_completion_tokens" not in calls[0] + assert calls[1]["max_completion_tokens"] == 64 + assert "max_tokens" not in calls[1] + + # ============ stream_chunk_builder unit tests ============ From 91f781a2c9282c5aca118f7a2441ee2e9ccfae5d Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 23:36:08 -0700 Subject: [PATCH 12/13] fix(llm): refine output token parameter checks for model responses - Updated the logic in acompletion_responses and acompletion functions to ensure that the output token parameter is validated alongside the model's max output tokens. - Modified the get_output_token_param function to return None when no valid parameter is found, enhancing type safety and clarity in handling token parameters. These changes aim to improve the robustness of token management across different LLM providers. --- pantheon/utils/llm.py | 4 ++-- pantheon/utils/provider_registry.py | 20 ++++---------------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 22a6e07b0..575105f8e 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -266,7 +266,7 @@ async def acompletion_responses( try: max_out = get_model_info(model).get("max_output_tokens") token_param = get_output_token_param(model, api_mode="responses") - if max_out and max_out > 0: + if token_param and max_out and max_out > 0: response_model_params[token_param] = max_out except Exception: pass @@ -593,7 +593,7 @@ async def acompletion( try: max_out = get_model_info(model).get("max_output_tokens") token_param = get_output_token_param(model, api_mode="chat") - if max_out and max_out > 0: + if token_param and max_out and max_out > 0: model_params[token_param] = max_out except Exception: pass # Fall through to provider default diff --git a/pantheon/utils/provider_registry.py b/pantheon/utils/provider_registry.py index 7f69b3be7..8f8c6bd7b 100644 --- a/pantheon/utils/provider_registry.py +++ b/pantheon/utils/provider_registry.py @@ -34,14 +34,6 @@ "supports_assistant_prefill": False, } -_DEFAULT_OUTPUT_TOKEN_PARAMS = { - "openai": "max_tokens", - "anthropic": "max_tokens", - "google-genai": "max_output_tokens", - "codex": "max_output_tokens", -} - - @lru_cache(maxsize=1) def load_catalog() -> dict: """Load and cache the provider catalog from llm_catalog.json.""" @@ -105,7 +97,7 @@ def get_provider_config(provider: str) -> dict: return catalog.get("providers", {}).get(provider, {}) -def get_output_token_param(model: str, api_mode: str = "chat") -> str: +def get_output_token_param(model: str, api_mode: str = "chat") -> str | None: """Return the provider/model-specific output token parameter name. Args: @@ -113,15 +105,11 @@ def get_output_token_param(model: str, api_mode: str = "chat") -> str: api_mode: ``chat`` for chat/completions style APIs, ``responses`` for OpenAI Responses-style APIs. """ - provider_key, _model_name, provider_config = find_provider_for_model(model) + _provider_key, _model_name, provider_config = find_provider_for_model(model) if api_mode == "responses": - return provider_config.get("responses_output_token_param", "max_output_tokens") - - if "chat_output_token_param" in provider_config: - return provider_config["chat_output_token_param"] + return provider_config.get("responses_output_token_param") - sdk_type = provider_config.get("sdk", "openai") - return _DEFAULT_OUTPUT_TOKEN_PARAMS.get(sdk_type, "max_tokens") + return provider_config.get("chat_output_token_param") # ============ Model Metadata ============ From 746eb2ecbde265f3ad04e5b07fcee74ce5d2d66a Mon Sep 17 00:00:00 2001 From: Starlitnightly Date: Fri, 3 Apr 2026 23:42:58 -0700 Subject: [PATCH 13/13] refactor(openai_adapter): remove unused output token recovery logic - Eliminated the output token recovery functions and related logic from the OpenAIAdapter class, streamlining the codebase. - Updated the acompletion method to remove unnecessary recovery attempts, enhancing clarity and maintainability. These changes focus on simplifying the adapter's implementation while ensuring it remains effective for handling OpenAI API interactions. --- pantheon/utils/adapters/openai_adapter.py | 65 --------------------- tests/test_provider_adapters.py | 71 ----------------------- 2 files changed, 136 deletions(-) diff --git a/pantheon/utils/adapters/openai_adapter.py b/pantheon/utils/adapters/openai_adapter.py index f80c63acd..e343b9975 100644 --- a/pantheon/utils/adapters/openai_adapter.py +++ b/pantheon/utils/adapters/openai_adapter.py @@ -6,7 +6,6 @@ """ import os -import re import time from typing import Any, Callable @@ -74,61 +73,6 @@ def _normalize_response_format(response_format: Any) -> Any: pass return response_format - -_OUTPUT_TOKEN_KEYS = ("max_tokens", "max_completion_tokens", "max_output_tokens") - - -def _clone_call_kwargs(call_kwargs: dict[str, Any]) -> dict[str, Any]: - """Clone request kwargs without mutating the current attempt.""" - cloned = dict(call_kwargs) - if "stream_options" in call_kwargs: - cloned["stream_options"] = dict(call_kwargs["stream_options"]) - return cloned - - -def _build_output_token_retry_kwargs(call_kwargs: dict[str, Any], err: Exception) -> tuple[dict[str, Any] | None, str | None]: - """Try to recover from vendor-specific output-token parameter failures.""" - err_text = str(err) - current_key = next((key for key in _OUTPUT_TOKEN_KEYS if key in call_kwargs), None) - current_value = call_kwargs.get(current_key) if current_key else None - - suggested_match = re.search( - r"Unsupported parameter: '([^']+)'.*?Use '([^']+)' instead", - err_text, - re.IGNORECASE, - ) - if suggested_match: - bad_key, suggested_key = suggested_match.groups() - value = call_kwargs.get(bad_key, current_value) - if value is not None and suggested_key not in call_kwargs: - new_kwargs = _clone_call_kwargs(call_kwargs) - new_kwargs.pop(bad_key, None) - new_kwargs[suggested_key] = value - return new_kwargs, f"switching output token parameter from {bad_key} to {suggested_key}" - - max_tokens_match = re.search( - r"supports at most (\d+) completion tokens", - err_text, - re.IGNORECASE, - ) - if current_key and current_value and max_tokens_match: - supported_max = int(max_tokens_match.group(1)) - if int(current_value) > supported_max: - new_kwargs = _clone_call_kwargs(call_kwargs) - new_kwargs[current_key] = supported_max - return new_kwargs, f"clamping {current_key} from {current_value} to {supported_max}" - - if current_key and ( - "unsupported parameter" in err_text.lower() - or "unknown parameter" in err_text.lower() - ): - new_kwargs = _clone_call_kwargs(call_kwargs) - new_kwargs.pop(current_key, None) - return new_kwargs, f"removing unsupported output token parameter {current_key}" - - return None, None - - class OpenAIAdapter(BaseAdapter): """Adapter for OpenAI and OpenAI-compatible APIs.""" @@ -189,7 +133,6 @@ async def acompletion( call_kwargs.update(kwargs) retry_count = num_retries - recovery_attempts = 2 while retry_count > 0: try: stream_start_time = time.time() @@ -260,14 +203,6 @@ async def acompletion( return collected_chunks except Exception as e: - recovered_kwargs, recovery_reason = _build_output_token_retry_kwargs(call_kwargs, e) - if recovered_kwargs is not None and recovery_attempts > 0: - recovery_attempts -= 1 - call_kwargs = recovered_kwargs - logger.warning( - f"Retrying chat completion after request adjustment ({recovery_reason}) [{model}]" - ) - continue wrapped = _wrap_openai_error(e) if isinstance(wrapped, APIConnectionError): retry_count -= 1 diff --git a/tests/test_provider_adapters.py b/tests/test_provider_adapters.py index ad51ca32a..55edea244 100644 --- a/tests/test_provider_adapters.py +++ b/tests/test_provider_adapters.py @@ -6,7 +6,6 @@ import os import sys -from types import SimpleNamespace import pytest sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) @@ -167,76 +166,6 @@ async def acompletion(self, **kwargs): assert "max_tokens" not in captured -@pytest.mark.asyncio -async def test_openai_adapter_recovers_from_unsupported_max_tokens(monkeypatch): - from pantheon.utils.adapters.openai_adapter import OpenAIAdapter - - calls = [] - - class FakeChunk: - def __init__(self, content: str, finish_reason: str | None = None): - delta = SimpleNamespace(model_dump=lambda: {"role": "assistant", "content": content}) - self.choices = [SimpleNamespace(delta=delta, finish_reason=finish_reason)] - self._dump = { - "choices": [ - { - "index": 0, - "delta": {"role": "assistant", "content": content}, - "finish_reason": finish_reason, - } - ] - } - - def model_dump(self): - return self._dump - - class FakeResponse: - def __init__(self): - self._chunks = [ - FakeChunk("hi"), - FakeChunk("", "stop"), - ] - - def __aiter__(self): - self._iter = iter(self._chunks) - return self - - async def __anext__(self): - try: - return next(self._iter) - except StopIteration: - raise StopAsyncIteration - - class FakeCompletions: - async def create(self, **kwargs): - calls.append(dict(kwargs)) - if len(calls) == 1: - raise Exception( - "Unsupported parameter: 'max_tokens' is not supported with this model. " - "Use 'max_completion_tokens' instead." - ) - return FakeResponse() - - class FakeClient: - def __init__(self): - self.chat = SimpleNamespace(completions=FakeCompletions()) - - adapter = OpenAIAdapter() - monkeypatch.setattr(adapter, "_make_client", lambda base_url, api_key: FakeClient()) - - chunks = await adapter.acompletion( - model="gpt-5.4", - messages=[{"role": "user", "content": "hello"}], - max_tokens=64, - ) - - assert len(chunks) == 2 - assert calls[0]["max_tokens"] == 64 - assert "max_completion_tokens" not in calls[0] - assert calls[1]["max_completion_tokens"] == 64 - assert "max_tokens" not in calls[1] - - # ============ stream_chunk_builder unit tests ============