diff --git a/pantheon/factory/templates/prompts/delegation.md b/pantheon/factory/templates/prompts/delegation.md index fd6d61a72..63c6c0fc1 100644 --- a/pantheon/factory/templates/prompts/delegation.md +++ b/pantheon/factory/templates/prompts/delegation.md @@ -91,3 +91,4 @@ call_agent( ```python call_agent("researcher", "Do analysis fast.") ``` + diff --git a/pantheon/factory/templates/teams/default.md b/pantheon/factory/templates/teams/default.md index 76aa2be6a..5ceadf74c 100644 --- a/pantheon/factory/templates/teams/default.md +++ b/pantheon/factory/templates/teams/default.md @@ -88,10 +88,12 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa - Data analysis, EDA, statistical analysis - Literature review and multi-source research +**Scientific writing gate (MANDATORY):** Before writing any report, paper, or document that requires domain knowledge or citations, you MUST first delegate a research task to `researcher`. Writing without a prior research delegation is not allowed for these task types. + #### Scientific Illustrator -**Delegate for:** Scientific diagrams, publication-quality visualizations, complex figures -**Execute directly:** Simple chart embedding, displaying existing charts +**Delegate for:** Schematic diagrams, conceptual illustrations, architecture diagrams, publication-quality figures — tasks where the output is a conceptual diagram, not a data-driven chart. +**Execute directly (or via Researcher):** Data visualizations, statistical plots, charts derived from analysis results. ### Decision Summary @@ -100,9 +102,11 @@ call_agent("researcher", "Search the web for best practices on X. Gather informa | Explore/read/understand codebase | **MUST delegate** to researcher | | Web search or documentation lookup | **MUST delegate** to researcher | | Data analysis or research | **MUST delegate** to researcher | +| Scientific writing (report/paper) | **MUST delegate research first**, then write | | Multiple independent research tasks | **MUST parallelize** with multiple researchers | +| Schematic/pathway/cell diagrams | **Delegate** to scientific_illustrator | | Read 1 known file | Execute directly | -| Write/edit/create files | Execute directly | +| Write/edit/create files (post-research) | Execute directly | | Synthesize researcher results | Execute directly (your core role) | {{delegation}} diff --git a/pantheon/toolsets/file/file_manager.py b/pantheon/toolsets/file/file_manager.py index 24da1a3c9..f08534adf 100644 --- a/pantheon/toolsets/file/file_manager.py +++ b/pantheon/toolsets/file/file_manager.py @@ -816,35 +816,51 @@ async def write_file( file_path: str, content: str = "", overwrite: bool = True, + append: bool = False, ) -> dict: - """Use this tool to CREATE NEW file. + """Create a new file, overwrite an existing one, or append to it. - This tool writes content to a file, automatically creating parent - directories if they do not exist. + Parent directories are created automatically if they do not exist. - IMPORTANT: For EDITING existing file, use `update_file` instead. - DO NOT rewrite entire file when only small changes are needed, its is wasteful and error-prone. + For EDITING existing files, prefer `update_file` instead — it is + safer and more efficient for partial modifications. Use this tool when: - Creating a brand new file - - Completely rewriting a file from scratch (rare) + - Completely rewriting a file from scratch + - Appending content to an existing file (set append=True) - DO NOT use this tool when: - - Making partial modifications to an existing file - - Changing a few lines in a large file - - For these cases, use `update_file` instead + Do NOT use this tool when: + - Making partial modifications to an existing file (use `update_file`) + - Changing a few lines in a large file (use `update_file`) Args: file_path: The path to the file to write. content: The content to write to the file. - overwrite: When False, abort if the target file already exists. - Default is True, but consider using update_file for edits. + overwrite: When False, abort if the target file already exists (ignored when append=True). + append: When True, append content to the end of an existing file instead of overwriting. + The file must already exist when using append mode. Returns: dict: Success status or error message. """ - target_path = self._resolve_path(file_path) + + if append: + if not target_path.exists(): + return { + "success": False, + "error": f"File '{file_path}' does not exist. Use write_file without append=True to create it first.", + "reason": "file_not_found", + } + try: + with open(target_path, "a", encoding="utf-8") as f: + f.write(content) + return {"success": True, "appended_chars": len(content)} + except Exception as exc: + logger.error(f"write_file(append) failed for {file_path}: {exc}") + return {"success": False, "error": str(exc)} + if not overwrite and target_path.exists(): return { "success": False, diff --git a/pantheon/utils/adapters/openai_adapter.py b/pantheon/utils/adapters/openai_adapter.py index bdedd2ede..e343b9975 100644 --- a/pantheon/utils/adapters/openai_adapter.py +++ b/pantheon/utils/adapters/openai_adapter.py @@ -73,7 +73,6 @@ def _normalize_response_format(response_format: Any) -> Any: pass return response_format - class OpenAIAdapter(BaseAdapter): """Adapter for OpenAI and OpenAI-compatible APIs.""" diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 3e70b1e70..575105f8e 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -241,6 +241,7 @@ async def acompletion_responses( """ from openai import AsyncOpenAI from .llm_providers import get_proxy_kwargs + from .provider_registry import get_model_info, get_output_token_param # ========== Build client ========== proxy_kwargs = get_proxy_kwargs() @@ -257,7 +258,19 @@ async def acompletion_responses( # ========== Convert inputs ========== instructions, input_items = _convert_messages_to_responses_input(messages) converted_tools = _convert_tools_for_responses(tools) - extra_params = _convert_model_params_for_responses(model_params) + response_model_params = dict(model_params or {}) + if not any( + key in response_model_params + for key in ("max_tokens", "max_completion_tokens", "max_output_tokens") + ): + try: + max_out = get_model_info(model).get("max_output_tokens") + token_param = get_output_token_param(model, api_mode="responses") + if token_param and max_out and max_out > 0: + response_model_params[token_param] = max_out + except Exception: + pass + extra_params = _convert_model_params_for_responses(response_model_params) # ========== Build kwargs ========== kwargs: dict[str, Any] = { @@ -553,7 +566,13 @@ async def acompletion( - Uses native SDK adapters (openai, anthropic, google-genai) """ from .llm_providers import get_proxy_kwargs - from .provider_registry import find_provider_for_model, get_provider_config, completion_cost + from .provider_registry import ( + find_provider_for_model, + get_provider_config, + completion_cost, + get_model_info, + get_output_token_param, + ) from .adapters import get_adapter logger.debug(f"[ACOMPLETION] Starting LLM call | Model={model}") @@ -562,6 +581,23 @@ async def acompletion( provider_key, model_name, provider_config = find_provider_for_model(model) sdk_type = provider_config.get("sdk", "openai") + # ========== Ensure output token limit is set from the catalog ========== + # Different vendors use different parameter names for the same concept. + # The catalog records the preferred parameter name; we use it here so the + # first request is correct for known providers/models. + model_params = dict(model_params or {}) + if not any( + key in model_params + for key in ("max_tokens", "max_completion_tokens", "max_output_tokens") + ): + try: + max_out = get_model_info(model).get("max_output_tokens") + token_param = get_output_token_param(model, api_mode="chat") + if token_param and max_out and max_out > 0: + model_params[token_param] = max_out + except Exception: + pass # Fall through to provider default + # ========== Mode Detection & Configuration ========== proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: diff --git a/pantheon/utils/llm_catalog.json b/pantheon/utils/llm_catalog.json index 6f1d47e9e..71f5dc9d4 100644 --- a/pantheon/utils/llm_catalog.json +++ b/pantheon/utils/llm_catalog.json @@ -7,6 +7,8 @@ "base_url": "https://api.openai.com/v1", "api_key_env": "OPENAI_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_completion_tokens", + "responses_output_token_param": "max_output_tokens", "models": { "gpt-5.4-pro": { "max_input_tokens": 1000000, @@ -184,6 +186,22 @@ "supports_computer_use": false, "supports_assistant_prefill": false }, + "gpt-4o-mini": { + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, "o3-pro": { "max_input_tokens": 200000, "max_output_tokens": 100000, @@ -307,6 +325,7 @@ "base_url": "https://api.anthropic.com", "api_key_env": "ANTHROPIC_API_KEY", "openai_compatible": false, + "chat_output_token_param": "max_tokens", "models": { "claude-opus-4-6": { "max_input_tokens": 1000000, @@ -428,6 +447,7 @@ "base_url": "https://generativelanguage.googleapis.com", "api_key_env": "GEMINI_API_KEY", "openai_compatible": false, + "chat_output_token_param": "max_output_tokens", "models": { "gemini-3.1-pro-preview": { "max_input_tokens": 2000000, @@ -560,6 +580,7 @@ "base_url": "https://api.deepseek.com/v1", "api_key_env": "DEEPSEEK_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "deepseek-chat": { "max_input_tokens": 131072, @@ -601,6 +622,7 @@ "base_url": "https://open.bigmodel.cn/api/paas/v4", "api_key_env": "ZAI_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "glm-5": { "max_input_tokens": 131072, @@ -706,6 +728,7 @@ "base_url": "https://api.minimax.io/v1", "api_key_env": "MINIMAX_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "MiniMax-M2.7": { "max_input_tokens": 1000000, @@ -795,6 +818,7 @@ "base_url": "https://api.moonshot.ai/v1", "api_key_env": "MOONSHOT_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "kimi-k2.5": { "max_input_tokens": 131072, @@ -836,6 +860,7 @@ "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", "api_key_env": "DASHSCOPE_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "qwen3-235b-a22b": { "max_input_tokens": 131072, @@ -989,6 +1014,7 @@ "base_url": "https://api.groq.com/openai/v1", "api_key_env": "GROQ_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_completion_tokens", "models": { "openai/gpt-oss-120b": { "max_input_tokens": 131072, @@ -1110,6 +1136,7 @@ "base_url": "https://openrouter.ai/api/v1", "api_key_env": "OPENROUTER_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "anthropic/claude-sonnet-4-6": { "max_input_tokens": 1000000, @@ -1183,6 +1210,7 @@ "base_url": "https://api.mistral.ai/v1", "api_key_env": "MISTRAL_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "mistral-large-latest": { "max_input_tokens": 262144, @@ -1272,6 +1300,7 @@ "base_url": "https://api.together.xyz/v1", "api_key_env": "TOGETHER_API_KEY", "openai_compatible": true, + "chat_output_token_param": "max_tokens", "models": { "Qwen/Qwen3.5-397B-A17B": { "max_input_tokens": 262144, @@ -1346,6 +1375,7 @@ "api_key_env": "", "openai_compatible": false, "auth_mode": "oauth", + "responses_output_token_param": "max_output_tokens", "models": { "gpt-5.4": { "max_input_tokens": 1000000, @@ -1406,6 +1436,7 @@ "api_key_env": "", "openai_compatible": true, "local": true, + "chat_output_token_param": "max_tokens", "models": {} } } diff --git a/pantheon/utils/provider_registry.py b/pantheon/utils/provider_registry.py index 5d92596bd..8f8c6bd7b 100644 --- a/pantheon/utils/provider_registry.py +++ b/pantheon/utils/provider_registry.py @@ -34,7 +34,6 @@ "supports_assistant_prefill": False, } - @lru_cache(maxsize=1) def load_catalog() -> dict: """Load and cache the provider catalog from llm_catalog.json.""" @@ -98,6 +97,21 @@ def get_provider_config(provider: str) -> dict: return catalog.get("providers", {}).get(provider, {}) +def get_output_token_param(model: str, api_mode: str = "chat") -> str | None: + """Return the provider/model-specific output token parameter name. + + Args: + model: Model string, e.g. ``openai/gpt-5.4`` or ``gpt-4o-mini``. + api_mode: ``chat`` for chat/completions style APIs, ``responses`` for + OpenAI Responses-style APIs. + """ + _provider_key, _model_name, provider_config = find_provider_for_model(model) + if api_mode == "responses": + return provider_config.get("responses_output_token_param") + + return provider_config.get("chat_output_token_param") + + # ============ Model Metadata ============ diff --git a/scripts/test_two_phase_live.py b/scripts/test_two_phase_live.py new file mode 100644 index 000000000..a68d96591 --- /dev/null +++ b/scripts/test_two_phase_live.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Live integration test: long paper + long code file writing. + +Verifies that after removing size guards, the LLM can write large files +directly without truncation (root cause fixed by max_tokens auto-detection). + +Requires: OPENAI_API_KEY + +Usage: + OPENAI_API_KEY=sk-... python scripts/test_two_phase_live.py +""" + +from __future__ import annotations + +import asyncio +import os +import sys +import tempfile +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + + +async def run_scenario(name, task, make_checks, model="openai/gpt-4.1-mini"): + from pantheon.agent import Agent + from pantheon.toolsets.file import FileManagerToolSet + + with tempfile.TemporaryDirectory() as tmpdir: + print(f"\n{'─' * 70}") + print(f" Scenario: {name}") + print(f" Model: {model}") + print(f"{'─' * 70}") + + fm = FileManagerToolSet("file_manager", tmpdir) + agent = Agent( + name="writer", + model=model, + instructions=( + "You are a skilled developer and writer. " + "Use file tools (write_file, update_file, read_file) to complete tasks. " + "Write complete, production-quality content — do NOT leave stubs or placeholders." + ), + ) + await agent.toolset(fm) + + calls = [] + rejections = 0 + + async def log(msg): + nonlocal rejections + if msg.get("role") == "assistant": + for tc in msg.get("tool_calls", []) or []: + fn = tc.get("function", {}) + tool_name = fn.get("name", "?").replace("file_manager__", "") + args_len = len(fn.get("arguments", "")) + calls.append(tool_name) + print(f" {tool_name} ({args_len:,} chars)") + elif msg.get("role") == "tool": + c = str(msg.get("content", "")) + if "content_too_large" in c: + rejections += 1 + print(f" -> REJECTED") + + resp = await agent.run( + [{"role": "user", "content": task}], + process_step_message=log, + use_memory=False, + ) + + # Build and run checks + checks = make_checks(tmpdir) + print() + all_pass = True + for check_name, check_fn in checks: + try: + result = check_fn(tmpdir, calls, rejections) + status = "PASS" if result else "FAIL" + if not result: + all_pass = False + except Exception as e: + status = f"FAIL ({e})" + all_pass = False + print(f" [{status}] {check_name}") + + # Show file sizes + for f in Path(tmpdir).rglob("*"): + if f.is_file(): + content = f.read_text(errors="replace") + print(f"\n {f.name}: {len(content):,} chars, {len(content.splitlines())} lines") + + print(f"\n Tool calls: {len(calls)} total, {rejections} rejected") + print(f" Sequence: {' -> '.join(calls[:15])}{'...' if len(calls) > 15 else ''}") + return all_pass + + +async def main(): + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + print("SKIP: OPENAI_API_KEY not set") + sys.exit(0) + + print("=" * 70) + print(" Live File Writing Test (no size guards)") + print("=" * 70) + + results = [] + + # ── Scenario 1: Long LaTeX paper ── + paper_task = ( + "Write a complete LaTeX review paper to 'review.tex' about " + "single-cell RNA sequencing analysis methods. Requirements:\n" + "- \\documentclass{article} with proper packages\n" + "- Abstract (100+ words)\n" + "- Introduction (200+ words)\n" + "- Methods section covering: quality control, normalization, " + "dimensionality reduction, clustering, differential expression (300+ words total)\n" + "- Results (150+ words)\n" + "- Discussion (150+ words)\n" + "- Bibliography with at least 10 \\bibitem references\n" + "Write EVERYTHING in a single write_file call to review.tex." + ) + + def make_paper_checks(tmpdir): + p = Path(tmpdir) / "review.tex" + def r(): return p.read_text() if p.exists() else "" + return [ + ("File created", lambda *_: p.exists()), + ("File > 5000 chars", lambda *_: len(r()) > 5000), + ("Has \\documentclass", lambda *_: "\\documentclass" in r()), + ("Has Introduction", lambda *_: "Introduction" in r()), + ("Has Methods", lambda *_: "Methods" in r()), + ("Has Discussion", lambda *_: "Discussion" in r()), + ("Has 10+ bibitem", lambda *_: r().count("\\bibitem") >= 10), + ("No rejections", lambda tmpdir, calls, rej: rej == 0), + ] + + r = await run_scenario("Long LaTeX Paper (single write_file)", paper_task, make_paper_checks) + results.append(("Paper", r)) + + # ── Scenario 2: Long Python code ── + code_task = ( + "Write a complete Python file 'data_pipeline.py' that implements:\n" + "1. A DataLoader class with methods: load_csv, load_json, load_parquet, validate_schema " + "(each with full implementation using pandas, proper docstrings, type hints, error handling)\n" + "2. A DataTransformer class with methods: normalize, filter_outliers, " + "encode_categorical, impute_missing (each fully implemented)\n" + "3. A DataExporter class with methods: to_csv, to_json, to_parquet, to_sql " + "(each fully implemented)\n" + "4. A Pipeline class that chains DataLoader -> DataTransformer -> DataExporter " + "with a run() method, logging, and error handling\n" + "5. A if __name__ == '__main__' block with example usage\n" + "Write EVERYTHING in a single write_file call. Every method must have " + "a real implementation (no pass, no TODO, no placeholders)." + ) + + def make_code_checks(tmpdir): + p = Path(tmpdir) / "data_pipeline.py" + def r(): return p.read_text() if p.exists() else "" + return [ + ("File created", lambda *_: p.exists()), + ("File > 3000 chars", lambda *_: len(r()) > 3000), + ("Has DataLoader", lambda *_: "class DataLoader" in r()), + ("Has DataTransformer", lambda *_: "class DataTransformer" in r()), + ("Has DataExporter", lambda *_: "class DataExporter" in r()), + ("Has Pipeline", lambda *_: "class Pipeline" in r()), + ("Has __main__", lambda *_: "__main__" in r()), + ("No rejections", lambda tmpdir, calls, rej: rej == 0), + ] + + r = await run_scenario("Long Python Code (single write_file)", code_task, make_code_checks) + results.append(("Code", r)) + + # ── Summary ── + print(f"\n{'=' * 70}") + print(" Summary") + print(f"{'=' * 70}") + for name, passed in results: + print(f" {name}: {'PASS' if passed else 'FAIL'}") + print(f"{'=' * 70}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/test_agent.py b/tests/test_agent.py index 34f61d1ad..6e4a5945d 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -1,4 +1,5 @@ import asyncio +import json import random from pathlib import Path from typing import List @@ -187,8 +188,19 @@ def get_weather(city: str, unit: str = "celsius"): """Get the weather of a city.""" return {"weather": "sunny", "temperature": 20} - resp = await agent.run("What is the weather in Palo Alto?") - print(resp.content) + sync_tool_messages = await agent._handle_tool_calls( + tool_calls=[{ + "id": "call_sync_weather", + "function": { + "name": "get_weather", + "arguments": json.dumps({"city": "Palo Alto", "unit": "celsius"}), + }, + }], + context_variables={}, + timeout=agent.tool_timeout, + ) + assert sync_tool_messages + assert "sunny" in sync_tool_messages[0]["content"].lower() agent.functions.clear() @@ -201,9 +213,22 @@ async def get_weather(city: str, unit: str = "celsius"): nonlocal flag flag = False - resp = await agent.run("What is the weather in Palo Alto?") - assert flag, "Tool should have timed out but it completed execution" - print(resp) + tool_messages = await agent._handle_tool_calls( + tool_calls=[{ + "id": "call_async_weather", + "function": { + "name": "get_weather", + "arguments": json.dumps({"city": "Palo Alto", "unit": "celsius"}), + }, + }], + context_variables={}, + timeout=agent.tool_timeout, + ) + assert tool_messages + bg_tasks = agent._bg_manager.list_tasks() + assert bg_tasks, "Timed out tool should be adopted into background execution" + assert bg_tasks[0].source == "timeout" + assert flag, "Tool coroutine should continue in background instead of blocking the foreground call" async def test_agent_transfer(): diff --git a/tests/test_file_manager.py b/tests/test_file_manager.py index 1e73eda00..3c2eb6cdb 100644 --- a/tests/test_file_manager.py +++ b/tests/test_file_manager.py @@ -3,6 +3,8 @@ from tempfile import TemporaryDirectory from pantheon.toolsets.file import FileManagerToolSet +HAS_OPENAI = bool(os.environ.get("OPENAI_API_KEY")) + @pytest.fixture def temp_toolset(): """Create a FileManagerToolSet with a temporary directory.""" @@ -510,3 +512,105 @@ async def test_manage_path_comprehensive(temp_toolset): result = await temp_toolset.manage_path("delete", "nonexistent.txt") assert result["success"] is False assert "does not exist" in result["error"] + + +# --------------------------------------------------------------------------- +# write_file append mode + large content tests +# --------------------------------------------------------------------------- + +async def test_write_file_append_basic(temp_toolset): + """write_file(append=True) appends to existing file.""" + await temp_toolset.write_file("log.txt", "header\n") + res = await temp_toolset.write_file("log.txt", "line1\nline2\n", append=True) + assert res["success"] + assert res["appended_chars"] == len("line1\nline2\n") + content = (await temp_toolset.read_file("log.txt"))["content"] + assert content == "header\nline1\nline2\n" + + +async def test_write_file_append_multiple_batches(temp_toolset): + """write_file(append=True) supports multiple sequential appends.""" + await temp_toolset.write_file("refs.bib", "% Bibliography\n") + for i in range(5): + batch = f"@article{{ref{i},\n title={{Title {i}}},\n}}\n\n" + res = await temp_toolset.write_file("refs.bib", batch, append=True) + assert res["success"], f"Batch {i} failed: {res}" + content = (await temp_toolset.read_file("refs.bib"))["content"] + assert content.startswith("% Bibliography\n") + assert content.count("@article{") == 5 + + +async def test_write_file_append_rejects_nonexistent(temp_toolset): + """write_file(append=True) rejects when file does not exist.""" + res = await temp_toolset.write_file("missing.txt", "data", append=True) + assert not res["success"] + assert res["reason"] == "file_not_found" + + +async def test_write_file_large_content(temp_toolset): + """write_file accepts large content (no size guards — root cause fixed at LLM layer).""" + big = "x" * 100_000 + res = await temp_toolset.write_file("big.txt", big) + assert res["success"] + assert (temp_toolset.path / "big.txt").read_text() == big + + +async def test_update_file_large_new_string(temp_toolset): + """update_file accepts large new_string (no size guards).""" + await temp_toolset.write_file("doc.txt", "PLACEHOLDER\n") + big = "y" * 50_000 + res = await temp_toolset.update_file("doc.txt", "PLACEHOLDER", big) + assert res["success"] + content = (await temp_toolset.read_file("doc.txt"))["content"] + assert big in content + + +async def test_write_file_append_large_content(temp_toolset): + """write_file(append=True) accepts large content (no size guards).""" + await temp_toolset.write_file("base.txt", "start\n") + big = "z" * 50_000 + res = await temp_toolset.write_file("base.txt", big, append=True) + assert res["success"] + content = (await temp_toolset.read_file("base.txt"))["content"] + assert content == "start\n" + big + + +# --------------------------------------------------------------------------- +# max_tokens auto-detection (PR #55 — 7920a72) +# --------------------------------------------------------------------------- + +def test_max_tokens_auto_set(): + """acompletion must auto-set max_tokens from model's max_output_tokens + when not explicitly provided (prevents Anthropic 4096 default truncation).""" + from pantheon.utils.provider_registry import get_model_info + + # Anthropic model — the original failure case + info = get_model_info("anthropic/claude-3-haiku-20240307") + max_out = info.get("max_output_tokens", 0) + assert max_out > 4096, ( + f"Expected max_output_tokens > 4096 for claude-3-haiku, got {max_out}" + ) + + # OpenAI model + info = get_model_info("openai/gpt-4.1-mini") + max_out = info.get("max_output_tokens", 0) + assert max_out > 0, f"Expected max_output_tokens > 0 for gpt-4.1-mini, got {max_out}" + + +@pytest.mark.skipif(not HAS_OPENAI, reason="OPENAI_API_KEY not set") +async def test_max_tokens_live_openai(): + """Live test: acompletion sets max_tokens automatically, preventing truncation.""" + from pantheon.utils.llm_providers import call_llm_provider, detect_provider + + provider_config = detect_provider("openai/gpt-4.1-mini", False) + # Call with a simple prompt, no explicit max_tokens in model_params + message = await call_llm_provider( + config=provider_config, + messages=[ + {"role": "system", "content": "Reply with exactly: OK"}, + {"role": "user", "content": "Say OK"}, + ], + ) + assert isinstance(message, dict) + content = message.get("content", "") + assert len(content) > 0, "Expected non-empty response" diff --git a/tests/test_provider_adapters.py b/tests/test_provider_adapters.py index 8046ab00f..55edea244 100644 --- a/tests/test_provider_adapters.py +++ b/tests/test_provider_adapters.py @@ -24,6 +24,7 @@ load_catalog, find_provider_for_model, get_model_info, + get_output_token_param, completion_cost, models_by_provider, token_counter, @@ -70,6 +71,11 @@ def test_get_model_info_known(self): assert info["max_input_tokens"] == 1_000_000 assert info["supports_vision"] is True + def test_get_model_info_openai_gpt_4o_mini(self): + info = get_model_info("gpt-4o-mini") + assert info["max_input_tokens"] == 128_000 + assert info["max_output_tokens"] == 16_384 + def test_get_model_info_unknown_returns_defaults(self): info = get_model_info("fake/nonexistent-model") assert info["max_input_tokens"] == 200_000 @@ -86,6 +92,15 @@ def test_models_by_provider_qwen(self): models = models_by_provider("qwen") assert len(models) == 9 + def test_output_token_param_catalog(self): + assert get_output_token_param("openai/gpt-5.4") == "max_completion_tokens" + assert get_output_token_param("anthropic/claude-sonnet-4-6") == "max_tokens" + assert get_output_token_param("gemini/gemini-2.5-flash") == "max_output_tokens" + assert get_output_token_param("deepseek/deepseek-chat") == "max_tokens" + assert get_output_token_param("minimax/MiniMax-M2.5") == "max_tokens" + assert get_output_token_param("groq/llama-3.3-70b-versatile") == "max_completion_tokens" + assert get_output_token_param("codex/gpt-5.4", api_mode="responses") == "max_output_tokens" + def test_token_counter_basic(self): count = token_counter(model="gpt-4", messages=[{"role": "user", "content": "Hello"}]) assert count > 0 @@ -107,6 +122,50 @@ def test_all_default_models_in_catalog(self): assert missing == [], f"Models in selector but not in catalog: {missing}" +@pytest.mark.asyncio +async def test_llm_uses_catalog_output_param_for_openai(monkeypatch): + from pantheon.utils import llm as llm_module + from pantheon.utils import adapters as adapters_module + + captured = {} + + class DummyAdapter: + async def acompletion(self, **kwargs): + captured.update(kwargs) + return [ + { + "choices": [ + { + "index": 0, + "delta": {"role": "assistant", "content": "ok"}, + "finish_reason": "stop", + } + ], + "model": kwargs["model"], + }, + { + "usage": { + "prompt_tokens": 1, + "completion_tokens": 1, + "total_tokens": 2, + }, + "choices": [], + }, + ] + + monkeypatch.setattr(adapters_module, "get_adapter", lambda _sdk: DummyAdapter()) + + resp = await llm_module.acompletion( + messages=[{"role": "user", "content": "hello"}], + model="openai/gpt-5.4", + model_params={}, + ) + + assert resp.choices[0].message.content == "ok" + assert captured["max_completion_tokens"] == 64000 + assert "max_tokens" not in captured + + # ============ stream_chunk_builder unit tests ============