From 04b7c0ac914e79c000272cb7e0728f36192c2fc9 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Mon, 30 Mar 2026 23:53:28 -0700 Subject: [PATCH 01/13] refactor: replace litellm with lightweight provider catalog + native SDK adapters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove litellm dependency entirely and replace with a catalog-driven provider abstraction layer using native SDKs (openai, anthropic, google-genai). New architecture: - llm_catalog.json: single source of truth for 12 providers, 80+ models (OpenAI, Anthropic, Gemini, DeepSeek, Zhipu, MiniMax, Moonshot, Qwen, Groq, Mistral, Together AI, OpenRouter) - provider_registry.py: catalog loader + get_model_info(), completion_cost(), token_counter(), models_by_provider() - adapters/: per-SDK adapters (openai, anthropic, gemini) with unified interface - OpenAI adapter handles all OpenAI-compatible providers - Anthropic adapter converts message format + normalizes streaming events - Gemini adapter wraps google-genai SDK - stream_chunk_builder(): local replacement for litellm.stream_chunk_builder() with reasoning_content support Key changes: - All litellm imports removed from codebase - pyproject.toml: litellm → anthropic, google-genai, tiktoken - Proxy mode: backward-compat LITELLM_PROXY_* env vars + new LLM_PROXY_* - remove_metadata(): whitelist-based field cleanup (strict providers like Groq reject any non-standard fields) - Null field cleanup: tool_calls=null → field removed - Tool call error recovery: stream interruptions from server-side validation (e.g. Groq hallucinated tool names) return partial text instead of crashing - stream_chunk_builder: handles usage=null from partial/interrupted streams - Responses API support via OpenAI adapter for gpt-5.x-pro and codex models Tested with real API calls across all providers (52/52 tests passing). Co-Authored-By: Claude Opus 4.6 (1M context) --- build_backend.spec | 6 +- docs/source/api/utils.rst | 2 +- pantheon/__init__.py | 6 - pantheon/agent.py | 26 +- pantheon/chatroom/room.py | 13 +- pantheon/internal/compression/compressor.py | 2 +- pantheon/repl/__init__.py | 5 - pantheon/repl/__main__.py | 29 +- pantheon/toolsets/image/image_gen.py | 57 +- .../toolsets/knowledge/knowledge_manager.py | 8 +- pantheon/utils/adapters/__init__.py | 54 + pantheon/utils/adapters/anthropic_adapter.py | 454 ++++++ pantheon/utils/adapters/base.py | 158 ++ pantheon/utils/adapters/gemini_adapter.py | 370 +++++ pantheon/utils/adapters/openai_adapter.py | 439 ++++++ pantheon/utils/llm.py | 322 ++-- pantheon/utils/llm_catalog.json | 1327 +++++++++++++++++ pantheon/utils/llm_providers.py | 72 +- pantheon/utils/misc.py | 10 +- pantheon/utils/model_selector.py | 72 +- pantheon/utils/provider_registry.py | 268 ++++ pyproject.toml | 31 +- tests/test_model_selector.py | 12 +- tests/test_provider_adapters.py | 237 +++ tests/test_scfm_router_real_queries.py | 21 +- 25 files changed, 3738 insertions(+), 263 deletions(-) create mode 100644 pantheon/utils/adapters/__init__.py create mode 100644 pantheon/utils/adapters/anthropic_adapter.py create mode 100644 pantheon/utils/adapters/base.py create mode 100644 pantheon/utils/adapters/gemini_adapter.py create mode 100644 pantheon/utils/adapters/openai_adapter.py create mode 100644 pantheon/utils/llm_catalog.json create mode 100644 pantheon/utils/provider_registry.py create mode 100644 tests/test_provider_adapters.py diff --git a/build_backend.spec b/build_backend.spec index 27bdedd9..8a8b66de 100644 --- a/build_backend.spec +++ b/build_backend.spec @@ -26,7 +26,7 @@ datas += copy_metadata('traitlets') datas += copy_metadata('pyzmq') datas += copy_metadata('tornado') datas += copy_metadata('nest_asyncio') -datas += collect_data_files('litellm', includes=['**/*.json']) +datas += collect_data_files('pantheon', subdir='utils', includes=['llm_catalog.json']) datas += collect_data_files('tiktoken_ext', includes=['**/*.py']) # fakeredis: model/_command_info.py loads os.path.join(dirname(__file__), '..', 'commands.json') # PyInstaller must include the JSON so the relative path resolves at runtime. @@ -65,9 +65,10 @@ a = Analysis( 'pantheon.toolsets.rag', 'pantheon.toolsets.scfm', 'nats', - 'litellm', 'openai', 'anthropic', + 'google.genai', + 'tiktoken', 'fastmcp', 'fastmcp.server', 'fastmcp.client', @@ -137,6 +138,7 @@ exe = EXE( pyz, a.scripts, [], + exclude_binaries=True, name='pantheon-backend-exe', debug=False, bootloader_ignore_signals=False, diff --git a/docs/source/api/utils.rst b/docs/source/api/utils.rst index 567e3f93..e8ce128e 100644 --- a/docs/source/api/utils.rst +++ b/docs/source/api/utils.rst @@ -58,7 +58,7 @@ Common Functions from pantheon.utils.llm import ( acompletion_openai, - acompletion_litellm, + acompletion_litellm, # adapter-based completion process_messages_for_model, remove_hidden_fields ) diff --git a/pantheon/__init__.py b/pantheon/__init__.py index b489187a..379b5037 100644 --- a/pantheon/__init__.py +++ b/pantheon/__init__.py @@ -21,12 +21,6 @@ except ImportError: pass -# Suppress litellm debug output via env vars (avoid importing litellm at startup, -# it costs ~1.5s. The actual suppress_debug_info/set_verbose flags are set in -# utils/llm.py:import_litellm() the first time litellm is actually used.) -os.environ.setdefault("LITELLM_LOG", "ERROR") -# Suppress CLIENT_IP_ENCRYPTION_KEY warning by setting a default value -os.environ.setdefault("CLIENT_IP_ENCRYPTION_KEY", "pantheon-default-key") # Suppress MCP SDK INFO logs ("Processing request of type...") that pollute CLI output import logging diff --git a/pantheon/agent.py b/pantheon/agent.py index 6564f053..1652ceb5 100644 --- a/pantheon/agent.py +++ b/pantheon/agent.py @@ -386,18 +386,15 @@ def __init__(self, partial_message: dict | None = None): def _is_retryable_error(error: Exception) -> bool: """Determine if an LLM API error is transient and worth retrying.""" - try: - from litellm.exceptions import ( - ServiceUnavailableError, - InternalServerError, - RateLimitError, - APIConnectionError, - ) - if isinstance(error, (ServiceUnavailableError, InternalServerError, - RateLimitError, APIConnectionError)): - return True - except ImportError: - pass + from pantheon.utils.adapters.base import ( + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, + ) + if isinstance(error, (ServiceUnavailableError, InternalServerError, + RateLimitError, APIConnectionError)): + return True # Fallback: string matching for common transient error indicators error_str = str(error).lower() return any(kw in error_str for kw in ( @@ -1479,6 +1476,9 @@ async def _acompletion( model_params=model_params, ) + if message is None: + message = {"role": "assistant", "content": "Error: Empty response from model."} + # Step 8: Add metadata to message end_timestamp = time.time() total_time = tracker.end("total") @@ -1603,6 +1603,8 @@ async def _acompletion_with_models( raise except Exception as e: last_error = e + import traceback + logger.error(f"[Agent:{self.name}] Full traceback:\n{traceback.format_exc()}") if _is_retryable_error(e) and attempt < max_retries: delay = min(base_delay * (2 ** attempt), max_delay) diff --git a/pantheon/chatroom/room.py b/pantheon/chatroom/room.py index 043d12b1..ef0287e9 100644 --- a/pantheon/chatroom/room.py +++ b/pantheon/chatroom/room.py @@ -1657,9 +1657,9 @@ async def speech_to_text(self, bytes_data): bytes_data: The bytes data of the audio (bytes, base64 string, or list). """ try: - import litellm import base64 - from pantheon.utils.llm_providers import get_litellm_proxy_kwargs + from pantheon.utils.llm_providers import get_proxy_kwargs + from pantheon.utils.adapters import get_adapter logger.info(f"[STT] Received bytes_data type={type(bytes_data).__name__}, " f"len={len(bytes_data) if hasattr(bytes_data, '__len__') else 'N/A'}") @@ -1691,12 +1691,15 @@ async def speech_to_text(self, bytes_data): audio_file = io.BytesIO(bytes_data) audio_file.name = "audio.webm" - logger.info("[STT] Calling litellm.atranscription...") + logger.info("[STT] Calling transcription adapter...") + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") response = await asyncio.wait_for( - litellm.atranscription( + adapter.atranscription( model=self.speech_to_text_model, file=audio_file, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ), timeout=30, ) diff --git a/pantheon/internal/compression/compressor.py b/pantheon/internal/compression/compressor.py index 1bb63cc4..e7a9b042 100644 --- a/pantheon/internal/compression/compressor.py +++ b/pantheon/internal/compression/compressor.py @@ -120,7 +120,7 @@ def should_compress(self, messages: list[dict], model: str | None = None) -> boo # Fallback: try to fetch from model info if available if model: try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info info = get_model_info(model) max_tokens = (info.get("max_input_tokens") or 0) + ( diff --git a/pantheon/repl/__init__.py b/pantheon/repl/__init__.py index 7d29cf44..f6197b08 100644 --- a/pantheon/repl/__init__.py +++ b/pantheon/repl/__init__.py @@ -1,8 +1,3 @@ -import os - -# Prevent litellm from making blocking network calls to GitHub on startup -os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True") - from .core import Repl __all__ = ["Repl"] \ No newline at end of file diff --git a/pantheon/repl/__main__.py b/pantheon/repl/__main__.py index a626706e..b241fc93 100644 --- a/pantheon/repl/__main__.py +++ b/pantheon/repl/__main__.py @@ -15,7 +15,7 @@ import warnings from pathlib import Path -# Warning filters and litellm config are already set in pantheon/__init__.py +# Warning filters are already set in pantheon/__init__.py # which runs before this __main__.py import fire @@ -159,30 +159,6 @@ def start( ) -async def _update_litellm_cost_map(): - """Background task to update litellm model cost map. - - This runs after startup to fetch the latest model pricing data - from GitHub without blocking the UI. - """ - try: - await asyncio.sleep(2) # Wait for REPL to fully initialize - import litellm - import aiohttp - - # Manually fetch the latest model metadata from GitHub using aiohttp. - # We fetch manually because litellm.get_model_cost_map filters some models, - # and litellm.register_model triggers interactive authentication prompts. - url = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json" - - async with aiohttp.ClientSession() as session: - async with session.get(url, timeout=10) as response: - if response.status == 200: - new_map = await response.json(content_type=None) - if new_map: - litellm.model_cost.update(new_map) - except Exception: - pass # Silently ignore - this is a best-effort background update async def _start_async( @@ -328,9 +304,6 @@ def filter(self, record: logging.LogRecord) -> bool: # Disable logging unless explicitly set to DEBUG disable_logging = quiet and log_level != "DEBUG" - # Start background task to update litellm cost map (non-blocking) - asyncio.create_task(_update_litellm_cost_map()) - await repl.run(message=initial_input, disable_logging=disable_logging, log_level=log_level) diff --git a/pantheon/toolsets/image/image_gen.py b/pantheon/toolsets/image/image_gen.py index 7996d11c..82c5f372 100644 --- a/pantheon/toolsets/image/image_gen.py +++ b/pantheon/toolsets/image/image_gen.py @@ -6,18 +6,13 @@ and native image editing models (OpenAI gpt-image). """ -import litellm - -# Suppress litellm debug output (Provider List message) -litellm.suppress_debug_info = True -litellm.set_verbose = False from pantheon.toolset import ToolSet, tool from pantheon.utils.vision import ( ImageStore, get_image_store, expand_image_references_for_llm, ) -from pantheon.utils.llm_providers import get_litellm_proxy_kwargs +from pantheon.utils.llm_providers import get_proxy_kwargs # Multimodal models that support image input + output via acompletion API # Gemini Nano Banana series: Pro / Nano Banana 2 / Nano Banana first-gen @@ -91,16 +86,16 @@ def _get_chat_id(self) -> str: return "default" def _extract_cost_from_response(self, response) -> float: - """Extract cost from LiteLLM response. - + """Extract cost from API response. + Args: - response: LiteLLM response object from acompletion or aimage_generation - + response: Response object from acompletion or aimage_generation + Returns: Cost in USD, or 0.0 if calculation fails """ try: - from litellm import completion_cost + from pantheon.utils.provider_registry import completion_cost cost = completion_cost(completion_response=response) or 0.0 from pantheon.utils.log import logger logger.debug(f"Image generation cost: ${cost:.6f}") @@ -171,12 +166,17 @@ async def _text_input_image_gen( model: str, ) -> dict: """Text-only image generation (DALL-E, Imagen).""" - response = await litellm.aimage_generation( + from pantheon.utils.adapters import get_adapter + + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") + response = await adapter.aimage_generation( model=model, prompt=prompt, size="1024x1024", n=1, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ) # Extract cost from response @@ -232,12 +232,26 @@ async def _multimodal_image_gen( self.image_store.process_message_images(messages[0], chat_id) messages = expand_image_references_for_llm(messages) - response = await litellm.acompletion( - model=model, + from pantheon.utils.adapters import get_adapter + from pantheon.utils.provider_registry import find_provider_for_model + + proxy_kwargs = get_proxy_kwargs() + provider_key, model_name, provider_config = find_provider_for_model(model) + sdk_type = provider_config.get("sdk", "openai") + if proxy_kwargs: + sdk_type = "openai" + adapter = get_adapter(sdk_type) + + collected_chunks = await adapter.acompletion( + model=model_name if not proxy_kwargs else model, messages=messages, - modalities=["text", "image"], # Enable image generation output - **get_litellm_proxy_kwargs(), # Use proxy for real API keys + stream=True, + base_url=proxy_kwargs.get("base_url") or provider_config.get("base_url"), + api_key=proxy_kwargs.get("api_key"), + modalities=["text", "image"], ) + from pantheon.utils.llm import stream_chunk_builder + response = stream_chunk_builder(collected_chunks) # Extract cost from response cost = self._extract_cost_from_response(response) @@ -283,13 +297,18 @@ async def _image_edit_gen( resolved = self.image_store.normalize_local_path(path) resolved_paths.append(resolved) - response = await litellm.aimage_edit( + from pantheon.utils.adapters import get_adapter + + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") + response = await adapter.aimage_edit( model=model, image=resolved_paths, prompt=prompt, size="1024x1024", n=1, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ) # Extract cost from response diff --git a/pantheon/toolsets/knowledge/knowledge_manager.py b/pantheon/toolsets/knowledge/knowledge_manager.py index 509b0bca..848e1473 100644 --- a/pantheon/toolsets/knowledge/knowledge_manager.py +++ b/pantheon/toolsets/knowledge/knowledge_manager.py @@ -87,7 +87,7 @@ async def run_setup(self): def _create_llm(): from llama_index.llms.openai import OpenAI from pantheon.settings import get_settings - from pantheon.utils.llm_providers import get_litellm_proxy_kwargs + from pantheon.utils.llm_providers import get_proxy_kwargs settings = get_settings() llm_kwargs = { @@ -99,10 +99,10 @@ def _create_llm(): if api_base: llm_kwargs["api_base"] = api_base - # Use LiteLLM proxy if enabled (overrides api_base/api_key) - proxy_kwargs = get_litellm_proxy_kwargs() + # Use proxy if enabled (overrides api_base/api_key) + proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: - llm_kwargs["api_base"] = proxy_kwargs["api_base"] + llm_kwargs["api_base"] = proxy_kwargs["base_url"] llm_kwargs["api_key"] = proxy_kwargs["api_key"] return OpenAI(**llm_kwargs) diff --git a/pantheon/utils/adapters/__init__.py b/pantheon/utils/adapters/__init__.py new file mode 100644 index 00000000..502a33ff --- /dev/null +++ b/pantheon/utils/adapters/__init__.py @@ -0,0 +1,54 @@ +""" +LLM Provider Adapters — unified async interface for different SDK types. + +Each adapter wraps a specific SDK (openai, anthropic, google-genai) and +exposes a common interface: acompletion, aembedding, aimage_generation, etc. +""" + +from functools import lru_cache + +from .base import ( + BaseAdapter, + LLMError, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +@lru_cache(maxsize=8) +def get_adapter(sdk_type: str) -> BaseAdapter: + """Get an adapter singleton for the given SDK type. + + Args: + sdk_type: One of 'openai', 'anthropic', 'google-genai' + + Returns: + BaseAdapter instance + """ + if sdk_type == "openai": + from .openai_adapter import OpenAIAdapter + return OpenAIAdapter() + elif sdk_type == "anthropic": + from .anthropic_adapter import AnthropicAdapter + return AnthropicAdapter() + elif sdk_type == "google-genai": + from .gemini_adapter import GeminiAdapter + return GeminiAdapter() + else: + # Default to OpenAI adapter for unknown SDK types + # (many providers are OpenAI-compatible) + from .openai_adapter import OpenAIAdapter + return OpenAIAdapter() + + +__all__ = [ + "get_adapter", + "BaseAdapter", + "LLMError", + "ServiceUnavailableError", + "InternalServerError", + "RateLimitError", + "APIConnectionError", +] diff --git a/pantheon/utils/adapters/anthropic_adapter.py b/pantheon/utils/adapters/anthropic_adapter.py new file mode 100644 index 00000000..9d1368fa --- /dev/null +++ b/pantheon/utils/adapters/anthropic_adapter.py @@ -0,0 +1,454 @@ +""" +Anthropic adapter — handles Claude models via the native Anthropic SDK. + +Converts between OpenAI message format (used internally by PantheonOS) +and Anthropic's native format, and normalizes streaming events. +""" + +import json +import time +from typing import Any, Callable + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +def _wrap_anthropic_error(e: Exception) -> Exception: + """Convert anthropic SDK exceptions to unified exception types.""" + try: + import anthropic as anthropic_mod + + if isinstance(e, anthropic_mod.RateLimitError): + return RateLimitError(str(e)) + elif isinstance(e, anthropic_mod.APIConnectionError): + return APIConnectionError(str(e)) + elif isinstance(e, anthropic_mod.InternalServerError): + return InternalServerError(str(e)) + elif isinstance(e, anthropic_mod.APIStatusError): + status = getattr(e, "status_code", 0) + if status == 503: + return ServiceUnavailableError(str(e)) + elif status == 429: + return RateLimitError(str(e)) + elif status >= 500: + return InternalServerError(str(e)) + except ImportError: + pass + return e + + +# ============ Message Format Conversion ============ + + +def _convert_messages_to_anthropic(messages: list[dict]) -> tuple[str | None, list[dict]]: + """Convert OpenAI-format messages to Anthropic format. + + Key differences: + - System messages become top-level `system` parameter + - tool_calls in assistant messages become tool_use content blocks + - tool role messages become tool_result content blocks in user messages + + Returns: + (system_prompt, converted_messages) + """ + system_prompt = None + converted = [] + pending_tool_results = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if role == "system": + # First system message becomes the system parameter + if system_prompt is None: + system_prompt = content if isinstance(content, str) else str(content) + else: + # Additional system messages become user messages + converted.append({ + "role": "user", + "content": f"[System]: {content}" + }) + continue + + if role == "tool": + # Accumulate tool results to attach to next user message + pending_tool_results.append({ + "type": "tool_result", + "tool_use_id": msg.get("tool_call_id", ""), + "content": content or "", + }) + continue + + if role == "user": + # Flush pending tool results first + if pending_tool_results: + # Tool results must be in a user message + result_content = list(pending_tool_results) + if content: + if isinstance(content, str): + result_content.append({"type": "text", "text": content}) + elif isinstance(content, list): + result_content.extend(content) + converted.append({"role": "user", "content": result_content}) + pending_tool_results = [] + else: + converted.append({"role": "user", "content": content}) + continue + + if role == "assistant": + # Flush any pending tool results as a separate user message + if pending_tool_results: + converted.append({"role": "user", "content": list(pending_tool_results)}) + pending_tool_results = [] + + # Build content blocks for assistant + content_blocks = [] + + # Text content + if content: + if isinstance(content, str): + content_blocks.append({"type": "text", "text": content}) + elif isinstance(content, list): + content_blocks.extend(content) + + # Tool calls → tool_use blocks + tool_calls = msg.get("tool_calls") + if tool_calls: + for tc in tool_calls: + func = tc.get("function", {}) + # Parse arguments from JSON string + try: + input_data = json.loads(func.get("arguments", "{}")) + except (json.JSONDecodeError, TypeError): + input_data = {} + + content_blocks.append({ + "type": "tool_use", + "id": tc.get("id", ""), + "name": func.get("name", ""), + "input": input_data, + }) + + if content_blocks: + converted.append({"role": "assistant", "content": content_blocks}) + elif not content and not tool_calls: + # Empty assistant message — skip + pass + continue + + # Flush remaining tool results + if pending_tool_results: + converted.append({"role": "user", "content": list(pending_tool_results)}) + + # Anthropic requires alternating user/assistant messages + # Merge consecutive same-role messages + merged = [] + for msg in converted: + if merged and merged[-1]["role"] == msg["role"]: + prev = merged[-1]["content"] + curr = msg["content"] + # Normalize both to lists + if isinstance(prev, str): + prev = [{"type": "text", "text": prev}] + elif not isinstance(prev, list): + prev = [{"type": "text", "text": str(prev)}] + if isinstance(curr, str): + curr = [{"type": "text", "text": curr}] + elif not isinstance(curr, list): + curr = [{"type": "text", "text": str(curr)}] + merged[-1]["content"] = prev + curr + else: + merged.append(msg) + + return system_prompt, merged + + +def _convert_tools_to_anthropic(tools: list[dict] | None) -> list[dict] | None: + """Convert OpenAI tool format to Anthropic tool format. + + From: {"type": "function", "function": {"name": ..., "description": ..., "parameters": {...}}} + To: {"name": ..., "description": ..., "input_schema": {...}} + """ + if not tools: + return None + + converted = [] + for tool in tools: + func = tool.get("function", {}) + anthropic_tool = { + "name": func.get("name", ""), + } + if "description" in func: + anthropic_tool["description"] = func["description"] + if "parameters" in func: + anthropic_tool["input_schema"] = func["parameters"] + else: + anthropic_tool["input_schema"] = {"type": "object", "properties": {}} + converted.append(anthropic_tool) + + return converted + + +# ============ Adapter ============ + + +class AnthropicAdapter(BaseAdapter): + """Adapter for Anthropic Claude API.""" + + def _make_client( + self, + base_url: str | None = None, + api_key: str | None = None, + ): + from anthropic import AsyncAnthropic + + kwargs = {} + if base_url: + kwargs["base_url"] = base_url + if api_key: + kwargs["api_key"] = api_key + return AsyncAnthropic(**kwargs) + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ): + """Streaming chat completion using the Anthropic SDK. + + Converts OpenAI messages to Anthropic format, streams events, + normalizes them to OpenAI-compatible chunk dicts, and returns + collected chunks. + """ + client = self._make_client(base_url, api_key) + + # Convert messages and tools + system_prompt, anthropic_messages = _convert_messages_to_anthropic(messages) + anthropic_tools = _convert_tools_to_anthropic(tools) + + # Build call kwargs (stream() method implies streaming, don't pass stream=True) + call_kwargs = { + "model": model, + "messages": anthropic_messages, + "max_tokens": kwargs.pop("max_tokens", None) or kwargs.pop("max_output_tokens", 8192), + } + + if system_prompt: + call_kwargs["system"] = system_prompt + + if anthropic_tools: + call_kwargs["tools"] = anthropic_tools + + # Handle thinking parameter + thinking = kwargs.pop("thinking", None) + reasoning_effort = kwargs.pop("reasoning_effort", None) + if thinking: + call_kwargs["thinking"] = thinking + elif reasoning_effort: + # Map reasoning_effort to Anthropic thinking + call_kwargs["thinking"] = {"type": "enabled", "budget_tokens": 10000} + + # Temperature + temperature = kwargs.pop("temperature", None) + if temperature is not None: + call_kwargs["temperature"] = temperature + + # Top-p + top_p = kwargs.pop("top_p", None) + if top_p is not None: + call_kwargs["top_p"] = top_p + + # Extra headers + extra_headers = kwargs.pop("extra_headers", None) + + try: + stream_start_time = time.time() + first_chunk_time = None + chunk_count = 0 + collected_chunks = [] + + # Track state for building OpenAI-compatible chunks + current_text = "" + current_tool_calls = [] + tool_call_index = -1 + tool_call_json_accum = "" + usage_info = {} + + async with client.messages.stream( + **call_kwargs, + extra_headers=extra_headers or {}, + ) as stream_resp: + async for event in stream_resp: + event_type = event.type + + if event_type == "message_start": + # Extract initial usage + msg = getattr(event, "message", None) + if msg and hasattr(msg, "usage"): + usage_info["prompt_tokens"] = getattr(msg.usage, "input_tokens", 0) + + elif event_type == "content_block_start": + block = event.content_block + if block.type == "tool_use": + tool_call_index += 1 + tool_call_json_accum = "" + current_tool_calls.append({ + "index": tool_call_index, + "id": block.id, + "type": "function", + "function": { + "name": block.name, + "arguments": "", + }, + }) + # Emit initial chunk with id and name + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [{ + "index": tool_call_index, + "id": block.id, + "type": "function", + "function": { + "name": block.name, + "arguments": "", + }, + }], + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + elif event_type == "content_block_delta": + delta_obj = event.delta + + if delta_obj.type == "text_delta": + text = delta_obj.text + current_text += text + + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + + # Build OpenAI-compatible chunk + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + chunk_count += 1 + await run_func(process_chunk, { + "role": "assistant", + "content": text, + }) + + elif delta_obj.type == "input_json_delta": + # Accumulate tool call arguments + partial = delta_obj.partial_json + tool_call_json_accum += partial + if current_tool_calls: + current_tool_calls[-1]["function"]["arguments"] += partial + + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [{ + "index": tool_call_index, + "function": { + "arguments": partial, + }, + }], + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + elif delta_obj.type == "thinking_delta": + thinking_text = delta_obj.thinking + if process_chunk: + await run_func(process_chunk, { + "role": "assistant", + "reasoning_content": thinking_text, + }) + + elif event_type == "message_delta": + delta = event.delta + stop_reason = getattr(delta, "stop_reason", None) + + # Extract usage from message_delta + usage = getattr(event, "usage", None) + if usage: + usage_info["completion_tokens"] = getattr(usage, "output_tokens", 0) + + # Map Anthropic stop reasons to OpenAI finish reasons + finish_reason = None + if stop_reason == "end_turn": + finish_reason = "stop" + elif stop_reason == "tool_use": + finish_reason = "tool_calls" + elif stop_reason == "max_tokens": + finish_reason = "length" + + if finish_reason: + chunk_dict = { + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": finish_reason, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk and finish_reason == "stop": + await run_func(process_chunk, {"stop": True}) + + elif event_type == "message_stop": + pass + + # Add usage chunk at the end (OpenAI stream_options style) + total_tokens = usage_info.get("prompt_tokens", 0) + usage_info.get("completion_tokens", 0) + usage_info["total_tokens"] = total_tokens + collected_chunks.append({ + "usage": usage_info, + "choices": [], + }) + + total_time = time.time() - stream_start_time + logger.info(f"✅ Stream completed: {total_time:.3f}s, {chunk_count} chunks [{model}]") + + return collected_chunks + + except Exception as e: + raise _wrap_anthropic_error(e) from e diff --git a/pantheon/utils/adapters/base.py b/pantheon/utils/adapters/base.py new file mode 100644 index 00000000..2a2a2d14 --- /dev/null +++ b/pantheon/utils/adapters/base.py @@ -0,0 +1,158 @@ +""" +Base adapter — ABC for all provider adapters + unified exception types. +""" + +from abc import ABC, abstractmethod +from typing import Any, AsyncIterator, Callable + + +# ============ Unified Exception Types ============ +# These replace litellm.exceptions.* and are caught in agent.py _is_retryable_error() + + +class LLMError(Exception): + """Base exception for LLM provider errors.""" + pass + + +class ServiceUnavailableError(LLMError): + """Provider service is temporarily unavailable (503).""" + pass + + +class InternalServerError(LLMError): + """Provider encountered an internal error (500).""" + pass + + +class RateLimitError(LLMError): + """Request was rate-limited (429).""" + pass + + +class APIConnectionError(LLMError): + """Failed to connect to the provider API.""" + pass + + +# ============ Base Adapter ============ + + +class BaseAdapter(ABC): + """Abstract base class for LLM provider adapters. + + Each adapter wraps a specific SDK and normalizes responses to + a common format compatible with the existing codebase. + + Streaming responses yield dicts with OpenAI-compatible chunk format. + Complete responses are SimpleNamespace objects with .choices and .usage. + """ + + @abstractmethod + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ) -> AsyncIterator: + """Async chat completion with streaming. + + Args: + model: Model name (without provider prefix) + messages: Chat messages in OpenAI format + tools: Tool definitions in OpenAI format + response_format: Response format specification + stream: Whether to stream (always True for now) + process_chunk: Callback for processing stream chunks + base_url: Override API base URL + api_key: Override API key + num_retries: Number of retries on transient errors + **kwargs: Additional provider-specific parameters + + Yields: + Stream chunks (provider-specific format, collected by caller) + + Returns: + The async iterator of chunks + """ + ... + + async def aembedding( + self, + *, + model: str, + input: list[str], + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> list[list[float]]: + """Generate embeddings. + + Returns: + List of embedding vectors + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support embeddings" + ) + + async def aimage_generation( + self, + *, + model: str, + prompt: str, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Generate images from text prompt. + + Returns: + Provider-specific image response + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support image generation" + ) + + async def aimage_edit( + self, + *, + model: str, + image: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Edit an image. + + Returns: + Provider-specific image response + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support image editing" + ) + + async def atranscription( + self, + *, + model: str, + file: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Transcribe audio to text. + + Returns: + Transcription response + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support transcription" + ) diff --git a/pantheon/utils/adapters/gemini_adapter.py b/pantheon/utils/adapters/gemini_adapter.py new file mode 100644 index 00000000..0831ecfa --- /dev/null +++ b/pantheon/utils/adapters/gemini_adapter.py @@ -0,0 +1,370 @@ +""" +Gemini adapter — handles Google Gemini models via the google-genai SDK. + +Converts between OpenAI message format and Gemini's native format, +and normalizes streaming events to OpenAI-compatible chunk dicts. +""" + +import json +import time +from typing import Any, Callable + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +def _wrap_gemini_error(e: Exception) -> Exception: + """Convert Gemini SDK exceptions to unified exception types.""" + error_str = str(e).lower() + if "429" in error_str or "resource exhausted" in error_str or "rate" in error_str: + return RateLimitError(str(e)) + elif "503" in error_str or "unavailable" in error_str: + return ServiceUnavailableError(str(e)) + elif "500" in error_str or "internal" in error_str: + return InternalServerError(str(e)) + elif "connect" in error_str or "timeout" in error_str: + return APIConnectionError(str(e)) + return e + + +# ============ Message Format Conversion ============ + + +def _convert_messages_to_gemini(messages: list[dict]) -> tuple[str | None, list[dict]]: + """Convert OpenAI-format messages to Gemini format. + + Returns: + (system_instruction, gemini_contents) + """ + system_instruction = None + contents = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if role == "system": + if system_instruction is None: + system_instruction = content if isinstance(content, str) else str(content) + else: + # Additional system messages as user context + contents.append({ + "role": "user", + "parts": [{"text": f"[System]: {content}"}], + }) + continue + + if role == "user": + parts = [] + if isinstance(content, str): + parts.append({"text": content}) + elif isinstance(content, list): + for item in content: + if isinstance(item, dict): + if item.get("type") == "text": + parts.append({"text": item["text"]}) + elif item.get("type") == "image_url": + # Pass image URLs through + url = item.get("image_url", {}).get("url", "") + if url.startswith("data:"): + # Base64 inline data + parts.append({"inline_data": {"mime_type": "image/png", "data": url.split(",", 1)[-1]}}) + else: + parts.append({"text": f"[Image: {url}]"}) + contents.append({"role": "user", "parts": parts}) + continue + + if role == "assistant": + parts = [] + if content: + if isinstance(content, str): + parts.append({"text": content}) + + # Tool calls → function_call parts + tool_calls = msg.get("tool_calls") + if tool_calls: + for tc in tool_calls: + func = tc.get("function", {}) + try: + args = json.loads(func.get("arguments", "{}")) + except (json.JSONDecodeError, TypeError): + args = {} + parts.append({ + "function_call": { + "name": func.get("name", ""), + "args": args, + } + }) + + if parts: + contents.append({"role": "model", "parts": parts}) + continue + + if role == "tool": + # Tool results → function_response parts + tool_call_id = msg.get("tool_call_id", "") + # Try to find tool name from previous assistant message + tool_name = msg.get("name", tool_call_id) + try: + result = json.loads(content) if isinstance(content, str) else content + except (json.JSONDecodeError, TypeError): + result = {"result": content} + + contents.append({ + "role": "user", + "parts": [{ + "function_response": { + "name": tool_name, + "response": result if isinstance(result, dict) else {"result": str(result)}, + } + }], + }) + continue + + return system_instruction, contents + + +def _convert_tools_to_gemini(tools: list[dict] | None) -> list[dict] | None: + """Convert OpenAI tool format to Gemini function declarations. + + From: {"type": "function", "function": {"name": ..., "description": ..., "parameters": {...}}} + To: {"name": ..., "description": ..., "parameters": {...}} + """ + if not tools: + return None + + declarations = [] + for tool in tools: + func = tool.get("function", {}) + decl = {"name": func.get("name", "")} + if "description" in func: + decl["description"] = func["description"] + if "parameters" in func: + params = dict(func["parameters"]) + # Gemini doesn't support 'strict' or 'additionalProperties' at top level + params.pop("strict", None) + params.pop("additionalProperties", None) + decl["parameters"] = params + declarations.append(decl) + + return declarations + + +# ============ Adapter ============ + + +class GeminiAdapter(BaseAdapter): + """Adapter for Google Gemini API via google-genai SDK.""" + + def _make_client(self, api_key: str | None = None): + """Create a google-genai client.""" + from google import genai + + import os + key = api_key or os.environ.get("GEMINI_API_KEY", "") + return genai.Client(api_key=key) + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ): + """Streaming chat completion using the Google GenAI SDK. + + Returns collected chunks in OpenAI-compatible format. + """ + from google.genai import types + + client = self._make_client(api_key) + + # Convert messages and tools + system_instruction, gemini_contents = _convert_messages_to_gemini(messages) + gemini_tools = _convert_tools_to_gemini(tools) + + # Build config + config_kwargs = {} + + # System instruction + if system_instruction: + config_kwargs["system_instruction"] = system_instruction + + # Tools + if gemini_tools: + config_kwargs["tools"] = [types.Tool(function_declarations=gemini_tools)] + + # Temperature + temperature = kwargs.pop("temperature", None) + if temperature is not None: + config_kwargs["temperature"] = temperature + + # Max output tokens + max_tokens = kwargs.pop("max_tokens", None) or kwargs.pop("max_output_tokens", None) + if max_tokens: + config_kwargs["max_output_tokens"] = max_tokens + + # Response modalities (for multimodal image generation) + modalities = kwargs.pop("modalities", None) + if modalities: + config_kwargs["response_modalities"] = modalities + + # Reasoning effort → thinking config + reasoning_effort = kwargs.pop("reasoning_effort", None) + if reasoning_effort: + config_kwargs["thinking_config"] = types.ThinkingConfig( + thinking_budget=-1 # auto + ) + + config = types.GenerateContentConfig(**config_kwargs) + + try: + stream_start_time = time.time() + first_chunk_time = None + chunk_count = 0 + collected_chunks = [] + full_text = "" + prompt_tokens = 0 + completion_tokens = 0 + + stream_iter = await client.aio.models.generate_content_stream( + model=model, + contents=gemini_contents, + config=config, + ) + async for response in stream_iter: + # Extract text from response candidates + text = "" + tool_calls_data = [] + + if response.candidates: + for candidate in response.candidates: + if candidate.content and candidate.content.parts: + for part in candidate.content.parts: + if hasattr(part, "text") and part.text: + text += part.text + elif hasattr(part, "function_call") and part.function_call: + fc = part.function_call + tool_calls_data.append({ + "index": len(tool_calls_data), + "id": f"call_{fc.name}_{len(tool_calls_data)}", + "type": "function", + "function": { + "name": fc.name, + "arguments": json.dumps(dict(fc.args)) if fc.args else "{}", + }, + }) + + if text: + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + + full_text += text + + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + chunk_count += 1 + await run_func(process_chunk, { + "role": "assistant", + "content": text, + }) + + if tool_calls_data: + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": tool_calls_data, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + # Extract usage if available + if hasattr(response, "usage_metadata") and response.usage_metadata: + um = response.usage_metadata + prompt_tokens = getattr(um, "prompt_token_count", 0) or 0 + completion_tokens = getattr(um, "candidates_token_count", 0) or 0 + + # Add finish chunk + collected_chunks.append({ + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": "stop", + }], + }) + + if process_chunk: + await run_func(process_chunk, {"stop": True}) + + # Add usage chunk + total_tokens = prompt_tokens + completion_tokens + collected_chunks.append({ + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + }, + "choices": [], + }) + + total_time = time.time() - stream_start_time + logger.info(f"✅ Stream completed: {total_time:.3f}s, {chunk_count} chunks [{model}]") + + return collected_chunks + + except Exception as e: + raise _wrap_gemini_error(e) from e + + async def aembedding( + self, + *, + model: str, + input: list[str], + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> list[list[float]]: + """Generate embeddings using Gemini API.""" + client = self._make_client(api_key) + try: + results = [] + for text in input: + response = await client.aio.models.embed_content( + model=model, + contents=text, + ) + results.append(response.embedding) + return results + except Exception as e: + raise _wrap_gemini_error(e) from e diff --git a/pantheon/utils/adapters/openai_adapter.py b/pantheon/utils/adapters/openai_adapter.py new file mode 100644 index 00000000..5d93c6a9 --- /dev/null +++ b/pantheon/utils/adapters/openai_adapter.py @@ -0,0 +1,439 @@ +""" +OpenAI adapter — handles OpenAI and all OpenAI-compatible providers. + +Covers: openai, deepseek, moonshot, minimax, zai (zhipu), and any +provider with openai_compatible=true in the catalog. +""" + +import os +import time +from typing import Any, Callable + +from openai import NOT_GIVEN, AsyncOpenAI + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +def _wrap_openai_error(e: Exception) -> Exception: + """Convert openai SDK exceptions to unified exception types.""" + import openai as openai_mod + + if isinstance(e, openai_mod.RateLimitError): + return RateLimitError(str(e)) + elif isinstance(e, openai_mod.APIConnectionError): + return APIConnectionError(str(e)) + elif isinstance(e, openai_mod.InternalServerError): + return InternalServerError(str(e)) + elif isinstance(e, openai_mod.APIStatusError): + status = getattr(e, "status_code", 0) + if status == 503: + return ServiceUnavailableError(str(e)) + elif status == 429: + return RateLimitError(str(e)) + elif status >= 500: + return InternalServerError(str(e)) + return e + + +class OpenAIAdapter(BaseAdapter): + """Adapter for OpenAI and OpenAI-compatible APIs.""" + + def _make_client( + self, + base_url: str | None = None, + api_key: str | None = None, + ) -> AsyncOpenAI: + """Create an AsyncOpenAI client with optional overrides.""" + kwargs = {} + if base_url: + kwargs["base_url"] = base_url + if api_key: + kwargs["api_key"] = api_key + return AsyncOpenAI(**kwargs) + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ): + """Streaming chat completion using the OpenAI SDK. + + Returns an async iterator that yields raw chunk dicts. + The caller is responsible for assembling chunks (via stream_chunk_builder). + """ + client = self._make_client(base_url, api_key) + + _tools = tools or NOT_GIVEN + _pcall = (tools is not None) or NOT_GIVEN + + # Build call kwargs + call_kwargs = { + "model": model, + "messages": messages, + "tools": _tools, + "stream": True, + "stream_options": {"include_usage": True}, + } + + if response_format: + call_kwargs["response_format"] = response_format + + # reasoning models (o1, o3, o4 series) don't support parallel_tool_calls + if not model.startswith("o"): + call_kwargs["parallel_tool_calls"] = _pcall + + # Merge extra kwargs (reasoning_effort, temperature, etc.) + call_kwargs.update(kwargs) + + retry_count = num_retries + while retry_count > 0: + try: + stream_start_time = time.time() + first_chunk_time = None + chunk_count = 0 + + response = await client.chat.completions.create(**call_kwargs) + + collected_chunks = [] + try: + async for chunk in response: + chunk_dict = chunk.model_dump() + collected_chunks.append(chunk_dict) + + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + + if ( + process_chunk + and chunk.choices + and len(chunk.choices) > 0 + ): + choice = chunk.choices[0] + if hasattr(choice, "delta") and choice.delta: + delta = choice.delta.model_dump() + chunk_count += 1 + await run_func(process_chunk, delta) + if hasattr(choice, "finish_reason") and choice.finish_reason == "stop": + await run_func(process_chunk, {"stop": True}) + except Exception as stream_err: + # Some providers (e.g. Groq) validate tool calls server-side + # and abort the stream mid-way with errors like: + # - "tool call validation failed: attempted to call tool X not in request.tools" + # - "Failed to parse tool call arguments as JSON" + # If we already collected text chunks, return them as a partial response + # instead of crashing the entire request. + err_str = str(stream_err).lower() + is_tool_error = "tool call" in err_str or "tool_call" in err_str + if is_tool_error and collected_chunks: + logger.warning( + f"⚠ Stream interrupted by tool call error, " + f"returning {len(collected_chunks)} partial chunks [{model}]: {stream_err}" + ) + # Strip tool_call deltas from partial chunks — they are incomplete + # and will cause downstream errors. Only keep text content. + cleaned_chunks = [] + for c in collected_chunks: + choices = c.get("choices", []) + if choices: + delta = choices[0].get("delta", {}) + # Remove tool_calls from delta, keep only text content + delta.pop("tool_calls", None) + cleaned_chunks.append(c) + # Add a stop chunk + cleaned_chunks.append({ + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + }) + collected_chunks = cleaned_chunks + if process_chunk: + await run_func(process_chunk, {"stop": True}) + else: + raise + + total_time = time.time() - stream_start_time + logger.info(f"✅ Stream completed: {total_time:.3f}s, {chunk_count} chunks [{model}]") + return collected_chunks + + except Exception as e: + wrapped = _wrap_openai_error(e) + if isinstance(wrapped, APIConnectionError): + retry_count -= 1 + logger.warning(f"Connection error, retrying ({num_retries - retry_count}/{num_retries}): {e}") + if retry_count <= 0: + raise wrapped from e + else: + raise wrapped from e + + # Should not reach here, but just in case + raise APIConnectionError(f"Failed after {num_retries} retries") + + async def acompletion_responses( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> dict: + """Call OpenAI Responses API with streaming. + + Used for models that require the Responses API (gpt-5.x-pro, codex, etc.). + Returns a normalized message dict (not chunks). + """ + client = self._make_client(base_url, api_key) + + # Convert messages to Responses API format + instructions = None + input_items = [] + for msg in messages: + role = msg.get("role") + content = msg.get("content") + if role == "system": + if instructions is None: + instructions = content + else: + input_items.append({"role": "developer", "content": content}) + elif role == "user": + input_items.append({"role": "user", "content": content}) + elif role == "assistant": + if content: + input_items.append({"role": "assistant", "content": content}) + for tc in msg.get("tool_calls") or []: + func = tc.get("function", {}) + input_items.append({ + "type": "function_call", + "call_id": tc["id"], + "name": func.get("name", ""), + "arguments": func.get("arguments", ""), + }) + elif role == "tool": + input_items.append({ + "type": "function_call_output", + "call_id": msg.get("tool_call_id", ""), + "output": content or "", + }) + + # Convert tools + converted_tools = None + if tools: + converted_tools = [] + for tool in tools: + func = tool.get("function", {}) + item = {"type": "function", "name": func.get("name", "")} + if "description" in func: + item["description"] = func["description"] + if "parameters" in func: + item["parameters"] = func["parameters"] + if "strict" in func: + item["strict"] = func["strict"] + converted_tools.append(item) + + # Build kwargs + call_kwargs = {"model": model, "input": input_items, "stream": True} + if instructions is not None: + call_kwargs["instructions"] = instructions + if converted_tools is not None: + call_kwargs["tools"] = converted_tools + if response_format is not None: + call_kwargs["text"] = response_format + + # Map model_params + if kwargs.get("max_tokens"): + call_kwargs["max_output_tokens"] = kwargs.pop("max_tokens") + if kwargs.get("max_completion_tokens"): + call_kwargs["max_output_tokens"] = kwargs.pop("max_completion_tokens") + if kwargs.get("max_output_tokens"): + call_kwargs["max_output_tokens"] = kwargs.pop("max_output_tokens") + reasoning_effort = kwargs.pop("reasoning_effort", None) + if reasoning_effort: + call_kwargs["reasoning"] = {"effort": reasoning_effort} + + # Stream + text_parts = [] + tool_calls_by_id = {} + _item_to_call = {} + response_obj = None + + try: + stream = await client.responses.create(**call_kwargs) + async for event in stream: + event_type = event.type + + if event_type == "response.output_text.delta": + text_parts.append(event.delta) + if process_chunk: + await run_func(process_chunk, {"content": event.delta, "role": "assistant"}) + + elif event_type == "response.output_item.added": + item = event.item + if getattr(item, "type", None) == "function_call": + call_id = getattr(item, "call_id", "") or "" + item_id = getattr(item, "id", "") or "" + _item_to_call[item_id] = call_id + tool_calls_by_id[call_id] = { + "name": getattr(item, "name", "") or "", + "arguments": "", + } + + elif event_type == "response.function_call_arguments.done": + item_id = getattr(event, "item_id", "") or "" + call_id = _item_to_call.get(item_id, "") + if call_id and call_id in tool_calls_by_id: + tool_calls_by_id[call_id]["arguments"] = event.arguments + if event.name: + tool_calls_by_id[call_id]["name"] = event.name + + elif event_type == "response.completed": + response_obj = event.response + if process_chunk: + await run_func(process_chunk, {"stop": True}) + + elif event_type == "response.failed": + error_msg = "" + if hasattr(event, "response") and hasattr(event.response, "error"): + error_msg = str(event.response.error) + raise RuntimeError(f"Responses API call failed: {error_msg}") + + except Exception as e: + wrapped = _wrap_openai_error(e) + if wrapped is not e: + raise wrapped from e + raise + + # Build output + aggregated_text = "".join(text_parts) if text_parts else None + final_tool_calls = None + if tool_calls_by_id: + final_tool_calls = [ + {"id": cid, "type": "function", "function": {"name": info["name"], "arguments": info["arguments"]}} + for cid, info in tool_calls_by_id.items() + ] + + # Cost + cost = 0.0 + usage_dict = {} + if response_obj and hasattr(response_obj, "usage") and response_obj.usage: + usage = response_obj.usage + input_tokens = getattr(usage, "input_tokens", 0) or 0 + output_tokens = getattr(usage, "output_tokens", 0) or 0 + usage_dict = { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + } + try: + from ..provider_registry import completion_cost as calc_cost + cost = calc_cost(model=model, prompt_tokens=input_tokens, completion_tokens=output_tokens) or 0.0 + except Exception: + pass + if cost == 0.0 and (input_tokens or output_tokens): + cost = (input_tokens * 1.0 + output_tokens * 5.0) / 1_000_000 + + return { + "role": "assistant", + "content": aggregated_text, + "tool_calls": final_tool_calls, + "_metadata": {"_debug_cost": cost, "_debug_usage": usage_dict}, + } + + async def aembedding( + self, + *, + model: str, + input: list[str], + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> list[list[float]]: + """Generate embeddings using OpenAI API.""" + client = self._make_client(base_url, api_key) + try: + response = await client.embeddings.create(model=model, input=input) + return [d.embedding for d in response.data] + except Exception as e: + raise _wrap_openai_error(e) from e + + async def aimage_generation( + self, + *, + model: str, + prompt: str, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Generate images using OpenAI API (DALL-E, gpt-image).""" + client = self._make_client(base_url, api_key) + try: + response = await client.images.generate( + model=model, + prompt=prompt, + **kwargs, + ) + return response + except Exception as e: + raise _wrap_openai_error(e) from e + + async def aimage_edit( + self, + *, + model: str, + image: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Edit images using OpenAI API.""" + client = self._make_client(base_url, api_key) + try: + response = await client.images.edit( + model=model, + image=image, + **kwargs, + ) + return response + except Exception as e: + raise _wrap_openai_error(e) from e + + async def atranscription( + self, + *, + model: str, + file: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Transcribe audio using OpenAI Whisper API.""" + client = self._make_client(base_url, api_key) + try: + response = await client.audio.transcriptions.create( + model=model, + file=file, + **kwargs, + ) + return response + except Exception as e: + raise _wrap_openai_error(e) from e diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index bf346b07..53cfa315 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -1,7 +1,6 @@ import json import re import time -import warnings from contextlib import asynccontextmanager from copy import deepcopy from typing import Any, Callable @@ -241,13 +240,13 @@ async def acompletion_responses( Returns a normalised message dict compatible with ``extract_message_from_response``. """ from openai import AsyncOpenAI - from .llm_providers import get_litellm_proxy_kwargs + from .llm_providers import get_proxy_kwargs # ========== Build client ========== - proxy_kwargs = get_litellm_proxy_kwargs() + proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: client = AsyncOpenAI( - base_url=proxy_kwargs["api_base"], + base_url=proxy_kwargs["base_url"], api_key=proxy_kwargs["api_key"] ) elif base_url: @@ -371,7 +370,7 @@ async def acompletion_responses( "total_tokens": input_tokens + output_tokens, } try: - from litellm import completion_cost + from pantheon.utils.provider_registry import completion_cost cost = completion_cost(model=model, prompt_tokens=input_tokens, completion_tokens=output_tokens) or 0.0 except Exception: pass @@ -390,13 +389,138 @@ async def acompletion_responses( return message -def import_litellm(): - warnings.filterwarnings("ignore") - import litellm +def stream_chunk_builder(chunks: list[dict]) -> Any: + """Assemble streaming chunks into a complete response object. - litellm.suppress_debug_info = True - litellm.set_verbose = False - return litellm + Aggregates content deltas, tool_call deltas, and usage from collected chunks + into a SimpleNamespace that mimics the shape of a chat completion response. + + Replaces litellm.stream_chunk_builder(). + """ + from types import SimpleNamespace + + full_content = "" + full_reasoning = "" + tool_calls_map: dict[int, dict] = {} # index → tool_call dict + finish_reason = None + usage = {} + model = "" + role = "assistant" + + for chunk in chunks: + # Handle dict chunks (from adapters) + if isinstance(chunk, dict): + # Extract usage from usage-only chunks + if "usage" in chunk and chunk["usage"]: + usage = chunk["usage"] + if "model" in chunk: + model = chunk["model"] + + choices = chunk.get("choices", []) + if not choices: + continue + choice = choices[0] + delta = choice.get("delta", {}) + + # Accumulate content + if "content" in delta and delta["content"]: + full_content += delta["content"] + + # Accumulate reasoning_content (OpenAI/Zhipu/Kimi reasoning models) + if "reasoning_content" in delta and delta["reasoning_content"]: + full_reasoning += delta["reasoning_content"] + + # Accumulate role + if "role" in delta and delta["role"]: + role = delta["role"] + + # Accumulate tool calls + if "tool_calls" in delta and delta["tool_calls"]: + for tc in delta["tool_calls"]: + idx = tc.get("index", 0) + if idx not in tool_calls_map: + tool_calls_map[idx] = { + "id": tc.get("id", ""), + "type": tc.get("type", "function"), + "function": { + "name": tc.get("function", {}).get("name", ""), + "arguments": "", + }, + } + else: + # Merge + if tc.get("id"): + tool_calls_map[idx]["id"] = tc["id"] + func = tc.get("function", {}) + if func.get("name"): + tool_calls_map[idx]["function"]["name"] = func["name"] + + # Always append arguments + args = tc.get("function", {}).get("arguments", "") + if args: + tool_calls_map[idx]["function"]["arguments"] += args + + # Track finish reason + fr = choice.get("finish_reason") + if fr: + finish_reason = fr + + else: + # Handle object-style chunks (from OpenAI SDK directly) + if hasattr(chunk, "model_dump"): + chunk_dict = chunk.model_dump() + # Recursively process as dict + result = stream_chunk_builder([chunk_dict]) + return result + + # Build final tool_calls list + final_tool_calls = None + if tool_calls_map: + final_tool_calls = [tool_calls_map[i] for i in sorted(tool_calls_map.keys())] + + # Build message + # For reasoning models that put everything in reasoning_content with no content, + # fall back to reasoning_content so the response isn't empty + effective_content = full_content or None + if not effective_content and full_reasoning: + effective_content = full_reasoning + + message = SimpleNamespace( + role=role, + content=effective_content, + tool_calls=final_tool_calls, + reasoning_content=full_reasoning or None, + ) + + def message_model_dump(): + d = {"role": message.role, "content": message.content, "tool_calls": message.tool_calls} + if message.reasoning_content: + d["reasoning_content"] = message.reasoning_content + return d + message.model_dump = message_model_dump + + # Build choice + choice = SimpleNamespace( + message=message, + finish_reason=finish_reason, + ) + + # Build usage + usage_ns = SimpleNamespace( + prompt_tokens=usage.get("prompt_tokens", 0), + completion_tokens=usage.get("completion_tokens", 0), + total_tokens=usage.get("total_tokens", 0), + ) + + # Build response + resp = SimpleNamespace( + choices=[choice], + model=model, + usage=usage_ns, + _hidden_params={}, + ) + + return resp async def acompletion_litellm( @@ -410,117 +534,97 @@ async def acompletion_litellm( model_params: dict | None = None, num_retries: int = 3, ): - """Call LLM via LiteLLM Proxy (preferred) or traditional API keys (fallback) + """Call LLM via provider adapters. Two modes of operation: 1. PROXY MODE (Hub-launched agents): - - LITELLM_PROXY_ENABLED=true with LITELLM_PROXY_URL and LITELLM_PROXY_KEY + - LLM_PROXY_ENABLED=true with LLM_PROXY_URL and LLM_PROXY_KEY - Uses virtual key for authentication to Proxy - Real API keys are hidden in Proxy, not in Pod environment - - Fake API keys in environment are for detect_available_provider() only 2. STANDALONE MODE (agents running independently): - - LITELLM_PROXY_ENABLED not set or false + - LLM_PROXY_ENABLED not set or false - Falls back to reading real API keys from environment variables - - Suitable for local development and standalone agent operation + - Uses native SDK adapters (openai, anthropic, google-genai) """ - from pantheon.settings import get_settings - from .llm_providers import get_litellm_proxy_kwargs - - litellm = import_litellm() - logger.debug(f"[LITELLM.ACOMPLETION] Starting LLM call | Model={model}") + from .llm_providers import get_proxy_kwargs + from .provider_registry import find_provider_for_model, get_provider_config, completion_cost + from .adapters import get_adapter - settings = get_settings() + logger.debug(f"[ACOMPLETION] Starting LLM call | Model={model}") - # ========== Prepare LiteLLM Parameters ========== - kwargs = { - "model": model, - "messages": messages, - "tools": tools, - "response_format": response_format, - "stream": True, - "stream_options": {"include_usage": True}, - "num_retries": num_retries, - } - - if model_params: - kwargs.update(**model_params) + # ========== Resolve provider and adapter ========== + provider_key, model_name, provider_config = find_provider_for_model(model) + sdk_type = provider_config.get("sdk", "openai") # ========== Mode Detection & Configuration ========== - proxy_kwargs = get_litellm_proxy_kwargs() + proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: - kwargs.update(proxy_kwargs) + # Proxy mode: all calls go through OpenAI-compatible proxy + effective_base_url = proxy_kwargs.get("base_url") + effective_api_key = proxy_kwargs.get("api_key") + sdk_type = "openai" # proxy exposes OpenAI-compatible API + effective_model = model # pass full model string to proxy else: - if base_url: - kwargs["api_base"] = base_url - if api_key: - kwargs["api_key"] = api_key + effective_base_url = base_url or provider_config.get("base_url") + effective_api_key = api_key + if not effective_api_key: + import os + api_key_env = provider_config.get("api_key_env", "") + if api_key_env: + effective_api_key = os.environ.get(api_key_env, "") + effective_model = model_name # use bare model name with native SDK + + adapter = get_adapter(sdk_type) + + # ========== Prepare adapter kwargs ========== + adapter_kwargs = dict(model_params or {}) # Kimi Coding API gates access by User-Agent header if "kimi-for-coding" in model: - kwargs.setdefault("extra_headers", {}) - kwargs["extra_headers"].setdefault("User-Agent", "claude-code/0.1.0") + adapter_kwargs.setdefault("extra_headers", {}) + adapter_kwargs["extra_headers"].setdefault("User-Agent", "claude-code/0.1.0") # ========== Execute Call ========== + from pantheon.agent import StopRunning + try: - logger.debug( - f"[LITELLM.ACOMPLETION] Calling litellm.acompletion with model={model}" + logger.debug(f"[ACOMPLETION] Calling {sdk_type} adapter for model={effective_model}") + collected_chunks = await adapter.acompletion( + model=effective_model, + messages=messages, + tools=tools, + response_format=response_format, + stream=True, + process_chunk=process_chunk, + base_url=effective_base_url, + api_key=effective_api_key, + num_retries=num_retries, + **adapter_kwargs, ) - response = await litellm.acompletion(**kwargs) - logger.debug(f"[LITELLM.ACOMPLETION] ✓ LiteLLM call succeeded for model={model}") + logger.debug(f"[ACOMPLETION] ✓ Call succeeded for model={effective_model}") + except StopRunning: + raise except Exception as e: logger.error( - f"[LITELLM.ACOMPLETION] ✗ LiteLLM call failed | " - f"Model={model} | Error={type(e).__name__}: {str(e)[:200]}" + f"[ACOMPLETION] ✗ Call failed | " + f"Model={effective_model} | Error={type(e).__name__}: {str(e)[:200]}" ) raise - # ========== Stream Processing & Cost Calculation ========== - from pantheon.agent import StopRunning - - collected_chunks = [] - try: - async for chunk in response: - collected_chunks.append(chunk) - if ( - process_chunk - and hasattr(chunk, "choices") - and chunk.choices - and len(chunk.choices) > 0 - ): - choice = chunk.choices[0] - if hasattr(choice, "delta"): - delta = choice.delta.model_dump() - # LiteLLM provides unified reasoning_content field - await run_func(process_chunk, delta) - if hasattr(choice, "finish_reason") and choice.finish_reason == "stop": - await run_func(process_chunk, {"stop": True}) - except StopRunning: - # Build partial message from chunks collected so far - partial_msg = None - if collected_chunks: - try: - partial_resp = litellm.stream_chunk_builder(collected_chunks) - if partial_resp and hasattr(partial_resp, "choices") and partial_resp.choices: - partial_msg = partial_resp.choices[0].message.model_dump() - partial_msg.setdefault("role", "assistant") - except Exception: - pass - raise StopRunning(partial_message=partial_msg) - - complete_resp = litellm.stream_chunk_builder(collected_chunks) + # ========== Build complete response ========== + complete_resp = stream_chunk_builder(collected_chunks) # Calculate and attach cost information try: - cost = litellm.completion_cost(completion_response=complete_resp) + cost = completion_cost(completion_response=complete_resp) if cost and cost > 0: - # Store cost in a way that count_tokens_in_messages can access if not hasattr(complete_resp, "_hidden_params"): complete_resp._hidden_params = {} complete_resp._hidden_params["response_cost"] = cost except Exception: - pass # Silently ignore cost calculation errors + pass return complete_resp @@ -761,14 +865,30 @@ def remove_ui_fields(messages: list[dict]) -> list[dict]: return messages +_ALLOWED_MESSAGE_FIELDS = { + "role", "content", "name", "tool_calls", "tool_call_id", + "refusal", "function_call", # OpenAI standard fields +} + + def remove_metadata(messages: list[dict]) -> list[dict]: """ - Remove _metadata field from messages. - This should be called just before sending messages to the LLM. + Strip messages down to only standard OpenAI fields before sending to LLM. + + Strict providers like Groq reject ANY unknown field (chat_id, _metadata, + _llm_content, _user_metadata, detected_attachments, etc.) and also + reject null values for optional fields like tool_calls. """ for msg in messages: - if "_metadata" in msg: - del msg["_metadata"] + # Remove non-standard fields + extra_keys = [k for k in msg if k not in _ALLOWED_MESSAGE_FIELDS] + for k in extra_keys: + del msg[k] + # Remove fields with None/null values (Groq rejects "tool_calls": null) + null_keys = [k for k in ("tool_calls", "tool_call_id", "name", "function_call", "refusal") + if k in msg and msg[k] is None] + for k in null_keys: + del msg[k] return messages @@ -821,7 +941,7 @@ def process_messages_for_hook_func(messages: list[dict]) -> list[dict]: async def openai_embedding( texts: list[str], model: str = "text-embedding-3-large" ) -> list[list[float]]: - """Get embeddings using litellm (with proxy support). + """Get embeddings (with proxy support). Args: texts: List of texts to embed @@ -830,19 +950,19 @@ async def openai_embedding( Returns: List of embedding vectors """ - from .llm_providers import get_litellm_proxy_kwargs + from .llm_providers import get_proxy_kwargs + from .adapters import get_adapter - litellm = import_litellm() + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") - # litellm.aembedding returns EmbeddingResponse with .data[].embedding - response = await litellm.aembedding( + return await adapter.aembedding( model=model, input=texts, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ) - return [d["embedding"] for d in response.data] - def remove_hidden_fields(content: dict) -> dict: """Remove hidden fields from dict content. @@ -1039,7 +1159,7 @@ def _safe_token_counter( ) -> int: """Token counter with fallback for unsupported models.""" try: - from litellm.utils import token_counter + from pantheon.utils.provider_registry import token_counter return token_counter(model=model, messages=messages or [], tools=tools) except Exception: @@ -1160,7 +1280,7 @@ def collect_message_stats_lightweight( # ========== 3. Max tokens ========== try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info model_info = get_model_info(model) meta["max_tokens"] = model_info.get("max_input_tokens", 200000) except Exception: @@ -1178,7 +1298,7 @@ def count_tokens_in_messages( Separates system prompt (first system message) and tools definition from other roles. """ try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info total_tokens = 0 tokens_by_role = {} diff --git a/pantheon/utils/llm_catalog.json b/pantheon/utils/llm_catalog.json new file mode 100644 index 00000000..d9383868 --- /dev/null +++ b/pantheon/utils/llm_catalog.json @@ -0,0 +1,1327 @@ +{ + "version": 1, + "providers": { + "openai": { + "display_name": "OpenAI", + "sdk": "openai", + "base_url": "https://api.openai.com/v1", + "api_key_env": "OPENAI_API_KEY", + "openai_compatible": true, + "models": { + "gpt-5.4-pro": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 5.0, + "output_cost_per_million": 20.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.4": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.4-mini": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.2-pro": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 5.0, + "output_cost_per_million": 20.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.2": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.2-codex": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5-mini": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5-nano": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0.1, + "output_cost_per_million": 0.4, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-4.1-mini": { + "max_input_tokens": 1000000, + "max_output_tokens": 32768, + "input_cost_per_million": 0.4, + "output_cost_per_million": 1.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "o3-pro": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 20.0, + "output_cost_per_million": 80.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "o4-mini": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 1.1, + "output_cost_per_million": 4.4, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-image-1": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "gpt-image-1.5": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "chatgpt-image-latest": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "dall-e-3": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "text-embedding-3-large": { + "mode": "embedding", + "max_input_tokens": 8191, + "max_output_tokens": 0, + "input_cost_per_million": 0.13, + "output_cost_per_million": 0, + "supports_vision": false, + "supports_function_calling": false + }, + "text-embedding-3-small": { + "mode": "embedding", + "max_input_tokens": 8191, + "max_output_tokens": 0, + "input_cost_per_million": 0.02, + "output_cost_per_million": 0, + "supports_vision": false, + "supports_function_calling": false + }, + "whisper-1": { + "mode": "audio_transcription", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": false, + "supports_function_calling": false + }, + "codex-mini-latest": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 1.5, + "output_cost_per_million": 6.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "anthropic": { + "display_name": "Anthropic", + "sdk": "anthropic", + "base_url": "https://api.anthropic.com", + "api_key_env": "ANTHROPIC_API_KEY", + "openai_compatible": false, + "models": { + "claude-opus-4-6": { + "max_input_tokens": 1000000, + "max_output_tokens": 32000, + "input_cost_per_million": 15.0, + "output_cost_per_million": 75.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-opus-4-5-20251101": { + "max_input_tokens": 200000, + "max_output_tokens": 32000, + "input_cost_per_million": 15.0, + "output_cost_per_million": 75.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-opus-4-20250514": { + "max_input_tokens": 200000, + "max_output_tokens": 32000, + "input_cost_per_million": 15.0, + "output_cost_per_million": 75.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-sonnet-4-6": { + "max_input_tokens": 1000000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.0, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-sonnet-4-5-20250929": { + "max_input_tokens": 200000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.0, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-sonnet-4-20250514": { + "max_input_tokens": 200000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.0, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-haiku-4-5": { + "max_input_tokens": 200000, + "max_output_tokens": 8192, + "input_cost_per_million": 0.8, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": true + } + } + }, + "gemini": { + "display_name": "Google Gemini", + "sdk": "google-genai", + "base_url": "https://generativelanguage.googleapis.com", + "api_key_env": "GEMINI_API_KEY", + "openai_compatible": false, + "models": { + "gemini-3.1-pro-preview": { + "max_input_tokens": 2000000, + "max_output_tokens": 65536, + "input_cost_per_million": 2.5, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-3-pro-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 2.5, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-3-flash-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-2.5-pro": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 1.25, + "output_cost_per_million": 10.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-2.5-flash": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-2.5-flash-lite": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.075, + "output_cost_per_million": 0.3, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-3-pro-image-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 2.5, + "output_cost_per_million": 15.0, + "mode": "multimodal_image", + "supports_vision": true, + "supports_function_calling": false + }, + "gemini-3.1-flash-image-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "mode": "multimodal_image", + "supports_vision": true, + "supports_function_calling": false + }, + "gemini-2.5-flash-image": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "mode": "multimodal_image", + "supports_vision": true, + "supports_function_calling": false + } + } + }, + "deepseek": { + "display_name": "DeepSeek", + "sdk": "openai", + "base_url": "https://api.deepseek.com/v1", + "api_key_env": "DEEPSEEK_API_KEY", + "openai_compatible": true, + "models": { + "deepseek-chat": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.27, + "output_cost_per_million": 1.1, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "deepseek-reasoner": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.55, + "output_cost_per_million": 2.19, + "supports_vision": false, + "supports_function_calling": false, + "supports_response_schema": false, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + } + } + }, + "zai": { + "display_name": "Z.ai (Zhipu)", + "sdk": "openai", + "base_url": "https://open.bigmodel.cn/api/paas/v4", + "api_key_env": "ZAI_API_KEY", + "openai_compatible": true, + "models": { + "glm-5": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.6": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5v": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5-air": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.1, + "output_cost_per_million": 0.4, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5-flash": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.0, + "output_cost_per_million": 0.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "minimax": { + "display_name": "MiniMax", + "sdk": "openai", + "base_url": "https://api.minimax.io/v1", + "api_key_env": "MINIMAX_API_KEY", + "openai_compatible": true, + "models": { + "MiniMax-M2.7": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.5-highspeed": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.5": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.1-highspeed": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.1": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "moonshot": { + "display_name": "Moonshot (Kimi)", + "sdk": "openai", + "base_url": "https://api.moonshot.ai/v1", + "api_key_env": "MOONSHOT_API_KEY", + "openai_compatible": true, + "models": { + "kimi-k2.5": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "kimi-k2-0905-preview": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "qwen": { + "display_name": "Qwen (DashScope)", + "sdk": "openai", + "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "api_key_env": "DASHSCOPE_API_KEY", + "openai_compatible": true, + "models": { + "qwen3-235b-a22b": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 4.0, + "output_cost_per_million": 16.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen3-32b": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.7, + "output_cost_per_million": 2.8, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen3-30b-a3b": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.35, + "output_cost_per_million": 1.4, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-max": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-plus": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.8, + "output_cost_per_million": 2.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-turbo": { + "max_input_tokens": 1000000, + "max_output_tokens": 8192, + "input_cost_per_million": 0.3, + "output_cost_per_million": 0.6, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-vl-max": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 3.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-vl-plus": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 1.0, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwq-plus": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "groq": { + "display_name": "Groq", + "sdk": "openai", + "base_url": "https://api.groq.com/openai/v1", + "api_key_env": "GROQ_API_KEY", + "openai_compatible": true, + "models": { + "openai/gpt-oss-120b": { + "max_input_tokens": 131072, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.60, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "openai/gpt-oss-20b": { + "max_input_tokens": 131072, + "max_output_tokens": 65536, + "input_cost_per_million": 0.075, + "output_cost_per_million": 0.30, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "groq/compound": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.0, + "output_cost_per_million": 0.0, + "supports_vision": false, + "supports_function_calling": false, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "llama-3.3-70b-versatile": { + "max_input_tokens": 131072, + "max_output_tokens": 32768, + "input_cost_per_million": 0.59, + "output_cost_per_million": 0.79, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "meta-llama/llama-4-scout-17b-16e-instruct": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.11, + "output_cost_per_million": 0.34, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen/qwen3-32b": { + "max_input_tokens": 131072, + "max_output_tokens": 40960, + "input_cost_per_million": 0.29, + "output_cost_per_million": 0.59, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "llama-3.1-8b-instant": { + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_million": 0.05, + "output_cost_per_million": 0.08, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "openrouter": { + "display_name": "OpenRouter", + "sdk": "openai", + "base_url": "https://openrouter.ai/api/v1", + "api_key_env": "OPENROUTER_API_KEY", + "openai_compatible": true, + "models": { + "anthropic/claude-sonnet-4-6": { + "max_input_tokens": 1000000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.17, + "output_cost_per_million": 15.83, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "google/gemini-2.5-flash": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.16, + "output_cost_per_million": 0.63, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "deepseek/deepseek-chat": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.28, + "output_cost_per_million": 1.16, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "meta-llama/llama-3.3-70b-instruct": { + "max_input_tokens": 131072, + "max_output_tokens": 32768, + "input_cost_per_million": 0.10, + "output_cost_per_million": 0.10, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "mistral": { + "display_name": "Mistral AI", + "sdk": "openai", + "base_url": "https://api.mistral.ai/v1", + "api_key_env": "MISTRAL_API_KEY", + "openai_compatible": true, + "models": { + "mistral-large-latest": { + "max_input_tokens": 262144, + "max_output_tokens": 262144, + "input_cost_per_million": 0.50, + "output_cost_per_million": 1.50, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "mistral-medium-latest": { + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_million": 0.40, + "output_cost_per_million": 2.00, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "mistral-small-latest": { + "max_input_tokens": 262144, + "max_output_tokens": 262144, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.60, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "codestral-latest": { + "max_input_tokens": 256000, + "max_output_tokens": 256000, + "input_cost_per_million": 0.30, + "output_cost_per_million": 0.90, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "open-mistral-nemo": { + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_million": 0.02, + "output_cost_per_million": 0.04, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + } + } + }, + "together_ai": { + "display_name": "Together AI", + "sdk": "openai", + "base_url": "https://api.together.xyz/v1", + "api_key_env": "TOGETHER_API_KEY", + "openai_compatible": true, + "models": { + "Qwen/Qwen3.5-397B-A17B": { + "max_input_tokens": 262144, + "max_output_tokens": 16384, + "input_cost_per_million": 0.60, + "output_cost_per_million": 3.60, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "deepseek-ai/DeepSeek-V3.1": { + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_million": 0.60, + "output_cost_per_million": 1.70, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "meta-llama/Llama-3.3-70B-Instruct-Turbo": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.88, + "output_cost_per_million": 0.88, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "deepseek-ai/DeepSeek-R1": { + "max_input_tokens": 163839, + "max_output_tokens": 16384, + "input_cost_per_million": 3.00, + "output_cost_per_million": 7.00, + "supports_vision": false, + "supports_function_calling": false, + "supports_response_schema": false, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + } + } +} diff --git a/pantheon/utils/llm_providers.py b/pantheon/utils/llm_providers.py index d9272cd4..c7b0ad6d 100644 --- a/pantheon/utils/llm_providers.py +++ b/pantheon/utils/llm_providers.py @@ -21,10 +21,14 @@ class ProviderType(Enum): - """Supported LLM providers""" + """Supported LLM providers. + + OPENAI: Direct OpenAI or OpenAI-compatible providers + LITELLM: Non-OpenAI providers (anthropic, gemini, etc.) — legacy name kept for compat + """ OPENAI = "openai" - LITELLM = "litellm" + LITELLM = "litellm" # Kept for backward compat; means "non-openai provider" @dataclass @@ -35,10 +39,10 @@ class ProviderConfig: model_name: str base_url: Optional[str] = None api_key: Optional[str] = None - force_litellm: bool = False + force_litellm: bool = False # Kept for backward compat -# OpenAI-compatible providers that litellm doesn't natively support. +# OpenAI-compatible providers that need custom base_url. # Maps provider prefix → (api_base_url, api_key_env_var) OPENAI_COMPATIBLE_PROVIDERS: dict[str, tuple[str, str]] = {} @@ -132,12 +136,16 @@ def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: def is_responses_api_model(config: ProviderConfig) -> bool: """Check if model should use the OpenAI Responses API instead of Chat Completions. - Currently triggers for OpenAI models with "codex" in the name (e.g. codex-mini-latest). + Triggers for: + - Models with "codex" in the name (e.g. codex-mini-latest) + - Pro models (gpt-5.x-pro, gpt-5.2-pro) which are Responses-only """ - return ( - config.provider_type == ProviderType.OPENAI - and "codex" in config.model_name.lower() - ) + name_lower = config.model_name.lower() + if config.provider_type != ProviderType.OPENAI: + return False + # Strip "openai/" prefix for matching + bare = name_lower.split("/")[-1] if "/" in name_lower else name_lower + return "codex" in bare or bare.endswith("-pro") def get_base_url(provider: ProviderType) -> Optional[str]: @@ -258,30 +266,42 @@ def _clean_message_fields(message: dict) -> None: message["tool_calls"] = None -def get_litellm_proxy_kwargs() -> dict: - """Get LiteLLM proxy kwargs for API calls. - - When LITELLM_PROXY_ENABLED=true, returns {"api_base": ..., "api_key": ...} - to route calls through the LiteLLM Proxy. Otherwise returns empty dict. +def get_proxy_kwargs() -> dict: + """Get proxy kwargs for API calls. - Usage: - proxy_kwargs = get_litellm_proxy_kwargs() - response = await litellm.aimage_generation(model=model, ..., **proxy_kwargs) - response = await litellm.acompletion(model=model, ..., **proxy_kwargs) + When LLM_PROXY_ENABLED=true (or LITELLM_PROXY_ENABLED for backward compat), + returns {"base_url": ..., "api_key": ...} to route calls through a proxy. + Otherwise returns empty dict. """ import os - proxy_enabled = os.environ.get("LITELLM_PROXY_ENABLED", "").lower() == "true" - proxy_url = os.environ.get("LITELLM_PROXY_URL") - proxy_key = os.environ.get("LITELLM_PROXY_KEY") + # Check new env vars first, fall back to legacy LITELLM_ prefix + proxy_enabled = ( + os.environ.get("LLM_PROXY_ENABLED", "").lower() == "true" + or os.environ.get("LITELLM_PROXY_ENABLED", "").lower() == "true" + ) + proxy_url = os.environ.get("LLM_PROXY_URL") or os.environ.get("LITELLM_PROXY_URL") + proxy_key = os.environ.get("LLM_PROXY_KEY") or os.environ.get("LITELLM_PROXY_KEY") if proxy_enabled and proxy_url and proxy_key: - logger.info(f"[LITELLM_PROXY] Routing through proxy | URL={proxy_url}") - return {"api_base": proxy_url, "api_key": proxy_key} + logger.info(f"[LLM_PROXY] Routing through proxy | URL={proxy_url}") + return {"base_url": proxy_url, "api_key": proxy_key} return {} +# Backward compatibility alias +def get_litellm_proxy_kwargs() -> dict: + """Backward-compatible alias for get_proxy_kwargs(). + + Returns keys in old format: {"api_base": ..., "api_key": ...} + """ + result = get_proxy_kwargs() + if result: + return {"api_base": result["base_url"], "api_key": result["api_key"]} + return {} + + def _extract_cost_and_usage(complete_resp: Any) -> tuple[float, dict]: """Calculate cost and extract usage from response. @@ -305,13 +325,12 @@ def _extract_cost_and_usage(complete_resp: Any) -> tuple[float, dict]: except Exception: pass - # Try to calculate cost (may fail for new/unmapped models) + # Calculate cost from catalog pricing try: - from litellm import completion_cost + from pantheon.utils.provider_registry import completion_cost cost = completion_cost(completion_response=complete_resp) or 0.0 except Exception as e: - # DEBUG level: this is expected for new models not yet in litellm's price map logger.debug(f"Cost calculation unavailable: {e}") # Fallback: estimate cost from usage if litellm failed but we have token counts @@ -450,7 +469,6 @@ async def call_llm_provider( Returns: Extracted and cleaned message dictionary """ - # Import here to avoid circular imports from .llm import ( acompletion_litellm, remove_metadata, diff --git a/pantheon/utils/misc.py b/pantheon/utils/misc.py index a464693a..b84cfc77 100644 --- a/pantheon/utils/misc.py +++ b/pantheon/utils/misc.py @@ -151,7 +151,11 @@ def desc_to_openai_dict( desc: Description, skip_params: List[str] = [], litellm_mode: bool = False, + relaxed_schema: bool = False, ) -> dict: + # Support both old and new parameter names + _relaxed = relaxed_schema or litellm_mode + # Filter inputs without modifying original desc.inputs filtered_inputs = [arg for arg in desc.inputs if arg.name not in skip_params] @@ -197,7 +201,7 @@ def desc_to_openai_dict( parameters[arg.name] = pdict - if litellm_mode: + if _relaxed: if arg.default is NotDef: required.append(arg.name) else: @@ -208,10 +212,10 @@ def desc_to_openai_dict( "function": { "name": desc.name, "description": tool_description, - "strict": not litellm_mode, + "strict": not _relaxed, }, } - if (not litellm_mode) or (len(parameters) > 0): + if (not _relaxed) or (len(parameters) > 0): func_dict["function"]["parameters"] = { "type": "object", "properties": parameters, diff --git a/pantheon/utils/model_selector.py b/pantheon/utils/model_selector.py index 8d8886a1..42eef115 100644 --- a/pantheon/utils/model_selector.py +++ b/pantheon/utils/model_selector.py @@ -60,7 +60,7 @@ class CustomEndpointConfig: # Built-in defaults based on February 2026 flagship models # Users can override in settings.json -DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot"] +DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot", "qwen", "groq", "mistral", "together_ai", "openrouter"] # Quality levels map to MODEL LISTS (not single models) for fallback chains # Models within each level are ordered by preference @@ -137,9 +137,44 @@ class CustomEndpointConfig: "normal": ["moonshot/kimi-k2.5", "moonshot/kimi-k2-0905-preview"], "low": ["moonshot/kimi-k2.5", "moonshot/kimi-k2-0905-preview"], }, + # Qwen (DashScope): Qwen3/QwQ series + # https://help.aliyun.com/zh/model-studio/ + "qwen": { + "high": ["qwen/qwen3-235b-a22b", "qwen/qwen-max", "qwen/qwq-plus"], + "normal": ["qwen/qwen3-32b", "qwen/qwen-plus"], + "low": ["qwen/qwen3-30b-a3b", "qwen/qwen-turbo"], + }, + # Groq: Ultra-fast inference + # https://console.groq.com/docs/models + "groq": { + "high": ["groq/openai/gpt-oss-120b", "groq/llama-3.3-70b-versatile"], + "normal": ["groq/openai/gpt-oss-20b", "groq/qwen/qwen3-32b", "groq/meta-llama/llama-4-scout-17b-16e-instruct"], + "low": ["groq/llama-3.1-8b-instant"], + }, + # Mistral AI + # https://docs.mistral.ai/getting-started/models + "mistral": { + "high": ["mistral/mistral-large-latest", "mistral/mistral-medium-latest"], + "normal": ["mistral/mistral-small-latest", "mistral/codestral-latest"], + "low": ["mistral/open-mistral-nemo"], + }, + # Together AI: Open-source model hosting + # https://docs.together.ai/docs/serverless-models + "together_ai": { + "high": ["together_ai/Qwen/Qwen3.5-397B-A17B", "together_ai/deepseek-ai/DeepSeek-V3.1"], + "normal": ["together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"], + "low": ["together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"], + }, + # OpenRouter: Multi-provider aggregator + # https://openrouter.ai/models + "openrouter": { + "high": ["openrouter/anthropic/claude-sonnet-4-6"], + "normal": ["openrouter/google/gemini-2.5-flash", "openrouter/deepseek/deepseek-chat"], + "low": ["openrouter/meta-llama/llama-3.3-70b-instruct"], + }, } -# Capability tags map to litellm's supports_* fields +# Capability tags map to catalog supports_* fields CAPABILITY_MAP = { "vision": "supports_vision", "reasoning": "supports_reasoning", @@ -180,6 +215,7 @@ class CustomEndpointConfig: "minimax": "MINIMAX_API_KEY", "zai": "ZAI_API_KEY", "moonshot": "MOONSHOT_API_KEY", + "qwen": "DASHSCOPE_API_KEY", } # ============ Image Generation Model Defaults ============ @@ -315,7 +351,7 @@ def _get_provider_models(self, provider: str) -> dict[str, list[str]]: Returns: Dict mapping quality levels to model lists """ - # Custom endpoints don't have predefined model lists in litellm + # Custom endpoints don't have predefined model lists # They use environment-specified models instead if provider in CUSTOM_ENDPOINT_ENVS: return {} @@ -331,11 +367,11 @@ def _get_provider_models(self, provider: str) -> dict[str, list[str]]: merged = {**default_config, **user_config} return merged - # No configuration - auto-generate from litellm + # No configuration - auto-generate from catalog return self._auto_generate_provider_config(provider) def _auto_generate_provider_config(self, provider: str) -> dict[str, list[str]]: - """Auto-generate provider config from litellm (sorted by price). + """Auto-generate provider config from catalog (sorted by price). Used when provider has API key but no configuration. @@ -345,28 +381,25 @@ def _auto_generate_provider_config(self, provider: str) -> dict[str, list[str]]: Returns: Dict mapping quality levels to model lists """ - try: - from litellm import models_by_provider - from litellm.utils import get_model_info - except ImportError: - logger.warning("litellm not available for auto-generation") - return {} + from pantheon.utils.provider_registry import models_by_provider as get_models, get_model_info logger.warning( - f"Provider '{provider}' not configured. Auto-generating from litellm. " + f"Provider '{provider}' not configured. Auto-generating from catalog. " f"Consider adding it to settings.json models.provider_models for better control." ) - if provider not in models_by_provider: - logger.warning(f"Provider '{provider}' not found in litellm") + all_models = get_models(provider) + if not all_models: + logger.warning(f"Provider '{provider}' not found in catalog") return {} # Collect chat models with prices models_with_prices: list[tuple[str, float]] = [] - for model in models_by_provider[provider]: + for model in all_models: try: info = get_model_info(model) - if info.get("mode") == "chat": + mode = info.get("mode", "chat") + if mode in ("chat", None): input_cost = info.get("input_cost_per_token", 0) or 0 models_with_prices.append((model, input_cost)) except Exception: @@ -411,13 +444,12 @@ def _check_model_capability(self, model: str, capability: str) -> bool: return False try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info info = get_model_info(model) - litellm_field = CAPABILITY_MAP[capability] - return bool(info.get(litellm_field)) + field = CAPABILITY_MAP[capability] + return bool(info.get(field)) except Exception: - # If we can't check, assume it doesn't support return False def resolve_model(self, tag: str) -> list[str]: diff --git a/pantheon/utils/provider_registry.py b/pantheon/utils/provider_registry.py new file mode 100644 index 00000000..99ac0ed8 --- /dev/null +++ b/pantheon/utils/provider_registry.py @@ -0,0 +1,268 @@ +""" +Provider registry — loads the catalog and exposes model metadata helpers. + +Replaces litellm.utils.get_model_info, litellm.completion_cost, +litellm.utils.token_counter, and litellm.models_by_provider. +""" + +import json +from functools import lru_cache +from pathlib import Path +from typing import Any + +from .log import logger + +# ============ Catalog Loading ============ + +_CATALOG_PATH = Path(__file__).parent / "llm_catalog.json" + +# Default metadata for unknown models +_DEFAULT_MODEL_INFO = { + "max_input_tokens": 200_000, + "max_output_tokens": 32_000, + "input_cost_per_million": 1.0, + "output_cost_per_million": 5.0, + "supports_vision": False, + "supports_function_calling": True, + "supports_response_schema": False, + "supports_reasoning": False, + "supports_audio_input": False, + "supports_audio_output": False, + "supports_web_search": False, + "supports_pdf_input": False, + "supports_computer_use": False, + "supports_assistant_prefill": False, +} + + +@lru_cache(maxsize=1) +def load_catalog() -> dict: + """Load and cache the provider catalog from llm_catalog.json.""" + try: + with open(_CATALOG_PATH, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.warning(f"Failed to load LLM catalog: {e}") + return {"version": 1, "providers": {}} + + +def reload_catalog() -> dict: + """Force-reload the catalog (clears cache). For testing.""" + load_catalog.cache_clear() + return load_catalog() + + +# ============ Provider Resolution ============ + + +def _parse_model_string(model: str) -> tuple[str | None, str]: + """Parse 'provider/model_name' into (provider, model_name). + + Returns (None, model) if no provider prefix. + """ + if "/" in model: + provider, model_name = model.split("/", 1) + return provider.lower(), model_name + return None, model + + +def find_provider_for_model(model: str) -> tuple[str, str, dict]: + """Given a model string, return (provider_key, model_name, provider_config). + + Tries: + 1. Explicit prefix: 'anthropic/claude-sonnet-4-6' → provider='anthropic' + 2. Search all providers for a matching model name + + Returns ('unknown', model, {}) if not found. + """ + catalog = load_catalog() + providers = catalog.get("providers", {}) + + # 1. Explicit prefix + prefix, model_name = _parse_model_string(model) + if prefix and prefix in providers: + return prefix, model_name, providers[prefix] + + # 2. Search all providers for bare model name + for pkey, pconfig in providers.items(): + if model_name in pconfig.get("models", {}): + return pkey, model_name, pconfig + + # 3. Not found — return with empty config + return prefix or "unknown", model_name, {} + + +def get_provider_config(provider: str) -> dict: + """Get provider configuration from catalog.""" + catalog = load_catalog() + return catalog.get("providers", {}).get(provider, {}) + + +# ============ Model Metadata ============ + + +def get_model_info(model: str) -> dict: + """Get model metadata from the catalog. + + Drop-in replacement for litellm.utils.get_model_info(). + + Args: + model: Model string, e.g. 'anthropic/claude-sonnet-4-6' or 'gpt-5.4' + + Returns: + Dict with max_input_tokens, max_output_tokens, pricing, supports_*, etc. + Returns defaults for unknown models. + """ + provider_key, model_name, provider_config = find_provider_for_model(model) + models = provider_config.get("models", {}) + + if model_name in models: + info = {**_DEFAULT_MODEL_INFO, **models[model_name]} + # Ensure per-token fields exist for backward compat + if "input_cost_per_token" not in info: + info["input_cost_per_token"] = info.get("input_cost_per_million", 1.0) / 1_000_000 + if "output_cost_per_token" not in info: + info["output_cost_per_token"] = info.get("output_cost_per_million", 5.0) / 1_000_000 + return info + + logger.debug(f"Model '{model}' not found in catalog, using defaults") + info = dict(_DEFAULT_MODEL_INFO) + info["input_cost_per_token"] = info["input_cost_per_million"] / 1_000_000 + info["output_cost_per_token"] = info["output_cost_per_million"] / 1_000_000 + return info + + +# ============ Cost Calculation ============ + + +def completion_cost( + completion_response: Any = None, + model: str | None = None, + prompt_tokens: int = 0, + completion_tokens: int = 0, +) -> float: + """Calculate completion cost from response or explicit token counts. + + Drop-in replacement for litellm.completion_cost(). + """ + # Extract from response object if provided + if completion_response is not None: + usage = getattr(completion_response, "usage", None) + if usage: + prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0 + completion_tokens = getattr(usage, "completion_tokens", 0) or 0 + + # Try to get model from response + if model is None: + model = getattr(completion_response, "model", None) or "" + + if not model: + # Fallback pricing: $1/1M input, $5/1M output + return (prompt_tokens * 1.0 + completion_tokens * 5.0) / 1_000_000 + + info = get_model_info(model) + input_cost = info.get("input_cost_per_token", 1.0 / 1_000_000) + output_cost = info.get("output_cost_per_token", 5.0 / 1_000_000) + + return prompt_tokens * input_cost + completion_tokens * output_cost + + +# ============ Model Listing ============ + + +def models_by_provider(provider: str) -> list[str]: + """List all model names for a provider. + + Drop-in replacement for litellm.models_by_provider[provider]. + """ + catalog = load_catalog() + provider_config = catalog.get("providers", {}).get(provider, {}) + models = provider_config.get("models", {}) + + # Return as 'provider/model_name' format + return [f"{provider}/{name}" for name in models] + + +# ============ Token Counting ============ + + +def token_counter( + model: str, + messages: list[dict] | None = None, + tools: list[dict] | None = None, +) -> int: + """Count tokens for messages and tools. + + Uses tiktoken when available, falls back to heuristic estimation. + """ + total = 0 + + # Try tiktoken first (works for OpenAI models) + try: + import tiktoken + + # Map model to encoding + try: + encoding = tiktoken.encoding_for_model(model.split("/")[-1]) + except KeyError: + encoding = tiktoken.get_encoding("cl100k_base") + + for msg in messages or []: + # Per-message overhead + total += 4 # role + content framing + content = msg.get("content", "") + if isinstance(content, str): + total += len(encoding.encode(content)) + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text", "") + if text: + total += len(encoding.encode(text)) + # Image tokens: rough estimate + if part.get("type") == "image_url": + total += 765 # ~average image token cost + + if tools: + total += len(encoding.encode(json.dumps(tools))) + + return total + + except (ImportError, Exception): + pass + + # Fallback: heuristic estimation + for msg in messages or []: + total += 4 + content = msg.get("content", "") + if isinstance(content, str): + total += _heuristic_token_count(content) + elif isinstance(content, list): + for part in content: + if isinstance(part, dict) and "text" in part: + total += _heuristic_token_count(part["text"]) + + if tools: + total += _heuristic_token_count(json.dumps(tools)) + + return total + + +def _heuristic_token_count(text: str) -> int: + """Estimate token count with language-aware heuristics.""" + if not text: + return 0 + + cjk_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff' + or '\u3040' <= c <= '\u30ff' + or '\uac00' <= c <= '\ud7af') + ascii_chars = sum(1 for c in text if c.isascii()) + other_chars = len(text) - cjk_chars - ascii_chars + + tokens = ( + cjk_chars * 0.6 + # CJK: ~1.7 chars per token + ascii_chars * 0.25 + # ASCII: ~4 chars per token + other_chars * 0.5 # Other: ~2 chars per token + ) + + return max(1, int(tokens)) diff --git a/pyproject.toml b/pyproject.toml index 2b4a4bbe..3af124ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,11 @@ dependencies = [ "executor-engine>=0.3.3", "fire>=0.7.0", "funcdesc>=0.1.8", - "litellm>=1.81.3", + "anthropic>=0.40.0", + "google-genai>=1.0.0", "loguru>=0.7.3", "openai>=2.0.0", + "tiktoken>=0.7.0", "pillow>=10.4.0", "rich>=14.0.0", "rich-pyfiglet", @@ -61,6 +63,16 @@ dependencies = [ "tree-sitter-javascript>=0.24.0", # Server "nats-server-bin>=2.10.0", + # Claw (multi-channel gateway) + "slack-sdk", + "slack-bolt", + "python-telegram-bot>=21.3", + "discord.py>=2.5", + "lark-oapi", + "requests>=2.0", + "cryptography>=42.0.0", + "websocket-client>=1.8", + "qrcode", ] dynamic = ["version"] @@ -95,23 +107,6 @@ dev = [ "pytest-asyncio>=0.25.0", "pytest-timeout>=2.3.1", ] -claw = [ - "slack-sdk", - "slack-bolt", - "python-telegram-bot>=21.3", - "discord.py>=2.5", - "lark-oapi", - "requests>=2.0", - "pillow>=9.0", - "nbformat>=5.0", - "cryptography>=42.0.0", - "websocket-client>=1.8", - "qrcode" -] -slack = [ - "slack-sdk", - "slack-bolt", -] r = [ # R language support for notebooks (requires R installed on system) "rpy2>=3.5.0", diff --git a/tests/test_model_selector.py b/tests/test_model_selector.py index 7d5b8392..9a9ff31c 100644 --- a/tests/test_model_selector.py +++ b/tests/test_model_selector.py @@ -288,8 +288,10 @@ def test_auto_generate_for_unknown_provider(self, mock_settings): """Test auto-generation for provider not in defaults.""" selector = ModelSelector(mock_settings) - # Mock litellm - imports are inside the method so patch at litellm level - mock_models_by_provider = {"custom_provider": ["model1", "model2", "model3"]} + # Mock provider_registry functions + mock_get_models = MagicMock( + return_value=["custom_provider/model1", "custom_provider/model2", "custom_provider/model3"] + ) mock_model_info = MagicMock( return_value={ "mode": "chat", @@ -299,11 +301,11 @@ def test_auto_generate_for_unknown_provider(self, mock_settings): with ( patch( - "litellm.models_by_provider", - mock_models_by_provider, + "pantheon.utils.provider_registry.models_by_provider", + mock_get_models, ), patch( - "litellm.utils.get_model_info", + "pantheon.utils.provider_registry.get_model_info", mock_model_info, ), ): diff --git a/tests/test_provider_adapters.py b/tests/test_provider_adapters.py new file mode 100644 index 00000000..6c68a122 --- /dev/null +++ b/tests/test_provider_adapters.py @@ -0,0 +1,237 @@ +""" +Integration tests for provider adapters — verifies every model in DEFAULT_PROVIDER_MODELS works. + +Requires API keys in .env file. +""" + +import os +import sys +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +# Load .env +_env_path = os.path.join(os.path.dirname(__file__), "..", ".env") +if os.path.exists(_env_path): + with open(_env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + os.environ.setdefault(key.strip(), value.strip()) + +from pantheon.utils.provider_registry import ( + load_catalog, + find_provider_for_model, + get_model_info, + completion_cost, + models_by_provider, + token_counter, +) +from pantheon.utils.adapters import get_adapter +from pantheon.utils.llm import stream_chunk_builder +from pantheon.utils.model_selector import DEFAULT_PROVIDER_MODELS, PROVIDER_API_KEYS +from pantheon.utils.llm_providers import is_responses_api_model, detect_provider + + +# ============ provider_registry unit tests ============ + + +class TestProviderRegistry: + + def test_load_catalog(self): + cat = load_catalog() + assert cat["version"] == 1 + assert len(cat["providers"]) >= 8 + + def test_find_provider_with_prefix(self): + p, m, c = find_provider_for_model("anthropic/claude-sonnet-4-6") + assert p == "anthropic" + assert m == "claude-sonnet-4-6" + assert c["sdk"] == "anthropic" + + def test_find_provider_openai_compat(self): + p, m, c = find_provider_for_model("deepseek/deepseek-chat") + assert p == "deepseek" + assert c["sdk"] == "openai" + + def test_find_provider_qwen(self): + p, m, c = find_provider_for_model("qwen/qwen3-235b-a22b") + assert p == "qwen" + assert c["api_key_env"] == "DASHSCOPE_API_KEY" + + def test_find_provider_unknown(self): + p, m, c = find_provider_for_model("unknown/some-model") + assert p == "unknown" + assert c == {} + + def test_get_model_info_known(self): + info = get_model_info("anthropic/claude-opus-4-6") + assert info["max_input_tokens"] == 1_000_000 + assert info["supports_vision"] is True + + def test_get_model_info_unknown_returns_defaults(self): + info = get_model_info("fake/nonexistent-model") + assert info["max_input_tokens"] == 200_000 + + def test_completion_cost(self): + cost = completion_cost(model="openai/gpt-5.4", prompt_tokens=1_000_000, completion_tokens=100_000) + assert abs(cost - 2.8) < 0.01 + + def test_models_by_provider(self): + models = models_by_provider("anthropic") + assert len(models) == 7 + + def test_models_by_provider_qwen(self): + models = models_by_provider("qwen") + assert len(models) == 9 + + def test_token_counter_basic(self): + count = token_counter(model="gpt-4", messages=[{"role": "user", "content": "Hello"}]) + assert count > 0 + + def test_all_default_models_in_catalog(self): + """Every model in DEFAULT_PROVIDER_MODELS should exist in the catalog.""" + cat = load_catalog() + all_catalog_models = set() + for prov, cfg in cat["providers"].items(): + for m in cfg.get("models", {}): + all_catalog_models.add(f"{prov}/{m}") + + missing = [] + for provider, levels in DEFAULT_PROVIDER_MODELS.items(): + for level, models in levels.items(): + for model in models: + if model not in all_catalog_models: + missing.append(model) + assert missing == [], f"Models in selector but not in catalog: {missing}" + + +# ============ stream_chunk_builder unit tests ============ + + +class TestStreamChunkBuilder: + + def test_text_chunks(self): + chunks = [ + {"choices": [{"index": 0, "delta": {"role": "assistant", "content": "Hello"}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {"content": " world"}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}, + {"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, "choices": []}, + ] + resp = stream_chunk_builder(chunks) + msg = resp.choices[0].message.model_dump() + assert msg["content"] == "Hello world" + assert resp.usage.prompt_tokens == 10 + + def test_tool_call_chunks(self): + chunks = [ + {"choices": [{"index": 0, "delta": {"role": "assistant", "tool_calls": [ + {"index": 0, "id": "call_1", "type": "function", "function": {"name": "test", "arguments": '{"a":'}} + ]}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {"tool_calls": [ + {"index": 0, "function": {"arguments": ' 1}'}} + ]}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {}, "finish_reason": "tool_calls"}]}, + {"usage": {"prompt_tokens": 20, "completion_tokens": 10, "total_tokens": 30}, "choices": []}, + ] + resp = stream_chunk_builder(chunks) + msg = resp.choices[0].message.model_dump() + assert msg["tool_calls"][0]["function"]["arguments"] == '{"a": 1}' + + def test_empty_chunks(self): + resp = stream_chunk_builder([]) + msg = resp.choices[0].message.model_dump() + assert msg["content"] is None + + +# ============ Real API: test every model in DEFAULT_PROVIDER_MODELS ============ + +SIMPLE_MESSAGES = [{"role": "user", "content": "Say 'hello' and nothing else."}] + + +def _has_key(provider: str) -> bool: + env_var = PROVIDER_API_KEYS.get(provider, "") + return bool(os.environ.get(env_var, "")) + + +def _get_all_models(): + """Collect unique (provider, model) pairs from DEFAULT_PROVIDER_MODELS, excluding qwen (no valid key).""" + seen = set() + result = [] + for provider, levels in DEFAULT_PROVIDER_MODELS.items(): + if provider == "qwen": + continue # Skip: no valid API key available + for level, models in levels.items(): + for model in models: + if model not in seen: + seen.add(model) + result.append((provider, model)) + return result + + +ALL_MODELS = _get_all_models() + + +@pytest.mark.parametrize("provider,model", ALL_MODELS, ids=[m for _, m in ALL_MODELS]) +@pytest.mark.asyncio +async def test_model_completion(provider, model): + """Test that each model in DEFAULT_PROVIDER_MODELS can complete a simple prompt. + + Automatically detects whether to use Chat Completions or Responses API. + """ + env_var = PROVIDER_API_KEYS.get(provider, "") + api_key = os.environ.get(env_var, "") + if not api_key: + pytest.skip(f"{env_var} not set") + + provider_key, model_name, provider_config = find_provider_for_model(model) + sdk_type = provider_config.get("sdk", "openai") + base_url = provider_config.get("base_url") + + # Check if this model needs Responses API + config = detect_provider(model, force_litellm=False) + uses_responses_api = is_responses_api_model(config) + + adapter = get_adapter("openai" if uses_responses_api else sdk_type) + + if uses_responses_api: + # Responses API path + bare_model = model_name.split("/")[-1] if "/" in model_name else model_name + msg = await adapter.acompletion_responses( + model=bare_model, + messages=SIMPLE_MESSAGES, + base_url=base_url, + api_key=api_key, + max_output_tokens=2048, + ) + content = msg.get("content") or "" + assert len(content.strip()) > 0, f"{model}: got empty content, full msg={msg}" + print(f" [{provider}] {model} (responses): {content[:80]!r}") + else: + # Chat Completions path + info = get_model_info(model) + is_reasoning = info.get("supports_reasoning", False) + + extra_kwargs = {} + if sdk_type == "anthropic": + extra_kwargs["max_tokens"] = 1024 if is_reasoning else 128 + elif sdk_type == "openai" and provider_key == "openai": + extra_kwargs["max_completion_tokens"] = 2048 + elif sdk_type != "google-genai": + extra_kwargs["max_tokens"] = 1024 if is_reasoning else 128 + + chunks = await adapter.acompletion( + model=model_name, + messages=SIMPLE_MESSAGES, + base_url=base_url, + api_key=api_key, + num_retries=2, + **extra_kwargs, + ) + resp = stream_chunk_builder(chunks) + msg = resp.choices[0].message.model_dump() + + content = msg.get("content") or "" + assert len(content.strip()) > 0, f"{model}: got empty content, full msg={msg}" + print(f" [{provider}] {model}: {content[:80]!r}") diff --git a/tests/test_scfm_router_real_queries.py b/tests/test_scfm_router_real_queries.py index 54f59aa1..a191731e 100644 --- a/tests/test_scfm_router_real_queries.py +++ b/tests/test_scfm_router_real_queries.py @@ -95,17 +95,19 @@ def has_api_key_for_model(model: str) -> bool: async def create_real_call_agent(): """ - Create a real _call_agent function using LiteLLM. + Create a real _call_agent function using provider adapters. Returns an async function compatible with the router's _call_agent interface. """ - import litellm + from pantheon.utils.adapters import get_adapter + from pantheon.utils.provider_registry import find_provider_for_model + from pantheon.utils.llm import stream_chunk_builder model = get_test_model() async def _call_agent(messages, system_prompt=None, model_override=None, **kwargs): """ - Call LLM via LiteLLM. + Call LLM via provider adapter. Args: messages: List of message dicts with 'role' and 'content' @@ -125,11 +127,18 @@ async def _call_agent(messages, system_prompt=None, model_override=None, **kwarg full_messages.extend(messages) try: - response = await litellm.acompletion( - model=actual_model, + provider_key, model_name, provider_config = find_provider_for_model(actual_model) + adapter = get_adapter(provider_config.get("sdk", "openai")) + import os + api_key = os.environ.get(provider_config.get("api_key_env", ""), "") + chunks = await adapter.acompletion( + model=model_name, messages=full_messages, - temperature=0.0, # Deterministic for testing + base_url=provider_config.get("base_url"), + api_key=api_key, + temperature=0.0, ) + response = stream_chunk_builder(chunks) return { "success": True, "response": response.choices[0].message.content, From a033ac2b3ae3a00ade39541c3a0e384e635b116c Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 00:09:45 -0700 Subject: [PATCH 02/13] refactor: remove all litellm naming from codebase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up all remaining litellm references in variable names, function names, enum values, parameters, comments, and documentation: - ProviderType.LITELLM → ProviderType.NATIVE - force_litellm parameter → relaxed_schema (Agent, detect_provider) - acompletion_litellm() → acompletion() - litellm_mode parameter → removed (only relaxed_schema remains) - _convert_functions(litellm_mode=) → _convert_functions(relaxed_schema=) - get_litellm_proxy_kwargs() backward-compat alias deleted - litellm_model variable → resolved_model - All comments and docstrings updated - Documentation updated (agent.rst, utils.rst, models.rst, etc.) - Test names updated (test_agent_force_litellm → test_agent_relaxed_schema) Only remaining "LITELLM" references are env var names in get_proxy_kwargs() for backward compatibility (LITELLM_PROXY_ENABLED/URL/KEY). Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/source/api/agent.rst | 4 +- docs/source/api/utils.rst | 2 +- docs/source/concepts.md | 2 +- docs/source/configuration/models.rst | 8 +-- docs/source/configuration/settings.rst | 2 +- docs/source/getting-started/installation.rst | 4 +- docs/source/interfaces/api/agent.rst | 2 +- docs/source/toolsets/image_generation.rst | 2 +- pantheon/agent.py | 18 ++--- pantheon/chatroom/start.py | 2 +- pantheon/factory/templates/.env.example | 4 +- pantheon/internal/compression/compressor.py | 2 +- pantheon/providers.py | 4 +- pantheon/settings.py | 6 +- pantheon/toolsets/image/image_gen.py | 2 +- pantheon/utils/adapters/base.py | 2 +- pantheon/utils/llm.py | 6 +- pantheon/utils/llm_providers.py | 72 ++++++++------------ pantheon/utils/log.py | 2 +- pantheon/utils/misc.py | 4 +- pantheon/utils/provider_registry.py | 10 +-- tests/test_agent.py | 4 +- tests/test_provider_adapters.py | 2 +- tests/test_responses_api.py | 14 ++-- tests/test_scfm_router_real_queries.py | 6 +- 25 files changed, 86 insertions(+), 100 deletions(-) diff --git a/docs/source/api/agent.rst b/docs/source/api/agent.rst index 74812fe7..9d7233aa 100644 --- a/docs/source/api/agent.rst +++ b/docs/source/api/agent.rst @@ -51,9 +51,9 @@ Constructor Parameters * - tool_timeout - int - Tool execution timeout in seconds (default: 600) - * - force_litellm + * - relaxed_schema - bool - - Force use of litellm backend (default: False) + - Use relaxed (non-strict) tool schema mode (default: False) * - max_tool_content_length - int | None - Maximum length for tool outputs (default: 100000) diff --git a/docs/source/api/utils.rst b/docs/source/api/utils.rst index e8ce128e..6019ad0f 100644 --- a/docs/source/api/utils.rst +++ b/docs/source/api/utils.rst @@ -58,7 +58,7 @@ Common Functions from pantheon.utils.llm import ( acompletion_openai, - acompletion_litellm, # adapter-based completion + acompletion, # adapter-based completion process_messages_for_model, remove_hidden_fields ) diff --git a/docs/source/concepts.md b/docs/source/concepts.md index cb48b2b5..60efa1d7 100644 --- a/docs/source/concepts.md +++ b/docs/source/concepts.md @@ -43,7 +43,7 @@ Every agent has instructions that define its behavior and personality. These ins Agents become powerful through tools - functions that extend their abilities beyond pure conversation. Tools allow agents to interact with external systems, perform calculations, access databases, browse the web, execute code, and much more. The tool system is extensible, allowing you to add custom capabilities tailored to your specific needs. #### Model Selection -Agents can use any LLM supported by LiteLLM. Model selection can be configured at the agent level or globally through settings. Fallback chains allow graceful degradation when primary models are unavailable. +Agents can use any LLM from the supported providers (OpenAI, Anthropic, Gemini, DeepSeek, and more). Model selection can be configured at the agent level or globally through settings. Fallback chains allow graceful degradation when primary models are unavailable. --- diff --git a/docs/source/configuration/models.rst b/docs/source/configuration/models.rst index 89b582ed..b52f1920 100644 --- a/docs/source/configuration/models.rst +++ b/docs/source/configuration/models.rst @@ -6,7 +6,7 @@ Configure which LLM models your agents use. Overview -------- -Pantheon uses `LiteLLM `_ as its unified LLM interface, providing access to **100+ LLM providers** through a consistent API. This means any model supported by LiteLLM works with Pantheon. +Pantheon provides a unified LLM interface with native SDK adapters, giving access to **many LLM providers** through a consistent API. Key features: @@ -119,7 +119,7 @@ Provider priority (configurable in settings.json): Supported Providers ------------------- -Pantheon supports all LiteLLM providers. Here are the most common ones: +Pantheon supports many LLM providers. Here are the most common ones: Major Cloud Providers ~~~~~~~~~~~~~~~~~~~~~ @@ -238,7 +238,7 @@ Chinese Providers .. note:: - For the complete list of 100+ supported providers, see the `LiteLLM Providers Documentation `_. + For additional providers, see the Pantheon documentation or configure custom endpoints. Model Format ------------ @@ -428,7 +428,7 @@ Or add to your ``.env`` / ``~/.pantheon/.env`` file: **Priority rules:** -- **Base URL**: ``OPENAI_API_BASE`` / ``LITELLM_API_BASE`` (provider-specific) > ``LLM_API_BASE`` (universal) +- **Base URL**: ``OPENAI_API_BASE`` (provider-specific) > ``LLM_API_BASE`` (universal) - **API Key (unified proxy mode)**: When ``LLM_API_BASE`` is set, ``LLM_API_KEY`` takes priority over provider-specific keys (e.g. ``OPENAI_API_KEY``). This ensures all requests to the proxy use the correct credentials. - **API Key (normal mode)**: When no ``LLM_API_BASE`` is set, provider-specific keys (e.g. ``OPENAI_API_KEY``) take priority over ``LLM_API_KEY``. diff --git a/docs/source/configuration/settings.rst b/docs/source/configuration/settings.rst index 62e0c61f..22f68a2b 100644 --- a/docs/source/configuration/settings.rst +++ b/docs/source/configuration/settings.rst @@ -61,7 +61,7 @@ See :doc:`models` for full details on custom API endpoints and priority rules. Models ~~~~~~ -Pantheon uses LiteLLM and supports smart model selection with quality tags. See :doc:`models` for full details. +Pantheon supports smart model selection with quality tags. See :doc:`models` for full details. .. code-block:: json diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index d4398f31..1b963f61 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -131,7 +131,7 @@ Core Dependencies Core dependencies are automatically installed: -- **LiteLLM** - Unified LLM API access (OpenAI, Anthropic, etc.) +- **Provider Adapters** - Unified LLM API access (OpenAI, Anthropic, etc.) - **Rich** - Terminal UI and formatting - **prompt-toolkit** - Interactive REPL - **NATS** - Distributed messaging @@ -155,7 +155,7 @@ Set up your LLM provider API keys: # Anthropic Claude export ANTHROPIC_API_KEY="your-anthropic-key" - # Or use other providers supported by LiteLLM + # Or use other supported providers export GEMINI_API_KEY="your-gemini-key" You can also create a ``.env`` file in your project directory: diff --git a/docs/source/interfaces/api/agent.rst b/docs/source/interfaces/api/agent.rst index 92355a6c..f239d045 100644 --- a/docs/source/interfaces/api/agent.rst +++ b/docs/source/interfaces/api/agent.rst @@ -246,7 +246,7 @@ Use provider/model format for exact model selection: agent = Agent(model="anthropic/claude-opus-4-5-20251101") agent = Agent(model="anthropic/claude-sonnet-4-5-20250929") - # Other providers (via LiteLLM) + # Other providers (via native SDK adapters) agent = Agent(model="gemini/gemini-3-pro-preview") agent = Agent(model="deepseek/deepseek-chat") agent = Agent(model="mistral/mistral-large") diff --git a/docs/source/toolsets/image_generation.rst b/docs/source/toolsets/image_generation.rst index a3440d57..de3342c6 100644 --- a/docs/source/toolsets/image_generation.rst +++ b/docs/source/toolsets/image_generation.rst @@ -115,7 +115,7 @@ Supported Models - ``dall-e-3`` - ``dall-e-2`` -- Any model supported by LiteLLM's ``aimage_generation`` API +- Any model supported by the provider adapter's ``aimage_generation`` API Model Selection --------------- diff --git a/pantheon/agent.py b/pantheon/agent.py index 1652ceb5..7ab4d8de 100644 --- a/pantheon/agent.py +++ b/pantheon/agent.py @@ -484,7 +484,7 @@ class Agent: memory: The memory to use for the agent. If not provided, a new memory will be created. tool_timeout: The timeout for the tool. (default: from settings.endpoint.local_toolset_timeout, or 3600s) - force_litellm: Whether to force using LiteLLM. (default: False) + relaxed_schema: Use relaxed (non-strict) tool schema mode. (default: False) max_tool_content_length: The maximum length of the tool content. (default: 100000) description: The description of the agent. (default: None) think_tool: Whether to enable the think tool for structured reasoning. (default: False) @@ -502,7 +502,7 @@ def __init__( use_memory: bool = True, memory: "Memory | None" = None, tool_timeout: int | None = None, - force_litellm: bool = False, + relaxed_schema: bool = False, max_tool_content_length: int | None = None, description: str | None = None, think_tool: bool = False, @@ -556,7 +556,7 @@ def __init__( # Input queue for run_loop() — messages/notifications enter here self.input_queue: asyncio.Queue = asyncio.Queue() self._loop_running: bool = False - self.force_litellm = force_litellm + self.relaxed_schema = relaxed_schema self.icon = icon # Provider management (MCP, ToolSet, etc.) @@ -913,7 +913,7 @@ async def get_tools_for_llm(self) -> list[dict]: """ # 1. Get tools from _base_functions (Agent's own tools - no prefix) base_tools = self._convert_functions( - litellm_mode=self.force_litellm, allow_transfer=True + relaxed_schema=self.relaxed_schema, allow_transfer=True ) # 2. Get tools from providers (dynamic retrieval - uses provider caching) @@ -1137,7 +1137,7 @@ async def call_tool( # ===== Legacy MCP method (deprecated, kept for backward compatibility) ===== def _convert_functions( - self, litellm_mode: bool, allow_transfer: bool + self, relaxed_schema: bool, allow_transfer: bool ) -> list[dict]: """Convert function to the format that the model can understand.""" functions = [] @@ -1160,7 +1160,7 @@ def _convert_functions( func_dict = desc_to_openai_dict( desc, skip_params=skip_params, - litellm_mode=litellm_mode, + relaxed_schema=relaxed_schema, ) functions.append(func_dict) @@ -1387,7 +1387,7 @@ async def _acompletion( messages = process_messages_for_model(messages, model) # Step 2: Detect provider and get configuration - provider_config = detect_provider(model, self.force_litellm) + provider_config = detect_provider(model, self.relaxed_schema) # Step 3: Get base URL and API key from environment if available # Skip if detect_provider already set them (e.g. OpenAI-compatible providers) @@ -1550,9 +1550,9 @@ async def _acompletion_with_models( For each model, transient errors (overloaded, rate-limit, 5xx) are retried with exponential backoff. Non-transient errors skip directly - to the next model. LiteLLM's ``num_retries`` still handles initial + to the next model. The adapter's ``num_retries`` still handles initial connection-level retries; this layer covers mid-stream failures that - LiteLLM cannot retry on its own. + the adapter cannot retry on its own. """ # --- Read retry settings (with sensible defaults) --- from .settings import get_settings diff --git a/pantheon/chatroom/start.py b/pantheon/chatroom/start.py index 24391444..34c897cc 100644 --- a/pantheon/chatroom/start.py +++ b/pantheon/chatroom/start.py @@ -250,7 +250,7 @@ async def start_services( - .env file: OPENAI_API_KEY=sk-... - settings.json api_keys section - Use LiteLLM Proxy mode for secure API key handling (LITELLM_PROXY_ENABLED environment variable). + Use LLM Proxy mode for secure API key handling (LLM_PROXY_ENABLED environment variable). """ # DIAGNOSTIC: Log startup parameters for debugging logger.debug(f"[DIAGNOSTIC] start_services() called with auto_start_nats={auto_start_nats}, auto_ui={auto_ui}") diff --git a/pantheon/factory/templates/.env.example b/pantheon/factory/templates/.env.example index 8445902e..325dfdc0 100644 --- a/pantheon/factory/templates/.env.example +++ b/pantheon/factory/templates/.env.example @@ -25,8 +25,8 @@ # Advanced Configuration (Optional) # ======================================== -# Custom LiteLLM endpoint -#LITELLM_BASE_URL=https://your-litellm-proxy.com +# Custom LLM proxy endpoint +#LLM_PROXY_URL=https://your-llm-proxy.com # Debug mode #DEBUG=false diff --git a/pantheon/internal/compression/compressor.py b/pantheon/internal/compression/compressor.py index e7a9b042..efb37a22 100644 --- a/pantheon/internal/compression/compressor.py +++ b/pantheon/internal/compression/compressor.py @@ -336,7 +336,7 @@ def _count_existing_compressions(self, messages: list[dict]) -> int: return sum(1 for msg in messages if msg.get("role") == "compression") def _estimate_tokens(self, messages: list[dict]) -> int: - """Estimate token count for messages using litellm/tiktoken when available.""" + """Estimate token count for messages using tiktoken when available.""" from pantheon.utils.llm import _safe_token_counter return max(1, _safe_token_counter(model=self.model, messages=messages)) diff --git a/pantheon/providers.py b/pantheon/providers.py index fa1ec2b0..630a4473 100644 --- a/pantheon/providers.py +++ b/pantheon/providers.py @@ -414,7 +414,7 @@ async def list_tools(self) -> list[ToolInfo]: # Generate OpenAI format schema using desc_to_openai_dict oai_dict = desc_to_openai_dict( - desc, skip_params=[], litellm_mode=True + desc, skip_params=[], relaxed_schema=True ) # Extract the "function" part (without "type": "function") @@ -545,7 +545,7 @@ async def list_tools(self) -> list[ToolInfo]: # Generate OpenAI format schema using desc_to_openai_dict oai_dict = desc_to_openai_dict( - desc, skip_params=[], litellm_mode=True + desc, skip_params=[], relaxed_schema=True ) # Extract the "function" part (without "type": "function") diff --git a/pantheon/settings.py b/pantheon/settings.py index 2194155d..c434c55a 100644 --- a/pantheon/settings.py +++ b/pantheon/settings.py @@ -166,7 +166,7 @@ def __init__(self, work_dir: Optional[Path] = None, env_override: bool = False): - .env file: OPENAI_API_KEY=sk-... - settings.json api_keys section - Use LiteLLM Proxy mode for secure API key handling (LITELLM_PROXY_ENABLED environment variable). + Use LLM Proxy mode for secure API key handling (LLM_PROXY_ENABLED environment variable). """ from .constant import PROJECT_ROOT @@ -835,8 +835,8 @@ def get_settings( API keys should be set via environment variables, .env file, or settings.json api_keys section. - For secure API key handling, use LiteLLM Proxy mode by setting - LITELLM_PROXY_ENABLED environment variable. + For secure API key handling, use LLM Proxy mode by setting + LLM_PROXY_ENABLED environment variable. """ global _settings diff --git a/pantheon/toolsets/image/image_gen.py b/pantheon/toolsets/image/image_gen.py index 82c5f372..eb2cf844 100644 --- a/pantheon/toolsets/image/image_gen.py +++ b/pantheon/toolsets/image/image_gen.py @@ -211,7 +211,7 @@ async def _multimodal_image_gen( """Multimodal image generation (Gemini Nano Banana series). Uses chat completion API with modalities parameter to generate images. - This approach works through LiteLLM Proxy and supports image generation. + This approach works through the LLM Proxy and supports image generation. Supported models: - gemini-3-pro-image-preview (Nano Banana Pro, up to 4K) diff --git a/pantheon/utils/adapters/base.py b/pantheon/utils/adapters/base.py index 2a2a2d14..aca797eb 100644 --- a/pantheon/utils/adapters/base.py +++ b/pantheon/utils/adapters/base.py @@ -7,7 +7,7 @@ # ============ Unified Exception Types ============ -# These replace litellm.exceptions.* and are caught in agent.py _is_retryable_error() +# Unified exception types caught in agent.py _is_retryable_error() class LLMError(Exception): diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 53cfa315..17301b74 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -395,7 +395,7 @@ def stream_chunk_builder(chunks: list[dict]) -> Any: Aggregates content deltas, tool_call deltas, and usage from collected chunks into a SimpleNamespace that mimics the shape of a chat completion response. - Replaces litellm.stream_chunk_builder(). + Replaces the stream_chunk_builder from external dependencies. """ from types import SimpleNamespace @@ -523,7 +523,7 @@ def message_model_dump(): return resp -async def acompletion_litellm( +async def acompletion( messages: list[dict], model: str, tools: list[dict] | None = None, @@ -1328,7 +1328,7 @@ def count_tokens_in_messages( # 2. Count tokens for tools definition if tools: - # litellm token_counter handles tools definition specifically + # token_counter handles tools definition specifically tools_definition_tokens = _safe_token_counter(model=model, tools=tools) total_tokens += tools_definition_tokens diff --git a/pantheon/utils/llm_providers.py b/pantheon/utils/llm_providers.py index c7b0ad6d..e225831c 100644 --- a/pantheon/utils/llm_providers.py +++ b/pantheon/utils/llm_providers.py @@ -24,11 +24,11 @@ class ProviderType(Enum): """Supported LLM providers. OPENAI: Direct OpenAI or OpenAI-compatible providers - LITELLM: Non-OpenAI providers (anthropic, gemini, etc.) — legacy name kept for compat + NATIVE: Non-OpenAI providers using native SDKs (anthropic, gemini, etc.) """ OPENAI = "openai" - LITELLM = "litellm" # Kept for backward compat; means "non-openai provider" + NATIVE = "native" @dataclass @@ -39,7 +39,7 @@ class ProviderConfig: model_name: str base_url: Optional[str] = None api_key: Optional[str] = None - force_litellm: bool = False # Kept for backward compat + relaxed_schema: bool = False # OpenAI-compatible providers that need custom base_url. @@ -50,18 +50,18 @@ class ProviderConfig: # ============ Provider Detection ============ -def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: +def detect_provider(model: str, relaxed_schema: bool) -> ProviderConfig: """Detect provider from model string. Model format: - - "gpt-4" → OpenAI (via LiteLLM) - - "provider/model" → LiteLLM (handles zhipu, anthropic, etc. natively) + - "gpt-4" → OpenAI provider + - "provider/model" → Native SDK (handles anthropic, gemini, etc.) - "custom_anthropic/model" → OpenAI-compatible with CUSTOM_ANTHROPIC_* env vars - "custom_openai/model" → OpenAI-compatible with CUSTOM_OPENAI_* env vars Args: model: Model identifier string - force_litellm: Force using LiteLLM backend + relaxed_schema: Use relaxed (non-strict) tool schema mode Returns: ProviderConfig with detected provider and model name @@ -82,21 +82,21 @@ def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: base_url = os.environ.get(config.api_base_env, "") api_key = os.environ.get(config.api_key_env, "") - # Determine the litellm model format based on endpoint type - # LiteLLM needs a provider prefix to route correctly. + # Determine the resolved model format based on endpoint type. + # A provider prefix is needed to route correctly. # Explicitly passed api_key in call_llm_provider overrides env vars. if "anthropic" in provider_lower: - litellm_model = f"anthropic/{model_name}" + resolved_model = f"anthropic/{model_name}" else: - litellm_model = f"openai/{model_name}" + resolved_model = f"openai/{model_name}" - logger.debug(f"Using custom endpoint '{provider_lower}' with base_url={base_url}, litellm_model={litellm_model}") + logger.debug(f"Using custom endpoint '{provider_lower}' with base_url={base_url}, resolved_model={resolved_model}") return ProviderConfig( provider_type=ProviderType.OPENAI, - model_name=litellm_model, + model_name=resolved_model, base_url=base_url or None, api_key=api_key or None, - force_litellm=force_litellm, + relaxed_schema=relaxed_schema, ) if "/" in model: @@ -113,23 +113,23 @@ def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: elif provider_lower == "openai": provider_type = ProviderType.OPENAI else: - # All other prefixed models go through LiteLLM (zhipu, anthropic, etc.) - provider_type = ProviderType.LITELLM - model_name = model # Keep full model string for LiteLLM + # All other prefixed models use native SDK adapters (anthropic, gemini, etc.) + provider_type = ProviderType.NATIVE + model_name = model # Keep full model string for native adapter else: provider_type = ProviderType.OPENAI model_name = model - # Override with LiteLLM if forced - if force_litellm and provider_type != ProviderType.LITELLM: - provider_type = ProviderType.LITELLM + # Override with NATIVE if relaxed_schema is forced + if relaxed_schema and provider_type != ProviderType.NATIVE: + provider_type = ProviderType.NATIVE return ProviderConfig( provider_type=provider_type, model_name=model_name, base_url=base_url, api_key=api_key or None, - force_litellm=force_litellm, + relaxed_schema=relaxed_schema, ) @@ -290,23 +290,11 @@ def get_proxy_kwargs() -> dict: return {} -# Backward compatibility alias -def get_litellm_proxy_kwargs() -> dict: - """Backward-compatible alias for get_proxy_kwargs(). - - Returns keys in old format: {"api_base": ..., "api_key": ...} - """ - result = get_proxy_kwargs() - if result: - return {"api_base": result["base_url"], "api_key": result["api_key"]} - return {} - - def _extract_cost_and_usage(complete_resp: Any) -> tuple[float, dict]: """Calculate cost and extract usage from response. Cost and usage are extracted independently - cost calculation failures - (e.g., for new models not yet in litellm's price map) should not prevent + (e.g., for new models not yet in the price catalog) should not prevent usage data from being captured. """ cost = 0.0 @@ -333,7 +321,7 @@ def _extract_cost_and_usage(complete_resp: Any) -> tuple[float, dict]: except Exception as e: logger.debug(f"Cost calculation unavailable: {e}") - # Fallback: estimate cost from usage if litellm failed but we have token counts + # Fallback: estimate cost from usage if catalog lookup failed but we have token counts if cost == 0.0 and usage_dict: input_tokens = usage_dict.get("prompt_tokens", 0) output_tokens = usage_dict.get("completion_tokens", 0) @@ -470,7 +458,7 @@ async def call_llm_provider( Extracted and cleaned message dictionary """ from .llm import ( - acompletion_litellm, + acompletion, remove_metadata, ) @@ -530,7 +518,7 @@ async def call_llm_provider( ) if config.provider_type == ProviderType.OPENAI: - # LiteLLM requires explicit provider prefixes for models it cannot auto-detect. + # Provider adapters require explicit provider prefixes for models they cannot auto-detect. # Ensure OpenAI models include the provider namespace to avoid BadRequestError. model_name = config.model_name @@ -540,7 +528,7 @@ async def call_llm_provider( logger.debug( f"[CALL_LLM_PROVIDER] Using OpenAI provider with model={model_name}, base_url={config.base_url}" ) - complete_resp = await acompletion_litellm( + complete_resp = await acompletion( messages=clean_messages, model=model_name, tools=tools, @@ -552,11 +540,11 @@ async def call_llm_provider( ) error_prefix = "OpenAI" - else: # LITELLM + else: # NATIVE logger.debug( - f"[CALL_LLM_PROVIDER] Using LiteLLM provider with model={config.model_name}" + f"[CALL_LLM_PROVIDER] Using native provider with model={config.model_name}" ) - complete_resp = await acompletion_litellm( + complete_resp = await acompletion( messages=clean_messages, model=config.model_name, tools=tools, @@ -566,7 +554,7 @@ async def call_llm_provider( api_key=config.api_key, model_params=model_params, ) - error_prefix = "LiteLLM" + error_prefix = "Native" # Extract and clean message return extract_message_from_response(complete_resp, error_prefix) diff --git a/pantheon/utils/log.py b/pantheon/utils/log.py index 4737c054..24313d19 100644 --- a/pantheon/utils/log.py +++ b/pantheon/utils/log.py @@ -158,7 +158,7 @@ def setup_file_logging( # Warning Suppression # ============================================================================= -# Suppress aiohttp "Unclosed client session" warnings from litellm. +# Suppress aiohttp "Unclosed client session" warnings. # These warnings are harmless - the OS cleans up connections on process exit. warnings.filterwarnings("ignore", message="Unclosed client session", category=ResourceWarning) warnings.filterwarnings("ignore", message="Unclosed connector", category=ResourceWarning) diff --git a/pantheon/utils/misc.py b/pantheon/utils/misc.py index b84cfc77..0fe99a3f 100644 --- a/pantheon/utils/misc.py +++ b/pantheon/utils/misc.py @@ -150,11 +150,9 @@ def _strip_docstring_args(docstring: str | None) -> str: def desc_to_openai_dict( desc: Description, skip_params: List[str] = [], - litellm_mode: bool = False, relaxed_schema: bool = False, ) -> dict: - # Support both old and new parameter names - _relaxed = relaxed_schema or litellm_mode + _relaxed = relaxed_schema # Filter inputs without modifying original desc.inputs filtered_inputs = [arg for arg in desc.inputs if arg.name not in skip_params] diff --git a/pantheon/utils/provider_registry.py b/pantheon/utils/provider_registry.py index 99ac0ed8..5d92596b 100644 --- a/pantheon/utils/provider_registry.py +++ b/pantheon/utils/provider_registry.py @@ -1,8 +1,8 @@ """ Provider registry — loads the catalog and exposes model metadata helpers. -Replaces litellm.utils.get_model_info, litellm.completion_cost, -litellm.utils.token_counter, and litellm.models_by_provider. +Provides get_model_info, completion_cost, token_counter, +and models_by_provider from the local LLM catalog. """ import json @@ -104,7 +104,7 @@ def get_provider_config(provider: str) -> dict: def get_model_info(model: str) -> dict: """Get model metadata from the catalog. - Drop-in replacement for litellm.utils.get_model_info(). + Retrieves model metadata from the local catalog. Args: model: Model string, e.g. 'anthropic/claude-sonnet-4-6' or 'gpt-5.4' @@ -143,7 +143,7 @@ def completion_cost( ) -> float: """Calculate completion cost from response or explicit token counts. - Drop-in replacement for litellm.completion_cost(). + Calculates completion cost from the local catalog pricing. """ # Extract from response object if provided if completion_response is not None: @@ -173,7 +173,7 @@ def completion_cost( def models_by_provider(provider: str) -> list[str]: """List all model names for a provider. - Drop-in replacement for litellm.models_by_provider[provider]. + Lists all model names for a given provider from the catalog. """ catalog = load_catalog() provider_config = catalog.get("providers", {}).get(provider, {}) diff --git a/tests/test_agent.py b/tests/test_agent.py index e2d9641c..34f61d1a 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -228,11 +228,11 @@ def transfer_to_classic_literature_fan(): assert resp.to_agent == classic_literature_fan.name -async def test_agent_force_litellm(): +async def test_agent_relaxed_schema(): agent = Agent( name="test", instructions="", - force_litellm=True, + relaxed_schema=True, ) resp = await agent.run("What is the weather in Palo Alto?") diff --git a/tests/test_provider_adapters.py b/tests/test_provider_adapters.py index 6c68a122..8046ab00 100644 --- a/tests/test_provider_adapters.py +++ b/tests/test_provider_adapters.py @@ -190,7 +190,7 @@ async def test_model_completion(provider, model): base_url = provider_config.get("base_url") # Check if this model needs Responses API - config = detect_provider(model, force_litellm=False) + config = detect_provider(model, relaxed_schema=False) uses_responses_api = is_responses_api_model(config) adapter = get_adapter("openai" if uses_responses_api else sdk_type) diff --git a/tests/test_responses_api.py b/tests/test_responses_api.py index 4ef4b8f1..240ad4ac 100644 --- a/tests/test_responses_api.py +++ b/tests/test_responses_api.py @@ -40,13 +40,13 @@ def test_non_codex_openai(self): config = ProviderConfig(provider_type=ProviderType.OPENAI, model_name="gpt-4o") assert is_responses_api_model(config) is False - def test_codex_model_litellm_provider(self): - """Codex model but via LiteLLM provider should NOT use Responses API.""" - config = ProviderConfig(provider_type=ProviderType.LITELLM, model_name="codex-mini-latest") + def test_codex_model_native_provider(self): + """Codex model but via native provider should NOT use Responses API.""" + config = ProviderConfig(provider_type=ProviderType.NATIVE, model_name="codex-mini-latest") assert is_responses_api_model(config) is False - def test_non_codex_litellm(self): - config = ProviderConfig(provider_type=ProviderType.LITELLM, model_name="anthropic/claude-3-opus") + def test_non_codex_native(self): + config = ProviderConfig(provider_type=ProviderType.NATIVE, model_name="anthropic/claude-3-opus") assert is_responses_api_model(config) is False def test_o1_model_not_codex(self): @@ -442,7 +442,7 @@ async def test_routing_through_call_llm_provider(self): detect_provider, ) - config = detect_provider(CODEX_MODEL, force_litellm=False) + config = detect_provider(CODEX_MODEL, relaxed_schema=False) assert is_responses_api_model(config) is True result = await call_llm_provider( @@ -466,7 +466,7 @@ async def test_non_codex_does_not_use_responses_api(self): def detect_provider_for_test(model: str) -> ProviderConfig: from pantheon.utils.llm_providers import detect_provider - return detect_provider(model, force_litellm=False) + return detect_provider(model, relaxed_schema=False) # ============ Agent.run() End-to-End Tests ============ diff --git a/tests/test_scfm_router_real_queries.py b/tests/test_scfm_router_real_queries.py index a191731e..87731389 100644 --- a/tests/test_scfm_router_real_queries.py +++ b/tests/test_scfm_router_real_queries.py @@ -60,7 +60,7 @@ def get_test_model() -> str: """ Get LLM model for testing from environment. - Supports LiteLLM model strings: + Supports provider-prefixed model strings: - OpenAI: "gpt-4o-mini", "gpt-4o" - Anthropic: "anthropic/claude-sonnet-4-20250514", "anthropic/claude-haiku-3-5-20241022" - Gemini: "gemini/gemini-1.5-flash" @@ -648,12 +648,12 @@ def test_query_does_not_contain_model_name(self, query, data_profile, constraint @pytest.mark.live_llm class TestLiveModelSelection: """ - Live LLM tests using real API calls via LiteLLM. + Live LLM tests using real API calls via provider adapters. Supports multiple providers through environment configuration: Environment Variables: - - SCFM_TEST_MODEL: LiteLLM model string (default: "gpt-4o-mini") + - SCFM_TEST_MODEL: Model string (default: "gpt-4o-mini") - Provider API keys: OPENAI_API_KEY, ANTHROPIC_API_KEY, GEMINI_API_KEY, etc. Example usage: From 478ba835d94bc65eea39e111e0d1962b03136361 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 00:36:24 -0700 Subject: [PATCH 03/13] fix: capture thinking/reasoning content for Anthropic and Gemini adapters Anthropic: thinking_delta events now written into collected_chunks (previously only sent via process_chunk callback, lost in stream_chunk_builder) Gemini: add include_thoughts=True to ThinkingConfig, capture thought=True parts as reasoning_content chunks (previously thinking parts were ignored) Both adapters now emit reasoning_content in the standard delta format, compatible with stream_chunk_builder's reasoning_content accumulation. Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/utils/adapters/anthropic_adapter.py | 14 +++++++ pantheon/utils/adapters/gemini_adapter.py | 40 ++++++++++++++++++-- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/pantheon/utils/adapters/anthropic_adapter.py b/pantheon/utils/adapters/anthropic_adapter.py index 9d1368fa..4824a597 100644 --- a/pantheon/utils/adapters/anthropic_adapter.py +++ b/pantheon/utils/adapters/anthropic_adapter.py @@ -397,6 +397,20 @@ async def acompletion( elif delta_obj.type == "thinking_delta": thinking_text = delta_obj.thinking + + # Write into chunks so stream_chunk_builder captures it + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": thinking_text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + if process_chunk: await run_func(process_chunk, { "role": "assistant", diff --git a/pantheon/utils/adapters/gemini_adapter.py b/pantheon/utils/adapters/gemini_adapter.py index 0831ecfa..f3702077 100644 --- a/pantheon/utils/adapters/gemini_adapter.py +++ b/pantheon/utils/adapters/gemini_adapter.py @@ -223,11 +223,19 @@ async def acompletion( if modalities: config_kwargs["response_modalities"] = modalities - # Reasoning effort → thinking config + # Reasoning / thinking config reasoning_effort = kwargs.pop("reasoning_effort", None) - if reasoning_effort: + thinking = kwargs.pop("thinking", None) + if thinking and isinstance(thinking, dict): + budget = thinking.get("budget_tokens", -1) config_kwargs["thinking_config"] = types.ThinkingConfig( - thinking_budget=-1 # auto + thinking_budget=budget, + include_thoughts=True, + ) + elif reasoning_effort: + config_kwargs["thinking_config"] = types.ThinkingConfig( + thinking_budget=-1, # auto + include_thoughts=True, ) config = types.GenerateContentConfig(**config_kwargs) @@ -251,11 +259,16 @@ async def acompletion( text = "" tool_calls_data = [] + thinking_text = "" + if response.candidates: for candidate in response.candidates: if candidate.content and candidate.content.parts: for part in candidate.content.parts: - if hasattr(part, "text") and part.text: + if getattr(part, "thought", False) and part.text: + # Thinking/reasoning part + thinking_text += part.text + elif hasattr(part, "text") and part.text: text += part.text elif hasattr(part, "function_call") and part.function_call: fc = part.function_call @@ -296,6 +309,25 @@ async def acompletion( "content": text, }) + if thinking_text: + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": thinking_text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + await run_func(process_chunk, { + "role": "assistant", + "reasoning_content": thinking_text, + }) + if tool_calls_data: chunk_dict = { "choices": [{ From 9b0279990ad6fde3369efebbdc045d6db579b014 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 00:53:27 -0700 Subject: [PATCH 04/13] fix: capture Groq reasoning field in stream_chunk_builder Groq gpt-oss models use 'reasoning' (not 'reasoning_content') for thinking output. stream_chunk_builder now accumulates both field names. OpenAI gpt-5 does not expose reasoning content at all (by design). Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/utils/llm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 17301b74..7928e2e0 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -426,9 +426,13 @@ def stream_chunk_builder(chunks: list[dict]) -> Any: if "content" in delta and delta["content"]: full_content += delta["content"] - # Accumulate reasoning_content (OpenAI/Zhipu/Kimi reasoning models) + # Accumulate reasoning (various field names across providers) + # - reasoning_content: DeepSeek, Zhipu, Kimi, Anthropic adapter, Gemini adapter + # - reasoning: Groq gpt-oss models if "reasoning_content" in delta and delta["reasoning_content"]: full_reasoning += delta["reasoning_content"] + elif "reasoning" in delta and delta["reasoning"]: + full_reasoning += delta["reasoning"] # Accumulate role if "role" in delta and delta["role"]: From 6d44f78c0639fb2e00ab1639277bb5c2e7a91b5d Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 01:45:12 -0700 Subject: [PATCH 05/13] feat: add Codex OAuth support (ChatGPT backend-api) New OAuth infrastructure for browser-based authentication: - pantheon/utils/oauth/codex.py: CodexOAuthManager with login(), refresh(), import_from_codex_cli(), and persistent token storage (~/.pantheon/oauth/) - OAuth 2.0 Authorization Code + PKCE flow, local callback server - Auto-refresh expired tokens, import from Codex CLI (~/.codex/auth.json) New Codex adapter: - pantheon/utils/adapters/codex_adapter.py: calls chatgpt.com/backend-api using Responses API format with OAuth bearer tokens - Handles SSE streaming, tool calls, usage extraction Integration: - llm_catalog.json: new "codex" provider with sdk="codex", auth_mode="oauth" - acompletion(): detects codex provider, auto-fetches OAuth token - call_llm_provider(): routes codex/ models to dedicated adapter - Models: gpt-5.4, gpt-5.4-mini, gpt-5.2-codex, gpt-5, o4-mini (free via OAuth) Usage: # Import from Codex CLI (if installed) from pantheon.utils.oauth import CodexOAuthManager CodexOAuthManager().import_from_codex_cli() # Or browser login CodexOAuthManager().login() # Then use codex/ prefix await acompletion(model="codex/gpt-5.4-mini", messages=[...]) Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/utils/adapters/__init__.py | 3 + pantheon/utils/adapters/codex_adapter.py | 276 +++++++++++++++ pantheon/utils/llm.py | 23 ++ pantheon/utils/llm_catalog.json | 60 ++++ pantheon/utils/llm_providers.py | 16 +- pantheon/utils/oauth/__init__.py | 10 + pantheon/utils/oauth/codex.py | 414 +++++++++++++++++++++++ 7 files changed, 801 insertions(+), 1 deletion(-) create mode 100644 pantheon/utils/adapters/codex_adapter.py create mode 100644 pantheon/utils/oauth/__init__.py create mode 100644 pantheon/utils/oauth/codex.py diff --git a/pantheon/utils/adapters/__init__.py b/pantheon/utils/adapters/__init__.py index 502a33ff..fa3a45ff 100644 --- a/pantheon/utils/adapters/__init__.py +++ b/pantheon/utils/adapters/__init__.py @@ -36,6 +36,9 @@ def get_adapter(sdk_type: str) -> BaseAdapter: elif sdk_type == "google-genai": from .gemini_adapter import GeminiAdapter return GeminiAdapter() + elif sdk_type == "codex": + from .codex_adapter import CodexAdapter + return CodexAdapter() else: # Default to OpenAI adapter for unknown SDK types # (many providers are OpenAI-compatible) diff --git a/pantheon/utils/adapters/codex_adapter.py b/pantheon/utils/adapters/codex_adapter.py new file mode 100644 index 00000000..f69d8d22 --- /dev/null +++ b/pantheon/utils/adapters/codex_adapter.py @@ -0,0 +1,276 @@ +""" +Codex adapter — calls OpenAI ChatGPT backend-api via OAuth tokens. + +Uses the Responses API format at https://chatgpt.com/backend-api/codex/responses. +Requires OAuth tokens from CodexOAuthManager. +""" + +import json +import time +import platform +from typing import Any, Callable + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + +CODEX_BASE_URL = "https://chatgpt.com/backend-api" + + +def _build_headers(access_token: str, account_id: str | None = None) -> dict: + """Build request headers for Codex backend-api.""" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json", + "Accept": "text/event-stream", + "OpenAI-Beta": "responses=experimental", + "originator": "pi", + "User-Agent": f"pi ({platform.system()} {platform.release()}; {platform.machine()})", + } + if account_id: + headers["chatgpt-account-id"] = account_id + return headers + + +def _convert_messages_to_responses_input(messages: list[dict]) -> tuple[str | None, list[dict]]: + """Convert Chat Completions messages to Responses API input format.""" + instructions = None + input_items = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if role == "system": + if instructions is None: + instructions = content + else: + input_items.append({"role": "developer", "content": content}) + elif role == "user": + input_items.append({"role": "user", "content": content}) + elif role == "assistant": + if content: + input_items.append({"role": "assistant", "content": content}) + for tc in msg.get("tool_calls") or []: + func = tc.get("function", {}) + input_items.append({ + "type": "function_call", + "call_id": tc["id"], + "name": func.get("name", ""), + "arguments": func.get("arguments", ""), + }) + elif role == "tool": + input_items.append({ + "type": "function_call_output", + "call_id": msg.get("tool_call_id", ""), + "output": content or "", + }) + + return instructions, input_items + + +def _convert_tools(tools: list[dict] | None) -> list[dict] | None: + """Convert Chat Completions tool format to Responses API format.""" + if not tools: + return None + converted = [] + for tool in tools: + func = tool.get("function", {}) + item = {"type": "function", "name": func.get("name", "")} + if "description" in func: + item["description"] = func["description"] + if "parameters" in func: + item["parameters"] = func["parameters"] + if "strict" in func: + item["strict"] = func["strict"] + converted.append(item) + return converted + + +class CodexAdapter(BaseAdapter): + """Adapter for OpenAI Codex via ChatGPT backend-api OAuth.""" + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, # This is the OAuth access_token + num_retries: int = 3, + **kwargs, + ): + """Call Codex backend-api with Responses API format. + + api_key should be the OAuth access_token. + kwargs may contain 'account_id' for the chatgpt-account-id header. + """ + import httpx + + access_token = api_key + if not access_token: + raise APIConnectionError("No Codex OAuth access token provided") + + account_id = kwargs.pop("account_id", None) + headers = _build_headers(access_token, account_id) + endpoint = f"{base_url or CODEX_BASE_URL}/codex/responses" + + # Convert messages + instructions, input_items = _convert_messages_to_responses_input(messages) + converted_tools = _convert_tools(tools) + + # Build request body + body: dict[str, Any] = { + "model": model, + "input": input_items, + "instructions": instructions or "You are a helpful assistant.", + "stream": True, + "store": False, + "parallel_tool_calls": True, + "include": ["reasoning.encrypted_content"], + } + if converted_tools: + body["tools"] = converted_tools + if response_format: + body["text"] = response_format + + # Map model_params (Codex backend-api has limited parameter support) + kwargs.pop("max_tokens", None) + kwargs.pop("max_completion_tokens", None) + kwargs.pop("max_output_tokens", None) + reasoning_effort = kwargs.pop("reasoning_effort", None) + if reasoning_effort: + body["reasoning"] = {"effort": reasoning_effort} + + # Stream response + text_parts = [] + tool_calls_by_id = {} + _item_to_call = {} + usage_dict = {} + cost = 0.0 + + try: + stream_start_time = time.time() + first_chunk_time = None + + async with httpx.AsyncClient(timeout=120) as client: + async with client.stream("POST", endpoint, headers=headers, json=body) as resp: + if resp.status_code == 401: + raise APIConnectionError("Codex OAuth token expired or invalid (401)") + elif resp.status_code == 429: + raise RateLimitError(f"Codex rate limited (429)") + elif resp.status_code >= 500: + raise ServiceUnavailableError(f"Codex server error ({resp.status_code})") + elif resp.status_code >= 400: + body_text = "" + async for chunk in resp.aiter_text(): + body_text += chunk + raise APIConnectionError(f"Codex error {resp.status_code}: {body_text[:300]}") + + async for line in resp.aiter_lines(): + if not line.startswith("data: "): + continue + data_str = line[6:] + if data_str == "[DONE]": + break + + try: + event = json.loads(data_str) + except json.JSONDecodeError: + continue + + event_type = event.get("type", "") + + if event_type == "response.output_text.delta": + delta_text = event.get("delta", "") + text_parts.append(delta_text) + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + if process_chunk: + await run_func(process_chunk, {"content": delta_text, "role": "assistant"}) + + elif event_type == "response.output_item.added": + item = event.get("item", {}) + if item.get("type") == "function_call": + call_id = item.get("call_id", "") + item_id = item.get("id", "") + _item_to_call[item_id] = call_id + tool_calls_by_id[call_id] = { + "name": item.get("name", ""), + "arguments": "", + } + + elif event_type == "response.function_call_arguments.done": + item_id = event.get("item_id", "") + call_id = _item_to_call.get(item_id, "") + if call_id and call_id in tool_calls_by_id: + tool_calls_by_id[call_id]["arguments"] = event.get("arguments", "") + if event.get("name"): + tool_calls_by_id[call_id]["name"] = event["name"] + + elif event_type == "response.completed": + if process_chunk: + await run_func(process_chunk, {"stop": True}) + # Extract usage + resp_obj = event.get("response", {}) + usage = resp_obj.get("usage", {}) + if usage: + input_tokens = usage.get("input_tokens", 0) + output_tokens = usage.get("output_tokens", 0) + usage_dict = { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + } + + elif event_type == "response.failed": + error_info = event.get("response", {}).get("error", {}) + raise RuntimeError(f"Codex call failed: {error_info}") + + total_time = time.time() - stream_start_time + logger.info(f"✅ Codex stream completed: {total_time:.3f}s [{model}]") + + except (APIConnectionError, RateLimitError, ServiceUnavailableError): + raise + except Exception as e: + err_str = str(e).lower() + if "401" in err_str or "unauthorized" in err_str: + raise APIConnectionError(f"Codex OAuth token invalid: {e}") from e + elif "429" in err_str or "rate" in err_str: + raise RateLimitError(str(e)) from e + raise + + # Build output message (same format as acompletion_responses in llm.py) + aggregated_text = "".join(text_parts) if text_parts else None + final_tool_calls = None + if tool_calls_by_id: + final_tool_calls = [ + {"id": cid, "type": "function", "function": {"name": info["name"], "arguments": info["arguments"]}} + for cid, info in tool_calls_by_id.items() + ] + + # Cost estimation from catalog + try: + from ..provider_registry import completion_cost as calc_cost + cost = calc_cost(model=model, **usage_dict) if usage_dict else 0.0 + except Exception: + pass + + return { + "role": "assistant", + "content": aggregated_text, + "tool_calls": final_tool_calls, + "_metadata": {"_debug_cost": cost, "_debug_usage": usage_dict}, + } diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 7928e2e0..965d6c3d 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -570,6 +570,18 @@ async def acompletion( effective_api_key = proxy_kwargs.get("api_key") sdk_type = "openai" # proxy exposes OpenAI-compatible API effective_model = model # pass full model string to proxy + elif sdk_type == "codex": + # Codex OAuth: get access token from OAuth manager + from .oauth import CodexOAuthManager + oauth = CodexOAuthManager() + effective_api_key = oauth.get_access_token(auto_refresh=True) + if not effective_api_key: + raise RuntimeError( + "Codex OAuth not authenticated. Run CodexOAuthManager().login() " + "or import tokens from Codex CLI with CodexOAuthManager().import_from_codex_cli()" + ) + effective_base_url = provider_config.get("base_url") + effective_model = model_name else: effective_base_url = base_url or provider_config.get("base_url") effective_api_key = api_key @@ -585,6 +597,13 @@ async def acompletion( # ========== Prepare adapter kwargs ========== adapter_kwargs = dict(model_params or {}) + # Codex OAuth: pass account_id for chatgpt-account-id header + if sdk_type == "codex": + from .oauth import CodexOAuthManager + account_id = CodexOAuthManager().get_account_id() + if account_id: + adapter_kwargs["account_id"] = account_id + # Kimi Coding API gates access by User-Agent header if "kimi-for-coding" in model: adapter_kwargs.setdefault("extra_headers", {}) @@ -618,6 +637,10 @@ async def acompletion( raise # ========== Build complete response ========== + # Codex adapter returns a message dict directly (not chunks) + if sdk_type == "codex" and isinstance(collected_chunks, dict): + return collected_chunks # Already a normalized message dict + complete_resp = stream_chunk_builder(collected_chunks) # Calculate and attach cost information diff --git a/pantheon/utils/llm_catalog.json b/pantheon/utils/llm_catalog.json index d9383868..19827320 100644 --- a/pantheon/utils/llm_catalog.json +++ b/pantheon/utils/llm_catalog.json @@ -1322,6 +1322,66 @@ "supports_assistant_prefill": false } } + }, + "codex": { + "display_name": "Codex (OAuth)", + "sdk": "codex", + "base_url": "https://chatgpt.com/backend-api", + "api_key_env": "", + "openai_compatible": false, + "auth_mode": "oauth", + "models": { + "gpt-5.4": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "gpt-5.4-mini": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "gpt-5.2-codex": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "gpt-5": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "o4-mini": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + } + } } } } diff --git a/pantheon/utils/llm_providers.py b/pantheon/utils/llm_providers.py index e225831c..868868cf 100644 --- a/pantheon/utils/llm_providers.py +++ b/pantheon/utils/llm_providers.py @@ -495,7 +495,21 @@ async def call_llm_provider( clean_messages = remove_metadata(clean_messages) # Call appropriate provider - # Route codex models through the OpenAI Responses API + # Route Codex OAuth models through their dedicated adapter + if "codex/" in config.model_name.lower() or config.model_name.startswith("codex/"): + from .llm import acompletion + logger.debug(f"[CALL_LLM_PROVIDER] Using Codex OAuth for model={config.model_name}") + # acompletion handles codex specially — returns message dict directly + return await acompletion( + messages=clean_messages, + model=config.model_name, + tools=tools, + response_format=response_format, + process_chunk=process_chunk, + model_params=model_params, + ) + + # Route codex/pro models through the OpenAI Responses API if is_responses_api_model(config): from .llm import acompletion_responses diff --git a/pantheon/utils/oauth/__init__.py b/pantheon/utils/oauth/__init__.py new file mode 100644 index 00000000..31afe3b8 --- /dev/null +++ b/pantheon/utils/oauth/__init__.py @@ -0,0 +1,10 @@ +""" +OAuth support for LLM providers. + +Currently supports: +- Codex (OpenAI ChatGPT backend-api) via browser-based OAuth 2.0 + PKCE +""" + +from .codex import CodexOAuthManager, CodexOAuthError + +__all__ = ["CodexOAuthManager", "CodexOAuthError"] diff --git a/pantheon/utils/oauth/codex.py b/pantheon/utils/oauth/codex.py new file mode 100644 index 00000000..d06838e8 --- /dev/null +++ b/pantheon/utils/oauth/codex.py @@ -0,0 +1,414 @@ +""" +OpenAI Codex OAuth — browser-based login to ChatGPT backend-api. + +Implements OAuth 2.0 Authorization Code flow with PKCE. +Tokens are stored in ~/.pantheon/oauth/codex.json. +Supports importing tokens from Codex CLI (~/.codex/auth.json). +""" + +from __future__ import annotations + +import base64 +import hashlib +import json +import os +import secrets +import threading +import time +import webbrowser +from datetime import datetime, timezone +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any, Callable, Optional +from urllib.parse import parse_qs, urlencode, urlparse + +import httpx + +from ..log import logger + +# ============ Constants ============ + +AUTH_ISSUER = "https://auth.openai.com" +CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann" +ORIGINATOR = "pi" +CALLBACK_PORT = 1455 +SCOPE = "openid profile email offline_access" +CODEX_BASE_URL = "https://chatgpt.com/backend-api" + +# Auth storage +AUTH_DIR = Path.home() / ".pantheon" / "oauth" +AUTH_FILE = AUTH_DIR / "codex.json" +CODEX_CLI_AUTH = Path.home() / ".codex" / "auth.json" + + +class CodexOAuthError(RuntimeError): + """Raised when Codex OAuth login or refresh fails.""" + + +# ============ Utility Functions ============ + + +def _utc_now() -> str: + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + + +def _b64url(data: bytes) -> str: + return base64.urlsafe_b64encode(data).decode("ascii").rstrip("=") + + +def _pkce_pair() -> tuple[str, str]: + """Generate PKCE verifier and challenge pair.""" + verifier = _b64url(secrets.token_bytes(32)) + challenge = _b64url(hashlib.sha256(verifier.encode("utf-8")).digest()) + return verifier, challenge + + +def _decode_jwt_payload(token: str) -> dict[str, Any]: + """Decode JWT payload without verification (for reading claims).""" + parts = (token or "").split(".") + if len(parts) != 3 or not parts[1]: + return {} + payload = parts[1] + payload += "=" * (-len(payload) % 4) + try: + decoded = base64.urlsafe_b64decode(payload.encode("ascii")) + data = json.loads(decoded.decode("utf-8")) + except Exception: + return {} + return data if isinstance(data, dict) else {} + + +def _jwt_org_context(token: str) -> dict[str, str]: + """Extract org/account/project from JWT claims.""" + payload = _decode_jwt_payload(token) + nested = payload.get("https://api.openai.com/auth") + claims = nested if isinstance(nested, dict) else {} + context = {} + for key in ("organization_id", "project_id", "chatgpt_account_id"): + value = str(claims.get(key) or "").strip() + if value: + context[key] = value + return context + + +def _token_expired(token: str, skew_seconds: int = 300) -> bool: + """Check if JWT access_token is expired (with skew).""" + payload = _decode_jwt_payload(token) + exp = payload.get("exp") + if not isinstance(exp, (int, float)): + return True + return time.time() >= (float(exp) - skew_seconds) + + +# ============ Token Exchange ============ + + +def _exchange_code(code: str, redirect_uri: str, code_verifier: str) -> dict[str, str]: + """Exchange authorization code for tokens.""" + resp = httpx.post( + f"{AUTH_ISSUER}/oauth/token", + data={ + "grant_type": "authorization_code", + "code": code, + "redirect_uri": redirect_uri, + "client_id": CLIENT_ID, + "code_verifier": code_verifier, + }, + timeout=30, + ) + if not resp.is_success: + raise CodexOAuthError(f"Token exchange failed: HTTP {resp.status_code} {resp.text[:300]}") + data = resp.json() + if not all(data.get(k) for k in ("id_token", "access_token", "refresh_token")): + raise CodexOAuthError("Token exchange returned incomplete credentials") + return { + "id_token": str(data["id_token"]), + "access_token": str(data["access_token"]), + "refresh_token": str(data["refresh_token"]), + } + + +def _refresh_tokens(refresh_token: str) -> dict[str, str]: + """Refresh access token using refresh token.""" + resp = httpx.post( + f"{AUTH_ISSUER}/oauth/token", + data={ + "client_id": CLIENT_ID, + "grant_type": "refresh_token", + "refresh_token": refresh_token, + }, + timeout=30, + ) + if not resp.is_success: + raise CodexOAuthError(f"Token refresh failed: HTTP {resp.status_code} {resp.text[:300]}") + data = resp.json() + access_token = str(data.get("access_token") or "").strip() + id_token = str(data.get("id_token") or "").strip() + next_refresh = str(data.get("refresh_token") or refresh_token).strip() + if not access_token or not id_token: + raise CodexOAuthError("Token refresh returned incomplete credentials") + return { + "id_token": id_token, + "access_token": access_token, + "refresh_token": next_refresh, + } + + +# ============ Callback Server ============ + + +class _CallbackHandler(BaseHTTPRequestHandler): + server_version = "PantheonOAuth/1.0" + + def do_GET(self): + parsed = urlparse(self.path) + if parsed.path != "/auth/callback": + self.send_error(404) + return + params = {k: v[-1] for k, v in parse_qs(parsed.query).items() if v} + self.server.result = params + self.server.event.set() + body = ( + "

OAuth complete

" + "

You can close this window and return to Pantheon.

" + ).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt, *args): + return # Suppress HTTP server logs + + +# ============ OAuth Manager ============ + + +class CodexOAuthManager: + """Manage Codex OAuth tokens — login, refresh, import, and storage.""" + + def __init__(self, auth_file: Path | None = None): + self.auth_file = auth_file or AUTH_FILE + + # ---- Storage ---- + + def _load(self) -> dict[str, Any]: + if self.auth_file.exists(): + try: + return json.loads(self.auth_file.read_text()) + except Exception: + pass + return {} + + def _save(self, auth: dict[str, Any]) -> dict[str, Any]: + self.auth_file.parent.mkdir(parents=True, exist_ok=True) + self.auth_file.write_text(json.dumps(auth, indent=2)) + os.chmod(self.auth_file, 0o600) + return auth + + # ---- Token Access ---- + + def get_tokens(self) -> dict[str, str]: + """Get stored tokens dict.""" + return self._load().get("tokens", {}) + + def get_access_token(self, auto_refresh: bool = True) -> str | None: + """Get a valid access token, refreshing if needed.""" + tokens = self.get_tokens() + access_token = tokens.get("access_token", "") + refresh_token = tokens.get("refresh_token", "") + + if not access_token: + return None + + if auto_refresh and _token_expired(access_token) and refresh_token: + logger.info("[Codex OAuth] Access token expired, refreshing...") + try: + self.refresh() + tokens = self.get_tokens() + access_token = tokens.get("access_token", "") + except Exception as e: + logger.warning(f"[Codex OAuth] Refresh failed: {e}") + return None + + return access_token if access_token and not _token_expired(access_token) else None + + def get_account_id(self) -> str | None: + """Get ChatGPT account_id for API calls.""" + return self.get_tokens().get("account_id") or None + + def is_authenticated(self) -> bool: + """Check if we have a valid (or refreshable) token.""" + tokens = self.get_tokens() + access_token = tokens.get("access_token", "") + refresh_token = tokens.get("refresh_token", "") + if access_token and not _token_expired(access_token): + return True + return bool(refresh_token) + + # ---- Login Flow ---- + + def login( + self, + *, + open_browser: bool = True, + timeout_seconds: int = 300, + ) -> dict[str, Any]: + """Start browser-based OAuth login flow. + + Opens browser to OpenAI auth page. User logs in, callback + redirects to local server. Returns auth record with tokens. + """ + verifier, challenge = _pkce_pair() + state = _b64url(secrets.token_bytes(24)) + + event = threading.Event() + server = self._create_server(event) + _, port = server.server_address + redirect_uri = f"http://localhost:{port}/auth/callback" + + auth_url = ( + f"{AUTH_ISSUER}/oauth/authorize?" + + urlencode({ + "response_type": "code", + "client_id": CLIENT_ID, + "redirect_uri": redirect_uri, + "scope": SCOPE, + "code_challenge": challenge, + "code_challenge_method": "S256", + "id_token_add_organizations": "true", + "codex_cli_simplified_flow": "true", + "state": state, + "originator": ORIGINATOR, + }) + ) + + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + + try: + logger.info(f"[Codex OAuth] Opening browser for login...") + logger.info(f"[Codex OAuth] Auth URL: {auth_url}") + if open_browser: + webbrowser.open(auth_url) + + if not event.wait(timeout_seconds): + raise CodexOAuthError("Timed out waiting for OAuth callback") + + params = getattr(server, "result", {}) or {} + finally: + server.shutdown() + server.server_close() + thread.join(timeout=2) + + # Validate callback + if params.get("state") != state: + raise CodexOAuthError("OAuth callback state mismatch") + if params.get("error"): + raise CodexOAuthError(f"OAuth failed: {params.get('error_description', params['error'])}") + + code = str(params.get("code", "")).strip() + if not code: + raise CodexOAuthError("OAuth callback missing authorization code") + + # Exchange code for tokens + tokens = _exchange_code(code, redirect_uri, verifier) + claims = _jwt_org_context(tokens["id_token"]) + + auth = { + "provider": "codex", + "tokens": { + **tokens, + "account_id": claims.get("chatgpt_account_id"), + "organization_id": claims.get("organization_id"), + "project_id": claims.get("project_id"), + }, + "last_refresh": _utc_now(), + } + + logger.info("[Codex OAuth] Login successful") + return self._save(auth) + + # ---- Refresh ---- + + def refresh(self) -> dict[str, Any]: + """Refresh the access token using the stored refresh token.""" + auth = self._load() + tokens = auth.get("tokens", {}) + refresh_token = tokens.get("refresh_token", "") + if not refresh_token: + raise CodexOAuthError("No refresh token available") + + refreshed = _refresh_tokens(refresh_token) + claims = _jwt_org_context(refreshed["id_token"]) + + auth["tokens"] = { + **refreshed, + "account_id": claims.get("chatgpt_account_id"), + "organization_id": claims.get("organization_id"), + "project_id": claims.get("project_id"), + } + auth["last_refresh"] = _utc_now() + + logger.info("[Codex OAuth] Token refreshed successfully") + return self._save(auth) + + # ---- Import from Codex CLI ---- + + def import_from_codex_cli(self) -> dict[str, Any] | None: + """Import tokens from Codex CLI auth file (~/.codex/auth.json).""" + if not CODEX_CLI_AUTH.exists(): + logger.info(f"[Codex OAuth] Codex CLI auth not found at {CODEX_CLI_AUTH}") + return None + + try: + codex_data = json.loads(CODEX_CLI_AUTH.read_text()) + except Exception as e: + logger.warning(f"[Codex OAuth] Failed to read Codex CLI auth: {e}") + return None + + tokens = codex_data.get("tokens", {}) + access_token = tokens.get("access_token", "") + refresh_token = tokens.get("refresh_token", "") + + if not access_token and not refresh_token: + logger.info("[Codex OAuth] Codex CLI auth has no tokens") + return None + + # If token is expired, refresh it + if refresh_token and (not access_token or _token_expired(access_token)): + logger.info("[Codex OAuth] Codex CLI token expired, refreshing...") + refreshed = _refresh_tokens(refresh_token) + tokens = refreshed + + claims = _jwt_org_context(tokens.get("id_token", "") or tokens.get("access_token", "")) + + auth = { + "provider": "codex", + "tokens": { + **tokens, + "account_id": claims.get("chatgpt_account_id"), + "organization_id": claims.get("organization_id"), + "project_id": claims.get("project_id"), + }, + "last_refresh": _utc_now(), + "source": str(CODEX_CLI_AUTH), + } + + logger.info("[Codex OAuth] Imported tokens from Codex CLI") + return self._save(auth) + + # ---- Internal ---- + + @staticmethod + def _create_server(event: threading.Event) -> ThreadingHTTPServer: + for port in (CALLBACK_PORT, 0): + try: + server = ThreadingHTTPServer(("127.0.0.1", port), _CallbackHandler) + server.event = event + server.result = {} + return server + except OSError: + continue + raise CodexOAuthError("Could not start local OAuth callback server") From 2025c4e6024da3134e0cb593d4ac51480afb2f67 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 13:44:02 -0700 Subject: [PATCH 06/13] feat: add Codex OAuth CLI commands, NATS RPC, and model selector integration CLI commands (pantheon-chatroom oauth): - oauth status: check auth status - oauth login: browser-based OAuth login - oauth import: import from Codex CLI (~/.codex/auth.json) - oauth logout: remove stored tokens NATS RPC tools for frontend: - oauth_status(): returns all OAuth provider statuses - oauth_login(provider): start browser-based login - oauth_import(provider): import from native CLI Model selector: - Detects codex as available provider when OAuth tokens exist - Added codex to DEFAULT_PROVIDER_MODELS and PROVIDER_API_KEYS - codex/ models appear in list_available_models() when authenticated acompletion(): - Routes codex/ models through OAuth token + CodexAdapter - Passes account_id for chatgpt-account-id header - Returns message dict directly (no stream_chunk_builder) Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/chatroom/__main__.py | 71 ++++++++++++++++++++- pantheon/chatroom/room.py | 104 +++++++++++++++++++++++++++++++ pantheon/utils/model_selector.py | 17 ++++- 3 files changed, 190 insertions(+), 2 deletions(-) diff --git a/pantheon/chatroom/__main__.py b/pantheon/chatroom/__main__.py index ae05b79f..0911e675 100644 --- a/pantheon/chatroom/__main__.py +++ b/pantheon/chatroom/__main__.py @@ -17,6 +17,75 @@ from pantheon.repl.setup_wizard import check_and_run_setup +def oauth(action: str = "status", provider: str = "codex"): + """Manage OAuth authentication for LLM providers. + + Args: + action: One of 'login', 'import', 'status', 'logout' + provider: OAuth provider name (default: 'codex') + + Examples: + pantheon-chatroom oauth login # Browser-based login + pantheon-chatroom oauth import # Import from Codex CLI + pantheon-chatroom oauth status # Check auth status + pantheon-chatroom oauth logout # Remove stored tokens + """ + if provider != "codex": + print(f"Unsupported OAuth provider: {provider}") + print("Supported providers: codex") + return + + from pantheon.utils.oauth import CodexOAuthManager, CodexOAuthError + mgr = CodexOAuthManager() + + if action == "status": + if mgr.is_authenticated(): + account_id = mgr.get_account_id() + print(f"Codex OAuth: authenticated") + print(f" Account ID: {account_id}") + print(f" Auth file: {mgr.auth_file}") + print(f" Use model prefix: codex/gpt-5.4-mini, codex/gpt-5, etc.") + else: + print(f"Codex OAuth: not authenticated") + print(f" Run: pantheon-chatroom oauth login") + print(f" Or: pantheon-chatroom oauth import (if Codex CLI is installed)") + + elif action == "login": + print("Starting Codex OAuth login...") + print("A browser window will open. Please log in with your OpenAI account.") + try: + mgr.login(open_browser=True, timeout_seconds=300) + print(f"\nLogin successful!") + print(f" Account ID: {mgr.get_account_id()}") + print(f" You can now use codex/ models (e.g., codex/gpt-5.4-mini)") + except CodexOAuthError as e: + print(f"\nLogin failed: {e}") + except KeyboardInterrupt: + print("\nLogin cancelled.") + + elif action == "import": + print("Importing from Codex CLI (~/.codex/auth.json)...") + result = mgr.import_from_codex_cli() + if result: + print(f"Import successful!") + print(f" Account ID: {mgr.get_account_id()}") + else: + print(f"Import failed. Make sure Codex CLI is installed and authenticated.") + print(f" Install: npx @anthropic-ai/codex") + print(f" Or use: pantheon-chatroom oauth login") + + elif action == "logout": + if mgr.auth_file.exists(): + mgr.auth_file.unlink() + print("Codex OAuth tokens removed.") + else: + print("No Codex OAuth tokens found.") + + else: + print(f"Unknown action: {action}") + print("Actions: login, import, status, logout") + + if __name__ == "__main__": # Check for API keys and run setup wizard if none found check_and_run_setup() @@ -29,6 +98,6 @@ if len(sys.argv) == 1 or (len(sys.argv) > 1 and sys.argv[1].startswith("-")): sys.argv.insert(1, "start") fire.Fire( - {"start": start_services}, + {"start": start_services, "oauth": oauth}, name="pantheon-chatroom", ) diff --git a/pantheon/chatroom/room.py b/pantheon/chatroom/room.py index ef0287e9..7e7921d4 100644 --- a/pantheon/chatroom/room.py +++ b/pantheon/chatroom/room.py @@ -2289,3 +2289,107 @@ async def check_api_keys(self) -> dict: has_any_key = any(v["configured"] for v in keys.values()) return {"keys": keys, "has_any_key": has_any_key} + + # ============ OAuth Management ============ + + @tool + async def oauth_status(self) -> dict: + """Get OAuth authentication status for all supported providers. + + Returns: + Dict with provider statuses including authentication state and account info. + """ + from pantheon.utils.oauth import CodexOAuthManager + + codex = CodexOAuthManager() + codex_authenticated = codex.is_authenticated() + codex_account_id = codex.get_account_id() if codex_authenticated else None + + return { + "providers": { + "codex": { + "authenticated": codex_authenticated, + "account_id": codex_account_id, + "description": "OpenAI Codex (ChatGPT backend-api, free with ChatGPT Plus)", + "supports_browser_login": True, + "supports_import": True, + }, + }, + } + + @tool + async def oauth_login(self, provider: str = "codex") -> dict: + """Start browser-based OAuth login flow. + + Opens the system browser for the user to authenticate. + Token is saved automatically after successful login. + + Args: + provider: OAuth provider name (currently only 'codex' supported) + + Returns: + Dict with success status and account info. + """ + if provider != "codex": + return {"success": False, "error": f"Unsupported OAuth provider: {provider}"} + + from pantheon.utils.oauth import CodexOAuthManager, CodexOAuthError + + try: + mgr = CodexOAuthManager() + mgr.login(open_browser=True, timeout_seconds=300) + + # Reload settings so model selector detects new provider + from pantheon.utils.model_selector import reset_model_selector + reset_model_selector() + + return { + "success": True, + "provider": "codex", + "account_id": mgr.get_account_id(), + "message": "Codex OAuth login successful. You can now use codex/ models.", + } + except CodexOAuthError as e: + return {"success": False, "error": str(e)} + except Exception as e: + logger.error(f"OAuth login failed: {e}") + return {"success": False, "error": str(e)} + + @tool + async def oauth_import(self, provider: str = "codex") -> dict: + """Import OAuth tokens from native CLI tools. + + For Codex: imports from ~/.codex/auth.json (Codex CLI). + + Args: + provider: OAuth provider name (currently only 'codex' supported) + + Returns: + Dict with success status. + """ + if provider != "codex": + return {"success": False, "error": f"Unsupported OAuth provider: {provider}"} + + from pantheon.utils.oauth import CodexOAuthManager + + try: + mgr = CodexOAuthManager() + result = mgr.import_from_codex_cli() + + if result: + from pantheon.utils.model_selector import reset_model_selector + reset_model_selector() + return { + "success": True, + "provider": "codex", + "account_id": mgr.get_account_id(), + "message": "Imported Codex CLI tokens successfully.", + } + else: + return { + "success": False, + "error": "No Codex CLI auth found (~/.codex/auth.json). Install Codex CLI or use browser login.", + } + except Exception as e: + logger.error(f"OAuth import failed: {e}") + return {"success": False, "error": str(e)} diff --git a/pantheon/utils/model_selector.py b/pantheon/utils/model_selector.py index 42eef115..5afef22b 100644 --- a/pantheon/utils/model_selector.py +++ b/pantheon/utils/model_selector.py @@ -60,7 +60,7 @@ class CustomEndpointConfig: # Built-in defaults based on February 2026 flagship models # Users can override in settings.json -DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot", "qwen", "groq", "mistral", "together_ai", "openrouter"] +DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot", "qwen", "groq", "mistral", "together_ai", "openrouter", "codex"] # Quality levels map to MODEL LISTS (not single models) for fallback chains # Models within each level are ordered by preference @@ -165,6 +165,12 @@ class CustomEndpointConfig: "normal": ["together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"], "low": ["together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"], }, + # Codex: OpenAI via ChatGPT OAuth (free with ChatGPT Plus) + "codex": { + "high": ["codex/gpt-5.4", "codex/gpt-5.2-codex"], + "normal": ["codex/gpt-5.4-mini", "codex/gpt-5"], + "low": ["codex/gpt-5.4-mini", "codex/o4-mini"], + }, # OpenRouter: Multi-provider aggregator # https://openrouter.ai/models "openrouter": { @@ -216,6 +222,7 @@ class CustomEndpointConfig: "zai": "ZAI_API_KEY", "moonshot": "MOONSHOT_API_KEY", "qwen": "DASHSCOPE_API_KEY", + "codex": "", # OAuth-based, no env var key } # ============ Image Generation Model Defaults ============ @@ -271,6 +278,14 @@ def _get_available_providers(self) -> set[str]: if os.environ.get(config.api_key_env, ""): self._available_providers.add(provider_key) + # Check OAuth providers (e.g., Codex) + try: + from pantheon.utils.oauth import CodexOAuthManager + if CodexOAuthManager().is_authenticated(): + self._available_providers.add("codex") + except Exception: + pass + # Universal proxy: LLM_API_KEY makes openai provider available # (most third-party proxies are OpenAI-compatible) # Note: LLM_API_BASE is deprecated, warn user to use custom endpoints instead From 82f260368aa3ead31d01f911631ee69ee6f7a0fc Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 15:14:54 -0700 Subject: [PATCH 07/13] fix: don't eagerly refresh on Codex CLI import OpenAI refresh_tokens are single-use. If Codex CLI already used the refresh_token, our refresh attempt fails with "refresh_token_reused". Now import_from_codex_cli() copies tokens as-is without refreshing. get_access_token() handles lazy refresh when actually needed. Only attempt refresh if there's no access_token at all. Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/utils/oauth/codex.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pantheon/utils/oauth/codex.py b/pantheon/utils/oauth/codex.py index d06838e8..07d95ead 100644 --- a/pantheon/utils/oauth/codex.py +++ b/pantheon/utils/oauth/codex.py @@ -376,11 +376,18 @@ def import_from_codex_cli(self) -> dict[str, Any] | None: logger.info("[Codex OAuth] Codex CLI auth has no tokens") return None - # If token is expired, refresh it - if refresh_token and (not access_token or _token_expired(access_token)): - logger.info("[Codex OAuth] Codex CLI token expired, refreshing...") - refreshed = _refresh_tokens(refresh_token) - tokens = refreshed + # Don't refresh here — OpenAI refresh_tokens are single-use. + # If Codex CLI already used it, refreshing would fail with "refresh_token_reused". + # Just import as-is; get_access_token() will refresh lazily when needed. + if not access_token and refresh_token: + # No access_token at all — must refresh to get one + try: + logger.info("[Codex OAuth] No access_token, attempting refresh...") + refreshed = _refresh_tokens(refresh_token) + tokens = refreshed + except CodexOAuthError as e: + logger.warning(f"[Codex OAuth] Refresh failed (token may be reused): {e}") + # Still import what we have — the token may work or login will be needed claims = _jwt_org_context(tokens.get("id_token", "") or tokens.get("access_token", "")) From 8e353e0c43dfb3216ffd68d752e8a6641de9d6a2 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Tue, 31 Mar 2026 15:18:04 -0700 Subject: [PATCH 08/13] fix: only show 'Import from Codex CLI' when ~/.codex/auth.json exists oauth_status() now returns supports_import=true only when Codex CLI auth file is detected. Frontend hides the import button otherwise. Also renamed button to "Import from Codex CLI" for clarity. Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/chatroom/room.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pantheon/chatroom/room.py b/pantheon/chatroom/room.py index 7e7921d4..208e05b6 100644 --- a/pantheon/chatroom/room.py +++ b/pantheon/chatroom/room.py @@ -2300,10 +2300,12 @@ async def oauth_status(self) -> dict: Dict with provider statuses including authentication state and account info. """ from pantheon.utils.oauth import CodexOAuthManager + from pantheon.utils.oauth.codex import CODEX_CLI_AUTH codex = CodexOAuthManager() codex_authenticated = codex.is_authenticated() codex_account_id = codex.get_account_id() if codex_authenticated else None + cli_available = CODEX_CLI_AUTH.exists() return { "providers": { @@ -2312,7 +2314,7 @@ async def oauth_status(self) -> dict: "account_id": codex_account_id, "description": "OpenAI Codex (ChatGPT backend-api, free with ChatGPT Plus)", "supports_browser_login": True, - "supports_import": True, + "supports_import": cli_available, }, }, } From 076c8b6291bd93fb232dd4e68241785964ad108e Mon Sep 17 00:00:00 2001 From: Nanguage Date: Thu, 2 Apr 2026 00:31:33 -0700 Subject: [PATCH 09/13] feat: add Ollama as auto-detected local provider Ollama is detected automatically when running at localhost:11434. No API key or manual configuration needed. - llm_catalog.json: new "ollama" provider with local=true, sdk=openai - model_selector.py: _detect_ollama() pings /api/tags to check availability, _list_ollama_models() fetches model names (cached 30s), _get_provider_models() returns dynamic ollama model list - llm.py: auto-fills dummy api_key="ollama" for local providers Models appear in the UI model selector as ollama/model-name. Usage: just run `ollama serve` and models show up automatically. Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/utils/llm.py | 3 ++ pantheon/utils/llm_catalog.json | 9 ++++++ pantheon/utils/model_selector.py | 54 +++++++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 965d6c3d..22a38715 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -590,6 +590,9 @@ async def acompletion( api_key_env = provider_config.get("api_key_env", "") if api_key_env: effective_api_key = os.environ.get(api_key_env, "") + # Local providers (Ollama) don't need a real API key + if not effective_api_key and provider_config.get("local"): + effective_api_key = "ollama" effective_model = model_name # use bare model name with native SDK adapter = get_adapter(sdk_type) diff --git a/pantheon/utils/llm_catalog.json b/pantheon/utils/llm_catalog.json index 19827320..9a3c0baa 100644 --- a/pantheon/utils/llm_catalog.json +++ b/pantheon/utils/llm_catalog.json @@ -1382,6 +1382,15 @@ "supports_reasoning": true } } + }, + "ollama": { + "display_name": "Ollama (Local)", + "sdk": "openai", + "base_url": "http://localhost:11434/v1", + "api_key_env": "", + "openai_compatible": true, + "local": true, + "models": {} } } } diff --git a/pantheon/utils/model_selector.py b/pantheon/utils/model_selector.py index 5afef22b..33d4288d 100644 --- a/pantheon/utils/model_selector.py +++ b/pantheon/utils/model_selector.py @@ -56,11 +56,46 @@ class CustomEndpointConfig: # Sentinel object for negative cache (better than empty string) _NOT_FOUND = object() +# ============ Local Provider Detection ============ + +_ollama_cache: dict | None = None +_ollama_cache_time: float = 0 + + +def _detect_ollama(base_url: str = "http://localhost:11434") -> bool: + """Check if Ollama is running locally.""" + try: + import httpx + resp = httpx.get(f"{base_url}/api/tags", timeout=2) + return resp.is_success + except Exception: + return False + + +def _list_ollama_models(base_url: str = "http://localhost:11434") -> list[str]: + """List available models from local Ollama instance (cached 30s).""" + import time + global _ollama_cache, _ollama_cache_time + if _ollama_cache is not None and time.time() - _ollama_cache_time < 30: + return _ollama_cache + + try: + import httpx + resp = httpx.get(f"{base_url}/api/tags", timeout=5) + if resp.is_success: + models = [m["name"] for m in resp.json().get("models", [])] + _ollama_cache = models + _ollama_cache_time = time.time() + return models + except Exception: + pass + return [] + # ============ Default Configuration ============ # Built-in defaults based on February 2026 flagship models # Users can override in settings.json -DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot", "qwen", "groq", "mistral", "together_ai", "openrouter", "codex"] +DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot", "qwen", "groq", "mistral", "together_ai", "openrouter", "codex", "ollama"] # Quality levels map to MODEL LISTS (not single models) for fallback chains # Models within each level are ordered by preference @@ -223,6 +258,7 @@ class CustomEndpointConfig: "moonshot": "MOONSHOT_API_KEY", "qwen": "DASHSCOPE_API_KEY", "codex": "", # OAuth-based, no env var key + "ollama": "", # Local, no env var key — detected by _detect_ollama() } # ============ Image Generation Model Defaults ============ @@ -286,6 +322,14 @@ def _get_available_providers(self) -> set[str]: except Exception: pass + # Check local Ollama + try: + from pantheon.utils.model_selector import _detect_ollama + if _detect_ollama(): + self._available_providers.add("ollama") + except Exception: + pass + # Universal proxy: LLM_API_KEY makes openai provider available # (most third-party proxies are OpenAI-compatible) # Note: LLM_API_BASE is deprecated, warn user to use custom endpoints instead @@ -371,6 +415,14 @@ def _get_provider_models(self, provider: str) -> dict[str, list[str]]: if provider in CUSTOM_ENDPOINT_ENVS: return {} + # Ollama: dynamically list local models + if provider == "ollama": + models = _list_ollama_models() + if models: + prefixed = [f"ollama/{m}" for m in models] + return {"high": prefixed, "normal": prefixed, "low": prefixed} + return {} + # Try user configuration first user_config = self.settings.get(f"models.provider_models.{provider}", {}) From 9ab7b0b06d983b42256279686058c1a3fd4d56d8 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Thu, 2 Apr 2026 13:24:35 -0700 Subject: [PATCH 10/13] fix: propagate LLM errors to frontend via NATS chat_finished event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a chat fails (e.g. OAuth token expired, model error), the error was silently swallowed — frontend just saw the model stop responding. Now chat_finished event includes status="error" and metadata.message when thread.response indicates failure. Frontend ChatManager shows the error as an assistant message in the chat. Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/chatroom/room.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/pantheon/chatroom/room.py b/pantheon/chatroom/room.py index 208e05b6..37efdd7e 100644 --- a/pantheon/chatroom/room.py +++ b/pantheon/chatroom/room.py @@ -1609,7 +1609,20 @@ async def team_getter(): # Publish chat finished message if NATS streaming enabled if self._nats_adapter is not None: - await self._nats_adapter.publish_chat_finished(chat_id) + resp = thread.response or {} + if resp.get("success") is False: + # Send error to frontend so it can display to user + error_msg = resp.get("message", "Unknown error") + await self._nats_adapter.publish( + chat_id, "chat_finished", + { + "type": "chat_finished", + "status": "error", + "metadata": {"message": error_msg}, + }, + ) + else: + await self._nats_adapter.publish_chat_finished(chat_id) return thread.response except asyncio.CancelledError: @@ -1939,6 +1952,9 @@ async def list_available_models(self) -> dict: from pantheon.utils.model_selector import get_model_selector selector = get_model_selector() + # Clear provider cache so dynamic providers (Ollama, OAuth) are re-detected + selector._available_providers = None + selector._detected_provider = None return selector.list_available_models() except Exception as e: logger.error(f"Error listing available models: {e}") @@ -2395,3 +2411,21 @@ async def oauth_import(self, provider: str = "codex") -> dict: except Exception as e: logger.error(f"OAuth import failed: {e}") return {"success": False, "error": str(e)} + + @tool + async def ollama_status(self, url: str = "http://localhost:11434") -> dict: + """Check Ollama server status and list available models. + + Args: + url: Ollama server URL (default: http://localhost:11434) + + Returns: + Dict with running status, model list, and URL. + """ + try: + from pantheon.utils.model_selector import _detect_ollama, _list_ollama_models + running = _detect_ollama(url) + models = _list_ollama_models(url) if running else [] + return {"running": running, "models": models, "url": url} + except Exception as e: + return {"running": False, "models": [], "url": url} From 59b7b29fec62e1190639461b83b8202834b05d16 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Thu, 2 Apr 2026 15:57:44 -0700 Subject: [PATCH 11/13] fix: oauth_status verifies token instead of just checking file exists Previously is_authenticated() returned true if refresh_token existed in the file, even if both access_token and refresh_token were expired/reused. Now oauth_status() calls get_access_token(auto_refresh=True) to actually verify the token works before reporting "Connected". Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/chatroom/room.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pantheon/chatroom/room.py b/pantheon/chatroom/room.py index 37efdd7e..e5230e65 100644 --- a/pantheon/chatroom/room.py +++ b/pantheon/chatroom/room.py @@ -2319,7 +2319,9 @@ async def oauth_status(self) -> dict: from pantheon.utils.oauth.codex import CODEX_CLI_AUTH codex = CodexOAuthManager() - codex_authenticated = codex.is_authenticated() + # Actually verify the token works (auto_refresh=True will try to refresh if expired) + access_token = codex.get_access_token(auto_refresh=True) + codex_authenticated = access_token is not None codex_account_id = codex.get_account_id() if codex_authenticated else None cli_available = CODEX_CLI_AUTH.exists() From 2e7b306ab34a981d833fbd7fdf53a28816441042 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Thu, 2 Apr 2026 16:04:04 -0700 Subject: [PATCH 12/13] fix: user-friendly Codex OAuth error messages with [OAUTH_REQUIRED] tag Improved error messages for Codex OAuth failures to be user-friendly and include [OAUTH_REQUIRED] tag for frontend to detect and show actionable UI (settings button). Co-Authored-By: Claude Opus 4.6 (1M context) --- pantheon/utils/adapters/codex_adapter.py | 5 ++++- pantheon/utils/llm.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pantheon/utils/adapters/codex_adapter.py b/pantheon/utils/adapters/codex_adapter.py index f69d8d22..d0986beb 100644 --- a/pantheon/utils/adapters/codex_adapter.py +++ b/pantheon/utils/adapters/codex_adapter.py @@ -166,7 +166,10 @@ async def acompletion( async with httpx.AsyncClient(timeout=120) as client: async with client.stream("POST", endpoint, headers=headers, json=body) as resp: if resp.status_code == 401: - raise APIConnectionError("Codex OAuth token expired or invalid (401)") + raise APIConnectionError( + "[OAUTH_REQUIRED] Codex OAuth token expired. " + "Please re-login in Settings → API Keys → OAuth." + ) elif resp.status_code == 429: raise RateLimitError(f"Codex rate limited (429)") elif resp.status_code >= 500: diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 22a38715..3c766105 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -577,8 +577,8 @@ async def acompletion( effective_api_key = oauth.get_access_token(auto_refresh=True) if not effective_api_key: raise RuntimeError( - "Codex OAuth not authenticated. Run CodexOAuthManager().login() " - "or import tokens from Codex CLI with CodexOAuthManager().import_from_codex_cli()" + "[OAUTH_REQUIRED] Codex OAuth session expired or not configured. " + "Please re-login in Settings → API Keys → OAuth." ) effective_base_url = provider_config.get("base_url") effective_model = model_name From b3bdc5af2a8f9ffd9921da949db60ee881b41637 Mon Sep 17 00:00:00 2001 From: Nanguage Date: Fri, 3 Apr 2026 15:38:30 -0700 Subject: [PATCH 13/13] fix: remove non-existent 'slack' extra from CI test workflow The test workflow referenced --extra slack but pyproject.toml has no slack optional-dependency group (slack-sdk/slack-bolt are in main deps). This caused all CI jobs to fail with "Extra slack is not defined". Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7293fd06..9eaf4160 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -49,7 +49,7 @@ jobs: uses: astral-sh/setup-uv@v2 - name: Install project dependencies - run: uv sync --extra dev --extra knowledge --extra slack + run: uv sync --extra dev --extra knowledge - name: Run all tests (core, NATS, and API-dependent) env: