diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7293fd06..9eaf4160 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -49,7 +49,7 @@ jobs: uses: astral-sh/setup-uv@v2 - name: Install project dependencies - run: uv sync --extra dev --extra knowledge --extra slack + run: uv sync --extra dev --extra knowledge - name: Run all tests (core, NATS, and API-dependent) env: diff --git a/build_backend.spec b/build_backend.spec index 27bdedd9..8a8b66de 100644 --- a/build_backend.spec +++ b/build_backend.spec @@ -26,7 +26,7 @@ datas += copy_metadata('traitlets') datas += copy_metadata('pyzmq') datas += copy_metadata('tornado') datas += copy_metadata('nest_asyncio') -datas += collect_data_files('litellm', includes=['**/*.json']) +datas += collect_data_files('pantheon', subdir='utils', includes=['llm_catalog.json']) datas += collect_data_files('tiktoken_ext', includes=['**/*.py']) # fakeredis: model/_command_info.py loads os.path.join(dirname(__file__), '..', 'commands.json') # PyInstaller must include the JSON so the relative path resolves at runtime. @@ -65,9 +65,10 @@ a = Analysis( 'pantheon.toolsets.rag', 'pantheon.toolsets.scfm', 'nats', - 'litellm', 'openai', 'anthropic', + 'google.genai', + 'tiktoken', 'fastmcp', 'fastmcp.server', 'fastmcp.client', @@ -137,6 +138,7 @@ exe = EXE( pyz, a.scripts, [], + exclude_binaries=True, name='pantheon-backend-exe', debug=False, bootloader_ignore_signals=False, diff --git a/docs/source/api/agent.rst b/docs/source/api/agent.rst index 74812fe7..9d7233aa 100644 --- a/docs/source/api/agent.rst +++ b/docs/source/api/agent.rst @@ -51,9 +51,9 @@ Constructor Parameters * - tool_timeout - int - Tool execution timeout in seconds (default: 600) - * - force_litellm + * - relaxed_schema - bool - - Force use of litellm backend (default: False) + - Use relaxed (non-strict) tool schema mode (default: False) * - max_tool_content_length - int | None - Maximum length for tool outputs (default: 100000) diff --git a/docs/source/api/utils.rst b/docs/source/api/utils.rst index 567e3f93..6019ad0f 100644 --- a/docs/source/api/utils.rst +++ b/docs/source/api/utils.rst @@ -58,7 +58,7 @@ Common Functions from pantheon.utils.llm import ( acompletion_openai, - acompletion_litellm, + acompletion, # adapter-based completion process_messages_for_model, remove_hidden_fields ) diff --git a/docs/source/concepts.md b/docs/source/concepts.md index cb48b2b5..60efa1d7 100644 --- a/docs/source/concepts.md +++ b/docs/source/concepts.md @@ -43,7 +43,7 @@ Every agent has instructions that define its behavior and personality. These ins Agents become powerful through tools - functions that extend their abilities beyond pure conversation. Tools allow agents to interact with external systems, perform calculations, access databases, browse the web, execute code, and much more. The tool system is extensible, allowing you to add custom capabilities tailored to your specific needs. #### Model Selection -Agents can use any LLM supported by LiteLLM. Model selection can be configured at the agent level or globally through settings. Fallback chains allow graceful degradation when primary models are unavailable. +Agents can use any LLM from the supported providers (OpenAI, Anthropic, Gemini, DeepSeek, and more). Model selection can be configured at the agent level or globally through settings. Fallback chains allow graceful degradation when primary models are unavailable. --- diff --git a/docs/source/configuration/models.rst b/docs/source/configuration/models.rst index 89b582ed..b52f1920 100644 --- a/docs/source/configuration/models.rst +++ b/docs/source/configuration/models.rst @@ -6,7 +6,7 @@ Configure which LLM models your agents use. Overview -------- -Pantheon uses `LiteLLM `_ as its unified LLM interface, providing access to **100+ LLM providers** through a consistent API. This means any model supported by LiteLLM works with Pantheon. +Pantheon provides a unified LLM interface with native SDK adapters, giving access to **many LLM providers** through a consistent API. Key features: @@ -119,7 +119,7 @@ Provider priority (configurable in settings.json): Supported Providers ------------------- -Pantheon supports all LiteLLM providers. Here are the most common ones: +Pantheon supports many LLM providers. Here are the most common ones: Major Cloud Providers ~~~~~~~~~~~~~~~~~~~~~ @@ -238,7 +238,7 @@ Chinese Providers .. note:: - For the complete list of 100+ supported providers, see the `LiteLLM Providers Documentation `_. + For additional providers, see the Pantheon documentation or configure custom endpoints. Model Format ------------ @@ -428,7 +428,7 @@ Or add to your ``.env`` / ``~/.pantheon/.env`` file: **Priority rules:** -- **Base URL**: ``OPENAI_API_BASE`` / ``LITELLM_API_BASE`` (provider-specific) > ``LLM_API_BASE`` (universal) +- **Base URL**: ``OPENAI_API_BASE`` (provider-specific) > ``LLM_API_BASE`` (universal) - **API Key (unified proxy mode)**: When ``LLM_API_BASE`` is set, ``LLM_API_KEY`` takes priority over provider-specific keys (e.g. ``OPENAI_API_KEY``). This ensures all requests to the proxy use the correct credentials. - **API Key (normal mode)**: When no ``LLM_API_BASE`` is set, provider-specific keys (e.g. ``OPENAI_API_KEY``) take priority over ``LLM_API_KEY``. diff --git a/docs/source/configuration/settings.rst b/docs/source/configuration/settings.rst index 62e0c61f..22f68a2b 100644 --- a/docs/source/configuration/settings.rst +++ b/docs/source/configuration/settings.rst @@ -61,7 +61,7 @@ See :doc:`models` for full details on custom API endpoints and priority rules. Models ~~~~~~ -Pantheon uses LiteLLM and supports smart model selection with quality tags. See :doc:`models` for full details. +Pantheon supports smart model selection with quality tags. See :doc:`models` for full details. .. code-block:: json diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index d4398f31..1b963f61 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -131,7 +131,7 @@ Core Dependencies Core dependencies are automatically installed: -- **LiteLLM** - Unified LLM API access (OpenAI, Anthropic, etc.) +- **Provider Adapters** - Unified LLM API access (OpenAI, Anthropic, etc.) - **Rich** - Terminal UI and formatting - **prompt-toolkit** - Interactive REPL - **NATS** - Distributed messaging @@ -155,7 +155,7 @@ Set up your LLM provider API keys: # Anthropic Claude export ANTHROPIC_API_KEY="your-anthropic-key" - # Or use other providers supported by LiteLLM + # Or use other supported providers export GEMINI_API_KEY="your-gemini-key" You can also create a ``.env`` file in your project directory: diff --git a/docs/source/interfaces/api/agent.rst b/docs/source/interfaces/api/agent.rst index 92355a6c..f239d045 100644 --- a/docs/source/interfaces/api/agent.rst +++ b/docs/source/interfaces/api/agent.rst @@ -246,7 +246,7 @@ Use provider/model format for exact model selection: agent = Agent(model="anthropic/claude-opus-4-5-20251101") agent = Agent(model="anthropic/claude-sonnet-4-5-20250929") - # Other providers (via LiteLLM) + # Other providers (via native SDK adapters) agent = Agent(model="gemini/gemini-3-pro-preview") agent = Agent(model="deepseek/deepseek-chat") agent = Agent(model="mistral/mistral-large") diff --git a/docs/source/toolsets/image_generation.rst b/docs/source/toolsets/image_generation.rst index a3440d57..de3342c6 100644 --- a/docs/source/toolsets/image_generation.rst +++ b/docs/source/toolsets/image_generation.rst @@ -115,7 +115,7 @@ Supported Models - ``dall-e-3`` - ``dall-e-2`` -- Any model supported by LiteLLM's ``aimage_generation`` API +- Any model supported by the provider adapter's ``aimage_generation`` API Model Selection --------------- diff --git a/pantheon/__init__.py b/pantheon/__init__.py index b489187a..379b5037 100644 --- a/pantheon/__init__.py +++ b/pantheon/__init__.py @@ -21,12 +21,6 @@ except ImportError: pass -# Suppress litellm debug output via env vars (avoid importing litellm at startup, -# it costs ~1.5s. The actual suppress_debug_info/set_verbose flags are set in -# utils/llm.py:import_litellm() the first time litellm is actually used.) -os.environ.setdefault("LITELLM_LOG", "ERROR") -# Suppress CLIENT_IP_ENCRYPTION_KEY warning by setting a default value -os.environ.setdefault("CLIENT_IP_ENCRYPTION_KEY", "pantheon-default-key") # Suppress MCP SDK INFO logs ("Processing request of type...") that pollute CLI output import logging diff --git a/pantheon/agent.py b/pantheon/agent.py index bd574b50..cd9b58d6 100644 --- a/pantheon/agent.py +++ b/pantheon/agent.py @@ -386,18 +386,15 @@ def __init__(self, partial_message: dict | None = None): def _is_retryable_error(error: Exception) -> bool: """Determine if an LLM API error is transient and worth retrying.""" - try: - from litellm.exceptions import ( - ServiceUnavailableError, - InternalServerError, - RateLimitError, - APIConnectionError, - ) - if isinstance(error, (ServiceUnavailableError, InternalServerError, - RateLimitError, APIConnectionError)): - return True - except ImportError: - pass + from pantheon.utils.adapters.base import ( + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, + ) + if isinstance(error, (ServiceUnavailableError, InternalServerError, + RateLimitError, APIConnectionError)): + return True # Fallback: string matching for common transient error indicators error_str = str(error).lower() return any(kw in error_str for kw in ( @@ -487,7 +484,7 @@ class Agent: memory: The memory to use for the agent. If not provided, a new memory will be created. tool_timeout: The timeout for the tool. (default: from settings.endpoint.local_toolset_timeout, or 3600s) - force_litellm: Whether to force using LiteLLM. (default: False) + relaxed_schema: Use relaxed (non-strict) tool schema mode. (default: False) max_tool_content_length: The maximum length of the tool content. (default: 100000) description: The description of the agent. (default: None) think_tool: Whether to enable the think tool for structured reasoning. (default: False) @@ -505,7 +502,7 @@ def __init__( use_memory: bool = True, memory: "Memory | None" = None, tool_timeout: int | None = None, - force_litellm: bool = False, + relaxed_schema: bool = False, max_tool_content_length: int | None = None, description: str | None = None, think_tool: bool = False, @@ -559,7 +556,7 @@ def __init__( # Input queue for run_loop() — messages/notifications enter here self.input_queue: asyncio.Queue = asyncio.Queue() self._loop_running: bool = False - self.force_litellm = force_litellm + self.relaxed_schema = relaxed_schema self.icon = icon # Provider management (MCP, ToolSet, etc.) @@ -871,7 +868,7 @@ async def get_tools_for_llm(self) -> list[dict]: """ # 1. Get tools from _base_functions (Agent's own tools - no prefix) base_tools = self._convert_functions( - litellm_mode=self.force_litellm, allow_transfer=True + relaxed_schema=self.relaxed_schema, allow_transfer=True ) # 2. Get tools from providers (dynamic retrieval - uses provider caching) @@ -1130,7 +1127,7 @@ async def call_tool( # ===== Legacy MCP method (deprecated, kept for backward compatibility) ===== def _convert_functions( - self, litellm_mode: bool, allow_transfer: bool + self, relaxed_schema: bool, allow_transfer: bool ) -> list[dict]: """Convert function to the format that the model can understand.""" functions = [] @@ -1153,7 +1150,7 @@ def _convert_functions( func_dict = desc_to_openai_dict( desc, skip_params=skip_params, - litellm_mode=litellm_mode, + relaxed_schema=relaxed_schema, ) functions.append(func_dict) @@ -1416,7 +1413,7 @@ async def _acompletion( messages = process_messages_for_model(messages, model) # Step 2: Detect provider and get configuration - provider_config = detect_provider(model, self.force_litellm) + provider_config = detect_provider(model, self.relaxed_schema) # Step 3: Get base URL and API key from environment if available # Skip if detect_provider already set them (e.g. OpenAI-compatible providers) @@ -1505,6 +1502,9 @@ async def _acompletion( model_params=model_params, ) + if message is None: + message = {"role": "assistant", "content": "Error: Empty response from model."} + # Step 8: Add metadata to message end_timestamp = time.time() total_time = tracker.end("total") @@ -1576,9 +1576,9 @@ async def _acompletion_with_models( For each model, transient errors (overloaded, rate-limit, 5xx) are retried with exponential backoff. Non-transient errors skip directly - to the next model. LiteLLM's ``num_retries`` still handles initial + to the next model. The adapter's ``num_retries`` still handles initial connection-level retries; this layer covers mid-stream failures that - LiteLLM cannot retry on its own. + the adapter cannot retry on its own. """ # --- Read retry settings (with sensible defaults) --- from .settings import get_settings @@ -1629,6 +1629,8 @@ async def _acompletion_with_models( raise except Exception as e: last_error = e + import traceback + logger.error(f"[Agent:{self.name}] Full traceback:\n{traceback.format_exc()}") if _is_retryable_error(e) and attempt < max_retries: delay = min(base_delay * (2 ** attempt), max_delay) diff --git a/pantheon/chatroom/__main__.py b/pantheon/chatroom/__main__.py index ae05b79f..0911e675 100644 --- a/pantheon/chatroom/__main__.py +++ b/pantheon/chatroom/__main__.py @@ -17,6 +17,75 @@ from pantheon.repl.setup_wizard import check_and_run_setup +def oauth(action: str = "status", provider: str = "codex"): + """Manage OAuth authentication for LLM providers. + + Args: + action: One of 'login', 'import', 'status', 'logout' + provider: OAuth provider name (default: 'codex') + + Examples: + pantheon-chatroom oauth login # Browser-based login + pantheon-chatroom oauth import # Import from Codex CLI + pantheon-chatroom oauth status # Check auth status + pantheon-chatroom oauth logout # Remove stored tokens + """ + if provider != "codex": + print(f"Unsupported OAuth provider: {provider}") + print("Supported providers: codex") + return + + from pantheon.utils.oauth import CodexOAuthManager, CodexOAuthError + mgr = CodexOAuthManager() + + if action == "status": + if mgr.is_authenticated(): + account_id = mgr.get_account_id() + print(f"Codex OAuth: authenticated") + print(f" Account ID: {account_id}") + print(f" Auth file: {mgr.auth_file}") + print(f" Use model prefix: codex/gpt-5.4-mini, codex/gpt-5, etc.") + else: + print(f"Codex OAuth: not authenticated") + print(f" Run: pantheon-chatroom oauth login") + print(f" Or: pantheon-chatroom oauth import (if Codex CLI is installed)") + + elif action == "login": + print("Starting Codex OAuth login...") + print("A browser window will open. Please log in with your OpenAI account.") + try: + mgr.login(open_browser=True, timeout_seconds=300) + print(f"\nLogin successful!") + print(f" Account ID: {mgr.get_account_id()}") + print(f" You can now use codex/ models (e.g., codex/gpt-5.4-mini)") + except CodexOAuthError as e: + print(f"\nLogin failed: {e}") + except KeyboardInterrupt: + print("\nLogin cancelled.") + + elif action == "import": + print("Importing from Codex CLI (~/.codex/auth.json)...") + result = mgr.import_from_codex_cli() + if result: + print(f"Import successful!") + print(f" Account ID: {mgr.get_account_id()}") + else: + print(f"Import failed. Make sure Codex CLI is installed and authenticated.") + print(f" Install: npx @anthropic-ai/codex") + print(f" Or use: pantheon-chatroom oauth login") + + elif action == "logout": + if mgr.auth_file.exists(): + mgr.auth_file.unlink() + print("Codex OAuth tokens removed.") + else: + print("No Codex OAuth tokens found.") + + else: + print(f"Unknown action: {action}") + print("Actions: login, import, status, logout") + + if __name__ == "__main__": # Check for API keys and run setup wizard if none found check_and_run_setup() @@ -29,6 +98,6 @@ if len(sys.argv) == 1 or (len(sys.argv) > 1 and sys.argv[1].startswith("-")): sys.argv.insert(1, "start") fire.Fire( - {"start": start_services}, + {"start": start_services, "oauth": oauth}, name="pantheon-chatroom", ) diff --git a/pantheon/chatroom/room.py b/pantheon/chatroom/room.py index 488d5466..2affb52d 100644 --- a/pantheon/chatroom/room.py +++ b/pantheon/chatroom/room.py @@ -1624,7 +1624,20 @@ async def team_getter(): # Publish chat finished message if NATS streaming enabled if self._nats_adapter is not None: - await self._nats_adapter.publish_chat_finished(chat_id) + resp = thread.response or {} + if resp.get("success") is False: + # Send error to frontend so it can display to user + error_msg = resp.get("message", "Unknown error") + await self._nats_adapter.publish( + chat_id, "chat_finished", + { + "type": "chat_finished", + "status": "error", + "metadata": {"message": error_msg}, + }, + ) + else: + await self._nats_adapter.publish_chat_finished(chat_id) return thread.response except asyncio.CancelledError: @@ -1672,9 +1685,9 @@ async def speech_to_text(self, bytes_data): bytes_data: The bytes data of the audio (bytes, base64 string, or list). """ try: - import litellm import base64 - from pantheon.utils.llm_providers import get_litellm_proxy_kwargs + from pantheon.utils.llm_providers import get_proxy_kwargs + from pantheon.utils.adapters import get_adapter logger.info(f"[STT] Received bytes_data type={type(bytes_data).__name__}, " f"len={len(bytes_data) if hasattr(bytes_data, '__len__') else 'N/A'}") @@ -1706,12 +1719,15 @@ async def speech_to_text(self, bytes_data): audio_file = io.BytesIO(bytes_data) audio_file.name = "audio.webm" - logger.info("[STT] Calling litellm.atranscription...") + logger.info("[STT] Calling transcription adapter...") + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") response = await asyncio.wait_for( - litellm.atranscription( + adapter.atranscription( model=self.speech_to_text_model, file=audio_file, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ), timeout=30, ) @@ -1951,6 +1967,9 @@ async def list_available_models(self) -> dict: from pantheon.utils.model_selector import get_model_selector selector = get_model_selector() + # Clear provider cache so dynamic providers (Ollama, OAuth) are re-detected + selector._available_providers = None + selector._detected_provider = None return selector.list_available_models() except Exception as e: logger.error(f"Error listing available models: {e}") @@ -2329,3 +2348,129 @@ async def check_api_keys(self) -> dict: has_any_key = any(v["configured"] for v in keys.values()) return {"keys": keys, "has_any_key": has_any_key} + + # ============ OAuth Management ============ + + @tool + async def oauth_status(self) -> dict: + """Get OAuth authentication status for all supported providers. + + Returns: + Dict with provider statuses including authentication state and account info. + """ + from pantheon.utils.oauth import CodexOAuthManager + from pantheon.utils.oauth.codex import CODEX_CLI_AUTH + + codex = CodexOAuthManager() + # Actually verify the token works (auto_refresh=True will try to refresh if expired) + access_token = codex.get_access_token(auto_refresh=True) + codex_authenticated = access_token is not None + codex_account_id = codex.get_account_id() if codex_authenticated else None + cli_available = CODEX_CLI_AUTH.exists() + + return { + "providers": { + "codex": { + "authenticated": codex_authenticated, + "account_id": codex_account_id, + "description": "OpenAI Codex (ChatGPT backend-api, free with ChatGPT Plus)", + "supports_browser_login": True, + "supports_import": cli_available, + }, + }, + } + + @tool + async def oauth_login(self, provider: str = "codex") -> dict: + """Start browser-based OAuth login flow. + + Opens the system browser for the user to authenticate. + Token is saved automatically after successful login. + + Args: + provider: OAuth provider name (currently only 'codex' supported) + + Returns: + Dict with success status and account info. + """ + if provider != "codex": + return {"success": False, "error": f"Unsupported OAuth provider: {provider}"} + + from pantheon.utils.oauth import CodexOAuthManager, CodexOAuthError + + try: + mgr = CodexOAuthManager() + mgr.login(open_browser=True, timeout_seconds=300) + + # Reload settings so model selector detects new provider + from pantheon.utils.model_selector import reset_model_selector + reset_model_selector() + + return { + "success": True, + "provider": "codex", + "account_id": mgr.get_account_id(), + "message": "Codex OAuth login successful. You can now use codex/ models.", + } + except CodexOAuthError as e: + return {"success": False, "error": str(e)} + except Exception as e: + logger.error(f"OAuth login failed: {e}") + return {"success": False, "error": str(e)} + + @tool + async def oauth_import(self, provider: str = "codex") -> dict: + """Import OAuth tokens from native CLI tools. + + For Codex: imports from ~/.codex/auth.json (Codex CLI). + + Args: + provider: OAuth provider name (currently only 'codex' supported) + + Returns: + Dict with success status. + """ + if provider != "codex": + return {"success": False, "error": f"Unsupported OAuth provider: {provider}"} + + from pantheon.utils.oauth import CodexOAuthManager + + try: + mgr = CodexOAuthManager() + result = mgr.import_from_codex_cli() + + if result: + from pantheon.utils.model_selector import reset_model_selector + reset_model_selector() + return { + "success": True, + "provider": "codex", + "account_id": mgr.get_account_id(), + "message": "Imported Codex CLI tokens successfully.", + } + else: + return { + "success": False, + "error": "No Codex CLI auth found (~/.codex/auth.json). Install Codex CLI or use browser login.", + } + except Exception as e: + logger.error(f"OAuth import failed: {e}") + return {"success": False, "error": str(e)} + + @tool + async def ollama_status(self, url: str = "http://localhost:11434") -> dict: + """Check Ollama server status and list available models. + + Args: + url: Ollama server URL (default: http://localhost:11434) + + Returns: + Dict with running status, model list, and URL. + """ + try: + from pantheon.utils.model_selector import _detect_ollama, _list_ollama_models + running = _detect_ollama(url) + models = _list_ollama_models(url) if running else [] + return {"running": running, "models": models, "url": url} + except Exception as e: + return {"running": False, "models": [], "url": url} diff --git a/pantheon/chatroom/start.py b/pantheon/chatroom/start.py index 24391444..34c897cc 100644 --- a/pantheon/chatroom/start.py +++ b/pantheon/chatroom/start.py @@ -250,7 +250,7 @@ async def start_services( - .env file: OPENAI_API_KEY=sk-... - settings.json api_keys section - Use LiteLLM Proxy mode for secure API key handling (LITELLM_PROXY_ENABLED environment variable). + Use LLM Proxy mode for secure API key handling (LLM_PROXY_ENABLED environment variable). """ # DIAGNOSTIC: Log startup parameters for debugging logger.debug(f"[DIAGNOSTIC] start_services() called with auto_start_nats={auto_start_nats}, auto_ui={auto_ui}") diff --git a/pantheon/factory/templates/.env.example b/pantheon/factory/templates/.env.example index 8445902e..325dfdc0 100644 --- a/pantheon/factory/templates/.env.example +++ b/pantheon/factory/templates/.env.example @@ -25,8 +25,8 @@ # Advanced Configuration (Optional) # ======================================== -# Custom LiteLLM endpoint -#LITELLM_BASE_URL=https://your-litellm-proxy.com +# Custom LLM proxy endpoint +#LLM_PROXY_URL=https://your-llm-proxy.com # Debug mode #DEBUG=false diff --git a/pantheon/internal/compression/compressor.py b/pantheon/internal/compression/compressor.py index 1bb63cc4..efb37a22 100644 --- a/pantheon/internal/compression/compressor.py +++ b/pantheon/internal/compression/compressor.py @@ -120,7 +120,7 @@ def should_compress(self, messages: list[dict], model: str | None = None) -> boo # Fallback: try to fetch from model info if available if model: try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info info = get_model_info(model) max_tokens = (info.get("max_input_tokens") or 0) + ( @@ -336,7 +336,7 @@ def _count_existing_compressions(self, messages: list[dict]) -> int: return sum(1 for msg in messages if msg.get("role") == "compression") def _estimate_tokens(self, messages: list[dict]) -> int: - """Estimate token count for messages using litellm/tiktoken when available.""" + """Estimate token count for messages using tiktoken when available.""" from pantheon.utils.llm import _safe_token_counter return max(1, _safe_token_counter(model=self.model, messages=messages)) diff --git a/pantheon/providers.py b/pantheon/providers.py index fa1ec2b0..630a4473 100644 --- a/pantheon/providers.py +++ b/pantheon/providers.py @@ -414,7 +414,7 @@ async def list_tools(self) -> list[ToolInfo]: # Generate OpenAI format schema using desc_to_openai_dict oai_dict = desc_to_openai_dict( - desc, skip_params=[], litellm_mode=True + desc, skip_params=[], relaxed_schema=True ) # Extract the "function" part (without "type": "function") @@ -545,7 +545,7 @@ async def list_tools(self) -> list[ToolInfo]: # Generate OpenAI format schema using desc_to_openai_dict oai_dict = desc_to_openai_dict( - desc, skip_params=[], litellm_mode=True + desc, skip_params=[], relaxed_schema=True ) # Extract the "function" part (without "type": "function") diff --git a/pantheon/repl/__init__.py b/pantheon/repl/__init__.py index 7d29cf44..f6197b08 100644 --- a/pantheon/repl/__init__.py +++ b/pantheon/repl/__init__.py @@ -1,8 +1,3 @@ -import os - -# Prevent litellm from making blocking network calls to GitHub on startup -os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True") - from .core import Repl __all__ = ["Repl"] \ No newline at end of file diff --git a/pantheon/repl/__main__.py b/pantheon/repl/__main__.py index 0523aef3..eea7b35d 100644 --- a/pantheon/repl/__main__.py +++ b/pantheon/repl/__main__.py @@ -15,7 +15,7 @@ import warnings from pathlib import Path -# Warning filters and litellm config are already set in pantheon/__init__.py +# Warning filters are already set in pantheon/__init__.py # which runs before this __main__.py import fire @@ -159,14 +159,6 @@ def start( ) -async def _update_litellm_cost_map(): - """Background task to update litellm model cost map. - - Delegates to the shared utility in pantheon.utils.llm. - """ - from pantheon.utils.llm import update_litellm_cost_map - await update_litellm_cost_map() - async def _start_async( template: str = None, @@ -311,9 +303,6 @@ def filter(self, record: logging.LogRecord) -> bool: # Disable logging unless explicitly set to DEBUG disable_logging = quiet and log_level != "DEBUG" - # Start background task to update litellm cost map (non-blocking) - asyncio.create_task(_update_litellm_cost_map()) - await repl.run(message=initial_input, disable_logging=disable_logging, log_level=log_level) diff --git a/pantheon/settings.py b/pantheon/settings.py index 2194155d..c434c55a 100644 --- a/pantheon/settings.py +++ b/pantheon/settings.py @@ -166,7 +166,7 @@ def __init__(self, work_dir: Optional[Path] = None, env_override: bool = False): - .env file: OPENAI_API_KEY=sk-... - settings.json api_keys section - Use LiteLLM Proxy mode for secure API key handling (LITELLM_PROXY_ENABLED environment variable). + Use LLM Proxy mode for secure API key handling (LLM_PROXY_ENABLED environment variable). """ from .constant import PROJECT_ROOT @@ -835,8 +835,8 @@ def get_settings( API keys should be set via environment variables, .env file, or settings.json api_keys section. - For secure API key handling, use LiteLLM Proxy mode by setting - LITELLM_PROXY_ENABLED environment variable. + For secure API key handling, use LLM Proxy mode by setting + LLM_PROXY_ENABLED environment variable. """ global _settings diff --git a/pantheon/toolsets/image/image_gen.py b/pantheon/toolsets/image/image_gen.py index 7996d11c..eb2cf844 100644 --- a/pantheon/toolsets/image/image_gen.py +++ b/pantheon/toolsets/image/image_gen.py @@ -6,18 +6,13 @@ and native image editing models (OpenAI gpt-image). """ -import litellm - -# Suppress litellm debug output (Provider List message) -litellm.suppress_debug_info = True -litellm.set_verbose = False from pantheon.toolset import ToolSet, tool from pantheon.utils.vision import ( ImageStore, get_image_store, expand_image_references_for_llm, ) -from pantheon.utils.llm_providers import get_litellm_proxy_kwargs +from pantheon.utils.llm_providers import get_proxy_kwargs # Multimodal models that support image input + output via acompletion API # Gemini Nano Banana series: Pro / Nano Banana 2 / Nano Banana first-gen @@ -91,16 +86,16 @@ def _get_chat_id(self) -> str: return "default" def _extract_cost_from_response(self, response) -> float: - """Extract cost from LiteLLM response. - + """Extract cost from API response. + Args: - response: LiteLLM response object from acompletion or aimage_generation - + response: Response object from acompletion or aimage_generation + Returns: Cost in USD, or 0.0 if calculation fails """ try: - from litellm import completion_cost + from pantheon.utils.provider_registry import completion_cost cost = completion_cost(completion_response=response) or 0.0 from pantheon.utils.log import logger logger.debug(f"Image generation cost: ${cost:.6f}") @@ -171,12 +166,17 @@ async def _text_input_image_gen( model: str, ) -> dict: """Text-only image generation (DALL-E, Imagen).""" - response = await litellm.aimage_generation( + from pantheon.utils.adapters import get_adapter + + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") + response = await adapter.aimage_generation( model=model, prompt=prompt, size="1024x1024", n=1, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ) # Extract cost from response @@ -211,7 +211,7 @@ async def _multimodal_image_gen( """Multimodal image generation (Gemini Nano Banana series). Uses chat completion API with modalities parameter to generate images. - This approach works through LiteLLM Proxy and supports image generation. + This approach works through the LLM Proxy and supports image generation. Supported models: - gemini-3-pro-image-preview (Nano Banana Pro, up to 4K) @@ -232,12 +232,26 @@ async def _multimodal_image_gen( self.image_store.process_message_images(messages[0], chat_id) messages = expand_image_references_for_llm(messages) - response = await litellm.acompletion( - model=model, + from pantheon.utils.adapters import get_adapter + from pantheon.utils.provider_registry import find_provider_for_model + + proxy_kwargs = get_proxy_kwargs() + provider_key, model_name, provider_config = find_provider_for_model(model) + sdk_type = provider_config.get("sdk", "openai") + if proxy_kwargs: + sdk_type = "openai" + adapter = get_adapter(sdk_type) + + collected_chunks = await adapter.acompletion( + model=model_name if not proxy_kwargs else model, messages=messages, - modalities=["text", "image"], # Enable image generation output - **get_litellm_proxy_kwargs(), # Use proxy for real API keys + stream=True, + base_url=proxy_kwargs.get("base_url") or provider_config.get("base_url"), + api_key=proxy_kwargs.get("api_key"), + modalities=["text", "image"], ) + from pantheon.utils.llm import stream_chunk_builder + response = stream_chunk_builder(collected_chunks) # Extract cost from response cost = self._extract_cost_from_response(response) @@ -283,13 +297,18 @@ async def _image_edit_gen( resolved = self.image_store.normalize_local_path(path) resolved_paths.append(resolved) - response = await litellm.aimage_edit( + from pantheon.utils.adapters import get_adapter + + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") + response = await adapter.aimage_edit( model=model, image=resolved_paths, prompt=prompt, size="1024x1024", n=1, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ) # Extract cost from response diff --git a/pantheon/toolsets/knowledge/knowledge_manager.py b/pantheon/toolsets/knowledge/knowledge_manager.py index 509b0bca..848e1473 100644 --- a/pantheon/toolsets/knowledge/knowledge_manager.py +++ b/pantheon/toolsets/knowledge/knowledge_manager.py @@ -87,7 +87,7 @@ async def run_setup(self): def _create_llm(): from llama_index.llms.openai import OpenAI from pantheon.settings import get_settings - from pantheon.utils.llm_providers import get_litellm_proxy_kwargs + from pantheon.utils.llm_providers import get_proxy_kwargs settings = get_settings() llm_kwargs = { @@ -99,10 +99,10 @@ def _create_llm(): if api_base: llm_kwargs["api_base"] = api_base - # Use LiteLLM proxy if enabled (overrides api_base/api_key) - proxy_kwargs = get_litellm_proxy_kwargs() + # Use proxy if enabled (overrides api_base/api_key) + proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: - llm_kwargs["api_base"] = proxy_kwargs["api_base"] + llm_kwargs["api_base"] = proxy_kwargs["base_url"] llm_kwargs["api_key"] = proxy_kwargs["api_key"] return OpenAI(**llm_kwargs) diff --git a/pantheon/utils/adapters/__init__.py b/pantheon/utils/adapters/__init__.py new file mode 100644 index 00000000..fa3a45ff --- /dev/null +++ b/pantheon/utils/adapters/__init__.py @@ -0,0 +1,57 @@ +""" +LLM Provider Adapters — unified async interface for different SDK types. + +Each adapter wraps a specific SDK (openai, anthropic, google-genai) and +exposes a common interface: acompletion, aembedding, aimage_generation, etc. +""" + +from functools import lru_cache + +from .base import ( + BaseAdapter, + LLMError, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +@lru_cache(maxsize=8) +def get_adapter(sdk_type: str) -> BaseAdapter: + """Get an adapter singleton for the given SDK type. + + Args: + sdk_type: One of 'openai', 'anthropic', 'google-genai' + + Returns: + BaseAdapter instance + """ + if sdk_type == "openai": + from .openai_adapter import OpenAIAdapter + return OpenAIAdapter() + elif sdk_type == "anthropic": + from .anthropic_adapter import AnthropicAdapter + return AnthropicAdapter() + elif sdk_type == "google-genai": + from .gemini_adapter import GeminiAdapter + return GeminiAdapter() + elif sdk_type == "codex": + from .codex_adapter import CodexAdapter + return CodexAdapter() + else: + # Default to OpenAI adapter for unknown SDK types + # (many providers are OpenAI-compatible) + from .openai_adapter import OpenAIAdapter + return OpenAIAdapter() + + +__all__ = [ + "get_adapter", + "BaseAdapter", + "LLMError", + "ServiceUnavailableError", + "InternalServerError", + "RateLimitError", + "APIConnectionError", +] diff --git a/pantheon/utils/adapters/anthropic_adapter.py b/pantheon/utils/adapters/anthropic_adapter.py new file mode 100644 index 00000000..4824a597 --- /dev/null +++ b/pantheon/utils/adapters/anthropic_adapter.py @@ -0,0 +1,468 @@ +""" +Anthropic adapter — handles Claude models via the native Anthropic SDK. + +Converts between OpenAI message format (used internally by PantheonOS) +and Anthropic's native format, and normalizes streaming events. +""" + +import json +import time +from typing import Any, Callable + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +def _wrap_anthropic_error(e: Exception) -> Exception: + """Convert anthropic SDK exceptions to unified exception types.""" + try: + import anthropic as anthropic_mod + + if isinstance(e, anthropic_mod.RateLimitError): + return RateLimitError(str(e)) + elif isinstance(e, anthropic_mod.APIConnectionError): + return APIConnectionError(str(e)) + elif isinstance(e, anthropic_mod.InternalServerError): + return InternalServerError(str(e)) + elif isinstance(e, anthropic_mod.APIStatusError): + status = getattr(e, "status_code", 0) + if status == 503: + return ServiceUnavailableError(str(e)) + elif status == 429: + return RateLimitError(str(e)) + elif status >= 500: + return InternalServerError(str(e)) + except ImportError: + pass + return e + + +# ============ Message Format Conversion ============ + + +def _convert_messages_to_anthropic(messages: list[dict]) -> tuple[str | None, list[dict]]: + """Convert OpenAI-format messages to Anthropic format. + + Key differences: + - System messages become top-level `system` parameter + - tool_calls in assistant messages become tool_use content blocks + - tool role messages become tool_result content blocks in user messages + + Returns: + (system_prompt, converted_messages) + """ + system_prompt = None + converted = [] + pending_tool_results = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if role == "system": + # First system message becomes the system parameter + if system_prompt is None: + system_prompt = content if isinstance(content, str) else str(content) + else: + # Additional system messages become user messages + converted.append({ + "role": "user", + "content": f"[System]: {content}" + }) + continue + + if role == "tool": + # Accumulate tool results to attach to next user message + pending_tool_results.append({ + "type": "tool_result", + "tool_use_id": msg.get("tool_call_id", ""), + "content": content or "", + }) + continue + + if role == "user": + # Flush pending tool results first + if pending_tool_results: + # Tool results must be in a user message + result_content = list(pending_tool_results) + if content: + if isinstance(content, str): + result_content.append({"type": "text", "text": content}) + elif isinstance(content, list): + result_content.extend(content) + converted.append({"role": "user", "content": result_content}) + pending_tool_results = [] + else: + converted.append({"role": "user", "content": content}) + continue + + if role == "assistant": + # Flush any pending tool results as a separate user message + if pending_tool_results: + converted.append({"role": "user", "content": list(pending_tool_results)}) + pending_tool_results = [] + + # Build content blocks for assistant + content_blocks = [] + + # Text content + if content: + if isinstance(content, str): + content_blocks.append({"type": "text", "text": content}) + elif isinstance(content, list): + content_blocks.extend(content) + + # Tool calls → tool_use blocks + tool_calls = msg.get("tool_calls") + if tool_calls: + for tc in tool_calls: + func = tc.get("function", {}) + # Parse arguments from JSON string + try: + input_data = json.loads(func.get("arguments", "{}")) + except (json.JSONDecodeError, TypeError): + input_data = {} + + content_blocks.append({ + "type": "tool_use", + "id": tc.get("id", ""), + "name": func.get("name", ""), + "input": input_data, + }) + + if content_blocks: + converted.append({"role": "assistant", "content": content_blocks}) + elif not content and not tool_calls: + # Empty assistant message — skip + pass + continue + + # Flush remaining tool results + if pending_tool_results: + converted.append({"role": "user", "content": list(pending_tool_results)}) + + # Anthropic requires alternating user/assistant messages + # Merge consecutive same-role messages + merged = [] + for msg in converted: + if merged and merged[-1]["role"] == msg["role"]: + prev = merged[-1]["content"] + curr = msg["content"] + # Normalize both to lists + if isinstance(prev, str): + prev = [{"type": "text", "text": prev}] + elif not isinstance(prev, list): + prev = [{"type": "text", "text": str(prev)}] + if isinstance(curr, str): + curr = [{"type": "text", "text": curr}] + elif not isinstance(curr, list): + curr = [{"type": "text", "text": str(curr)}] + merged[-1]["content"] = prev + curr + else: + merged.append(msg) + + return system_prompt, merged + + +def _convert_tools_to_anthropic(tools: list[dict] | None) -> list[dict] | None: + """Convert OpenAI tool format to Anthropic tool format. + + From: {"type": "function", "function": {"name": ..., "description": ..., "parameters": {...}}} + To: {"name": ..., "description": ..., "input_schema": {...}} + """ + if not tools: + return None + + converted = [] + for tool in tools: + func = tool.get("function", {}) + anthropic_tool = { + "name": func.get("name", ""), + } + if "description" in func: + anthropic_tool["description"] = func["description"] + if "parameters" in func: + anthropic_tool["input_schema"] = func["parameters"] + else: + anthropic_tool["input_schema"] = {"type": "object", "properties": {}} + converted.append(anthropic_tool) + + return converted + + +# ============ Adapter ============ + + +class AnthropicAdapter(BaseAdapter): + """Adapter for Anthropic Claude API.""" + + def _make_client( + self, + base_url: str | None = None, + api_key: str | None = None, + ): + from anthropic import AsyncAnthropic + + kwargs = {} + if base_url: + kwargs["base_url"] = base_url + if api_key: + kwargs["api_key"] = api_key + return AsyncAnthropic(**kwargs) + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ): + """Streaming chat completion using the Anthropic SDK. + + Converts OpenAI messages to Anthropic format, streams events, + normalizes them to OpenAI-compatible chunk dicts, and returns + collected chunks. + """ + client = self._make_client(base_url, api_key) + + # Convert messages and tools + system_prompt, anthropic_messages = _convert_messages_to_anthropic(messages) + anthropic_tools = _convert_tools_to_anthropic(tools) + + # Build call kwargs (stream() method implies streaming, don't pass stream=True) + call_kwargs = { + "model": model, + "messages": anthropic_messages, + "max_tokens": kwargs.pop("max_tokens", None) or kwargs.pop("max_output_tokens", 8192), + } + + if system_prompt: + call_kwargs["system"] = system_prompt + + if anthropic_tools: + call_kwargs["tools"] = anthropic_tools + + # Handle thinking parameter + thinking = kwargs.pop("thinking", None) + reasoning_effort = kwargs.pop("reasoning_effort", None) + if thinking: + call_kwargs["thinking"] = thinking + elif reasoning_effort: + # Map reasoning_effort to Anthropic thinking + call_kwargs["thinking"] = {"type": "enabled", "budget_tokens": 10000} + + # Temperature + temperature = kwargs.pop("temperature", None) + if temperature is not None: + call_kwargs["temperature"] = temperature + + # Top-p + top_p = kwargs.pop("top_p", None) + if top_p is not None: + call_kwargs["top_p"] = top_p + + # Extra headers + extra_headers = kwargs.pop("extra_headers", None) + + try: + stream_start_time = time.time() + first_chunk_time = None + chunk_count = 0 + collected_chunks = [] + + # Track state for building OpenAI-compatible chunks + current_text = "" + current_tool_calls = [] + tool_call_index = -1 + tool_call_json_accum = "" + usage_info = {} + + async with client.messages.stream( + **call_kwargs, + extra_headers=extra_headers or {}, + ) as stream_resp: + async for event in stream_resp: + event_type = event.type + + if event_type == "message_start": + # Extract initial usage + msg = getattr(event, "message", None) + if msg and hasattr(msg, "usage"): + usage_info["prompt_tokens"] = getattr(msg.usage, "input_tokens", 0) + + elif event_type == "content_block_start": + block = event.content_block + if block.type == "tool_use": + tool_call_index += 1 + tool_call_json_accum = "" + current_tool_calls.append({ + "index": tool_call_index, + "id": block.id, + "type": "function", + "function": { + "name": block.name, + "arguments": "", + }, + }) + # Emit initial chunk with id and name + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [{ + "index": tool_call_index, + "id": block.id, + "type": "function", + "function": { + "name": block.name, + "arguments": "", + }, + }], + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + elif event_type == "content_block_delta": + delta_obj = event.delta + + if delta_obj.type == "text_delta": + text = delta_obj.text + current_text += text + + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + + # Build OpenAI-compatible chunk + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + chunk_count += 1 + await run_func(process_chunk, { + "role": "assistant", + "content": text, + }) + + elif delta_obj.type == "input_json_delta": + # Accumulate tool call arguments + partial = delta_obj.partial_json + tool_call_json_accum += partial + if current_tool_calls: + current_tool_calls[-1]["function"]["arguments"] += partial + + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [{ + "index": tool_call_index, + "function": { + "arguments": partial, + }, + }], + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + elif delta_obj.type == "thinking_delta": + thinking_text = delta_obj.thinking + + # Write into chunks so stream_chunk_builder captures it + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": thinking_text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + await run_func(process_chunk, { + "role": "assistant", + "reasoning_content": thinking_text, + }) + + elif event_type == "message_delta": + delta = event.delta + stop_reason = getattr(delta, "stop_reason", None) + + # Extract usage from message_delta + usage = getattr(event, "usage", None) + if usage: + usage_info["completion_tokens"] = getattr(usage, "output_tokens", 0) + + # Map Anthropic stop reasons to OpenAI finish reasons + finish_reason = None + if stop_reason == "end_turn": + finish_reason = "stop" + elif stop_reason == "tool_use": + finish_reason = "tool_calls" + elif stop_reason == "max_tokens": + finish_reason = "length" + + if finish_reason: + chunk_dict = { + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": finish_reason, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk and finish_reason == "stop": + await run_func(process_chunk, {"stop": True}) + + elif event_type == "message_stop": + pass + + # Add usage chunk at the end (OpenAI stream_options style) + total_tokens = usage_info.get("prompt_tokens", 0) + usage_info.get("completion_tokens", 0) + usage_info["total_tokens"] = total_tokens + collected_chunks.append({ + "usage": usage_info, + "choices": [], + }) + + total_time = time.time() - stream_start_time + logger.info(f"✅ Stream completed: {total_time:.3f}s, {chunk_count} chunks [{model}]") + + return collected_chunks + + except Exception as e: + raise _wrap_anthropic_error(e) from e diff --git a/pantheon/utils/adapters/base.py b/pantheon/utils/adapters/base.py new file mode 100644 index 00000000..aca797eb --- /dev/null +++ b/pantheon/utils/adapters/base.py @@ -0,0 +1,158 @@ +""" +Base adapter — ABC for all provider adapters + unified exception types. +""" + +from abc import ABC, abstractmethod +from typing import Any, AsyncIterator, Callable + + +# ============ Unified Exception Types ============ +# Unified exception types caught in agent.py _is_retryable_error() + + +class LLMError(Exception): + """Base exception for LLM provider errors.""" + pass + + +class ServiceUnavailableError(LLMError): + """Provider service is temporarily unavailable (503).""" + pass + + +class InternalServerError(LLMError): + """Provider encountered an internal error (500).""" + pass + + +class RateLimitError(LLMError): + """Request was rate-limited (429).""" + pass + + +class APIConnectionError(LLMError): + """Failed to connect to the provider API.""" + pass + + +# ============ Base Adapter ============ + + +class BaseAdapter(ABC): + """Abstract base class for LLM provider adapters. + + Each adapter wraps a specific SDK and normalizes responses to + a common format compatible with the existing codebase. + + Streaming responses yield dicts with OpenAI-compatible chunk format. + Complete responses are SimpleNamespace objects with .choices and .usage. + """ + + @abstractmethod + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ) -> AsyncIterator: + """Async chat completion with streaming. + + Args: + model: Model name (without provider prefix) + messages: Chat messages in OpenAI format + tools: Tool definitions in OpenAI format + response_format: Response format specification + stream: Whether to stream (always True for now) + process_chunk: Callback for processing stream chunks + base_url: Override API base URL + api_key: Override API key + num_retries: Number of retries on transient errors + **kwargs: Additional provider-specific parameters + + Yields: + Stream chunks (provider-specific format, collected by caller) + + Returns: + The async iterator of chunks + """ + ... + + async def aembedding( + self, + *, + model: str, + input: list[str], + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> list[list[float]]: + """Generate embeddings. + + Returns: + List of embedding vectors + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support embeddings" + ) + + async def aimage_generation( + self, + *, + model: str, + prompt: str, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Generate images from text prompt. + + Returns: + Provider-specific image response + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support image generation" + ) + + async def aimage_edit( + self, + *, + model: str, + image: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Edit an image. + + Returns: + Provider-specific image response + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support image editing" + ) + + async def atranscription( + self, + *, + model: str, + file: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Transcribe audio to text. + + Returns: + Transcription response + """ + raise NotImplementedError( + f"{self.__class__.__name__} does not support transcription" + ) diff --git a/pantheon/utils/adapters/codex_adapter.py b/pantheon/utils/adapters/codex_adapter.py new file mode 100644 index 00000000..d0986beb --- /dev/null +++ b/pantheon/utils/adapters/codex_adapter.py @@ -0,0 +1,279 @@ +""" +Codex adapter — calls OpenAI ChatGPT backend-api via OAuth tokens. + +Uses the Responses API format at https://chatgpt.com/backend-api/codex/responses. +Requires OAuth tokens from CodexOAuthManager. +""" + +import json +import time +import platform +from typing import Any, Callable + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + +CODEX_BASE_URL = "https://chatgpt.com/backend-api" + + +def _build_headers(access_token: str, account_id: str | None = None) -> dict: + """Build request headers for Codex backend-api.""" + headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json", + "Accept": "text/event-stream", + "OpenAI-Beta": "responses=experimental", + "originator": "pi", + "User-Agent": f"pi ({platform.system()} {platform.release()}; {platform.machine()})", + } + if account_id: + headers["chatgpt-account-id"] = account_id + return headers + + +def _convert_messages_to_responses_input(messages: list[dict]) -> tuple[str | None, list[dict]]: + """Convert Chat Completions messages to Responses API input format.""" + instructions = None + input_items = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if role == "system": + if instructions is None: + instructions = content + else: + input_items.append({"role": "developer", "content": content}) + elif role == "user": + input_items.append({"role": "user", "content": content}) + elif role == "assistant": + if content: + input_items.append({"role": "assistant", "content": content}) + for tc in msg.get("tool_calls") or []: + func = tc.get("function", {}) + input_items.append({ + "type": "function_call", + "call_id": tc["id"], + "name": func.get("name", ""), + "arguments": func.get("arguments", ""), + }) + elif role == "tool": + input_items.append({ + "type": "function_call_output", + "call_id": msg.get("tool_call_id", ""), + "output": content or "", + }) + + return instructions, input_items + + +def _convert_tools(tools: list[dict] | None) -> list[dict] | None: + """Convert Chat Completions tool format to Responses API format.""" + if not tools: + return None + converted = [] + for tool in tools: + func = tool.get("function", {}) + item = {"type": "function", "name": func.get("name", "")} + if "description" in func: + item["description"] = func["description"] + if "parameters" in func: + item["parameters"] = func["parameters"] + if "strict" in func: + item["strict"] = func["strict"] + converted.append(item) + return converted + + +class CodexAdapter(BaseAdapter): + """Adapter for OpenAI Codex via ChatGPT backend-api OAuth.""" + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, # This is the OAuth access_token + num_retries: int = 3, + **kwargs, + ): + """Call Codex backend-api with Responses API format. + + api_key should be the OAuth access_token. + kwargs may contain 'account_id' for the chatgpt-account-id header. + """ + import httpx + + access_token = api_key + if not access_token: + raise APIConnectionError("No Codex OAuth access token provided") + + account_id = kwargs.pop("account_id", None) + headers = _build_headers(access_token, account_id) + endpoint = f"{base_url or CODEX_BASE_URL}/codex/responses" + + # Convert messages + instructions, input_items = _convert_messages_to_responses_input(messages) + converted_tools = _convert_tools(tools) + + # Build request body + body: dict[str, Any] = { + "model": model, + "input": input_items, + "instructions": instructions or "You are a helpful assistant.", + "stream": True, + "store": False, + "parallel_tool_calls": True, + "include": ["reasoning.encrypted_content"], + } + if converted_tools: + body["tools"] = converted_tools + if response_format: + body["text"] = response_format + + # Map model_params (Codex backend-api has limited parameter support) + kwargs.pop("max_tokens", None) + kwargs.pop("max_completion_tokens", None) + kwargs.pop("max_output_tokens", None) + reasoning_effort = kwargs.pop("reasoning_effort", None) + if reasoning_effort: + body["reasoning"] = {"effort": reasoning_effort} + + # Stream response + text_parts = [] + tool_calls_by_id = {} + _item_to_call = {} + usage_dict = {} + cost = 0.0 + + try: + stream_start_time = time.time() + first_chunk_time = None + + async with httpx.AsyncClient(timeout=120) as client: + async with client.stream("POST", endpoint, headers=headers, json=body) as resp: + if resp.status_code == 401: + raise APIConnectionError( + "[OAUTH_REQUIRED] Codex OAuth token expired. " + "Please re-login in Settings → API Keys → OAuth." + ) + elif resp.status_code == 429: + raise RateLimitError(f"Codex rate limited (429)") + elif resp.status_code >= 500: + raise ServiceUnavailableError(f"Codex server error ({resp.status_code})") + elif resp.status_code >= 400: + body_text = "" + async for chunk in resp.aiter_text(): + body_text += chunk + raise APIConnectionError(f"Codex error {resp.status_code}: {body_text[:300]}") + + async for line in resp.aiter_lines(): + if not line.startswith("data: "): + continue + data_str = line[6:] + if data_str == "[DONE]": + break + + try: + event = json.loads(data_str) + except json.JSONDecodeError: + continue + + event_type = event.get("type", "") + + if event_type == "response.output_text.delta": + delta_text = event.get("delta", "") + text_parts.append(delta_text) + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + if process_chunk: + await run_func(process_chunk, {"content": delta_text, "role": "assistant"}) + + elif event_type == "response.output_item.added": + item = event.get("item", {}) + if item.get("type") == "function_call": + call_id = item.get("call_id", "") + item_id = item.get("id", "") + _item_to_call[item_id] = call_id + tool_calls_by_id[call_id] = { + "name": item.get("name", ""), + "arguments": "", + } + + elif event_type == "response.function_call_arguments.done": + item_id = event.get("item_id", "") + call_id = _item_to_call.get(item_id, "") + if call_id and call_id in tool_calls_by_id: + tool_calls_by_id[call_id]["arguments"] = event.get("arguments", "") + if event.get("name"): + tool_calls_by_id[call_id]["name"] = event["name"] + + elif event_type == "response.completed": + if process_chunk: + await run_func(process_chunk, {"stop": True}) + # Extract usage + resp_obj = event.get("response", {}) + usage = resp_obj.get("usage", {}) + if usage: + input_tokens = usage.get("input_tokens", 0) + output_tokens = usage.get("output_tokens", 0) + usage_dict = { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + } + + elif event_type == "response.failed": + error_info = event.get("response", {}).get("error", {}) + raise RuntimeError(f"Codex call failed: {error_info}") + + total_time = time.time() - stream_start_time + logger.info(f"✅ Codex stream completed: {total_time:.3f}s [{model}]") + + except (APIConnectionError, RateLimitError, ServiceUnavailableError): + raise + except Exception as e: + err_str = str(e).lower() + if "401" in err_str or "unauthorized" in err_str: + raise APIConnectionError(f"Codex OAuth token invalid: {e}") from e + elif "429" in err_str or "rate" in err_str: + raise RateLimitError(str(e)) from e + raise + + # Build output message (same format as acompletion_responses in llm.py) + aggregated_text = "".join(text_parts) if text_parts else None + final_tool_calls = None + if tool_calls_by_id: + final_tool_calls = [ + {"id": cid, "type": "function", "function": {"name": info["name"], "arguments": info["arguments"]}} + for cid, info in tool_calls_by_id.items() + ] + + # Cost estimation from catalog + try: + from ..provider_registry import completion_cost as calc_cost + cost = calc_cost(model=model, **usage_dict) if usage_dict else 0.0 + except Exception: + pass + + return { + "role": "assistant", + "content": aggregated_text, + "tool_calls": final_tool_calls, + "_metadata": {"_debug_cost": cost, "_debug_usage": usage_dict}, + } diff --git a/pantheon/utils/adapters/gemini_adapter.py b/pantheon/utils/adapters/gemini_adapter.py new file mode 100644 index 00000000..f3702077 --- /dev/null +++ b/pantheon/utils/adapters/gemini_adapter.py @@ -0,0 +1,402 @@ +""" +Gemini adapter — handles Google Gemini models via the google-genai SDK. + +Converts between OpenAI message format and Gemini's native format, +and normalizes streaming events to OpenAI-compatible chunk dicts. +""" + +import json +import time +from typing import Any, Callable + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +def _wrap_gemini_error(e: Exception) -> Exception: + """Convert Gemini SDK exceptions to unified exception types.""" + error_str = str(e).lower() + if "429" in error_str or "resource exhausted" in error_str or "rate" in error_str: + return RateLimitError(str(e)) + elif "503" in error_str or "unavailable" in error_str: + return ServiceUnavailableError(str(e)) + elif "500" in error_str or "internal" in error_str: + return InternalServerError(str(e)) + elif "connect" in error_str or "timeout" in error_str: + return APIConnectionError(str(e)) + return e + + +# ============ Message Format Conversion ============ + + +def _convert_messages_to_gemini(messages: list[dict]) -> tuple[str | None, list[dict]]: + """Convert OpenAI-format messages to Gemini format. + + Returns: + (system_instruction, gemini_contents) + """ + system_instruction = None + contents = [] + + for msg in messages: + role = msg.get("role") + content = msg.get("content") + + if role == "system": + if system_instruction is None: + system_instruction = content if isinstance(content, str) else str(content) + else: + # Additional system messages as user context + contents.append({ + "role": "user", + "parts": [{"text": f"[System]: {content}"}], + }) + continue + + if role == "user": + parts = [] + if isinstance(content, str): + parts.append({"text": content}) + elif isinstance(content, list): + for item in content: + if isinstance(item, dict): + if item.get("type") == "text": + parts.append({"text": item["text"]}) + elif item.get("type") == "image_url": + # Pass image URLs through + url = item.get("image_url", {}).get("url", "") + if url.startswith("data:"): + # Base64 inline data + parts.append({"inline_data": {"mime_type": "image/png", "data": url.split(",", 1)[-1]}}) + else: + parts.append({"text": f"[Image: {url}]"}) + contents.append({"role": "user", "parts": parts}) + continue + + if role == "assistant": + parts = [] + if content: + if isinstance(content, str): + parts.append({"text": content}) + + # Tool calls → function_call parts + tool_calls = msg.get("tool_calls") + if tool_calls: + for tc in tool_calls: + func = tc.get("function", {}) + try: + args = json.loads(func.get("arguments", "{}")) + except (json.JSONDecodeError, TypeError): + args = {} + parts.append({ + "function_call": { + "name": func.get("name", ""), + "args": args, + } + }) + + if parts: + contents.append({"role": "model", "parts": parts}) + continue + + if role == "tool": + # Tool results → function_response parts + tool_call_id = msg.get("tool_call_id", "") + # Try to find tool name from previous assistant message + tool_name = msg.get("name", tool_call_id) + try: + result = json.loads(content) if isinstance(content, str) else content + except (json.JSONDecodeError, TypeError): + result = {"result": content} + + contents.append({ + "role": "user", + "parts": [{ + "function_response": { + "name": tool_name, + "response": result if isinstance(result, dict) else {"result": str(result)}, + } + }], + }) + continue + + return system_instruction, contents + + +def _convert_tools_to_gemini(tools: list[dict] | None) -> list[dict] | None: + """Convert OpenAI tool format to Gemini function declarations. + + From: {"type": "function", "function": {"name": ..., "description": ..., "parameters": {...}}} + To: {"name": ..., "description": ..., "parameters": {...}} + """ + if not tools: + return None + + declarations = [] + for tool in tools: + func = tool.get("function", {}) + decl = {"name": func.get("name", "")} + if "description" in func: + decl["description"] = func["description"] + if "parameters" in func: + params = dict(func["parameters"]) + # Gemini doesn't support 'strict' or 'additionalProperties' at top level + params.pop("strict", None) + params.pop("additionalProperties", None) + decl["parameters"] = params + declarations.append(decl) + + return declarations + + +# ============ Adapter ============ + + +class GeminiAdapter(BaseAdapter): + """Adapter for Google Gemini API via google-genai SDK.""" + + def _make_client(self, api_key: str | None = None): + """Create a google-genai client.""" + from google import genai + + import os + key = api_key or os.environ.get("GEMINI_API_KEY", "") + return genai.Client(api_key=key) + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ): + """Streaming chat completion using the Google GenAI SDK. + + Returns collected chunks in OpenAI-compatible format. + """ + from google.genai import types + + client = self._make_client(api_key) + + # Convert messages and tools + system_instruction, gemini_contents = _convert_messages_to_gemini(messages) + gemini_tools = _convert_tools_to_gemini(tools) + + # Build config + config_kwargs = {} + + # System instruction + if system_instruction: + config_kwargs["system_instruction"] = system_instruction + + # Tools + if gemini_tools: + config_kwargs["tools"] = [types.Tool(function_declarations=gemini_tools)] + + # Temperature + temperature = kwargs.pop("temperature", None) + if temperature is not None: + config_kwargs["temperature"] = temperature + + # Max output tokens + max_tokens = kwargs.pop("max_tokens", None) or kwargs.pop("max_output_tokens", None) + if max_tokens: + config_kwargs["max_output_tokens"] = max_tokens + + # Response modalities (for multimodal image generation) + modalities = kwargs.pop("modalities", None) + if modalities: + config_kwargs["response_modalities"] = modalities + + # Reasoning / thinking config + reasoning_effort = kwargs.pop("reasoning_effort", None) + thinking = kwargs.pop("thinking", None) + if thinking and isinstance(thinking, dict): + budget = thinking.get("budget_tokens", -1) + config_kwargs["thinking_config"] = types.ThinkingConfig( + thinking_budget=budget, + include_thoughts=True, + ) + elif reasoning_effort: + config_kwargs["thinking_config"] = types.ThinkingConfig( + thinking_budget=-1, # auto + include_thoughts=True, + ) + + config = types.GenerateContentConfig(**config_kwargs) + + try: + stream_start_time = time.time() + first_chunk_time = None + chunk_count = 0 + collected_chunks = [] + full_text = "" + prompt_tokens = 0 + completion_tokens = 0 + + stream_iter = await client.aio.models.generate_content_stream( + model=model, + contents=gemini_contents, + config=config, + ) + async for response in stream_iter: + # Extract text from response candidates + text = "" + tool_calls_data = [] + + thinking_text = "" + + if response.candidates: + for candidate in response.candidates: + if candidate.content and candidate.content.parts: + for part in candidate.content.parts: + if getattr(part, "thought", False) and part.text: + # Thinking/reasoning part + thinking_text += part.text + elif hasattr(part, "text") and part.text: + text += part.text + elif hasattr(part, "function_call") and part.function_call: + fc = part.function_call + tool_calls_data.append({ + "index": len(tool_calls_data), + "id": f"call_{fc.name}_{len(tool_calls_data)}", + "type": "function", + "function": { + "name": fc.name, + "arguments": json.dumps(dict(fc.args)) if fc.args else "{}", + }, + }) + + if text: + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + + full_text += text + + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "content": text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + chunk_count += 1 + await run_func(process_chunk, { + "role": "assistant", + "content": text, + }) + + if thinking_text: + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": thinking_text, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + if process_chunk: + await run_func(process_chunk, { + "role": "assistant", + "reasoning_content": thinking_text, + }) + + if tool_calls_data: + chunk_dict = { + "choices": [{ + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": tool_calls_data, + }, + "finish_reason": None, + }], + } + collected_chunks.append(chunk_dict) + + # Extract usage if available + if hasattr(response, "usage_metadata") and response.usage_metadata: + um = response.usage_metadata + prompt_tokens = getattr(um, "prompt_token_count", 0) or 0 + completion_tokens = getattr(um, "candidates_token_count", 0) or 0 + + # Add finish chunk + collected_chunks.append({ + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": "stop", + }], + }) + + if process_chunk: + await run_func(process_chunk, {"stop": True}) + + # Add usage chunk + total_tokens = prompt_tokens + completion_tokens + collected_chunks.append({ + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + }, + "choices": [], + }) + + total_time = time.time() - stream_start_time + logger.info(f"✅ Stream completed: {total_time:.3f}s, {chunk_count} chunks [{model}]") + + return collected_chunks + + except Exception as e: + raise _wrap_gemini_error(e) from e + + async def aembedding( + self, + *, + model: str, + input: list[str], + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> list[list[float]]: + """Generate embeddings using Gemini API.""" + client = self._make_client(api_key) + try: + results = [] + for text in input: + response = await client.aio.models.embed_content( + model=model, + contents=text, + ) + results.append(response.embedding) + return results + except Exception as e: + raise _wrap_gemini_error(e) from e diff --git a/pantheon/utils/adapters/openai_adapter.py b/pantheon/utils/adapters/openai_adapter.py new file mode 100644 index 00000000..5d93c6a9 --- /dev/null +++ b/pantheon/utils/adapters/openai_adapter.py @@ -0,0 +1,439 @@ +""" +OpenAI adapter — handles OpenAI and all OpenAI-compatible providers. + +Covers: openai, deepseek, moonshot, minimax, zai (zhipu), and any +provider with openai_compatible=true in the catalog. +""" + +import os +import time +from typing import Any, Callable + +from openai import NOT_GIVEN, AsyncOpenAI + +from ..log import logger +from ..misc import run_func +from .base import ( + BaseAdapter, + ServiceUnavailableError, + InternalServerError, + RateLimitError, + APIConnectionError, +) + + +def _wrap_openai_error(e: Exception) -> Exception: + """Convert openai SDK exceptions to unified exception types.""" + import openai as openai_mod + + if isinstance(e, openai_mod.RateLimitError): + return RateLimitError(str(e)) + elif isinstance(e, openai_mod.APIConnectionError): + return APIConnectionError(str(e)) + elif isinstance(e, openai_mod.InternalServerError): + return InternalServerError(str(e)) + elif isinstance(e, openai_mod.APIStatusError): + status = getattr(e, "status_code", 0) + if status == 503: + return ServiceUnavailableError(str(e)) + elif status == 429: + return RateLimitError(str(e)) + elif status >= 500: + return InternalServerError(str(e)) + return e + + +class OpenAIAdapter(BaseAdapter): + """Adapter for OpenAI and OpenAI-compatible APIs.""" + + def _make_client( + self, + base_url: str | None = None, + api_key: str | None = None, + ) -> AsyncOpenAI: + """Create an AsyncOpenAI client with optional overrides.""" + kwargs = {} + if base_url: + kwargs["base_url"] = base_url + if api_key: + kwargs["api_key"] = api_key + return AsyncOpenAI(**kwargs) + + async def acompletion( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + stream: bool = True, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + num_retries: int = 3, + **kwargs, + ): + """Streaming chat completion using the OpenAI SDK. + + Returns an async iterator that yields raw chunk dicts. + The caller is responsible for assembling chunks (via stream_chunk_builder). + """ + client = self._make_client(base_url, api_key) + + _tools = tools or NOT_GIVEN + _pcall = (tools is not None) or NOT_GIVEN + + # Build call kwargs + call_kwargs = { + "model": model, + "messages": messages, + "tools": _tools, + "stream": True, + "stream_options": {"include_usage": True}, + } + + if response_format: + call_kwargs["response_format"] = response_format + + # reasoning models (o1, o3, o4 series) don't support parallel_tool_calls + if not model.startswith("o"): + call_kwargs["parallel_tool_calls"] = _pcall + + # Merge extra kwargs (reasoning_effort, temperature, etc.) + call_kwargs.update(kwargs) + + retry_count = num_retries + while retry_count > 0: + try: + stream_start_time = time.time() + first_chunk_time = None + chunk_count = 0 + + response = await client.chat.completions.create(**call_kwargs) + + collected_chunks = [] + try: + async for chunk in response: + chunk_dict = chunk.model_dump() + collected_chunks.append(chunk_dict) + + if first_chunk_time is None: + first_chunk_time = time.time() + ttfb = first_chunk_time - stream_start_time + logger.info(f"⚡ First chunk received: {ttfb:.3f}s (TTFB) [{model}]") + + if ( + process_chunk + and chunk.choices + and len(chunk.choices) > 0 + ): + choice = chunk.choices[0] + if hasattr(choice, "delta") and choice.delta: + delta = choice.delta.model_dump() + chunk_count += 1 + await run_func(process_chunk, delta) + if hasattr(choice, "finish_reason") and choice.finish_reason == "stop": + await run_func(process_chunk, {"stop": True}) + except Exception as stream_err: + # Some providers (e.g. Groq) validate tool calls server-side + # and abort the stream mid-way with errors like: + # - "tool call validation failed: attempted to call tool X not in request.tools" + # - "Failed to parse tool call arguments as JSON" + # If we already collected text chunks, return them as a partial response + # instead of crashing the entire request. + err_str = str(stream_err).lower() + is_tool_error = "tool call" in err_str or "tool_call" in err_str + if is_tool_error and collected_chunks: + logger.warning( + f"⚠ Stream interrupted by tool call error, " + f"returning {len(collected_chunks)} partial chunks [{model}]: {stream_err}" + ) + # Strip tool_call deltas from partial chunks — they are incomplete + # and will cause downstream errors. Only keep text content. + cleaned_chunks = [] + for c in collected_chunks: + choices = c.get("choices", []) + if choices: + delta = choices[0].get("delta", {}) + # Remove tool_calls from delta, keep only text content + delta.pop("tool_calls", None) + cleaned_chunks.append(c) + # Add a stop chunk + cleaned_chunks.append({ + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + }) + collected_chunks = cleaned_chunks + if process_chunk: + await run_func(process_chunk, {"stop": True}) + else: + raise + + total_time = time.time() - stream_start_time + logger.info(f"✅ Stream completed: {total_time:.3f}s, {chunk_count} chunks [{model}]") + return collected_chunks + + except Exception as e: + wrapped = _wrap_openai_error(e) + if isinstance(wrapped, APIConnectionError): + retry_count -= 1 + logger.warning(f"Connection error, retrying ({num_retries - retry_count}/{num_retries}): {e}") + if retry_count <= 0: + raise wrapped from e + else: + raise wrapped from e + + # Should not reach here, but just in case + raise APIConnectionError(f"Failed after {num_retries} retries") + + async def acompletion_responses( + self, + *, + model: str, + messages: list[dict], + tools: list[dict] | None = None, + response_format: Any | None = None, + process_chunk: Callable | None = None, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> dict: + """Call OpenAI Responses API with streaming. + + Used for models that require the Responses API (gpt-5.x-pro, codex, etc.). + Returns a normalized message dict (not chunks). + """ + client = self._make_client(base_url, api_key) + + # Convert messages to Responses API format + instructions = None + input_items = [] + for msg in messages: + role = msg.get("role") + content = msg.get("content") + if role == "system": + if instructions is None: + instructions = content + else: + input_items.append({"role": "developer", "content": content}) + elif role == "user": + input_items.append({"role": "user", "content": content}) + elif role == "assistant": + if content: + input_items.append({"role": "assistant", "content": content}) + for tc in msg.get("tool_calls") or []: + func = tc.get("function", {}) + input_items.append({ + "type": "function_call", + "call_id": tc["id"], + "name": func.get("name", ""), + "arguments": func.get("arguments", ""), + }) + elif role == "tool": + input_items.append({ + "type": "function_call_output", + "call_id": msg.get("tool_call_id", ""), + "output": content or "", + }) + + # Convert tools + converted_tools = None + if tools: + converted_tools = [] + for tool in tools: + func = tool.get("function", {}) + item = {"type": "function", "name": func.get("name", "")} + if "description" in func: + item["description"] = func["description"] + if "parameters" in func: + item["parameters"] = func["parameters"] + if "strict" in func: + item["strict"] = func["strict"] + converted_tools.append(item) + + # Build kwargs + call_kwargs = {"model": model, "input": input_items, "stream": True} + if instructions is not None: + call_kwargs["instructions"] = instructions + if converted_tools is not None: + call_kwargs["tools"] = converted_tools + if response_format is not None: + call_kwargs["text"] = response_format + + # Map model_params + if kwargs.get("max_tokens"): + call_kwargs["max_output_tokens"] = kwargs.pop("max_tokens") + if kwargs.get("max_completion_tokens"): + call_kwargs["max_output_tokens"] = kwargs.pop("max_completion_tokens") + if kwargs.get("max_output_tokens"): + call_kwargs["max_output_tokens"] = kwargs.pop("max_output_tokens") + reasoning_effort = kwargs.pop("reasoning_effort", None) + if reasoning_effort: + call_kwargs["reasoning"] = {"effort": reasoning_effort} + + # Stream + text_parts = [] + tool_calls_by_id = {} + _item_to_call = {} + response_obj = None + + try: + stream = await client.responses.create(**call_kwargs) + async for event in stream: + event_type = event.type + + if event_type == "response.output_text.delta": + text_parts.append(event.delta) + if process_chunk: + await run_func(process_chunk, {"content": event.delta, "role": "assistant"}) + + elif event_type == "response.output_item.added": + item = event.item + if getattr(item, "type", None) == "function_call": + call_id = getattr(item, "call_id", "") or "" + item_id = getattr(item, "id", "") or "" + _item_to_call[item_id] = call_id + tool_calls_by_id[call_id] = { + "name": getattr(item, "name", "") or "", + "arguments": "", + } + + elif event_type == "response.function_call_arguments.done": + item_id = getattr(event, "item_id", "") or "" + call_id = _item_to_call.get(item_id, "") + if call_id and call_id in tool_calls_by_id: + tool_calls_by_id[call_id]["arguments"] = event.arguments + if event.name: + tool_calls_by_id[call_id]["name"] = event.name + + elif event_type == "response.completed": + response_obj = event.response + if process_chunk: + await run_func(process_chunk, {"stop": True}) + + elif event_type == "response.failed": + error_msg = "" + if hasattr(event, "response") and hasattr(event.response, "error"): + error_msg = str(event.response.error) + raise RuntimeError(f"Responses API call failed: {error_msg}") + + except Exception as e: + wrapped = _wrap_openai_error(e) + if wrapped is not e: + raise wrapped from e + raise + + # Build output + aggregated_text = "".join(text_parts) if text_parts else None + final_tool_calls = None + if tool_calls_by_id: + final_tool_calls = [ + {"id": cid, "type": "function", "function": {"name": info["name"], "arguments": info["arguments"]}} + for cid, info in tool_calls_by_id.items() + ] + + # Cost + cost = 0.0 + usage_dict = {} + if response_obj and hasattr(response_obj, "usage") and response_obj.usage: + usage = response_obj.usage + input_tokens = getattr(usage, "input_tokens", 0) or 0 + output_tokens = getattr(usage, "output_tokens", 0) or 0 + usage_dict = { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + } + try: + from ..provider_registry import completion_cost as calc_cost + cost = calc_cost(model=model, prompt_tokens=input_tokens, completion_tokens=output_tokens) or 0.0 + except Exception: + pass + if cost == 0.0 and (input_tokens or output_tokens): + cost = (input_tokens * 1.0 + output_tokens * 5.0) / 1_000_000 + + return { + "role": "assistant", + "content": aggregated_text, + "tool_calls": final_tool_calls, + "_metadata": {"_debug_cost": cost, "_debug_usage": usage_dict}, + } + + async def aembedding( + self, + *, + model: str, + input: list[str], + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> list[list[float]]: + """Generate embeddings using OpenAI API.""" + client = self._make_client(base_url, api_key) + try: + response = await client.embeddings.create(model=model, input=input) + return [d.embedding for d in response.data] + except Exception as e: + raise _wrap_openai_error(e) from e + + async def aimage_generation( + self, + *, + model: str, + prompt: str, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Generate images using OpenAI API (DALL-E, gpt-image).""" + client = self._make_client(base_url, api_key) + try: + response = await client.images.generate( + model=model, + prompt=prompt, + **kwargs, + ) + return response + except Exception as e: + raise _wrap_openai_error(e) from e + + async def aimage_edit( + self, + *, + model: str, + image: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Edit images using OpenAI API.""" + client = self._make_client(base_url, api_key) + try: + response = await client.images.edit( + model=model, + image=image, + **kwargs, + ) + return response + except Exception as e: + raise _wrap_openai_error(e) from e + + async def atranscription( + self, + *, + model: str, + file: Any, + base_url: str | None = None, + api_key: str | None = None, + **kwargs, + ) -> Any: + """Transcribe audio using OpenAI Whisper API.""" + client = self._make_client(base_url, api_key) + try: + response = await client.audio.transcriptions.create( + model=model, + file=file, + **kwargs, + ) + return response + except Exception as e: + raise _wrap_openai_error(e) from e diff --git a/pantheon/utils/llm.py b/pantheon/utils/llm.py index 2d9c72c4..f6142094 100644 --- a/pantheon/utils/llm.py +++ b/pantheon/utils/llm.py @@ -1,7 +1,6 @@ import json import re import time -import warnings from contextlib import asynccontextmanager from copy import deepcopy from typing import Any, Callable @@ -241,13 +240,13 @@ async def acompletion_responses( Returns a normalised message dict compatible with ``extract_message_from_response``. """ from openai import AsyncOpenAI - from .llm_providers import get_litellm_proxy_kwargs + from .llm_providers import get_proxy_kwargs # ========== Build client ========== - proxy_kwargs = get_litellm_proxy_kwargs() + proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: client = AsyncOpenAI( - base_url=proxy_kwargs["api_base"], + base_url=proxy_kwargs["base_url"], api_key=proxy_kwargs["api_key"] ) elif base_url: @@ -371,7 +370,7 @@ async def acompletion_responses( "total_tokens": input_tokens + output_tokens, } try: - from litellm import completion_cost + from pantheon.utils.provider_registry import completion_cost cost = completion_cost(model=model, prompt_tokens=input_tokens, completion_tokens=output_tokens) or 0.0 except Exception: pass @@ -390,16 +389,145 @@ async def acompletion_responses( return message -def import_litellm(): - warnings.filterwarnings("ignore") - import litellm +def stream_chunk_builder(chunks: list[dict]) -> Any: + """Assemble streaming chunks into a complete response object. - litellm.suppress_debug_info = True - litellm.set_verbose = False - return litellm + Aggregates content deltas, tool_call deltas, and usage from collected chunks + into a SimpleNamespace that mimics the shape of a chat completion response. + Replaces the stream_chunk_builder from external dependencies. + """ + from types import SimpleNamespace + + full_content = "" + full_reasoning = "" + tool_calls_map: dict[int, dict] = {} # index → tool_call dict + finish_reason = None + usage = {} + model = "" + role = "assistant" + + for chunk in chunks: + # Handle dict chunks (from adapters) + if isinstance(chunk, dict): + # Extract usage from usage-only chunks + if "usage" in chunk and chunk["usage"]: + usage = chunk["usage"] + if "model" in chunk: + model = chunk["model"] + + choices = chunk.get("choices", []) + if not choices: + continue + choice = choices[0] + delta = choice.get("delta", {}) + + # Accumulate content + if "content" in delta and delta["content"]: + full_content += delta["content"] + + # Accumulate reasoning (various field names across providers) + # - reasoning_content: DeepSeek, Zhipu, Kimi, Anthropic adapter, Gemini adapter + # - reasoning: Groq gpt-oss models + if "reasoning_content" in delta and delta["reasoning_content"]: + full_reasoning += delta["reasoning_content"] + elif "reasoning" in delta and delta["reasoning"]: + full_reasoning += delta["reasoning"] + + # Accumulate role + if "role" in delta and delta["role"]: + role = delta["role"] + + # Accumulate tool calls + if "tool_calls" in delta and delta["tool_calls"]: + for tc in delta["tool_calls"]: + idx = tc.get("index", 0) + if idx not in tool_calls_map: + tool_calls_map[idx] = { + "id": tc.get("id", ""), + "type": tc.get("type", "function"), + "function": { + "name": tc.get("function", {}).get("name", ""), + "arguments": "", + }, + } + else: + # Merge + if tc.get("id"): + tool_calls_map[idx]["id"] = tc["id"] + func = tc.get("function", {}) + if func.get("name"): + tool_calls_map[idx]["function"]["name"] = func["name"] + + # Always append arguments + args = tc.get("function", {}).get("arguments", "") + if args: + tool_calls_map[idx]["function"]["arguments"] += args + + # Track finish reason + fr = choice.get("finish_reason") + if fr: + finish_reason = fr + + else: + # Handle object-style chunks (from OpenAI SDK directly) + if hasattr(chunk, "model_dump"): + chunk_dict = chunk.model_dump() + # Recursively process as dict + result = stream_chunk_builder([chunk_dict]) + return result + + # Build final tool_calls list + final_tool_calls = None + if tool_calls_map: + final_tool_calls = [tool_calls_map[i] for i in sorted(tool_calls_map.keys())] + + # Build message + # For reasoning models that put everything in reasoning_content with no content, + # fall back to reasoning_content so the response isn't empty + effective_content = full_content or None + if not effective_content and full_reasoning: + effective_content = full_reasoning + + message = SimpleNamespace( + role=role, + content=effective_content, + tool_calls=final_tool_calls, + reasoning_content=full_reasoning or None, + ) -async def acompletion_litellm( + def message_model_dump(): + d = {"role": message.role, "content": message.content, "tool_calls": message.tool_calls} + if message.reasoning_content: + d["reasoning_content"] = message.reasoning_content + return d + message.model_dump = message_model_dump + + # Build choice + choice = SimpleNamespace( + message=message, + finish_reason=finish_reason, + ) + + # Build usage + usage_ns = SimpleNamespace( + prompt_tokens=usage.get("prompt_tokens", 0), + completion_tokens=usage.get("completion_tokens", 0), + total_tokens=usage.get("total_tokens", 0), + ) + + # Build response + resp = SimpleNamespace( + choices=[choice], + model=model, + usage=usage_ns, + _hidden_params={}, + ) + + return resp + + +async def acompletion( messages: list[dict], model: str, tools: list[dict] | None = None, @@ -410,117 +538,123 @@ async def acompletion_litellm( model_params: dict | None = None, num_retries: int = 3, ): - """Call LLM via LiteLLM Proxy (preferred) or traditional API keys (fallback) + """Call LLM via provider adapters. Two modes of operation: 1. PROXY MODE (Hub-launched agents): - - LITELLM_PROXY_ENABLED=true with LITELLM_PROXY_URL and LITELLM_PROXY_KEY + - LLM_PROXY_ENABLED=true with LLM_PROXY_URL and LLM_PROXY_KEY - Uses virtual key for authentication to Proxy - Real API keys are hidden in Proxy, not in Pod environment - - Fake API keys in environment are for detect_available_provider() only 2. STANDALONE MODE (agents running independently): - - LITELLM_PROXY_ENABLED not set or false + - LLM_PROXY_ENABLED not set or false - Falls back to reading real API keys from environment variables - - Suitable for local development and standalone agent operation + - Uses native SDK adapters (openai, anthropic, google-genai) """ - from pantheon.settings import get_settings - from .llm_providers import get_litellm_proxy_kwargs - - litellm = import_litellm() - logger.debug(f"[LITELLM.ACOMPLETION] Starting LLM call | Model={model}") + from .llm_providers import get_proxy_kwargs + from .provider_registry import find_provider_for_model, get_provider_config, completion_cost + from .adapters import get_adapter - settings = get_settings() - - # ========== Prepare LiteLLM Parameters ========== - kwargs = { - "model": model, - "messages": messages, - "tools": tools, - "response_format": response_format, - "stream": True, - "stream_options": {"include_usage": True}, - "num_retries": num_retries, - } + logger.debug(f"[ACOMPLETION] Starting LLM call | Model={model}") - if model_params: - kwargs.update(**model_params) + # ========== Resolve provider and adapter ========== + provider_key, model_name, provider_config = find_provider_for_model(model) + sdk_type = provider_config.get("sdk", "openai") # ========== Mode Detection & Configuration ========== - proxy_kwargs = get_litellm_proxy_kwargs() + proxy_kwargs = get_proxy_kwargs() if proxy_kwargs: - kwargs.update(proxy_kwargs) + # Proxy mode: all calls go through OpenAI-compatible proxy + effective_base_url = proxy_kwargs.get("base_url") + effective_api_key = proxy_kwargs.get("api_key") + sdk_type = "openai" # proxy exposes OpenAI-compatible API + effective_model = model # pass full model string to proxy + elif sdk_type == "codex": + # Codex OAuth: get access token from OAuth manager + from .oauth import CodexOAuthManager + oauth = CodexOAuthManager() + effective_api_key = oauth.get_access_token(auto_refresh=True) + if not effective_api_key: + raise RuntimeError( + "[OAUTH_REQUIRED] Codex OAuth session expired or not configured. " + "Please re-login in Settings → API Keys → OAuth." + ) + effective_base_url = provider_config.get("base_url") + effective_model = model_name else: - if base_url: - kwargs["api_base"] = base_url - if api_key: - kwargs["api_key"] = api_key + effective_base_url = base_url or provider_config.get("base_url") + effective_api_key = api_key + if not effective_api_key: + import os + api_key_env = provider_config.get("api_key_env", "") + if api_key_env: + effective_api_key = os.environ.get(api_key_env, "") + # Local providers (Ollama) don't need a real API key + if not effective_api_key and provider_config.get("local"): + effective_api_key = "ollama" + effective_model = model_name # use bare model name with native SDK + + adapter = get_adapter(sdk_type) + + # ========== Prepare adapter kwargs ========== + adapter_kwargs = dict(model_params or {}) + + # Codex OAuth: pass account_id for chatgpt-account-id header + if sdk_type == "codex": + from .oauth import CodexOAuthManager + account_id = CodexOAuthManager().get_account_id() + if account_id: + adapter_kwargs["account_id"] = account_id # Kimi Coding API gates access by User-Agent header if "kimi-for-coding" in model: - kwargs.setdefault("extra_headers", {}) - kwargs["extra_headers"].setdefault("User-Agent", "claude-code/0.1.0") + adapter_kwargs.setdefault("extra_headers", {}) + adapter_kwargs["extra_headers"].setdefault("User-Agent", "claude-code/0.1.0") # ========== Execute Call ========== + from pantheon.agent import StopRunning + try: - logger.debug( - f"[LITELLM.ACOMPLETION] Calling litellm.acompletion with model={model}" + logger.debug(f"[ACOMPLETION] Calling {sdk_type} adapter for model={effective_model}") + collected_chunks = await adapter.acompletion( + model=effective_model, + messages=messages, + tools=tools, + response_format=response_format, + stream=True, + process_chunk=process_chunk, + base_url=effective_base_url, + api_key=effective_api_key, + num_retries=num_retries, + **adapter_kwargs, ) - response = await litellm.acompletion(**kwargs) - logger.debug(f"[LITELLM.ACOMPLETION] ✓ LiteLLM call succeeded for model={model}") + logger.debug(f"[ACOMPLETION] ✓ Call succeeded for model={effective_model}") + except StopRunning: + raise except Exception as e: logger.error( - f"[LITELLM.ACOMPLETION] ✗ LiteLLM call failed | " - f"Model={model} | Error={type(e).__name__}: {str(e)[:200]}" + f"[ACOMPLETION] ✗ Call failed | " + f"Model={effective_model} | Error={type(e).__name__}: {str(e)[:200]}" ) raise - # ========== Stream Processing & Cost Calculation ========== - from pantheon.agent import StopRunning + # ========== Build complete response ========== + # Codex adapter returns a message dict directly (not chunks) + if sdk_type == "codex" and isinstance(collected_chunks, dict): + return collected_chunks # Already a normalized message dict - collected_chunks = [] - try: - async for chunk in response: - collected_chunks.append(chunk) - if ( - process_chunk - and hasattr(chunk, "choices") - and chunk.choices - and len(chunk.choices) > 0 - ): - choice = chunk.choices[0] - if hasattr(choice, "delta"): - delta = choice.delta.model_dump() - # LiteLLM provides unified reasoning_content field - await run_func(process_chunk, delta) - if hasattr(choice, "finish_reason") and choice.finish_reason == "stop": - await run_func(process_chunk, {"stop": True}) - except StopRunning: - # Build partial message from chunks collected so far - partial_msg = None - if collected_chunks: - try: - partial_resp = litellm.stream_chunk_builder(collected_chunks) - if partial_resp and hasattr(partial_resp, "choices") and partial_resp.choices: - partial_msg = partial_resp.choices[0].message.model_dump() - partial_msg.setdefault("role", "assistant") - except Exception: - pass - raise StopRunning(partial_message=partial_msg) - - complete_resp = litellm.stream_chunk_builder(collected_chunks) + complete_resp = stream_chunk_builder(collected_chunks) # Calculate and attach cost information try: - cost = litellm.completion_cost(completion_response=complete_resp) + cost = completion_cost(completion_response=complete_resp) if cost and cost > 0: - # Store cost in a way that count_tokens_in_messages can access if not hasattr(complete_resp, "_hidden_params"): complete_resp._hidden_params = {} complete_resp._hidden_params["response_cost"] = cost except Exception: - pass # Silently ignore cost calculation errors + pass return complete_resp @@ -761,14 +895,30 @@ def remove_ui_fields(messages: list[dict]) -> list[dict]: return messages +_ALLOWED_MESSAGE_FIELDS = { + "role", "content", "name", "tool_calls", "tool_call_id", + "refusal", "function_call", # OpenAI standard fields +} + + def remove_metadata(messages: list[dict]) -> list[dict]: """ - Remove _metadata field from messages. - This should be called just before sending messages to the LLM. + Strip messages down to only standard OpenAI fields before sending to LLM. + + Strict providers like Groq reject ANY unknown field (chat_id, _metadata, + _llm_content, _user_metadata, detected_attachments, etc.) and also + reject null values for optional fields like tool_calls. """ for msg in messages: - if "_metadata" in msg: - del msg["_metadata"] + # Remove non-standard fields + extra_keys = [k for k in msg if k not in _ALLOWED_MESSAGE_FIELDS] + for k in extra_keys: + del msg[k] + # Remove fields with None/null values (Groq rejects "tool_calls": null) + null_keys = [k for k in ("tool_calls", "tool_call_id", "name", "function_call", "refusal") + if k in msg and msg[k] is None] + for k in null_keys: + del msg[k] return messages @@ -821,7 +971,7 @@ def process_messages_for_hook_func(messages: list[dict]) -> list[dict]: async def openai_embedding( texts: list[str], model: str = "text-embedding-3-large" ) -> list[list[float]]: - """Get embeddings using litellm (with proxy support). + """Get embeddings (with proxy support). Args: texts: List of texts to embed @@ -830,19 +980,19 @@ async def openai_embedding( Returns: List of embedding vectors """ - from .llm_providers import get_litellm_proxy_kwargs + from .llm_providers import get_proxy_kwargs + from .adapters import get_adapter - litellm = import_litellm() + proxy_kwargs = get_proxy_kwargs() + adapter = get_adapter("openai") - # litellm.aembedding returns EmbeddingResponse with .data[].embedding - response = await litellm.aembedding( + return await adapter.aembedding( model=model, input=texts, - **get_litellm_proxy_kwargs(), + base_url=proxy_kwargs.get("base_url"), + api_key=proxy_kwargs.get("api_key"), ) - return [d["embedding"] for d in response.data] - def remove_hidden_fields(content: dict) -> dict: """Remove hidden fields from dict content. @@ -1086,7 +1236,7 @@ def _safe_token_counter( ) -> int: """Token counter with fallback for unsupported models.""" try: - from litellm.utils import token_counter + from pantheon.utils.provider_registry import token_counter return token_counter(model=model, messages=messages or [], tools=tools) except Exception: @@ -1207,7 +1357,7 @@ def collect_message_stats_lightweight( # ========== 3. Max tokens ========== try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info model_info = get_model_info(model) meta["max_tokens"] = model_info.get("max_input_tokens", 200000) except Exception: @@ -1225,7 +1375,7 @@ def count_tokens_in_messages( Separates system prompt (first system message) and tools definition from other roles. """ try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info total_tokens = 0 tokens_by_role = {} @@ -1255,7 +1405,7 @@ def count_tokens_in_messages( # 2. Count tokens for tools definition if tools: - # litellm token_counter handles tools definition specifically + # token_counter handles tools definition specifically tools_definition_tokens = _safe_token_counter(model=model, tools=tools) total_tokens += tools_definition_tokens diff --git a/pantheon/utils/llm_catalog.json b/pantheon/utils/llm_catalog.json new file mode 100644 index 00000000..9a3c0baa --- /dev/null +++ b/pantheon/utils/llm_catalog.json @@ -0,0 +1,1396 @@ +{ + "version": 1, + "providers": { + "openai": { + "display_name": "OpenAI", + "sdk": "openai", + "base_url": "https://api.openai.com/v1", + "api_key_env": "OPENAI_API_KEY", + "openai_compatible": true, + "models": { + "gpt-5.4-pro": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 5.0, + "output_cost_per_million": 20.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.4": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.4-mini": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.2-pro": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 5.0, + "output_cost_per_million": 20.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.2": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5.2-codex": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5-mini": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-5-nano": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0.1, + "output_cost_per_million": 0.4, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-4.1-mini": { + "max_input_tokens": 1000000, + "max_output_tokens": 32768, + "input_cost_per_million": 0.4, + "output_cost_per_million": 1.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "o3-pro": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 20.0, + "output_cost_per_million": 80.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "o4-mini": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 1.1, + "output_cost_per_million": 4.4, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gpt-image-1": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "gpt-image-1.5": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "chatgpt-image-latest": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "dall-e-3": { + "mode": "image_generation", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "cost_per_image": 0.04, + "supports_vision": false, + "supports_function_calling": false + }, + "text-embedding-3-large": { + "mode": "embedding", + "max_input_tokens": 8191, + "max_output_tokens": 0, + "input_cost_per_million": 0.13, + "output_cost_per_million": 0, + "supports_vision": false, + "supports_function_calling": false + }, + "text-embedding-3-small": { + "mode": "embedding", + "max_input_tokens": 8191, + "max_output_tokens": 0, + "input_cost_per_million": 0.02, + "output_cost_per_million": 0, + "supports_vision": false, + "supports_function_calling": false + }, + "whisper-1": { + "mode": "audio_transcription", + "max_input_tokens": 0, + "max_output_tokens": 0, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": false, + "supports_function_calling": false + }, + "codex-mini-latest": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 1.5, + "output_cost_per_million": 6.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "anthropic": { + "display_name": "Anthropic", + "sdk": "anthropic", + "base_url": "https://api.anthropic.com", + "api_key_env": "ANTHROPIC_API_KEY", + "openai_compatible": false, + "models": { + "claude-opus-4-6": { + "max_input_tokens": 1000000, + "max_output_tokens": 32000, + "input_cost_per_million": 15.0, + "output_cost_per_million": 75.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-opus-4-5-20251101": { + "max_input_tokens": 200000, + "max_output_tokens": 32000, + "input_cost_per_million": 15.0, + "output_cost_per_million": 75.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-opus-4-20250514": { + "max_input_tokens": 200000, + "max_output_tokens": 32000, + "input_cost_per_million": 15.0, + "output_cost_per_million": 75.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-sonnet-4-6": { + "max_input_tokens": 1000000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.0, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-sonnet-4-5-20250929": { + "max_input_tokens": 200000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.0, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-sonnet-4-20250514": { + "max_input_tokens": 200000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.0, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": true, + "supports_assistant_prefill": true + }, + "claude-haiku-4-5": { + "max_input_tokens": 200000, + "max_output_tokens": 8192, + "input_cost_per_million": 0.8, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": true + } + } + }, + "gemini": { + "display_name": "Google Gemini", + "sdk": "google-genai", + "base_url": "https://generativelanguage.googleapis.com", + "api_key_env": "GEMINI_API_KEY", + "openai_compatible": false, + "models": { + "gemini-3.1-pro-preview": { + "max_input_tokens": 2000000, + "max_output_tokens": 65536, + "input_cost_per_million": 2.5, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-3-pro-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 2.5, + "output_cost_per_million": 15.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-3-flash-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-2.5-pro": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 1.25, + "output_cost_per_million": 10.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-2.5-flash": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-2.5-flash-lite": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.075, + "output_cost_per_million": 0.3, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "gemini-3-pro-image-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 2.5, + "output_cost_per_million": 15.0, + "mode": "multimodal_image", + "supports_vision": true, + "supports_function_calling": false + }, + "gemini-3.1-flash-image-preview": { + "max_input_tokens": 1000000, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "mode": "multimodal_image", + "supports_vision": true, + "supports_function_calling": false + }, + "gemini-2.5-flash-image": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.6, + "mode": "multimodal_image", + "supports_vision": true, + "supports_function_calling": false + } + } + }, + "deepseek": { + "display_name": "DeepSeek", + "sdk": "openai", + "base_url": "https://api.deepseek.com/v1", + "api_key_env": "DEEPSEEK_API_KEY", + "openai_compatible": true, + "models": { + "deepseek-chat": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.27, + "output_cost_per_million": 1.1, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "deepseek-reasoner": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.55, + "output_cost_per_million": 2.19, + "supports_vision": false, + "supports_function_calling": false, + "supports_response_schema": false, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + } + } + }, + "zai": { + "display_name": "Z.ai (Zhipu)", + "sdk": "openai", + "base_url": "https://open.bigmodel.cn/api/paas/v4", + "api_key_env": "ZAI_API_KEY", + "openai_compatible": true, + "models": { + "glm-5": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.6": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5v": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5-air": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.1, + "output_cost_per_million": 0.4, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "glm-4.5-flash": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.0, + "output_cost_per_million": 0.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "minimax": { + "display_name": "MiniMax", + "sdk": "openai", + "base_url": "https://api.minimax.io/v1", + "api_key_env": "MINIMAX_API_KEY", + "openai_compatible": true, + "models": { + "MiniMax-M2.7": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.5-highspeed": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.5": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.5, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.1-highspeed": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "MiniMax-M2.1": { + "max_input_tokens": 1000000, + "max_output_tokens": 131072, + "input_cost_per_million": 0.3, + "output_cost_per_million": 1.2, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "moonshot": { + "display_name": "Moonshot (Kimi)", + "sdk": "openai", + "base_url": "https://api.moonshot.ai/v1", + "api_key_env": "MOONSHOT_API_KEY", + "openai_compatible": true, + "models": { + "kimi-k2.5": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "kimi-k2-0905-preview": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "qwen": { + "display_name": "Qwen (DashScope)", + "sdk": "openai", + "base_url": "https://dashscope.aliyuncs.com/compatible-mode/v1", + "api_key_env": "DASHSCOPE_API_KEY", + "openai_compatible": true, + "models": { + "qwen3-235b-a22b": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 4.0, + "output_cost_per_million": 16.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen3-32b": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.7, + "output_cost_per_million": 2.8, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen3-30b-a3b": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.35, + "output_cost_per_million": 1.4, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-max": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 2.0, + "output_cost_per_million": 8.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-plus": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.8, + "output_cost_per_million": 2.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-turbo": { + "max_input_tokens": 1000000, + "max_output_tokens": 8192, + "input_cost_per_million": 0.3, + "output_cost_per_million": 0.6, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-vl-max": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 3.0, + "output_cost_per_million": 8.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen-vl-plus": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 1.0, + "output_cost_per_million": 2.0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwq-plus": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 1.0, + "output_cost_per_million": 4.0, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "groq": { + "display_name": "Groq", + "sdk": "openai", + "base_url": "https://api.groq.com/openai/v1", + "api_key_env": "GROQ_API_KEY", + "openai_compatible": true, + "models": { + "openai/gpt-oss-120b": { + "max_input_tokens": 131072, + "max_output_tokens": 65536, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.60, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "openai/gpt-oss-20b": { + "max_input_tokens": 131072, + "max_output_tokens": 65536, + "input_cost_per_million": 0.075, + "output_cost_per_million": 0.30, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "groq/compound": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.0, + "output_cost_per_million": 0.0, + "supports_vision": false, + "supports_function_calling": false, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": true, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "llama-3.3-70b-versatile": { + "max_input_tokens": 131072, + "max_output_tokens": 32768, + "input_cost_per_million": 0.59, + "output_cost_per_million": 0.79, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "meta-llama/llama-4-scout-17b-16e-instruct": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.11, + "output_cost_per_million": 0.34, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "qwen/qwen3-32b": { + "max_input_tokens": 131072, + "max_output_tokens": 40960, + "input_cost_per_million": 0.29, + "output_cost_per_million": 0.59, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "llama-3.1-8b-instant": { + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_million": 0.05, + "output_cost_per_million": 0.08, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "openrouter": { + "display_name": "OpenRouter", + "sdk": "openai", + "base_url": "https://openrouter.ai/api/v1", + "api_key_env": "OPENROUTER_API_KEY", + "openai_compatible": true, + "models": { + "anthropic/claude-sonnet-4-6": { + "max_input_tokens": 1000000, + "max_output_tokens": 16000, + "input_cost_per_million": 3.17, + "output_cost_per_million": 15.83, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "google/gemini-2.5-flash": { + "max_input_tokens": 1048576, + "max_output_tokens": 65536, + "input_cost_per_million": 0.16, + "output_cost_per_million": 0.63, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": true, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": true, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "deepseek/deepseek-chat": { + "max_input_tokens": 131072, + "max_output_tokens": 8192, + "input_cost_per_million": 0.28, + "output_cost_per_million": 1.16, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "meta-llama/llama-3.3-70b-instruct": { + "max_input_tokens": 131072, + "max_output_tokens": 32768, + "input_cost_per_million": 0.10, + "output_cost_per_million": 0.10, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "mistral": { + "display_name": "Mistral AI", + "sdk": "openai", + "base_url": "https://api.mistral.ai/v1", + "api_key_env": "MISTRAL_API_KEY", + "openai_compatible": true, + "models": { + "mistral-large-latest": { + "max_input_tokens": 262144, + "max_output_tokens": 262144, + "input_cost_per_million": 0.50, + "output_cost_per_million": 1.50, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "mistral-medium-latest": { + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_million": 0.40, + "output_cost_per_million": 2.00, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "mistral-small-latest": { + "max_input_tokens": 262144, + "max_output_tokens": 262144, + "input_cost_per_million": 0.15, + "output_cost_per_million": 0.60, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "codestral-latest": { + "max_input_tokens": 256000, + "max_output_tokens": 256000, + "input_cost_per_million": 0.30, + "output_cost_per_million": 0.90, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + }, + "open-mistral-nemo": { + "max_input_tokens": 131072, + "max_output_tokens": 131072, + "input_cost_per_million": 0.02, + "output_cost_per_million": 0.04, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": true + } + } + }, + "together_ai": { + "display_name": "Together AI", + "sdk": "openai", + "base_url": "https://api.together.xyz/v1", + "api_key_env": "TOGETHER_API_KEY", + "openai_compatible": true, + "models": { + "Qwen/Qwen3.5-397B-A17B": { + "max_input_tokens": 262144, + "max_output_tokens": 16384, + "input_cost_per_million": 0.60, + "output_cost_per_million": 3.60, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "deepseek-ai/DeepSeek-V3.1": { + "max_input_tokens": 128000, + "max_output_tokens": 16384, + "input_cost_per_million": 0.60, + "output_cost_per_million": 1.70, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "meta-llama/Llama-3.3-70B-Instruct-Turbo": { + "max_input_tokens": 131072, + "max_output_tokens": 16384, + "input_cost_per_million": 0.88, + "output_cost_per_million": 0.88, + "supports_vision": false, + "supports_function_calling": true, + "supports_response_schema": false, + "supports_reasoning": false, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + }, + "deepseek-ai/DeepSeek-R1": { + "max_input_tokens": 163839, + "max_output_tokens": 16384, + "input_cost_per_million": 3.00, + "output_cost_per_million": 7.00, + "supports_vision": false, + "supports_function_calling": false, + "supports_response_schema": false, + "supports_reasoning": true, + "supports_audio_input": false, + "supports_audio_output": false, + "supports_web_search": false, + "supports_pdf_input": false, + "supports_computer_use": false, + "supports_assistant_prefill": false + } + } + }, + "codex": { + "display_name": "Codex (OAuth)", + "sdk": "codex", + "base_url": "https://chatgpt.com/backend-api", + "api_key_env": "", + "openai_compatible": false, + "auth_mode": "oauth", + "models": { + "gpt-5.4": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "gpt-5.4-mini": { + "max_input_tokens": 1000000, + "max_output_tokens": 64000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "gpt-5.2-codex": { + "max_input_tokens": 1000000, + "max_output_tokens": 100000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "gpt-5": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + }, + "o4-mini": { + "max_input_tokens": 200000, + "max_output_tokens": 100000, + "input_cost_per_million": 0, + "output_cost_per_million": 0, + "supports_vision": true, + "supports_function_calling": true, + "supports_response_schema": true, + "supports_reasoning": true + } + } + }, + "ollama": { + "display_name": "Ollama (Local)", + "sdk": "openai", + "base_url": "http://localhost:11434/v1", + "api_key_env": "", + "openai_compatible": true, + "local": true, + "models": {} + } + } +} diff --git a/pantheon/utils/llm_providers.py b/pantheon/utils/llm_providers.py index d9272cd4..868868cf 100644 --- a/pantheon/utils/llm_providers.py +++ b/pantheon/utils/llm_providers.py @@ -21,10 +21,14 @@ class ProviderType(Enum): - """Supported LLM providers""" + """Supported LLM providers. + + OPENAI: Direct OpenAI or OpenAI-compatible providers + NATIVE: Non-OpenAI providers using native SDKs (anthropic, gemini, etc.) + """ OPENAI = "openai" - LITELLM = "litellm" + NATIVE = "native" @dataclass @@ -35,10 +39,10 @@ class ProviderConfig: model_name: str base_url: Optional[str] = None api_key: Optional[str] = None - force_litellm: bool = False + relaxed_schema: bool = False -# OpenAI-compatible providers that litellm doesn't natively support. +# OpenAI-compatible providers that need custom base_url. # Maps provider prefix → (api_base_url, api_key_env_var) OPENAI_COMPATIBLE_PROVIDERS: dict[str, tuple[str, str]] = {} @@ -46,18 +50,18 @@ class ProviderConfig: # ============ Provider Detection ============ -def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: +def detect_provider(model: str, relaxed_schema: bool) -> ProviderConfig: """Detect provider from model string. Model format: - - "gpt-4" → OpenAI (via LiteLLM) - - "provider/model" → LiteLLM (handles zhipu, anthropic, etc. natively) + - "gpt-4" → OpenAI provider + - "provider/model" → Native SDK (handles anthropic, gemini, etc.) - "custom_anthropic/model" → OpenAI-compatible with CUSTOM_ANTHROPIC_* env vars - "custom_openai/model" → OpenAI-compatible with CUSTOM_OPENAI_* env vars Args: model: Model identifier string - force_litellm: Force using LiteLLM backend + relaxed_schema: Use relaxed (non-strict) tool schema mode Returns: ProviderConfig with detected provider and model name @@ -78,21 +82,21 @@ def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: base_url = os.environ.get(config.api_base_env, "") api_key = os.environ.get(config.api_key_env, "") - # Determine the litellm model format based on endpoint type - # LiteLLM needs a provider prefix to route correctly. + # Determine the resolved model format based on endpoint type. + # A provider prefix is needed to route correctly. # Explicitly passed api_key in call_llm_provider overrides env vars. if "anthropic" in provider_lower: - litellm_model = f"anthropic/{model_name}" + resolved_model = f"anthropic/{model_name}" else: - litellm_model = f"openai/{model_name}" + resolved_model = f"openai/{model_name}" - logger.debug(f"Using custom endpoint '{provider_lower}' with base_url={base_url}, litellm_model={litellm_model}") + logger.debug(f"Using custom endpoint '{provider_lower}' with base_url={base_url}, resolved_model={resolved_model}") return ProviderConfig( provider_type=ProviderType.OPENAI, - model_name=litellm_model, + model_name=resolved_model, base_url=base_url or None, api_key=api_key or None, - force_litellm=force_litellm, + relaxed_schema=relaxed_schema, ) if "/" in model: @@ -109,35 +113,39 @@ def detect_provider(model: str, force_litellm: bool) -> ProviderConfig: elif provider_lower == "openai": provider_type = ProviderType.OPENAI else: - # All other prefixed models go through LiteLLM (zhipu, anthropic, etc.) - provider_type = ProviderType.LITELLM - model_name = model # Keep full model string for LiteLLM + # All other prefixed models use native SDK adapters (anthropic, gemini, etc.) + provider_type = ProviderType.NATIVE + model_name = model # Keep full model string for native adapter else: provider_type = ProviderType.OPENAI model_name = model - # Override with LiteLLM if forced - if force_litellm and provider_type != ProviderType.LITELLM: - provider_type = ProviderType.LITELLM + # Override with NATIVE if relaxed_schema is forced + if relaxed_schema and provider_type != ProviderType.NATIVE: + provider_type = ProviderType.NATIVE return ProviderConfig( provider_type=provider_type, model_name=model_name, base_url=base_url, api_key=api_key or None, - force_litellm=force_litellm, + relaxed_schema=relaxed_schema, ) def is_responses_api_model(config: ProviderConfig) -> bool: """Check if model should use the OpenAI Responses API instead of Chat Completions. - Currently triggers for OpenAI models with "codex" in the name (e.g. codex-mini-latest). + Triggers for: + - Models with "codex" in the name (e.g. codex-mini-latest) + - Pro models (gpt-5.x-pro, gpt-5.2-pro) which are Responses-only """ - return ( - config.provider_type == ProviderType.OPENAI - and "codex" in config.model_name.lower() - ) + name_lower = config.model_name.lower() + if config.provider_type != ProviderType.OPENAI: + return False + # Strip "openai/" prefix for matching + bare = name_lower.split("/")[-1] if "/" in name_lower else name_lower + return "codex" in bare or bare.endswith("-pro") def get_base_url(provider: ProviderType) -> Optional[str]: @@ -258,26 +266,26 @@ def _clean_message_fields(message: dict) -> None: message["tool_calls"] = None -def get_litellm_proxy_kwargs() -> dict: - """Get LiteLLM proxy kwargs for API calls. - - When LITELLM_PROXY_ENABLED=true, returns {"api_base": ..., "api_key": ...} - to route calls through the LiteLLM Proxy. Otherwise returns empty dict. +def get_proxy_kwargs() -> dict: + """Get proxy kwargs for API calls. - Usage: - proxy_kwargs = get_litellm_proxy_kwargs() - response = await litellm.aimage_generation(model=model, ..., **proxy_kwargs) - response = await litellm.acompletion(model=model, ..., **proxy_kwargs) + When LLM_PROXY_ENABLED=true (or LITELLM_PROXY_ENABLED for backward compat), + returns {"base_url": ..., "api_key": ...} to route calls through a proxy. + Otherwise returns empty dict. """ import os - proxy_enabled = os.environ.get("LITELLM_PROXY_ENABLED", "").lower() == "true" - proxy_url = os.environ.get("LITELLM_PROXY_URL") - proxy_key = os.environ.get("LITELLM_PROXY_KEY") + # Check new env vars first, fall back to legacy LITELLM_ prefix + proxy_enabled = ( + os.environ.get("LLM_PROXY_ENABLED", "").lower() == "true" + or os.environ.get("LITELLM_PROXY_ENABLED", "").lower() == "true" + ) + proxy_url = os.environ.get("LLM_PROXY_URL") or os.environ.get("LITELLM_PROXY_URL") + proxy_key = os.environ.get("LLM_PROXY_KEY") or os.environ.get("LITELLM_PROXY_KEY") if proxy_enabled and proxy_url and proxy_key: - logger.info(f"[LITELLM_PROXY] Routing through proxy | URL={proxy_url}") - return {"api_base": proxy_url, "api_key": proxy_key} + logger.info(f"[LLM_PROXY] Routing through proxy | URL={proxy_url}") + return {"base_url": proxy_url, "api_key": proxy_key} return {} @@ -286,7 +294,7 @@ def _extract_cost_and_usage(complete_resp: Any) -> tuple[float, dict]: """Calculate cost and extract usage from response. Cost and usage are extracted independently - cost calculation failures - (e.g., for new models not yet in litellm's price map) should not prevent + (e.g., for new models not yet in the price catalog) should not prevent usage data from being captured. """ cost = 0.0 @@ -305,16 +313,15 @@ def _extract_cost_and_usage(complete_resp: Any) -> tuple[float, dict]: except Exception: pass - # Try to calculate cost (may fail for new/unmapped models) + # Calculate cost from catalog pricing try: - from litellm import completion_cost + from pantheon.utils.provider_registry import completion_cost cost = completion_cost(completion_response=complete_resp) or 0.0 except Exception as e: - # DEBUG level: this is expected for new models not yet in litellm's price map logger.debug(f"Cost calculation unavailable: {e}") - # Fallback: estimate cost from usage if litellm failed but we have token counts + # Fallback: estimate cost from usage if catalog lookup failed but we have token counts if cost == 0.0 and usage_dict: input_tokens = usage_dict.get("prompt_tokens", 0) output_tokens = usage_dict.get("completion_tokens", 0) @@ -450,9 +457,8 @@ async def call_llm_provider( Returns: Extracted and cleaned message dictionary """ - # Import here to avoid circular imports from .llm import ( - acompletion_litellm, + acompletion, remove_metadata, ) @@ -489,7 +495,21 @@ async def call_llm_provider( clean_messages = remove_metadata(clean_messages) # Call appropriate provider - # Route codex models through the OpenAI Responses API + # Route Codex OAuth models through their dedicated adapter + if "codex/" in config.model_name.lower() or config.model_name.startswith("codex/"): + from .llm import acompletion + logger.debug(f"[CALL_LLM_PROVIDER] Using Codex OAuth for model={config.model_name}") + # acompletion handles codex specially — returns message dict directly + return await acompletion( + messages=clean_messages, + model=config.model_name, + tools=tools, + response_format=response_format, + process_chunk=process_chunk, + model_params=model_params, + ) + + # Route codex/pro models through the OpenAI Responses API if is_responses_api_model(config): from .llm import acompletion_responses @@ -512,7 +532,7 @@ async def call_llm_provider( ) if config.provider_type == ProviderType.OPENAI: - # LiteLLM requires explicit provider prefixes for models it cannot auto-detect. + # Provider adapters require explicit provider prefixes for models they cannot auto-detect. # Ensure OpenAI models include the provider namespace to avoid BadRequestError. model_name = config.model_name @@ -522,7 +542,7 @@ async def call_llm_provider( logger.debug( f"[CALL_LLM_PROVIDER] Using OpenAI provider with model={model_name}, base_url={config.base_url}" ) - complete_resp = await acompletion_litellm( + complete_resp = await acompletion( messages=clean_messages, model=model_name, tools=tools, @@ -534,11 +554,11 @@ async def call_llm_provider( ) error_prefix = "OpenAI" - else: # LITELLM + else: # NATIVE logger.debug( - f"[CALL_LLM_PROVIDER] Using LiteLLM provider with model={config.model_name}" + f"[CALL_LLM_PROVIDER] Using native provider with model={config.model_name}" ) - complete_resp = await acompletion_litellm( + complete_resp = await acompletion( messages=clean_messages, model=config.model_name, tools=tools, @@ -548,7 +568,7 @@ async def call_llm_provider( api_key=config.api_key, model_params=model_params, ) - error_prefix = "LiteLLM" + error_prefix = "Native" # Extract and clean message return extract_message_from_response(complete_resp, error_prefix) diff --git a/pantheon/utils/log.py b/pantheon/utils/log.py index 4737c054..24313d19 100644 --- a/pantheon/utils/log.py +++ b/pantheon/utils/log.py @@ -158,7 +158,7 @@ def setup_file_logging( # Warning Suppression # ============================================================================= -# Suppress aiohttp "Unclosed client session" warnings from litellm. +# Suppress aiohttp "Unclosed client session" warnings. # These warnings are harmless - the OS cleans up connections on process exit. warnings.filterwarnings("ignore", message="Unclosed client session", category=ResourceWarning) warnings.filterwarnings("ignore", message="Unclosed connector", category=ResourceWarning) diff --git a/pantheon/utils/misc.py b/pantheon/utils/misc.py index a464693a..0fe99a3f 100644 --- a/pantheon/utils/misc.py +++ b/pantheon/utils/misc.py @@ -150,8 +150,10 @@ def _strip_docstring_args(docstring: str | None) -> str: def desc_to_openai_dict( desc: Description, skip_params: List[str] = [], - litellm_mode: bool = False, + relaxed_schema: bool = False, ) -> dict: + _relaxed = relaxed_schema + # Filter inputs without modifying original desc.inputs filtered_inputs = [arg for arg in desc.inputs if arg.name not in skip_params] @@ -197,7 +199,7 @@ def desc_to_openai_dict( parameters[arg.name] = pdict - if litellm_mode: + if _relaxed: if arg.default is NotDef: required.append(arg.name) else: @@ -208,10 +210,10 @@ def desc_to_openai_dict( "function": { "name": desc.name, "description": tool_description, - "strict": not litellm_mode, + "strict": not _relaxed, }, } - if (not litellm_mode) or (len(parameters) > 0): + if (not _relaxed) or (len(parameters) > 0): func_dict["function"]["parameters"] = { "type": "object", "properties": parameters, diff --git a/pantheon/utils/model_selector.py b/pantheon/utils/model_selector.py index 817ddc7d..ab0d50bb 100644 --- a/pantheon/utils/model_selector.py +++ b/pantheon/utils/model_selector.py @@ -56,11 +56,46 @@ class CustomEndpointConfig: # Sentinel object for negative cache (better than empty string) _NOT_FOUND = object() +# ============ Local Provider Detection ============ + +_ollama_cache: dict | None = None +_ollama_cache_time: float = 0 + + +def _detect_ollama(base_url: str = "http://localhost:11434") -> bool: + """Check if Ollama is running locally.""" + try: + import httpx + resp = httpx.get(f"{base_url}/api/tags", timeout=2) + return resp.is_success + except Exception: + return False + + +def _list_ollama_models(base_url: str = "http://localhost:11434") -> list[str]: + """List available models from local Ollama instance (cached 30s).""" + import time + global _ollama_cache, _ollama_cache_time + if _ollama_cache is not None and time.time() - _ollama_cache_time < 30: + return _ollama_cache + + try: + import httpx + resp = httpx.get(f"{base_url}/api/tags", timeout=5) + if resp.is_success: + models = [m["name"] for m in resp.json().get("models", [])] + _ollama_cache = models + _ollama_cache_time = time.time() + return models + except Exception: + pass + return [] + # ============ Default Configuration ============ # Built-in defaults based on February 2026 flagship models # Users can override in settings.json -DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot"] +DEFAULT_PROVIDER_PRIORITY = ["openai", "anthropic", "gemini", "zai", "deepseek", "minimax", "moonshot", "qwen", "groq", "mistral", "together_ai", "openrouter", "codex", "ollama"] # Quality levels map to MODEL LISTS (not single models) for fallback chains # Models within each level are ordered by preference @@ -137,9 +172,50 @@ class CustomEndpointConfig: "normal": ["moonshot/kimi-k2.5", "moonshot/kimi-k2-0905-preview"], "low": ["moonshot/kimi-k2.5", "moonshot/kimi-k2-0905-preview"], }, + # Qwen (DashScope): Qwen3/QwQ series + # https://help.aliyun.com/zh/model-studio/ + "qwen": { + "high": ["qwen/qwen3-235b-a22b", "qwen/qwen-max", "qwen/qwq-plus"], + "normal": ["qwen/qwen3-32b", "qwen/qwen-plus"], + "low": ["qwen/qwen3-30b-a3b", "qwen/qwen-turbo"], + }, + # Groq: Ultra-fast inference + # https://console.groq.com/docs/models + "groq": { + "high": ["groq/openai/gpt-oss-120b", "groq/llama-3.3-70b-versatile"], + "normal": ["groq/openai/gpt-oss-20b", "groq/qwen/qwen3-32b", "groq/meta-llama/llama-4-scout-17b-16e-instruct"], + "low": ["groq/llama-3.1-8b-instant"], + }, + # Mistral AI + # https://docs.mistral.ai/getting-started/models + "mistral": { + "high": ["mistral/mistral-large-latest", "mistral/mistral-medium-latest"], + "normal": ["mistral/mistral-small-latest", "mistral/codestral-latest"], + "low": ["mistral/open-mistral-nemo"], + }, + # Together AI: Open-source model hosting + # https://docs.together.ai/docs/serverless-models + "together_ai": { + "high": ["together_ai/Qwen/Qwen3.5-397B-A17B", "together_ai/deepseek-ai/DeepSeek-V3.1"], + "normal": ["together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"], + "low": ["together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo"], + }, + # Codex: OpenAI via ChatGPT OAuth (free with ChatGPT Plus) + "codex": { + "high": ["codex/gpt-5.4", "codex/gpt-5.2-codex"], + "normal": ["codex/gpt-5.4-mini", "codex/gpt-5"], + "low": ["codex/gpt-5.4-mini", "codex/o4-mini"], + }, + # OpenRouter: Multi-provider aggregator + # https://openrouter.ai/models + "openrouter": { + "high": ["openrouter/anthropic/claude-sonnet-4-6"], + "normal": ["openrouter/google/gemini-2.5-flash", "openrouter/deepseek/deepseek-chat"], + "low": ["openrouter/meta-llama/llama-3.3-70b-instruct"], + }, } -# Capability tags map to litellm's supports_* fields +# Capability tags map to catalog supports_* fields CAPABILITY_MAP = { "vision": "supports_vision", "reasoning": "supports_reasoning", @@ -180,6 +256,9 @@ class CustomEndpointConfig: "minimax": "MINIMAX_API_KEY", "zai": "ZAI_API_KEY", "moonshot": "MOONSHOT_API_KEY", + "qwen": "DASHSCOPE_API_KEY", + "codex": "", # OAuth-based, no env var key + "ollama": "", # Local, no env var key — detected by _detect_ollama() } # ============ Image Generation Model Defaults ============ @@ -235,6 +314,22 @@ def _get_available_providers(self) -> set[str]: if os.environ.get(config.api_key_env, ""): self._available_providers.add(provider_key) + # Check OAuth providers (e.g., Codex) + try: + from pantheon.utils.oauth import CodexOAuthManager + if CodexOAuthManager().is_authenticated(): + self._available_providers.add("codex") + except Exception: + pass + + # Check local Ollama + try: + from pantheon.utils.model_selector import _detect_ollama + if _detect_ollama(): + self._available_providers.add("ollama") + except Exception: + pass + # Universal proxy: LLM_API_KEY makes openai provider available # (most third-party proxies are OpenAI-compatible) # Note: LLM_API_BASE is deprecated, warn user to use custom endpoints instead @@ -315,11 +410,19 @@ def _get_provider_models(self, provider: str) -> dict[str, list[str]]: Returns: Dict mapping quality levels to model lists """ - # Custom endpoints don't have predefined model lists in litellm + # Custom endpoints don't have predefined model lists # They use environment-specified models instead if provider in CUSTOM_ENDPOINT_ENVS: return {} + # Ollama: dynamically list local models + if provider == "ollama": + models = _list_ollama_models() + if models: + prefixed = [f"ollama/{m}" for m in models] + return {"high": prefixed, "normal": prefixed, "low": prefixed} + return {} + # Try user configuration first user_config = self.settings.get(f"models.provider_models.{provider}", {}) @@ -331,11 +434,11 @@ def _get_provider_models(self, provider: str) -> dict[str, list[str]]: merged = {**default_config, **user_config} return merged - # No configuration - auto-generate from litellm + # No configuration - auto-generate from catalog return self._auto_generate_provider_config(provider) def _auto_generate_provider_config(self, provider: str) -> dict[str, list[str]]: - """Auto-generate provider config from litellm (sorted by price). + """Auto-generate provider config from catalog (sorted by price). Used when provider has API key but no configuration. @@ -345,28 +448,25 @@ def _auto_generate_provider_config(self, provider: str) -> dict[str, list[str]]: Returns: Dict mapping quality levels to model lists """ - try: - from litellm import models_by_provider - from litellm.utils import get_model_info - except ImportError: - logger.warning("litellm not available for auto-generation") - return {} + from pantheon.utils.provider_registry import models_by_provider as get_models, get_model_info logger.warning( - f"Provider '{provider}' not configured. Auto-generating from litellm. " + f"Provider '{provider}' not configured. Auto-generating from catalog. " f"Consider adding it to settings.json models.provider_models for better control." ) - if provider not in models_by_provider: - logger.warning(f"Provider '{provider}' not found in litellm") + all_models = get_models(provider) + if not all_models: + logger.warning(f"Provider '{provider}' not found in catalog") return {} # Collect chat models with prices models_with_prices: list[tuple[str, float]] = [] - for model in models_by_provider[provider]: + for model in all_models: try: info = get_model_info(model) - if info.get("mode") == "chat": + mode = info.get("mode", "chat") + if mode in ("chat", None): input_cost = info.get("input_cost_per_token", 0) or 0 models_with_prices.append((model, input_cost)) except Exception: @@ -411,13 +511,12 @@ def _check_model_capability(self, model: str, capability: str) -> bool: return False try: - from litellm.utils import get_model_info + from pantheon.utils.provider_registry import get_model_info info = get_model_info(model) - litellm_field = CAPABILITY_MAP[capability] - return bool(info.get(litellm_field)) + field = CAPABILITY_MAP[capability] + return bool(info.get(field)) except Exception: - # If we can't check, assume it doesn't support return False def resolve_model(self, tag: str) -> list[str]: diff --git a/pantheon/utils/oauth/__init__.py b/pantheon/utils/oauth/__init__.py new file mode 100644 index 00000000..31afe3b8 --- /dev/null +++ b/pantheon/utils/oauth/__init__.py @@ -0,0 +1,10 @@ +""" +OAuth support for LLM providers. + +Currently supports: +- Codex (OpenAI ChatGPT backend-api) via browser-based OAuth 2.0 + PKCE +""" + +from .codex import CodexOAuthManager, CodexOAuthError + +__all__ = ["CodexOAuthManager", "CodexOAuthError"] diff --git a/pantheon/utils/oauth/codex.py b/pantheon/utils/oauth/codex.py new file mode 100644 index 00000000..07d95ead --- /dev/null +++ b/pantheon/utils/oauth/codex.py @@ -0,0 +1,421 @@ +""" +OpenAI Codex OAuth — browser-based login to ChatGPT backend-api. + +Implements OAuth 2.0 Authorization Code flow with PKCE. +Tokens are stored in ~/.pantheon/oauth/codex.json. +Supports importing tokens from Codex CLI (~/.codex/auth.json). +""" + +from __future__ import annotations + +import base64 +import hashlib +import json +import os +import secrets +import threading +import time +import webbrowser +from datetime import datetime, timezone +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any, Callable, Optional +from urllib.parse import parse_qs, urlencode, urlparse + +import httpx + +from ..log import logger + +# ============ Constants ============ + +AUTH_ISSUER = "https://auth.openai.com" +CLIENT_ID = "app_EMoamEEZ73f0CkXaXp7hrann" +ORIGINATOR = "pi" +CALLBACK_PORT = 1455 +SCOPE = "openid profile email offline_access" +CODEX_BASE_URL = "https://chatgpt.com/backend-api" + +# Auth storage +AUTH_DIR = Path.home() / ".pantheon" / "oauth" +AUTH_FILE = AUTH_DIR / "codex.json" +CODEX_CLI_AUTH = Path.home() / ".codex" / "auth.json" + + +class CodexOAuthError(RuntimeError): + """Raised when Codex OAuth login or refresh fails.""" + + +# ============ Utility Functions ============ + + +def _utc_now() -> str: + return datetime.now(timezone.utc).isoformat().replace("+00:00", "Z") + + +def _b64url(data: bytes) -> str: + return base64.urlsafe_b64encode(data).decode("ascii").rstrip("=") + + +def _pkce_pair() -> tuple[str, str]: + """Generate PKCE verifier and challenge pair.""" + verifier = _b64url(secrets.token_bytes(32)) + challenge = _b64url(hashlib.sha256(verifier.encode("utf-8")).digest()) + return verifier, challenge + + +def _decode_jwt_payload(token: str) -> dict[str, Any]: + """Decode JWT payload without verification (for reading claims).""" + parts = (token or "").split(".") + if len(parts) != 3 or not parts[1]: + return {} + payload = parts[1] + payload += "=" * (-len(payload) % 4) + try: + decoded = base64.urlsafe_b64decode(payload.encode("ascii")) + data = json.loads(decoded.decode("utf-8")) + except Exception: + return {} + return data if isinstance(data, dict) else {} + + +def _jwt_org_context(token: str) -> dict[str, str]: + """Extract org/account/project from JWT claims.""" + payload = _decode_jwt_payload(token) + nested = payload.get("https://api.openai.com/auth") + claims = nested if isinstance(nested, dict) else {} + context = {} + for key in ("organization_id", "project_id", "chatgpt_account_id"): + value = str(claims.get(key) or "").strip() + if value: + context[key] = value + return context + + +def _token_expired(token: str, skew_seconds: int = 300) -> bool: + """Check if JWT access_token is expired (with skew).""" + payload = _decode_jwt_payload(token) + exp = payload.get("exp") + if not isinstance(exp, (int, float)): + return True + return time.time() >= (float(exp) - skew_seconds) + + +# ============ Token Exchange ============ + + +def _exchange_code(code: str, redirect_uri: str, code_verifier: str) -> dict[str, str]: + """Exchange authorization code for tokens.""" + resp = httpx.post( + f"{AUTH_ISSUER}/oauth/token", + data={ + "grant_type": "authorization_code", + "code": code, + "redirect_uri": redirect_uri, + "client_id": CLIENT_ID, + "code_verifier": code_verifier, + }, + timeout=30, + ) + if not resp.is_success: + raise CodexOAuthError(f"Token exchange failed: HTTP {resp.status_code} {resp.text[:300]}") + data = resp.json() + if not all(data.get(k) for k in ("id_token", "access_token", "refresh_token")): + raise CodexOAuthError("Token exchange returned incomplete credentials") + return { + "id_token": str(data["id_token"]), + "access_token": str(data["access_token"]), + "refresh_token": str(data["refresh_token"]), + } + + +def _refresh_tokens(refresh_token: str) -> dict[str, str]: + """Refresh access token using refresh token.""" + resp = httpx.post( + f"{AUTH_ISSUER}/oauth/token", + data={ + "client_id": CLIENT_ID, + "grant_type": "refresh_token", + "refresh_token": refresh_token, + }, + timeout=30, + ) + if not resp.is_success: + raise CodexOAuthError(f"Token refresh failed: HTTP {resp.status_code} {resp.text[:300]}") + data = resp.json() + access_token = str(data.get("access_token") or "").strip() + id_token = str(data.get("id_token") or "").strip() + next_refresh = str(data.get("refresh_token") or refresh_token).strip() + if not access_token or not id_token: + raise CodexOAuthError("Token refresh returned incomplete credentials") + return { + "id_token": id_token, + "access_token": access_token, + "refresh_token": next_refresh, + } + + +# ============ Callback Server ============ + + +class _CallbackHandler(BaseHTTPRequestHandler): + server_version = "PantheonOAuth/1.0" + + def do_GET(self): + parsed = urlparse(self.path) + if parsed.path != "/auth/callback": + self.send_error(404) + return + params = {k: v[-1] for k, v in parse_qs(parsed.query).items() if v} + self.server.result = params + self.server.event.set() + body = ( + "

OAuth complete

" + "

You can close this window and return to Pantheon.

" + ).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def log_message(self, fmt, *args): + return # Suppress HTTP server logs + + +# ============ OAuth Manager ============ + + +class CodexOAuthManager: + """Manage Codex OAuth tokens — login, refresh, import, and storage.""" + + def __init__(self, auth_file: Path | None = None): + self.auth_file = auth_file or AUTH_FILE + + # ---- Storage ---- + + def _load(self) -> dict[str, Any]: + if self.auth_file.exists(): + try: + return json.loads(self.auth_file.read_text()) + except Exception: + pass + return {} + + def _save(self, auth: dict[str, Any]) -> dict[str, Any]: + self.auth_file.parent.mkdir(parents=True, exist_ok=True) + self.auth_file.write_text(json.dumps(auth, indent=2)) + os.chmod(self.auth_file, 0o600) + return auth + + # ---- Token Access ---- + + def get_tokens(self) -> dict[str, str]: + """Get stored tokens dict.""" + return self._load().get("tokens", {}) + + def get_access_token(self, auto_refresh: bool = True) -> str | None: + """Get a valid access token, refreshing if needed.""" + tokens = self.get_tokens() + access_token = tokens.get("access_token", "") + refresh_token = tokens.get("refresh_token", "") + + if not access_token: + return None + + if auto_refresh and _token_expired(access_token) and refresh_token: + logger.info("[Codex OAuth] Access token expired, refreshing...") + try: + self.refresh() + tokens = self.get_tokens() + access_token = tokens.get("access_token", "") + except Exception as e: + logger.warning(f"[Codex OAuth] Refresh failed: {e}") + return None + + return access_token if access_token and not _token_expired(access_token) else None + + def get_account_id(self) -> str | None: + """Get ChatGPT account_id for API calls.""" + return self.get_tokens().get("account_id") or None + + def is_authenticated(self) -> bool: + """Check if we have a valid (or refreshable) token.""" + tokens = self.get_tokens() + access_token = tokens.get("access_token", "") + refresh_token = tokens.get("refresh_token", "") + if access_token and not _token_expired(access_token): + return True + return bool(refresh_token) + + # ---- Login Flow ---- + + def login( + self, + *, + open_browser: bool = True, + timeout_seconds: int = 300, + ) -> dict[str, Any]: + """Start browser-based OAuth login flow. + + Opens browser to OpenAI auth page. User logs in, callback + redirects to local server. Returns auth record with tokens. + """ + verifier, challenge = _pkce_pair() + state = _b64url(secrets.token_bytes(24)) + + event = threading.Event() + server = self._create_server(event) + _, port = server.server_address + redirect_uri = f"http://localhost:{port}/auth/callback" + + auth_url = ( + f"{AUTH_ISSUER}/oauth/authorize?" + + urlencode({ + "response_type": "code", + "client_id": CLIENT_ID, + "redirect_uri": redirect_uri, + "scope": SCOPE, + "code_challenge": challenge, + "code_challenge_method": "S256", + "id_token_add_organizations": "true", + "codex_cli_simplified_flow": "true", + "state": state, + "originator": ORIGINATOR, + }) + ) + + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + + try: + logger.info(f"[Codex OAuth] Opening browser for login...") + logger.info(f"[Codex OAuth] Auth URL: {auth_url}") + if open_browser: + webbrowser.open(auth_url) + + if not event.wait(timeout_seconds): + raise CodexOAuthError("Timed out waiting for OAuth callback") + + params = getattr(server, "result", {}) or {} + finally: + server.shutdown() + server.server_close() + thread.join(timeout=2) + + # Validate callback + if params.get("state") != state: + raise CodexOAuthError("OAuth callback state mismatch") + if params.get("error"): + raise CodexOAuthError(f"OAuth failed: {params.get('error_description', params['error'])}") + + code = str(params.get("code", "")).strip() + if not code: + raise CodexOAuthError("OAuth callback missing authorization code") + + # Exchange code for tokens + tokens = _exchange_code(code, redirect_uri, verifier) + claims = _jwt_org_context(tokens["id_token"]) + + auth = { + "provider": "codex", + "tokens": { + **tokens, + "account_id": claims.get("chatgpt_account_id"), + "organization_id": claims.get("organization_id"), + "project_id": claims.get("project_id"), + }, + "last_refresh": _utc_now(), + } + + logger.info("[Codex OAuth] Login successful") + return self._save(auth) + + # ---- Refresh ---- + + def refresh(self) -> dict[str, Any]: + """Refresh the access token using the stored refresh token.""" + auth = self._load() + tokens = auth.get("tokens", {}) + refresh_token = tokens.get("refresh_token", "") + if not refresh_token: + raise CodexOAuthError("No refresh token available") + + refreshed = _refresh_tokens(refresh_token) + claims = _jwt_org_context(refreshed["id_token"]) + + auth["tokens"] = { + **refreshed, + "account_id": claims.get("chatgpt_account_id"), + "organization_id": claims.get("organization_id"), + "project_id": claims.get("project_id"), + } + auth["last_refresh"] = _utc_now() + + logger.info("[Codex OAuth] Token refreshed successfully") + return self._save(auth) + + # ---- Import from Codex CLI ---- + + def import_from_codex_cli(self) -> dict[str, Any] | None: + """Import tokens from Codex CLI auth file (~/.codex/auth.json).""" + if not CODEX_CLI_AUTH.exists(): + logger.info(f"[Codex OAuth] Codex CLI auth not found at {CODEX_CLI_AUTH}") + return None + + try: + codex_data = json.loads(CODEX_CLI_AUTH.read_text()) + except Exception as e: + logger.warning(f"[Codex OAuth] Failed to read Codex CLI auth: {e}") + return None + + tokens = codex_data.get("tokens", {}) + access_token = tokens.get("access_token", "") + refresh_token = tokens.get("refresh_token", "") + + if not access_token and not refresh_token: + logger.info("[Codex OAuth] Codex CLI auth has no tokens") + return None + + # Don't refresh here — OpenAI refresh_tokens are single-use. + # If Codex CLI already used it, refreshing would fail with "refresh_token_reused". + # Just import as-is; get_access_token() will refresh lazily when needed. + if not access_token and refresh_token: + # No access_token at all — must refresh to get one + try: + logger.info("[Codex OAuth] No access_token, attempting refresh...") + refreshed = _refresh_tokens(refresh_token) + tokens = refreshed + except CodexOAuthError as e: + logger.warning(f"[Codex OAuth] Refresh failed (token may be reused): {e}") + # Still import what we have — the token may work or login will be needed + + claims = _jwt_org_context(tokens.get("id_token", "") or tokens.get("access_token", "")) + + auth = { + "provider": "codex", + "tokens": { + **tokens, + "account_id": claims.get("chatgpt_account_id"), + "organization_id": claims.get("organization_id"), + "project_id": claims.get("project_id"), + }, + "last_refresh": _utc_now(), + "source": str(CODEX_CLI_AUTH), + } + + logger.info("[Codex OAuth] Imported tokens from Codex CLI") + return self._save(auth) + + # ---- Internal ---- + + @staticmethod + def _create_server(event: threading.Event) -> ThreadingHTTPServer: + for port in (CALLBACK_PORT, 0): + try: + server = ThreadingHTTPServer(("127.0.0.1", port), _CallbackHandler) + server.event = event + server.result = {} + return server + except OSError: + continue + raise CodexOAuthError("Could not start local OAuth callback server") diff --git a/pantheon/utils/provider_registry.py b/pantheon/utils/provider_registry.py new file mode 100644 index 00000000..5d92596b --- /dev/null +++ b/pantheon/utils/provider_registry.py @@ -0,0 +1,268 @@ +""" +Provider registry — loads the catalog and exposes model metadata helpers. + +Provides get_model_info, completion_cost, token_counter, +and models_by_provider from the local LLM catalog. +""" + +import json +from functools import lru_cache +from pathlib import Path +from typing import Any + +from .log import logger + +# ============ Catalog Loading ============ + +_CATALOG_PATH = Path(__file__).parent / "llm_catalog.json" + +# Default metadata for unknown models +_DEFAULT_MODEL_INFO = { + "max_input_tokens": 200_000, + "max_output_tokens": 32_000, + "input_cost_per_million": 1.0, + "output_cost_per_million": 5.0, + "supports_vision": False, + "supports_function_calling": True, + "supports_response_schema": False, + "supports_reasoning": False, + "supports_audio_input": False, + "supports_audio_output": False, + "supports_web_search": False, + "supports_pdf_input": False, + "supports_computer_use": False, + "supports_assistant_prefill": False, +} + + +@lru_cache(maxsize=1) +def load_catalog() -> dict: + """Load and cache the provider catalog from llm_catalog.json.""" + try: + with open(_CATALOG_PATH, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.warning(f"Failed to load LLM catalog: {e}") + return {"version": 1, "providers": {}} + + +def reload_catalog() -> dict: + """Force-reload the catalog (clears cache). For testing.""" + load_catalog.cache_clear() + return load_catalog() + + +# ============ Provider Resolution ============ + + +def _parse_model_string(model: str) -> tuple[str | None, str]: + """Parse 'provider/model_name' into (provider, model_name). + + Returns (None, model) if no provider prefix. + """ + if "/" in model: + provider, model_name = model.split("/", 1) + return provider.lower(), model_name + return None, model + + +def find_provider_for_model(model: str) -> tuple[str, str, dict]: + """Given a model string, return (provider_key, model_name, provider_config). + + Tries: + 1. Explicit prefix: 'anthropic/claude-sonnet-4-6' → provider='anthropic' + 2. Search all providers for a matching model name + + Returns ('unknown', model, {}) if not found. + """ + catalog = load_catalog() + providers = catalog.get("providers", {}) + + # 1. Explicit prefix + prefix, model_name = _parse_model_string(model) + if prefix and prefix in providers: + return prefix, model_name, providers[prefix] + + # 2. Search all providers for bare model name + for pkey, pconfig in providers.items(): + if model_name in pconfig.get("models", {}): + return pkey, model_name, pconfig + + # 3. Not found — return with empty config + return prefix or "unknown", model_name, {} + + +def get_provider_config(provider: str) -> dict: + """Get provider configuration from catalog.""" + catalog = load_catalog() + return catalog.get("providers", {}).get(provider, {}) + + +# ============ Model Metadata ============ + + +def get_model_info(model: str) -> dict: + """Get model metadata from the catalog. + + Retrieves model metadata from the local catalog. + + Args: + model: Model string, e.g. 'anthropic/claude-sonnet-4-6' or 'gpt-5.4' + + Returns: + Dict with max_input_tokens, max_output_tokens, pricing, supports_*, etc. + Returns defaults for unknown models. + """ + provider_key, model_name, provider_config = find_provider_for_model(model) + models = provider_config.get("models", {}) + + if model_name in models: + info = {**_DEFAULT_MODEL_INFO, **models[model_name]} + # Ensure per-token fields exist for backward compat + if "input_cost_per_token" not in info: + info["input_cost_per_token"] = info.get("input_cost_per_million", 1.0) / 1_000_000 + if "output_cost_per_token" not in info: + info["output_cost_per_token"] = info.get("output_cost_per_million", 5.0) / 1_000_000 + return info + + logger.debug(f"Model '{model}' not found in catalog, using defaults") + info = dict(_DEFAULT_MODEL_INFO) + info["input_cost_per_token"] = info["input_cost_per_million"] / 1_000_000 + info["output_cost_per_token"] = info["output_cost_per_million"] / 1_000_000 + return info + + +# ============ Cost Calculation ============ + + +def completion_cost( + completion_response: Any = None, + model: str | None = None, + prompt_tokens: int = 0, + completion_tokens: int = 0, +) -> float: + """Calculate completion cost from response or explicit token counts. + + Calculates completion cost from the local catalog pricing. + """ + # Extract from response object if provided + if completion_response is not None: + usage = getattr(completion_response, "usage", None) + if usage: + prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0 + completion_tokens = getattr(usage, "completion_tokens", 0) or 0 + + # Try to get model from response + if model is None: + model = getattr(completion_response, "model", None) or "" + + if not model: + # Fallback pricing: $1/1M input, $5/1M output + return (prompt_tokens * 1.0 + completion_tokens * 5.0) / 1_000_000 + + info = get_model_info(model) + input_cost = info.get("input_cost_per_token", 1.0 / 1_000_000) + output_cost = info.get("output_cost_per_token", 5.0 / 1_000_000) + + return prompt_tokens * input_cost + completion_tokens * output_cost + + +# ============ Model Listing ============ + + +def models_by_provider(provider: str) -> list[str]: + """List all model names for a provider. + + Lists all model names for a given provider from the catalog. + """ + catalog = load_catalog() + provider_config = catalog.get("providers", {}).get(provider, {}) + models = provider_config.get("models", {}) + + # Return as 'provider/model_name' format + return [f"{provider}/{name}" for name in models] + + +# ============ Token Counting ============ + + +def token_counter( + model: str, + messages: list[dict] | None = None, + tools: list[dict] | None = None, +) -> int: + """Count tokens for messages and tools. + + Uses tiktoken when available, falls back to heuristic estimation. + """ + total = 0 + + # Try tiktoken first (works for OpenAI models) + try: + import tiktoken + + # Map model to encoding + try: + encoding = tiktoken.encoding_for_model(model.split("/")[-1]) + except KeyError: + encoding = tiktoken.get_encoding("cl100k_base") + + for msg in messages or []: + # Per-message overhead + total += 4 # role + content framing + content = msg.get("content", "") + if isinstance(content, str): + total += len(encoding.encode(content)) + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text", "") + if text: + total += len(encoding.encode(text)) + # Image tokens: rough estimate + if part.get("type") == "image_url": + total += 765 # ~average image token cost + + if tools: + total += len(encoding.encode(json.dumps(tools))) + + return total + + except (ImportError, Exception): + pass + + # Fallback: heuristic estimation + for msg in messages or []: + total += 4 + content = msg.get("content", "") + if isinstance(content, str): + total += _heuristic_token_count(content) + elif isinstance(content, list): + for part in content: + if isinstance(part, dict) and "text" in part: + total += _heuristic_token_count(part["text"]) + + if tools: + total += _heuristic_token_count(json.dumps(tools)) + + return total + + +def _heuristic_token_count(text: str) -> int: + """Estimate token count with language-aware heuristics.""" + if not text: + return 0 + + cjk_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff' + or '\u3040' <= c <= '\u30ff' + or '\uac00' <= c <= '\ud7af') + ascii_chars = sum(1 for c in text if c.isascii()) + other_chars = len(text) - cjk_chars - ascii_chars + + tokens = ( + cjk_chars * 0.6 + # CJK: ~1.7 chars per token + ascii_chars * 0.25 + # ASCII: ~4 chars per token + other_chars * 0.5 # Other: ~2 chars per token + ) + + return max(1, int(tokens)) diff --git a/pyproject.toml b/pyproject.toml index 2b4a4bbe..3af124ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,9 +30,11 @@ dependencies = [ "executor-engine>=0.3.3", "fire>=0.7.0", "funcdesc>=0.1.8", - "litellm>=1.81.3", + "anthropic>=0.40.0", + "google-genai>=1.0.0", "loguru>=0.7.3", "openai>=2.0.0", + "tiktoken>=0.7.0", "pillow>=10.4.0", "rich>=14.0.0", "rich-pyfiglet", @@ -61,6 +63,16 @@ dependencies = [ "tree-sitter-javascript>=0.24.0", # Server "nats-server-bin>=2.10.0", + # Claw (multi-channel gateway) + "slack-sdk", + "slack-bolt", + "python-telegram-bot>=21.3", + "discord.py>=2.5", + "lark-oapi", + "requests>=2.0", + "cryptography>=42.0.0", + "websocket-client>=1.8", + "qrcode", ] dynamic = ["version"] @@ -95,23 +107,6 @@ dev = [ "pytest-asyncio>=0.25.0", "pytest-timeout>=2.3.1", ] -claw = [ - "slack-sdk", - "slack-bolt", - "python-telegram-bot>=21.3", - "discord.py>=2.5", - "lark-oapi", - "requests>=2.0", - "pillow>=9.0", - "nbformat>=5.0", - "cryptography>=42.0.0", - "websocket-client>=1.8", - "qrcode" -] -slack = [ - "slack-sdk", - "slack-bolt", -] r = [ # R language support for notebooks (requires R installed on system) "rpy2>=3.5.0", diff --git a/tests/test_agent.py b/tests/test_agent.py index e2d9641c..34f61d1a 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -228,11 +228,11 @@ def transfer_to_classic_literature_fan(): assert resp.to_agent == classic_literature_fan.name -async def test_agent_force_litellm(): +async def test_agent_relaxed_schema(): agent = Agent( name="test", instructions="", - force_litellm=True, + relaxed_schema=True, ) resp = await agent.run("What is the weather in Palo Alto?") diff --git a/tests/test_model_selector.py b/tests/test_model_selector.py index 7d5b8392..9a9ff31c 100644 --- a/tests/test_model_selector.py +++ b/tests/test_model_selector.py @@ -288,8 +288,10 @@ def test_auto_generate_for_unknown_provider(self, mock_settings): """Test auto-generation for provider not in defaults.""" selector = ModelSelector(mock_settings) - # Mock litellm - imports are inside the method so patch at litellm level - mock_models_by_provider = {"custom_provider": ["model1", "model2", "model3"]} + # Mock provider_registry functions + mock_get_models = MagicMock( + return_value=["custom_provider/model1", "custom_provider/model2", "custom_provider/model3"] + ) mock_model_info = MagicMock( return_value={ "mode": "chat", @@ -299,11 +301,11 @@ def test_auto_generate_for_unknown_provider(self, mock_settings): with ( patch( - "litellm.models_by_provider", - mock_models_by_provider, + "pantheon.utils.provider_registry.models_by_provider", + mock_get_models, ), patch( - "litellm.utils.get_model_info", + "pantheon.utils.provider_registry.get_model_info", mock_model_info, ), ): diff --git a/tests/test_provider_adapters.py b/tests/test_provider_adapters.py new file mode 100644 index 00000000..8046ab00 --- /dev/null +++ b/tests/test_provider_adapters.py @@ -0,0 +1,237 @@ +""" +Integration tests for provider adapters — verifies every model in DEFAULT_PROVIDER_MODELS works. + +Requires API keys in .env file. +""" + +import os +import sys +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +# Load .env +_env_path = os.path.join(os.path.dirname(__file__), "..", ".env") +if os.path.exists(_env_path): + with open(_env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + os.environ.setdefault(key.strip(), value.strip()) + +from pantheon.utils.provider_registry import ( + load_catalog, + find_provider_for_model, + get_model_info, + completion_cost, + models_by_provider, + token_counter, +) +from pantheon.utils.adapters import get_adapter +from pantheon.utils.llm import stream_chunk_builder +from pantheon.utils.model_selector import DEFAULT_PROVIDER_MODELS, PROVIDER_API_KEYS +from pantheon.utils.llm_providers import is_responses_api_model, detect_provider + + +# ============ provider_registry unit tests ============ + + +class TestProviderRegistry: + + def test_load_catalog(self): + cat = load_catalog() + assert cat["version"] == 1 + assert len(cat["providers"]) >= 8 + + def test_find_provider_with_prefix(self): + p, m, c = find_provider_for_model("anthropic/claude-sonnet-4-6") + assert p == "anthropic" + assert m == "claude-sonnet-4-6" + assert c["sdk"] == "anthropic" + + def test_find_provider_openai_compat(self): + p, m, c = find_provider_for_model("deepseek/deepseek-chat") + assert p == "deepseek" + assert c["sdk"] == "openai" + + def test_find_provider_qwen(self): + p, m, c = find_provider_for_model("qwen/qwen3-235b-a22b") + assert p == "qwen" + assert c["api_key_env"] == "DASHSCOPE_API_KEY" + + def test_find_provider_unknown(self): + p, m, c = find_provider_for_model("unknown/some-model") + assert p == "unknown" + assert c == {} + + def test_get_model_info_known(self): + info = get_model_info("anthropic/claude-opus-4-6") + assert info["max_input_tokens"] == 1_000_000 + assert info["supports_vision"] is True + + def test_get_model_info_unknown_returns_defaults(self): + info = get_model_info("fake/nonexistent-model") + assert info["max_input_tokens"] == 200_000 + + def test_completion_cost(self): + cost = completion_cost(model="openai/gpt-5.4", prompt_tokens=1_000_000, completion_tokens=100_000) + assert abs(cost - 2.8) < 0.01 + + def test_models_by_provider(self): + models = models_by_provider("anthropic") + assert len(models) == 7 + + def test_models_by_provider_qwen(self): + models = models_by_provider("qwen") + assert len(models) == 9 + + def test_token_counter_basic(self): + count = token_counter(model="gpt-4", messages=[{"role": "user", "content": "Hello"}]) + assert count > 0 + + def test_all_default_models_in_catalog(self): + """Every model in DEFAULT_PROVIDER_MODELS should exist in the catalog.""" + cat = load_catalog() + all_catalog_models = set() + for prov, cfg in cat["providers"].items(): + for m in cfg.get("models", {}): + all_catalog_models.add(f"{prov}/{m}") + + missing = [] + for provider, levels in DEFAULT_PROVIDER_MODELS.items(): + for level, models in levels.items(): + for model in models: + if model not in all_catalog_models: + missing.append(model) + assert missing == [], f"Models in selector but not in catalog: {missing}" + + +# ============ stream_chunk_builder unit tests ============ + + +class TestStreamChunkBuilder: + + def test_text_chunks(self): + chunks = [ + {"choices": [{"index": 0, "delta": {"role": "assistant", "content": "Hello"}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {"content": " world"}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]}, + {"usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, "choices": []}, + ] + resp = stream_chunk_builder(chunks) + msg = resp.choices[0].message.model_dump() + assert msg["content"] == "Hello world" + assert resp.usage.prompt_tokens == 10 + + def test_tool_call_chunks(self): + chunks = [ + {"choices": [{"index": 0, "delta": {"role": "assistant", "tool_calls": [ + {"index": 0, "id": "call_1", "type": "function", "function": {"name": "test", "arguments": '{"a":'}} + ]}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {"tool_calls": [ + {"index": 0, "function": {"arguments": ' 1}'}} + ]}, "finish_reason": None}]}, + {"choices": [{"index": 0, "delta": {}, "finish_reason": "tool_calls"}]}, + {"usage": {"prompt_tokens": 20, "completion_tokens": 10, "total_tokens": 30}, "choices": []}, + ] + resp = stream_chunk_builder(chunks) + msg = resp.choices[0].message.model_dump() + assert msg["tool_calls"][0]["function"]["arguments"] == '{"a": 1}' + + def test_empty_chunks(self): + resp = stream_chunk_builder([]) + msg = resp.choices[0].message.model_dump() + assert msg["content"] is None + + +# ============ Real API: test every model in DEFAULT_PROVIDER_MODELS ============ + +SIMPLE_MESSAGES = [{"role": "user", "content": "Say 'hello' and nothing else."}] + + +def _has_key(provider: str) -> bool: + env_var = PROVIDER_API_KEYS.get(provider, "") + return bool(os.environ.get(env_var, "")) + + +def _get_all_models(): + """Collect unique (provider, model) pairs from DEFAULT_PROVIDER_MODELS, excluding qwen (no valid key).""" + seen = set() + result = [] + for provider, levels in DEFAULT_PROVIDER_MODELS.items(): + if provider == "qwen": + continue # Skip: no valid API key available + for level, models in levels.items(): + for model in models: + if model not in seen: + seen.add(model) + result.append((provider, model)) + return result + + +ALL_MODELS = _get_all_models() + + +@pytest.mark.parametrize("provider,model", ALL_MODELS, ids=[m for _, m in ALL_MODELS]) +@pytest.mark.asyncio +async def test_model_completion(provider, model): + """Test that each model in DEFAULT_PROVIDER_MODELS can complete a simple prompt. + + Automatically detects whether to use Chat Completions or Responses API. + """ + env_var = PROVIDER_API_KEYS.get(provider, "") + api_key = os.environ.get(env_var, "") + if not api_key: + pytest.skip(f"{env_var} not set") + + provider_key, model_name, provider_config = find_provider_for_model(model) + sdk_type = provider_config.get("sdk", "openai") + base_url = provider_config.get("base_url") + + # Check if this model needs Responses API + config = detect_provider(model, relaxed_schema=False) + uses_responses_api = is_responses_api_model(config) + + adapter = get_adapter("openai" if uses_responses_api else sdk_type) + + if uses_responses_api: + # Responses API path + bare_model = model_name.split("/")[-1] if "/" in model_name else model_name + msg = await adapter.acompletion_responses( + model=bare_model, + messages=SIMPLE_MESSAGES, + base_url=base_url, + api_key=api_key, + max_output_tokens=2048, + ) + content = msg.get("content") or "" + assert len(content.strip()) > 0, f"{model}: got empty content, full msg={msg}" + print(f" [{provider}] {model} (responses): {content[:80]!r}") + else: + # Chat Completions path + info = get_model_info(model) + is_reasoning = info.get("supports_reasoning", False) + + extra_kwargs = {} + if sdk_type == "anthropic": + extra_kwargs["max_tokens"] = 1024 if is_reasoning else 128 + elif sdk_type == "openai" and provider_key == "openai": + extra_kwargs["max_completion_tokens"] = 2048 + elif sdk_type != "google-genai": + extra_kwargs["max_tokens"] = 1024 if is_reasoning else 128 + + chunks = await adapter.acompletion( + model=model_name, + messages=SIMPLE_MESSAGES, + base_url=base_url, + api_key=api_key, + num_retries=2, + **extra_kwargs, + ) + resp = stream_chunk_builder(chunks) + msg = resp.choices[0].message.model_dump() + + content = msg.get("content") or "" + assert len(content.strip()) > 0, f"{model}: got empty content, full msg={msg}" + print(f" [{provider}] {model}: {content[:80]!r}") diff --git a/tests/test_responses_api.py b/tests/test_responses_api.py index 4ef4b8f1..240ad4ac 100644 --- a/tests/test_responses_api.py +++ b/tests/test_responses_api.py @@ -40,13 +40,13 @@ def test_non_codex_openai(self): config = ProviderConfig(provider_type=ProviderType.OPENAI, model_name="gpt-4o") assert is_responses_api_model(config) is False - def test_codex_model_litellm_provider(self): - """Codex model but via LiteLLM provider should NOT use Responses API.""" - config = ProviderConfig(provider_type=ProviderType.LITELLM, model_name="codex-mini-latest") + def test_codex_model_native_provider(self): + """Codex model but via native provider should NOT use Responses API.""" + config = ProviderConfig(provider_type=ProviderType.NATIVE, model_name="codex-mini-latest") assert is_responses_api_model(config) is False - def test_non_codex_litellm(self): - config = ProviderConfig(provider_type=ProviderType.LITELLM, model_name="anthropic/claude-3-opus") + def test_non_codex_native(self): + config = ProviderConfig(provider_type=ProviderType.NATIVE, model_name="anthropic/claude-3-opus") assert is_responses_api_model(config) is False def test_o1_model_not_codex(self): @@ -442,7 +442,7 @@ async def test_routing_through_call_llm_provider(self): detect_provider, ) - config = detect_provider(CODEX_MODEL, force_litellm=False) + config = detect_provider(CODEX_MODEL, relaxed_schema=False) assert is_responses_api_model(config) is True result = await call_llm_provider( @@ -466,7 +466,7 @@ async def test_non_codex_does_not_use_responses_api(self): def detect_provider_for_test(model: str) -> ProviderConfig: from pantheon.utils.llm_providers import detect_provider - return detect_provider(model, force_litellm=False) + return detect_provider(model, relaxed_schema=False) # ============ Agent.run() End-to-End Tests ============ diff --git a/tests/test_scfm_router_real_queries.py b/tests/test_scfm_router_real_queries.py index 54f59aa1..87731389 100644 --- a/tests/test_scfm_router_real_queries.py +++ b/tests/test_scfm_router_real_queries.py @@ -60,7 +60,7 @@ def get_test_model() -> str: """ Get LLM model for testing from environment. - Supports LiteLLM model strings: + Supports provider-prefixed model strings: - OpenAI: "gpt-4o-mini", "gpt-4o" - Anthropic: "anthropic/claude-sonnet-4-20250514", "anthropic/claude-haiku-3-5-20241022" - Gemini: "gemini/gemini-1.5-flash" @@ -95,17 +95,19 @@ def has_api_key_for_model(model: str) -> bool: async def create_real_call_agent(): """ - Create a real _call_agent function using LiteLLM. + Create a real _call_agent function using provider adapters. Returns an async function compatible with the router's _call_agent interface. """ - import litellm + from pantheon.utils.adapters import get_adapter + from pantheon.utils.provider_registry import find_provider_for_model + from pantheon.utils.llm import stream_chunk_builder model = get_test_model() async def _call_agent(messages, system_prompt=None, model_override=None, **kwargs): """ - Call LLM via LiteLLM. + Call LLM via provider adapter. Args: messages: List of message dicts with 'role' and 'content' @@ -125,11 +127,18 @@ async def _call_agent(messages, system_prompt=None, model_override=None, **kwarg full_messages.extend(messages) try: - response = await litellm.acompletion( - model=actual_model, + provider_key, model_name, provider_config = find_provider_for_model(actual_model) + adapter = get_adapter(provider_config.get("sdk", "openai")) + import os + api_key = os.environ.get(provider_config.get("api_key_env", ""), "") + chunks = await adapter.acompletion( + model=model_name, messages=full_messages, - temperature=0.0, # Deterministic for testing + base_url=provider_config.get("base_url"), + api_key=api_key, + temperature=0.0, ) + response = stream_chunk_builder(chunks) return { "success": True, "response": response.choices[0].message.content, @@ -639,12 +648,12 @@ def test_query_does_not_contain_model_name(self, query, data_profile, constraint @pytest.mark.live_llm class TestLiveModelSelection: """ - Live LLM tests using real API calls via LiteLLM. + Live LLM tests using real API calls via provider adapters. Supports multiple providers through environment configuration: Environment Variables: - - SCFM_TEST_MODEL: LiteLLM model string (default: "gpt-4o-mini") + - SCFM_TEST_MODEL: Model string (default: "gpt-4o-mini") - Provider API keys: OPENAI_API_KEY, ANTHROPIC_API_KEY, GEMINI_API_KEY, etc. Example usage: