From a7de17b109a3c727acf2f52bb339b13263dc76d4 Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Tue, 10 Feb 2026 15:53:20 -0700 Subject: [PATCH 01/11] support custom endpoint --- .env.example | 4 +- llm_clients/__init__.py | 1 + llm_clients/config.py | 17 +++++ llm_clients/endpoint_llm.py | 135 ++++++++++++++++++++++++++++++++++++ llm_clients/llm_factory.py | 6 +- 5 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 llm_clients/endpoint_llm.py diff --git a/.env.example b/.env.example index cceb209e..87c84430 100644 --- a/.env.example +++ b/.env.example @@ -3,4 +3,6 @@ OPENAI_API_KEY=your_openai_api_key_here GOOGLE_API_KEY=your_google_api_key_here AZURE_API_KEY=your_azure_api_key_here AZURE_ENDPOINT=your_azure_endpoint_here -AZURE_API_VERSION=your_azure_api_version_here \ No newline at end of file +AZURE_API_VERSION=your_azure_api_version_here +ENDPOINT_URL=http://0.0.0.0:8000/api/chat +ENDPOINT_API_KEY=howdy \ No newline at end of file diff --git a/llm_clients/__init__.py b/llm_clients/__init__.py index f6a00aa8..dd6c10eb 100644 --- a/llm_clients/__init__.py +++ b/llm_clients/__init__.py @@ -6,6 +6,7 @@ - Gemini (gemini-*) - Azure (azure-*) - Ollama (ollama-*) + - Custom endpoint (endpoint, endpoint-*) """ from .config import Config diff --git a/llm_clients/config.py b/llm_clients/config.py index 5961eaca..3dc1772e 100644 --- a/llm_clients/config.py +++ b/llm_clients/config.py @@ -31,6 +31,10 @@ class Config: AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT") AZURE_API_VERSION = os.getenv("AZURE_API_VERSION") # Optional + # Custom endpoint (chat-only provider) + ENDPOINT_URL = os.getenv("ENDPOINT_URL", "http://0.0.0.0:8000") + ENDPOINT_API_KEY = os.getenv("ENDPOINT_API_KEY", "howdy") + @classmethod def get_claude_config(cls) -> Dict[str, Any]: """Get default Claude model name. @@ -80,3 +84,16 @@ def get_ollama_config(cls) -> Dict[str, Any]: "model": "llama3:8b", "base_url": "http://localhost:11434", # Default Ollama URL } + + @classmethod + def get_endpoint_config(cls) -> Dict[str, Any]: + """Get custom endpoint configuration. + + Returns base_url (no /api/chat path), api_key, and default model. + Runtime parameters can override via kwargs. + """ + return { + "base_url": cls.ENDPOINT_URL.rstrip("/"), + "api_key": cls.ENDPOINT_API_KEY, + "model": "phi4", + } diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py new file mode 100644 index 00000000..0b595596 --- /dev/null +++ b/llm_clients/endpoint_llm.py @@ -0,0 +1,135 @@ +import time +from datetime import datetime +from typing import Any, Dict, List, Optional + +import aiohttp + +from utils.conversation_utils import build_langchain_messages + +from .config import Config +from .llm_interface import LLMInterface, Role + + +class EndpointLLM(LLMInterface): + """Chat-only LLM that calls a custom POST /api/chat endpoint. + + The API manages conversation history server-side via conversation_id. + This implementation does not support structured output and cannot be used + as a judge. For judge operations, use Claude, OpenAI, Gemini, or Azure. + """ + + def __init__( + self, + name: str, + role: Role, + system_prompt: Optional[str] = None, + model_name: Optional[str] = None, + base_url: Optional[str] = None, + api_key: Optional[str] = None, + **kwargs, + ): + super().__init__(name, role, system_prompt) + + cfg = Config.get_endpoint_config() + self._base_url = (base_url or cfg["base_url"]).rstrip("/") + self._api_key = api_key or cfg["api_key"] + + if model_name and model_name.lower().startswith("endpoint-"): + self._api_model = model_name[len("endpoint-") :].strip() or cfg["model"] + else: + self._api_model = cfg["model"] + self.model_name = model_name or "endpoint" + self.temperature = kwargs.pop("temperature", None) + self.max_tokens = kwargs.pop("max_tokens", None) + + def __getattr__(self, name): + """Delegate attribute access to the underlying llm object. + + This allows accessing attributes like temperature, max_tokens, etc. + directly on the LLM instance, which will be forwarded to the + underlying LangChain model (self.llm). + """ + # Check if self.llm exists by looking in __dict__ to avoid recursion + # Only delegate if self.llm exists and has the attribute + if "llm" in self.__dict__ and hasattr(self.llm, name): + return getattr(self.llm, name) + # If the attribute doesn't exist on self.llm, raise AttributeError + return getattr(self, name, None) + + async def generate_response( + self, + conversation_history: Optional[List[Dict[str, Any]]] = None, + ) -> str: + """Generate a response via POST /api/chat with server-side conversation_id. + + The API does not accept a system role; the system prompt is folded into + the first user message as \"System: ...\". + """ + messages = build_langchain_messages(self.role, conversation_history) + last_message = messages[-1].text + + headers = { + "X-API-Key": self._api_key, + "Content-Type": "application/json", + } + body: Dict[str, Any] = { + "model": self._api_model, + "messages": [ + { + "role": "user", + "content": last_message, + }, + ], + "stream": False, + "conversation_id": self.conversation_id, + } + + try: + start_time = time.time() + async with aiohttp.ClientSession() as session: + async with session.post( + self._base_url, + headers=headers, + json=body, + ) as resp: + if resp.status != 200: + text = await resp.text() + raise RuntimeError( + f"Endpoint returned {resp.status}: {text[:500]}" + ) + resp_data = await resp.json() + end_time = time.time() + + msg_data = resp_data.get("message") or {} + msg_text: str = msg_data.get("content", "") + + self.last_response_metadata = { + "conversation_id": resp_data.get("conversation_id"), + "model": resp_data.get("model", self._api_model), + "provider": "endpoint", + "role": self.role.value, + "timestamp": datetime.now().isoformat(), + "response_time_seconds": round(end_time - start_time, 3), + "total_duration": resp_data.get("total_duration"), + "load_duration": resp_data.get("load_duration"), + "prompt_eval_count": resp_data.get("prompt_eval_count"), + "prompt_eval_duration": resp_data.get("prompt_eval_duration"), + "eval_count": resp_data.get("eval_count"), + "eval_duration": resp_data.get("eval_duration"), + } + self.ensure_conversation_id() + return msg_text + except Exception as e: + self.last_response_metadata = { + "model": self._api_model, + "provider": "endpoint", + "role": self.role.value, + "timestamp": datetime.now().isoformat(), + "error": str(e), + } + self.ensure_conversation_id() + return f"Error generating response: {str(e)}" + + def set_system_prompt(self, system_prompt: str) -> None: + """Set or update the system prompt.""" + self.system_prompt = system_prompt diff --git a/llm_clients/llm_factory.py b/llm_clients/llm_factory.py index 3ce0d57e..a60b0e4c 100644 --- a/llm_clients/llm_factory.py +++ b/llm_clients/llm_factory.py @@ -67,6 +67,10 @@ def create_llm( from .gemini_llm import GeminiLLM return GeminiLLM(name, role, system_prompt, model_name, **model_params) + elif "endpoint" in model_lower: + from .endpoint_llm import EndpointLLM + + return EndpointLLM(name, role, system_prompt, model_name, **model_params) else: raise ValueError(f"Unsupported model: {model_name}") @@ -111,7 +115,7 @@ def create_judge_llm( f"generation. Judge operations require models with structured " f"output support. Supported models: Claude (claude-*), " f"OpenAI (gpt-*), Gemini (gemini-*), Azure (azure-*). " - f"Not supported: Ollama models." + f"Not supported: Ollama (ollama-*), Endpoint (endpoint-*)." ) return llm From 650effd4145095782a0dc5bc6b7cbe0670d63da1 Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Tue, 10 Feb 2026 16:22:13 -0700 Subject: [PATCH 02/11] create conv id at init + update if server creates new --- llm_clients/endpoint_llm.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py index 0b595596..f170ca69 100644 --- a/llm_clients/endpoint_llm.py +++ b/llm_clients/endpoint_llm.py @@ -41,6 +41,8 @@ def __init__( self.model_name = model_name or "endpoint" self.temperature = kwargs.pop("temperature", None) self.max_tokens = kwargs.pop("max_tokens", None) + # ID to send on first request; API may use it or return its own. + self.ensure_conversation_id() def __getattr__(self, name): """Delegate attribute access to the underlying llm object. @@ -103,8 +105,9 @@ async def generate_response( msg_data = resp_data.get("message") or {} msg_text: str = msg_data.get("content", "") + server_conversation_id = resp_data.get("conversation_id") self.last_response_metadata = { - "conversation_id": resp_data.get("conversation_id"), + "conversation_id": server_conversation_id, "model": resp_data.get("model", self._api_model), "provider": "endpoint", "role": self.role.value, @@ -118,6 +121,12 @@ async def generate_response( "eval_duration": resp_data.get("eval_duration"), } self.ensure_conversation_id() + # Update conversation_id if API returned a different one + if ( + server_conversation_id is not None + and server_conversation_id != self.conversation_id + ): + self.conversation_id = server_conversation_id return msg_text except Exception as e: self.last_response_metadata = { From 6c0cd85daa089399036231a49d973389dae499a9 Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Tue, 10 Feb 2026 16:53:30 -0700 Subject: [PATCH 03/11] support simpler conv id from init + overwrite --- llm_clients/endpoint_llm.py | 14 ++++---------- llm_clients/llm_interface.py | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py index f170ca69..f01d4c57 100644 --- a/llm_clients/endpoint_llm.py +++ b/llm_clients/endpoint_llm.py @@ -41,8 +41,6 @@ def __init__( self.model_name = model_name or "endpoint" self.temperature = kwargs.pop("temperature", None) self.max_tokens = kwargs.pop("max_tokens", None) - # ID to send on first request; API may use it or return its own. - self.ensure_conversation_id() def __getattr__(self, name): """Delegate attribute access to the underlying llm object. @@ -120,13 +118,9 @@ async def generate_response( "eval_count": resp_data.get("eval_count"), "eval_duration": resp_data.get("eval_duration"), } - self.ensure_conversation_id() - # Update conversation_id if API returned a different one - if ( - server_conversation_id is not None - and server_conversation_id != self.conversation_id - ): - self.conversation_id = server_conversation_id + + self._update_conversation_id_from_metadata() + return msg_text except Exception as e: self.last_response_metadata = { @@ -136,7 +130,7 @@ async def generate_response( "timestamp": datetime.now().isoformat(), "error": str(e), } - self.ensure_conversation_id() + self._update_conversation_id_from_metadata() return f"Error generating response: {str(e)}" def set_system_prompt(self, system_prompt: str) -> None: diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py index 683b1904..a2cc27fa 100644 --- a/llm_clients/llm_interface.py +++ b/llm_clients/llm_interface.py @@ -64,7 +64,7 @@ def _update_conversation_id_from_metadata(self) -> None: will overwrite self.conversation_id here. """ cid = (self._last_response_metadata or {}).get("conversation_id") - if cid is not None: + if cid is not None and cid != self.conversation_id and cid != "": self.conversation_id = cid @abstractmethod From 97ca23004f43ee5568a17f2b5d9e273f8825a4a4 Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Wed, 18 Feb 2026 17:14:25 -0700 Subject: [PATCH 04/11] catch endpoint up with latest implementations --- llm_clients/endpoint_llm.py | 164 ++++++++++++++++++----------- tests/unit/llm_clients/conftest.py | 16 +++ 2 files changed, 118 insertions(+), 62 deletions(-) diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py index f01d4c57..142f9074 100644 --- a/llm_clients/endpoint_llm.py +++ b/llm_clients/endpoint_llm.py @@ -1,5 +1,4 @@ import time -from datetime import datetime from typing import Any, Dict, List, Optional import aiohttp @@ -28,7 +27,15 @@ def __init__( api_key: Optional[str] = None, **kwargs, ): - super().__init__(name, role, system_prompt) + first_message = kwargs.pop("first_message", None) + start_prompt = kwargs.pop("start_prompt", None) + super().__init__( + name, + role, + system_prompt, + first_message=first_message, + start_prompt=start_prompt, + ) cfg = Config.get_endpoint_config() self._base_url = (base_url or cfg["base_url"]).rstrip("/") @@ -43,18 +50,70 @@ def __init__( self.max_tokens = kwargs.pop("max_tokens", None) def __getattr__(self, name): - """Delegate attribute access to the underlying llm object. + """Delegate to self.llm when present; else return self's attribute or None. - This allows accessing attributes like temperature, max_tokens, etc. - directly on the LLM instance, which will be forwarded to the - underlying LangChain model (self.llm). + Only uses __dict__ lookups to avoid recursion. Attributes like + temperature and max_tokens are on self; unknown names return None. """ - # Check if self.llm exists by looking in __dict__ to avoid recursion - # Only delegate if self.llm exists and has the attribute - if "llm" in self.__dict__ and hasattr(self.llm, name): - return getattr(self.llm, name) - # If the attribute doesn't exist on self.llm, raise AttributeError - return getattr(self, name, None) + if "llm" in self.__dict__ and hasattr(self.__dict__["llm"], name): + return getattr(self.__dict__["llm"], name) + if name in self.__dict__: + return self.__dict__[name] + return None + + async def start_conversation(self) -> str: + """Produce the first response: static first_message if set, else call API.""" + if self.first_message is not None: + self._set_response_metadata("endpoint", static_first_message=True) + return self.first_message + else: + # Example: call a dedicated start endpoint instead of /api/chat + # start_url = f"{self._base_url}/start_convo" + # turns = self.get_initial_prompt_turns() + # content = turns[0].get("response", "") if turns else "" # initial prompt + # resp_data = await self._ainvoke(start_url, content) + # msg_data = resp_data.get("message") or {} + # self._set_response_metadata("endpoint", + # conversation_id=resp_data.get("conversation_id"), ...) + # self._update_conversation_id_from_metadata() + # return msg_data.get("content", "") + return await self.generate_response(self.get_initial_prompt_turns()) + + def _default_headers(self) -> Dict[str, str]: + """Default request headers (API key and content type).""" + return { + "X-API-Key": self._api_key, + "Content-Type": "application/json", + } + + def _build_body(self, content: str) -> Dict[str, Any]: + """Body: model, messages (user content), stream, conversation_id.""" + return { + "model": self._api_model, + "messages": [{"role": "user", "content": content}], + "stream": False, + "conversation_id": self.conversation_id, + } + + async def _ainvoke( + self, + url: str, + content: str, + *, + headers: Optional[Dict[str, str]] = None, + ) -> Dict[str, Any]: + """POST to url with body built from content; return parsed JSON. + Body: model, messages (single user message), stream=False, conversation_id. + Default headers when headers is None. Raises RuntimeError on non-200. + """ + req_headers = headers if headers is not None else self._default_headers() + body = self._build_body(content) + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=req_headers, json=body) as resp: + if resp.status != 200: + text = await resp.text() + raise RuntimeError(f"Endpoint returned {resp.status}: {text[:500]}") + return await resp.json() async def generate_response( self, @@ -65,71 +124,52 @@ async def generate_response( The API does not accept a system role; the system prompt is folded into the first user message as \"System: ...\". """ + if not conversation_history or len(conversation_history) == 0: + return await self.start_conversation() + messages = build_langchain_messages(self.role, conversation_history) last_message = messages[-1].text - headers = { - "X-API-Key": self._api_key, - "Content-Type": "application/json", - } - body: Dict[str, Any] = { - "model": self._api_model, - "messages": [ - { - "role": "user", - "content": last_message, - }, - ], - "stream": False, - "conversation_id": self.conversation_id, - } - try: start_time = time.time() - async with aiohttp.ClientSession() as session: - async with session.post( - self._base_url, - headers=headers, - json=body, - ) as resp: - if resp.status != 200: - text = await resp.text() - raise RuntimeError( - f"Endpoint returned {resp.status}: {text[:500]}" - ) - resp_data = await resp.json() + resp_data = await self._ainvoke(self._base_url, last_message) end_time = time.time() msg_data = resp_data.get("message") or {} msg_text: str = msg_data.get("content", "") server_conversation_id = resp_data.get("conversation_id") - self.last_response_metadata = { - "conversation_id": server_conversation_id, - "model": resp_data.get("model", self._api_model), - "provider": "endpoint", - "role": self.role.value, - "timestamp": datetime.now().isoformat(), - "response_time_seconds": round(end_time - start_time, 3), - "total_duration": resp_data.get("total_duration"), - "load_duration": resp_data.get("load_duration"), - "prompt_eval_count": resp_data.get("prompt_eval_count"), - "prompt_eval_duration": resp_data.get("prompt_eval_duration"), - "eval_count": resp_data.get("eval_count"), - "eval_duration": resp_data.get("eval_duration"), - } - + usage = {} + if resp_data.get("prompt_eval_count") is not None: + usage["prompt_tokens"] = resp_data.get("prompt_eval_count", 0) + if resp_data.get("eval_count") is not None: + usage["completion_tokens"] = resp_data.get("eval_count", 0) + if usage: + usage.setdefault("prompt_tokens", 0) + usage.setdefault("completion_tokens", 0) + usage["total_tokens"] = ( + usage["prompt_tokens"] + usage["completion_tokens"] + ) + + self._set_response_metadata( + "endpoint", + model=resp_data.get("model", self._api_model), + response_id=msg_data.get("id"), + usage=usage, + conversation_id=server_conversation_id, + response_time_seconds=round(end_time - start_time, 3), + total_duration=resp_data.get("total_duration"), + load_duration=resp_data.get("load_duration"), + prompt_eval_count=resp_data.get("prompt_eval_count"), + prompt_eval_duration=resp_data.get("prompt_eval_duration"), + eval_count=resp_data.get("eval_count"), + eval_duration=resp_data.get("eval_duration"), + ) self._update_conversation_id_from_metadata() return msg_text except Exception as e: - self.last_response_metadata = { - "model": self._api_model, - "provider": "endpoint", - "role": self.role.value, - "timestamp": datetime.now().isoformat(), - "error": str(e), - } + self._set_response_metadata("endpoint", error=str(e)) self._update_conversation_id_from_metadata() return f"Error generating response: {str(e)}" diff --git a/tests/unit/llm_clients/conftest.py b/tests/unit/llm_clients/conftest.py index 63a031de..49e4fb7e 100644 --- a/tests/unit/llm_clients/conftest.py +++ b/tests/unit/llm_clients/conftest.py @@ -339,6 +339,22 @@ def mock_ollama_model(): yield mock +@pytest.fixture +def mock_endpoint_config(): + """Patch custom endpoint configuration for EndpointLLM tests.""" + from unittest.mock import patch + + with patch( + "llm_clients.endpoint_llm.Config.get_endpoint_config", + return_value={ + "base_url": "https://api.example.com/chat", + "api_key": "test-endpoint-key", + "model": "phi4", + }, + ): + yield + + # Note there is no need to mock the other LLM Client configs as Azure's is a bit complex @pytest.fixture def mock_azure_config(): From f3af354d809ab1bb0296ba643ec8d7b0ebf1a57b Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Wed, 18 Feb 2026 17:28:28 -0700 Subject: [PATCH 05/11] fulfill start_conversation if start_url exists --- llm_clients/config.py | 22 +- llm_clients/endpoint_llm.py | 95 ++++---- tests/unit/llm_clients/test_endpoint_llm.py | 240 ++++++++++++++++++++ 3 files changed, 308 insertions(+), 49 deletions(-) create mode 100644 tests/unit/llm_clients/test_endpoint_llm.py diff --git a/llm_clients/config.py b/llm_clients/config.py index 3dc1772e..e9d6ee89 100644 --- a/llm_clients/config.py +++ b/llm_clients/config.py @@ -32,8 +32,9 @@ class Config: AZURE_API_VERSION = os.getenv("AZURE_API_VERSION") # Optional # Custom endpoint (chat-only provider) - ENDPOINT_URL = os.getenv("ENDPOINT_URL", "http://0.0.0.0:8000") - ENDPOINT_API_KEY = os.getenv("ENDPOINT_API_KEY", "howdy") + ENDPOINT_API_KEY = os.getenv("ENDPOINT_API_KEY", None) + ENDPOINT_URL = os.getenv("ENDPOINT_URL", None) + ENDPOINT_START_URL = os.getenv("ENDPOINT_START_URL", None) @classmethod def get_claude_config(cls) -> Dict[str, Any]: @@ -91,9 +92,24 @@ def get_endpoint_config(cls) -> Dict[str, Any]: Returns base_url (no /api/chat path), api_key, and default model. Runtime parameters can override via kwargs. + Raises ValueError if ENDPOINT_API_KEY, ENDPOINT_URL, or ENDPOINT_START_URL + are not set in the environment. """ + missing = [] + if cls.ENDPOINT_API_KEY is None: + missing.append("ENDPOINT_API_KEY") + if cls.ENDPOINT_URL is None: + missing.append("ENDPOINT_URL") + if cls.ENDPOINT_START_URL is None: + print("ENDPOINT_START_URL is not set in the environment.") + if missing: + raise ValueError( + "Custom endpoint requires these environment variables: " + f"{', '.join(missing)}" + ) return { - "base_url": cls.ENDPOINT_URL.rstrip("/"), + "base_url": cls.ENDPOINT_URL, "api_key": cls.ENDPOINT_API_KEY, + "start_url": cls.ENDPOINT_START_URL, "model": "phi4", } diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py index 142f9074..6c5ff473 100644 --- a/llm_clients/endpoint_llm.py +++ b/llm_clients/endpoint_llm.py @@ -38,7 +38,8 @@ def __init__( ) cfg = Config.get_endpoint_config() - self._base_url = (base_url or cfg["base_url"]).rstrip("/") + self._base_url = base_url or cfg["base_url"] + self._start_url = cfg.get("start_url", None) self._api_key = api_key or cfg["api_key"] if model_name and model_name.lower().startswith("endpoint-"): @@ -62,21 +63,21 @@ def __getattr__(self, name): return None async def start_conversation(self) -> str: - """Produce the first response: static first_message if set, else call API.""" + """Produce the first conversational turn: + - static first_message if set, or + - API call to start_url if set, or + - API call to /api/chat with start_prompt if neither is set. + """ if self.first_message is not None: self._set_response_metadata("endpoint", static_first_message=True) return self.first_message + elif self._start_url is not None: + start_time = time.time() + resp_data = await self._ainvoke(self._start_url, self.start_prompt) + return self._process_chat_response( + resp_data, round(time.time() - start_time, 3) + ) else: - # Example: call a dedicated start endpoint instead of /api/chat - # start_url = f"{self._base_url}/start_convo" - # turns = self.get_initial_prompt_turns() - # content = turns[0].get("response", "") if turns else "" # initial prompt - # resp_data = await self._ainvoke(start_url, content) - # msg_data = resp_data.get("message") or {} - # self._set_response_metadata("endpoint", - # conversation_id=resp_data.get("conversation_id"), ...) - # self._update_conversation_id_from_metadata() - # return msg_data.get("content", "") return await self.generate_response(self.get_initial_prompt_turns()) def _default_headers(self) -> Dict[str, str]: @@ -86,6 +87,40 @@ def _default_headers(self) -> Dict[str, str]: "Content-Type": "application/json", } + def _process_chat_response( + self, resp_data: Dict[str, Any], response_time_seconds: float + ) -> str: + """Extract message text from API response and set metadata. Return content.""" + msg_data = resp_data.get("message") or {} + msg_text: str = msg_data.get("content", "") + + usage = {} + if resp_data.get("prompt_eval_count") is not None: + usage["prompt_tokens"] = resp_data.get("prompt_eval_count", 0) + if resp_data.get("eval_count") is not None: + usage["completion_tokens"] = resp_data.get("eval_count", 0) + if usage: + usage.setdefault("prompt_tokens", 0) + usage.setdefault("completion_tokens", 0) + usage["total_tokens"] = usage["prompt_tokens"] + usage["completion_tokens"] + + self._set_response_metadata( + "endpoint", + model=resp_data.get("model", self._api_model), + response_id=msg_data.get("id"), + usage=usage, + conversation_id=resp_data.get("conversation_id"), + response_time_seconds=response_time_seconds, + total_duration=resp_data.get("total_duration"), + load_duration=resp_data.get("load_duration"), + prompt_eval_count=resp_data.get("prompt_eval_count"), + prompt_eval_duration=resp_data.get("prompt_eval_duration"), + eval_count=resp_data.get("eval_count"), + eval_duration=resp_data.get("eval_duration"), + ) + self._update_conversation_id_from_metadata() + return msg_text + def _build_body(self, content: str) -> Dict[str, Any]: """Body: model, messages (user content), stream, conversation_id.""" return { @@ -133,41 +168,9 @@ async def generate_response( try: start_time = time.time() resp_data = await self._ainvoke(self._base_url, last_message) - end_time = time.time() - - msg_data = resp_data.get("message") or {} - msg_text: str = msg_data.get("content", "") - - server_conversation_id = resp_data.get("conversation_id") - usage = {} - if resp_data.get("prompt_eval_count") is not None: - usage["prompt_tokens"] = resp_data.get("prompt_eval_count", 0) - if resp_data.get("eval_count") is not None: - usage["completion_tokens"] = resp_data.get("eval_count", 0) - if usage: - usage.setdefault("prompt_tokens", 0) - usage.setdefault("completion_tokens", 0) - usage["total_tokens"] = ( - usage["prompt_tokens"] + usage["completion_tokens"] - ) - - self._set_response_metadata( - "endpoint", - model=resp_data.get("model", self._api_model), - response_id=msg_data.get("id"), - usage=usage, - conversation_id=server_conversation_id, - response_time_seconds=round(end_time - start_time, 3), - total_duration=resp_data.get("total_duration"), - load_duration=resp_data.get("load_duration"), - prompt_eval_count=resp_data.get("prompt_eval_count"), - prompt_eval_duration=resp_data.get("prompt_eval_duration"), - eval_count=resp_data.get("eval_count"), - eval_duration=resp_data.get("eval_duration"), + return self._process_chat_response( + resp_data, round(time.time() - start_time, 3) ) - self._update_conversation_id_from_metadata() - - return msg_text except Exception as e: self._set_response_metadata("endpoint", error=str(e)) self._update_conversation_id_from_metadata() diff --git a/tests/unit/llm_clients/test_endpoint_llm.py b/tests/unit/llm_clients/test_endpoint_llm.py new file mode 100644 index 00000000..e71bcd08 --- /dev/null +++ b/tests/unit/llm_clients/test_endpoint_llm.py @@ -0,0 +1,240 @@ +"""Unit tests for EndpointLLM class.""" + +from contextlib import contextmanager +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from llm_clients import Role +from llm_clients.endpoint_llm import EndpointLLM +from llm_clients.llm_interface import DEFAULT_START_PROMPT + +from .test_base_llm import TestLLMBase +from .test_helpers import ( + assert_error_metadata, + assert_error_response, + assert_iso_timestamp, + assert_metadata_copy_behavior, + assert_metadata_structure, + assert_response_timing, +) + + +def _make_aiohttp_mock( + content: str = "Test response text", + conversation_id: str | None = "server-cid-1", + status: int = 200, +): + """Build mock aiohttp ClientSession/post/response for EndpointLLM.""" + resp_mock = MagicMock() + resp_mock.status = status + resp_mock.json = AsyncMock( + return_value={ + "message": {"content": content, "id": "msg-1"}, + "conversation_id": conversation_id, + "model": "phi4", + } + ) + resp_mock.text = AsyncMock(return_value="") + + post_cm = MagicMock() + post_cm.__aenter__ = AsyncMock(return_value=resp_mock) + post_cm.__aexit__ = AsyncMock(return_value=None) + + session_mock = MagicMock() + session_mock.post = MagicMock(return_value=post_cm) + + session_cm = MagicMock() + session_cm.__aenter__ = AsyncMock(return_value=session_mock) + session_cm.__aexit__ = AsyncMock(return_value=None) + + client_session_mock = MagicMock(return_value=session_cm) + return client_session_mock + + +@pytest.mark.unit +@pytest.mark.usefixtures("mock_endpoint_config") +class TestEndpointLLM(TestLLMBase): + """Unit tests for EndpointLLM. + + EndpointLLM implements LLMInterface only (no JudgeLLM); it uses aiohttp + instead of an underlying .llm, so some base tests are overridden. + """ + + def create_llm(self, role: Role, **kwargs): + if "name" not in kwargs: + kwargs["name"] = "test-endpoint" + return EndpointLLM(role=role, **kwargs) + + def get_provider_name(self) -> str: + return "endpoint" + + @contextmanager + def get_mock_patches(self): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession", + new_callable=lambda: _make_aiohttp_mock(), + ): + yield + + # ------------------------------------------------------------------------- + # Overrides: generate_response uses aiohttp, not llm.llm + # ------------------------------------------------------------------------- + + @pytest.mark.asyncio + async def test_generate_response_returns_llm_text( + self, mock_response_factory, mock_llm_factory, mock_system_message + ): + expected_text = "Test response text" + with self.get_mock_patches(): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession", + new_callable=lambda: _make_aiohttp_mock(content=expected_text), + ): + llm = self.create_llm(role=Role.PROVIDER, name="TestLLM") + response = await llm.generate_response( + conversation_history=mock_system_message + ) + assert response == expected_text + + @pytest.mark.asyncio + async def test_generate_response_updates_metadata( + self, mock_response_factory, mock_llm_factory, mock_system_message + ): + with self.get_mock_patches(): + llm = self.create_llm(role=Role.PROVIDER, name="TestLLM") + await llm.generate_response(conversation_history=mock_system_message) + metadata = assert_metadata_structure( + llm, + expected_provider=self.get_provider_name(), + expected_role=Role.PROVIDER, + ) + assert "timestamp" in metadata + assert_iso_timestamp(metadata["timestamp"]) + assert_response_timing(metadata) + + @pytest.mark.asyncio + async def test_generate_response_handles_errors( + self, mock_llm_factory, mock_system_message + ): + with self.get_mock_patches(): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession" + ) as mock_session_class: + session_cm = MagicMock() + session_cm.__aenter__ = AsyncMock(side_effect=Exception("API Error")) + session_cm.__aexit__ = AsyncMock(return_value=None) + mock_session_class.return_value = session_cm + + llm = self.create_llm(role=Role.PROVIDER, name="TestLLM") + response = await llm.generate_response( + conversation_history=mock_system_message + ) + + assert_error_response(response, "API Error") + assert_error_metadata( + llm, + expected_provider=self.get_provider_name(), + expected_error_substring="API Error", + ) + + # ------------------------------------------------------------------------- + # Endpoint-specific tests + # ------------------------------------------------------------------------- + + def test_init_passes_first_message_and_start_prompt_to_super(self): + with self.get_mock_patches(): + llm = EndpointLLM( + name="ep", + role=Role.PROVIDER, + first_message="Hello", + start_prompt="Custom start", + ) + assert llm.first_message == "Hello" + assert llm.start_prompt == "Custom start" + + def test_init_default_start_prompt(self): + with self.get_mock_patches(): + llm = EndpointLLM(name="ep", role=Role.PROVIDER) + assert llm.start_prompt == DEFAULT_START_PROMPT + + @pytest.mark.asyncio + async def test_start_conversation_returns_first_message_when_set(self): + with self.get_mock_patches(): + llm = EndpointLLM( + name="ep", + role=Role.PROVIDER, + first_message="Static first reply", + ) + out = await llm.start_conversation() + assert out == "Static first reply" + meta = llm.last_response_metadata + assert meta.get("static_first_message") is True + assert meta.get("provider") == "endpoint" + + @pytest.mark.asyncio + async def test_start_conversation_calls_api_when_no_first_message(self): + with self.get_mock_patches(): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession", + new_callable=lambda: _make_aiohttp_mock(content="First turn from API"), + ) as mock_session_class: + llm = EndpointLLM(name="ep", role=Role.PROVIDER) + out = await llm.start_conversation() + assert out == "First turn from API" + mock_session_class.return_value.__aenter__.return_value.post.assert_called_once() + + @pytest.mark.asyncio + async def test_generate_response_with_empty_conversation_history(self): + """Verify start_conversation / default start_prompt with empty history.""" + with self.get_mock_patches(): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession", + new_callable=lambda: _make_aiohttp_mock(content="Delegated first turn"), + ): + llm = EndpointLLM(name="ep", role=Role.PROVIDER) + out = await llm.generate_response(conversation_history=[]) + assert out == "Delegated first turn" + + @pytest.mark.asyncio + async def test_generate_response_none_history_delegates_to_start_conversation( + self, + ): + with self.get_mock_patches(): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession", + new_callable=lambda: _make_aiohttp_mock(content="Delegated from None"), + ): + llm = EndpointLLM(name="ep", role=Role.PROVIDER) + out = await llm.generate_response(conversation_history=None) + assert out == "Delegated from None" + + def test_set_system_prompt(self): + with self.get_mock_patches(): + llm = self.create_llm( + role=Role.PROVIDER, name="TestLLM", system_prompt="Initial" + ) + assert llm.system_prompt == "Initial" + llm.set_system_prompt("Updated") + assert llm.system_prompt == "Updated" + + def test_getattr_returns_none_for_unknown_attribute(self): + with self.get_mock_patches(): + llm = EndpointLLM(name="ep", role=Role.PROVIDER) + assert llm.nonexistent_attr is None + + def test_temperature_and_max_tokens_accessible_from_self(self): + with self.get_mock_patches(): + llm = EndpointLLM( + name="ep", + role=Role.PROVIDER, + temperature=0.3, + max_tokens=100, + ) + assert llm.temperature == 0.3 + assert llm.max_tokens == 100 + + def test_last_response_metadata_copy_returns_copy(self): + with self.get_mock_patches(): + llm = self.create_llm(role=Role.PROVIDER, name="TestLLM") + assert_metadata_copy_behavior(llm) From b2d0d347118a7262b1215421460b70c9e0e51f1a Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Thu, 19 Feb 2026 09:17:15 -0700 Subject: [PATCH 06/11] include ENDPOINT_START_URL --- .env.example | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 87c84430..b82879df 100644 --- a/.env.example +++ b/.env.example @@ -1,8 +1,13 @@ ANTHROPIC_API_KEY=your_anthropic_api_key_here + OPENAI_API_KEY=your_openai_api_key_here + GOOGLE_API_KEY=your_google_api_key_here + AZURE_API_KEY=your_azure_api_key_here AZURE_ENDPOINT=your_azure_endpoint_here AZURE_API_VERSION=your_azure_api_version_here + ENDPOINT_URL=http://0.0.0.0:8000/api/chat -ENDPOINT_API_KEY=howdy \ No newline at end of file +ENDPOINT_START_URL=http://0.0.0.0:8000/api/start_conversation +ENDPOINT_API_KEY=your_endpoint_api_key_here \ No newline at end of file From 2f104f8bd7ee8ac5d095fdc227e42810179fab2a Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Thu, 19 Feb 2026 09:23:34 -0700 Subject: [PATCH 07/11] test overwrite convo id --- tests/unit/llm_clients/test_endpoint_llm.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/unit/llm_clients/test_endpoint_llm.py b/tests/unit/llm_clients/test_endpoint_llm.py index e71bcd08..7c149d2e 100644 --- a/tests/unit/llm_clients/test_endpoint_llm.py +++ b/tests/unit/llm_clients/test_endpoint_llm.py @@ -184,6 +184,25 @@ async def test_start_conversation_calls_api_when_no_first_message(self): assert out == "First turn from API" mock_session_class.return_value.__aenter__.return_value.post.assert_called_once() + @pytest.mark.asyncio + async def test_conversation_id_overwritten_when_endpoint_returns_different( + self, mock_system_message + ): + """Endpoint response conversation_id overwrites client-generated id.""" + client_cid = "client-generated-cid" + server_cid = "server-returned-cid" + with self.get_mock_patches(): + with patch( + "llm_clients.endpoint_llm.aiohttp.ClientSession", + new_callable=lambda: _make_aiohttp_mock( + content="OK", conversation_id=server_cid + ), + ): + llm = EndpointLLM(name="ep", role=Role.PROVIDER) + llm.conversation_id = client_cid + await llm.generate_response(conversation_history=mock_system_message) + assert llm.conversation_id == server_cid + @pytest.mark.asyncio async def test_generate_response_with_empty_conversation_history(self): """Verify start_conversation / default start_prompt with empty history.""" From 16ce00fb6f2046ca2a2be6d161ce379db5175697 Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Thu, 19 Feb 2026 09:30:26 -0700 Subject: [PATCH 08/11] add doc about why system msg is not used --- llm_clients/endpoint_llm.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py index 6c5ff473..aa80c1bd 100644 --- a/llm_clients/endpoint_llm.py +++ b/llm_clients/endpoint_llm.py @@ -15,6 +15,12 @@ class EndpointLLM(LLMInterface): The API manages conversation history server-side via conversation_id. This implementation does not support structured output and cannot be used as a judge. For judge operations, use Claude, OpenAI, Gemini, or Azure. + + System prompt: This class accepts system_prompt (from LLMInterface) for + interface consistency and as an example for subclasses. By default we do + not send it to the endpoint as custom APIs typically manage system context + themselves. To apply it (e.g. prefix first user message with + \"System: ...\"), override generate_response or _build_body in a subclass. """ def __init__( @@ -122,7 +128,9 @@ def _process_chat_response( return msg_text def _build_body(self, content: str) -> Dict[str, Any]: - """Body: model, messages (user content), stream, conversation_id.""" + """Body: model, messages (user content only), stream, conversation_id. + System prompt is not included; see class docstring. + """ return { "model": self._api_model, "messages": [{"role": "user", "content": content}], @@ -156,14 +164,14 @@ async def generate_response( ) -> str: """Generate a response via POST /api/chat with server-side conversation_id. - The API does not accept a system role; the system prompt is folded into - the first user message as \"System: ...\". + Only the latest user content is sent; self.system_prompt is not included + in the request (see class docstring for rationale). """ if not conversation_history or len(conversation_history) == 0: return await self.start_conversation() messages = build_langchain_messages(self.role, conversation_history) - last_message = messages[-1].text + last_message = messages[-1].text # no system_prompt in payload by design try: start_time = time.time() From f032f090d851d0013055e42d2a2452f4fe0f03ea Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Thu, 19 Feb 2026 10:22:32 -0700 Subject: [PATCH 09/11] clarify endpoint config expectations --- llm_clients/config.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llm_clients/config.py b/llm_clients/config.py index e9d6ee89..e930bc07 100644 --- a/llm_clients/config.py +++ b/llm_clients/config.py @@ -25,13 +25,15 @@ class Config: # API Keys ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY") + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") - GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # For Gemini + + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + AZURE_API_KEY = os.getenv("AZURE_API_KEY") AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT") - AZURE_API_VERSION = os.getenv("AZURE_API_VERSION") # Optional + AZURE_API_VERSION = os.getenv("AZURE_API_VERSION") - # Custom endpoint (chat-only provider) ENDPOINT_API_KEY = os.getenv("ENDPOINT_API_KEY", None) ENDPOINT_URL = os.getenv("ENDPOINT_URL", None) ENDPOINT_START_URL = os.getenv("ENDPOINT_START_URL", None) @@ -92,8 +94,9 @@ def get_endpoint_config(cls) -> Dict[str, Any]: Returns base_url (no /api/chat path), api_key, and default model. Runtime parameters can override via kwargs. - Raises ValueError if ENDPOINT_API_KEY, ENDPOINT_URL, or ENDPOINT_START_URL + Raises ValueError if ENDPOINT_API_KEY or ENDPOINT_URL are not set in the environment. + ENDPOINT_START_URL is optional and can be set to None. """ missing = [] if cls.ENDPOINT_API_KEY is None: From 64eb7bfb3522c5fa7ab3aa26256fa12647216a4c Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Thu, 19 Feb 2026 10:49:51 -0700 Subject: [PATCH 10/11] unset start_prompt if _start_url is present --- llm_clients/endpoint_llm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llm_clients/endpoint_llm.py b/llm_clients/endpoint_llm.py index aa80c1bd..4c0e519b 100644 --- a/llm_clients/endpoint_llm.py +++ b/llm_clients/endpoint_llm.py @@ -44,9 +44,14 @@ def __init__( ) cfg = Config.get_endpoint_config() + self._api_key = api_key or cfg["api_key"] self._base_url = base_url or cfg["base_url"] self._start_url = cfg.get("start_url", None) - self._api_key = api_key or cfg["api_key"] + + # NOTE: if start_url is set, we don't need to use the start_prompt + # unless the developer wants to utilize it + if self._start_url is not None: + self.start_prompt = None if model_name and model_name.lower().startswith("endpoint-"): self._api_model = model_name[len("endpoint-") :].strip() or cfg["model"] From cac1de6f3ccb81b78f01f110d962ad87a66376ee Mon Sep 17 00:00:00 2001 From: Josh Gieringer Date: Thu, 19 Feb 2026 11:03:47 -0700 Subject: [PATCH 11/11] add note about EndpointLLM --- README.md | 1 + docs/evaluating.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 9205822d..1da8971c 100644 --- a/README.md +++ b/README.md @@ -302,6 +302,7 @@ VERA-MH simulates realistic conversations between Large Language Models (LLMs) f - **`gemini_llm.py`**: Google Gemini implementation with structured output - **`azure_llm.py`**: Azure OpenAI and Azure AI Foundry implementation with structured output - **`ollama_llm.py`**: Ollama model implementation + - **`endpoint_llm.py`**: Example for using your own API as the provider agent (currently chat-only; see [evaluating.md](docs/evaluating.md)) - **`config.py`**: Configuration management for API keys and model settings - **`utils/`**: Utility functions and helpers - **`prompt_loader.py`**: Functions for loading prompt configurations diff --git a/docs/evaluating.md b/docs/evaluating.md index 9392170b..8bf0113e 100644 --- a/docs/evaluating.md +++ b/docs/evaluating.md @@ -3,6 +3,7 @@ VERA-MH is ready to be used to evaluate any chat-based interface. [This](../llm_clients/llm_interface.py) Abstract Base Class (ABC) represents the interface to be implemented. Four concrete implementations of that class are provided for the APIs of ChatGPT, Claude, Gemini, Azure, and Llama (via Ollama). +For developers who wish to use their own API as the provider agent, [EndpointLLM](../llm_clients/endpoint_llm.py) serves as a working example (currently chat-only; no judge support). To test your service, you need to instantiate a concrete class and implement these key methods: - `start_conversation()`: Async method that returns the first conversational turn as a string. For raw LLM APIs you can call `generate_response(self.get_initial_prompt_turns())`; for service-based APIs you may call your own start endpoint (e.g. POST /start_conversation) and return the message.