Skip to content
Draft
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ cache
/workspace/
openapi.json
.client/
/.agent_server_llm_switch_demo/
# Local workspace files
.beads/*.db
.worktrees/
Expand Down
56 changes: 56 additions & 0 deletions docs/llm_switching.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Runtime LLM Switching (SDK + agent-server)

This repo supports **runtime LLM switching** without any “agent immutability” or
resume-time diff enforcement. The guiding principle is:

- An `Agent` is a *composition* of (effectively) immutable components (`LLM`,
`AgentContext`, etc.).
- The composition is **switchable**: at runtime the conversation can replace
components (currently the agent’s primary `LLM`) and persist the change.

## Persistence model (single rule)

Conversation snapshots (`base_state.json`) persist the agent’s LLM as:

- `{"profile_id": "<id>"}` when `LLM.profile_id` is present
- a full inline LLM payload when `LLM.profile_id` is absent

Snapshots are written with `context={"expose_secrets": True}` so inline LLMs can
restore without “reconciliation” against a runtime agent.

## SDK API

Local conversations support two LLM update paths:

- `LocalConversation.switch_llm(profile_id: str)`:
loads `<profile_id>.json` from the registry’s profile dir and swaps the active
LLM for the agent’s `usage_id`.
- `LocalConversation.set_llm(llm: LLM)`:
replaces the active LLM instance for the agent’s `usage_id` (useful for remote
clients that can’t rely on server-side profile files).

Both are persisted immediately via the conversation’s base state snapshot.

## agent-server API

For a running (or paused/idle) conversation:

- `POST /api/conversations/{conversation_id}/llm`
- `{"profile_id": "<id>"}`: switch via server-side profile loading
- `{"llm": {...}}`: set an inline LLM payload (client-supplied config)

There is also a convenience alias:

- `POST /api/conversations/{conversation_id}/llm/switch`
- `{"profile_id": "<id>"}`

## Remote clients (VS Code extension)

VS Code LLM Profiles are **local-only**. The recommended remote flow is:

1. Resolve `profileId` locally to an LLM configuration.
2. Start the conversation with an expanded `agent.llm` payload (no `profile_id`).
3. On profile changes, call `POST /api/conversations/{id}/llm` with
`{"llm": <expanded payload>}` so the server persists the new LLM.
4. On restore, the server’s persisted LLM is the source of truth; the client
can re-apply its selected profile before triggering a new run if desired.
31 changes: 1 addition & 30 deletions examples/01_standalone_sdk/26_runtime_llm_switch.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
api_key = os.getenv("LLM_API_KEY")
assert api_key is not None, "LLM_API_KEY environment variable is not set."

# 2. Disable inline conversations so profile references are stored instead
os.environ.setdefault("OPENHANDS_INLINE_CONVERSATIONS", "false")

# 3. Profiles live under ~/.openhands/llm-profiles by default. We create two
# 2. Profiles live under ~/.openhands/llm-profiles by default. We create two
# variants that share the same usage_id so they can be swapped at runtime.
registry = LLMRegistry()
usage_id = "support-agent"
Expand Down Expand Up @@ -123,29 +120,3 @@
reloaded.run()

print("Reloaded run finished with profile:", reloaded.state.agent.llm.profile_id)

# ---------------------------------------------------------------------------
# Part 2: Inline persistence rejects runtime switching
# ---------------------------------------------------------------------------
# When OPENHANDS_INLINE_CONVERSATIONS is true the conversation persists full
# LLM payloads instead of profile references. Switching profiles would break
# the diff reconciliation step, so the SDK deliberately rejects it with a
# RuntimeError. We demonstrate that behaviour below.
os.environ["OPENHANDS_INLINE_CONVERSATIONS"] = "true"

inline_persistence_dir = Path("./.conversations_switch_demo_inline").resolve()
inline_agent = Agent(llm=registry.load_profile(base_profile_id), tools=[])
inline_conversation = Conversation(
agent=inline_agent,
workspace=str(workspace_dir),
persistence_dir=str(inline_persistence_dir),
conversation_id=uuid.uuid4(),
visualizer=None,
)

try:
inline_conversation.switch_llm(alt_profile_id)
except RuntimeError as exc:
print("Inline mode switch attempt rejected as expected:", exc)
else:
raise AssertionError("Inline mode should have rejected the LLM switch")
183 changes: 183 additions & 0 deletions examples/02_remote_agent_server/07_llm_switch_and_restore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Demonstrate agent-server LLM switching + persistence across restart.

This script:
1) Starts a local Python agent-server with a dedicated conversations directory.
2) Creates a conversation (without running it).
3) Switches the conversation's active LLM via `POST /api/conversations/{id}/llm`.
4) Restarts the agent-server and verifies the switched LLM persists on restore.

The switch uses an inline LLM payload, which is the recommended path for remote
clients whose "profiles" are local-only (e.g. the VS Code extension).
"""

from __future__ import annotations

import json
import os
import socket
import subprocess
import sys
import time
from pathlib import Path

import httpx


def _find_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(("127.0.0.1", 0))
sock.listen(1)
return int(sock.getsockname()[1])


def _wait_for_health(base_url: str, timeout_seconds: float = 30.0) -> None:
deadline = time.time() + timeout_seconds
while time.time() < deadline:
try:
response = httpx.get(f"{base_url}/health", timeout=1.0)
if response.status_code == 200:
return
except Exception:
pass
time.sleep(0.25)
raise RuntimeError(
f"Timed out waiting for agent-server health at {base_url}/health"
)


def _start_agent_server(
*, conversations_path: Path
) -> tuple[subprocess.Popen[str], str]:
port = _find_free_port()
base_url = f"http://127.0.0.1:{port}"

env = {
**os.environ,
"PYTHONUNBUFFERED": "1",
"OH_ENABLE_VSCODE": "0",
"OH_ENABLE_VNC": "0",
"OH_PRELOAD_TOOLS": "0",
"SESSION_API_KEY": "",
"OH_CONVERSATIONS_PATH": str(conversations_path),
}

proc = subprocess.Popen(
[
sys.executable,
"-m",
"openhands.agent_server",
"--host",
"127.0.0.1",
"--port",
str(port),
],
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
try:
_wait_for_health(base_url)
except Exception:
try:
output = (proc.stdout.read() if proc.stdout else "") or ""
except Exception:
output = ""
proc.terminate()
raise RuntimeError(f"agent-server failed to start.\n\n{output}") from None

return proc, base_url


def _stop_agent_server(proc: subprocess.Popen[str]) -> None:
proc.terminate()
try:
proc.wait(timeout=5)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait(timeout=5)


def main() -> None:
root = Path(".agent_server_llm_switch_demo").resolve()
conversations_path = root / "conversations"
workspace_path = root / "workspace"
conversations_path.mkdir(parents=True, exist_ok=True)
workspace_path.mkdir(parents=True, exist_ok=True)

proc_1, base_1 = _start_agent_server(conversations_path=conversations_path)
conversation_id: str
try:
print("agent-server #1:", base_1)

create = httpx.post(
f"{base_1}/api/conversations",
json={
"agent": {
"llm": {
"usage_id": "agent",
"model": "test-provider/original",
"api_key": "test-key",
},
"tools": [],
},
"workspace": {"working_dir": str(workspace_path)},
},
timeout=10.0,
)
create.raise_for_status()
conversation_id = create.json()["id"]
print("conversation id:", conversation_id)

update = httpx.post(
f"{base_1}/api/conversations/{conversation_id}/llm",
json={
"llm": {
"usage_id": "ignored-by-server",
"model": "test-provider/alternate",
"api_key": "test-key-2",
}
},
timeout=10.0,
)
update.raise_for_status()

info = httpx.get(
f"{base_1}/api/conversations/{conversation_id}",
timeout=10.0,
)
info.raise_for_status()
current_model = info.json()["agent"]["llm"]["model"]
print("server #1 model:", current_model)
if current_model != "test-provider/alternate":
raise RuntimeError("LLM switch did not apply on server #1")
finally:
_stop_agent_server(proc_1)

proc_2, base_2 = _start_agent_server(conversations_path=conversations_path)
try:
print("agent-server #2:", base_2)
restored = httpx.get(
f"{base_2}/api/conversations/{conversation_id}",
timeout=10.0,
)
restored.raise_for_status()
restored_model = restored.json()["agent"]["llm"]["model"]
print("server #2 restored model:", restored_model)
if restored_model != "test-provider/alternate":
raise RuntimeError("LLM switch did not persist across restart")
finally:
_stop_agent_server(proc_2)

print("✓ LLM switch persisted across agent-server restart")

base_state = (
conversations_path / conversation_id.replace("-", "") / "base_state.json"
)
if base_state.exists():
payload = json.loads(base_state.read_text(encoding="utf-8"))
print("base_state.json agent.llm:", payload.get("agent", {}).get("llm"))


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
SetSecurityAnalyzerRequest,
StartConversationRequest,
Success,
SwitchLLMProfileRequest,
UpdateConversationLLMRequest,
UpdateConversationRequest,
UpdateSecretsRequest,
)
Expand Down Expand Up @@ -254,6 +256,44 @@ async def set_conversation_security_analyzer(
return Success()


@conversation_router.post(
"/{conversation_id}/llm/switch",
responses={404: {"description": "Item not found"}},
)
async def switch_conversation_llm(
conversation_id: UUID,
request: SwitchLLMProfileRequest,
conversation_service: ConversationService = Depends(get_conversation_service),
) -> Success:
"""Switch the conversation's active agent LLM profile for future requests."""
event_service = await conversation_service.get_event_service(conversation_id)
if event_service is None:
raise HTTPException(status.HTTP_404_NOT_FOUND)
await event_service.switch_llm(request.profile_id)
return Success()


@conversation_router.post(
"/{conversation_id}/llm",
responses={404: {"description": "Item not found"}},
)
async def update_conversation_llm(
conversation_id: UUID,
request: UpdateConversationLLMRequest,
conversation_service: ConversationService = Depends(get_conversation_service),
) -> Success:
"""Update the conversation's active agent LLM for future requests."""
event_service = await conversation_service.get_event_service(conversation_id)
if event_service is None:
raise HTTPException(status.HTTP_404_NOT_FOUND)
if request.profile_id is not None:
await event_service.switch_llm(request.profile_id)
else:
assert request.llm is not None
await event_service.set_llm(request.llm)
return Success()


@conversation_router.patch(
"/{conversation_id}", responses={404: {"description": "Item not found"}}
)
Expand Down
26 changes: 26 additions & 0 deletions openhands-agent-server/openhands/agent_server/event_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,32 @@ async def set_security_analyzer(
None, self._conversation.set_security_analyzer, security_analyzer
)

async def _update_llm(self, update_fn, *args) -> None:
"""Apply an LLM update and re-wire telemetry for future completions."""
conversation = self._conversation
if conversation is None:
raise ValueError("inactive_service")
loop = asyncio.get_running_loop()
await loop.run_in_executor(None, update_fn, *args)
# The agent may now hold a new LLM instance; re-wire telemetry callbacks so
# clients continue receiving logs/stats for future completions.
self._setup_llm_log_streaming(conversation.agent)
self._setup_stats_streaming(conversation.agent)

async def switch_llm(self, profile_id: str) -> None:
"""Switch the conversation's active agent LLM to the given profile."""
conversation = self._conversation
if conversation is None:
raise ValueError("inactive_service")
await self._update_llm(conversation.switch_llm, profile_id)

async def set_llm(self, llm: LLM) -> None:
"""Replace the conversation's active agent LLM instance."""
conversation = self._conversation
if conversation is None:
raise ValueError("inactive_service")
await self._update_llm(conversation.set_llm, llm)

async def close(self):
await self._pub_sub.close()
if self._conversation:
Expand Down
Loading
Loading