diff --git a/README.md b/README.md index 2b0f956..3daf0db 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ The proxy supports the following OpenAI-compatible parameters in the `/v1/chat/c - **`temperature`** (number): Controls randomness (passed to the engine). - **`max_tokens`** (number): Limits the length of the generated response. - **`reasoning_effort`** (string): For models with reasoning capabilities (e.g., `low`, `medium`, `high`). +- **`tools` / `tool_choice`**: Standard OpenAI tool-calling fields used by agentic clients. +- **`browseros_mode`** (boolean): Optional strict mode toggle for BrowserOS-like agentic clients. When tools are provided, this mode is **enabled by default** unless you explicitly set `browseros_mode: false`. ## Quick Start @@ -70,6 +72,18 @@ curl -N -X POST http://localhost:8080/v1/chat/completions \ - **Port**: Set via `PORT` environment variable (defaults to 8080). - **Models**: The proxy automatically queries your local Codex installation for available model slugs. +### BrowserOS Configuration + +If your BrowserOS agent sends tool definitions but the model replies with text like _"I’m unable to control the browser from this environment."_, verify: + +- you are sending `tools` in your `/v1/chat/completions` request body (this auto-enables BrowserOS strict mode) +- optionally set `browseros_mode: true` explicitly for clarity +- keep sending `tool_choice` when your client supports it + +To disable strict BrowserOS behavior for non-agentic use-cases, set `browseros_mode: false`. + +This proxy mode is designed to use the proxy only for LLM/provider behavior while BrowserOS continues to execute the actual browser tools on its side. + ## Architecture This project uses a typed `CodexClient` that manages a persistent `codex app-server` background process. Communication happens over a high-speed JSON-RPC channel on `stdio`, ensuring that the model state remains warm and ready for immediate inference. diff --git a/src/codex-client.ts b/src/codex-client.ts index 2c95de6..6e400b9 100644 --- a/src/codex-client.ts +++ b/src/codex-client.ts @@ -3,6 +3,9 @@ import { getCodexBinaryPath, type Message, type CodexStreamEvent, + type ParsedToolCall, + parseToolCalls, + buildToolInstructions, } from "./codex"; // Official V2 Types @@ -118,17 +121,91 @@ export class CodexClient { async *chatCompletionStream( messages: Message[], - options: { model: string }, + options: { + model: string; + tools?: any[]; + tool_choice?: any; + browseros_mode?: boolean; + }, ): AsyncGenerator { - // Format full prompt - let fullPrompt = ""; + const hasTools = options.tools && options.tools.length > 0; + + // --- Extract system messages into baseInstructions --- + const systemParts: string[] = []; + const nonSystemMessages: Message[] = []; for (const msg of messages) { - const roleName = msg.role.toUpperCase(); - const content = - typeof msg.content === "string" - ? msg.content - : JSON.stringify(msg.content); - fullPrompt += `[${roleName}]\n${content}\n\n`; + if (msg.role === "system") { + const content = + typeof msg.content === "string" + ? msg.content + : JSON.stringify(msg.content); + systemParts.push(content); + } else { + nonSystemMessages.push(msg); + } + } + + let baseInstructions = systemParts.join("\n\n") || undefined; + + // --- If tools are provided, inject tool definitions into instructions --- + if (hasTools) { + const toolBlock = buildToolInstructions( + options.tools!, + options.tool_choice, + ); + baseInstructions = (baseInstructions || "") + toolBlock; + } + + if (hasTools && options.browseros_mode) { + const browserOSToolModeInstructions = + `\n\n## BrowserOS Tool Execution Mode\n\n` + + `You are running as a tool-calling planner inside BrowserOS. ` + + `You can and must control the browser by emitting tool calls. ` + + `Do not claim you cannot access or control the browser/environment. ` + + `If the request is actionable with available tools, your response MUST include at least one block. ` + + `Prefer tool-call-only output for action steps. ` + + `For shopping workflows, adding products to cart is permitted; avoid checkout/payment unless explicitly requested. ` + + `If an action needs multiple steps, emit only the next required tool call(s) for the current step.\n`; + baseInstructions = (baseInstructions || "") + browserOSToolModeInstructions; + } + + // --- Format conversation messages into prompt --- + let fullPrompt = ""; + for (const msg of nonSystemMessages) { + if (msg.role === "tool") { + // Tool result message from BrowserOS + const toolCallId = (msg as any).tool_call_id || "unknown"; + const toolName = (msg as any).name || "unknown"; + const content = + typeof msg.content === "string" + ? msg.content + : JSON.stringify(msg.content); + fullPrompt += `[TOOL_RESULT] (tool_call_id: ${toolCallId}, name: ${toolName})\n${content}\n\n`; + } else if (msg.role === "assistant" && (msg as any).tool_calls) { + // Assistant message that contained tool calls (history from previous turns) + const toolCalls = (msg as any).tool_calls as any[]; + let assistantContent = ""; + if (msg.content) { + assistantContent += + typeof msg.content === "string" + ? msg.content + : JSON.stringify(msg.content); + assistantContent += "\n"; + } + for (const tc of toolCalls) { + if (tc.type === "function" && tc.function) { + assistantContent += `{"name": "${tc.function.name}", "arguments": ${tc.function.arguments}}\n`; + } + } + fullPrompt += `[ASSISTANT]\n${assistantContent}\n`; + } else { + const roleName = msg.role.toUpperCase(); + const content = + typeof msg.content === "string" + ? msg.content + : JSON.stringify(msg.content); + fullPrompt += `[${roleName}]\n${content}\n\n`; + } } fullPrompt = (fullPrompt.trim() || "Please help me.") + "\n\n[ASSISTANT]\n"; @@ -137,6 +214,7 @@ export class CodexClient { cwd: process.cwd(), experimentalRawEvents: false, persistExtendedHistory: false, + ...(baseInstructions ? { baseInstructions } : {}), }; const startRes = (await this.request( @@ -166,10 +244,9 @@ export class CodexClient { input: input, cwd: process.cwd(), approvalPolicy: "never", - sandboxPolicy: { - type: "readOnly", - access: { type: "fullAccess" }, - }, + sandboxPolicy: hasTools + ? { type: "readOnly", access: { type: "fullAccess" } } + : { type: "dangerFullAccess" }, model: options.model, effort: "none" as any, summary: "none" as any, @@ -180,6 +257,7 @@ export class CodexClient { let turnDone = false; const eventQueue: CodexStreamEvent[] = []; let resolveNext: (() => void) | null = null; + let accumulatedText = ""; const cleanup = this.onEvent((event) => { if (event.type === "notification") { @@ -187,7 +265,12 @@ export class CodexClient { if (method === "item/agentMessage/delta") { const p = params as AgentMessageDeltaNotification; - eventQueue.push({ type: "message", text: p.delta }); + accumulatedText += p.delta; + if (!hasTools) { + // When no tools, stream text directly + eventQueue.push({ type: "message", text: p.delta }); + } + // When tools present, we buffer and parse at the end } else if ( method === "item/reasoning/textDelta" || method === "item/reasoning/summaryTextDelta" @@ -195,6 +278,29 @@ export class CodexClient { const p = params as ReasoningTextDeltaNotification; eventQueue.push({ type: "reasoning", text: p.delta }); } else if (method === "turn/completed") { + // If tools are present, check for tool calls in accumulated text + if (hasTools && accumulatedText) { + const toolCalls = parseToolCalls(accumulatedText); + if (toolCalls.length > 0) { + // Strip tool_call tags from text, emit remaining as content + const textWithoutToolCalls = accumulatedText + .replace(/[\s\S]*?<\/tool_call>/g, "") + .trim(); + if (textWithoutToolCalls) { + eventQueue.push({ + type: "message", + text: textWithoutToolCalls, + }); + } + eventQueue.push({ type: "tool_calls", calls: toolCalls }); + } else { + console.warn( + `[CodexClient] Tools provided but no tool calls parsed. Assistant preview: ${accumulatedText.slice(0, 300).replace(/\s+/g, " ")}`, + ); + // No tool calls found, emit as plain message + eventQueue.push({ type: "message", text: accumulatedText }); + } + } turnDone = true; } else if (method === "error") { const p = params as ErrorNotification; @@ -202,12 +308,62 @@ export class CodexClient { p.error?.message || (p as any).message || "Unknown error"; eventQueue.push({ type: "error", text: errMsg }); turnDone = true; + } else if (method === "commandExecution/requestApproval") { + // Auto-approve command executions for agentic behavior + const approvalId = params?.approvalId; + if (approvalId) { + console.log( + `[CodexClient] Auto-approving command execution: ${params?.command || "unknown"}`, + ); + this.request("commandExecution/sendApproval", { + approvalId, + decision: "accept", + }).catch(() => {}); + } + } else if (method === "fileChange/requestApproval") { + // Auto-approve file changes for agentic behavior + const approvalId = params?.approvalId; + if (approvalId) { + console.log(`[CodexClient] Auto-approving file change`); + this.request("fileChange/sendApproval", { + approvalId, + decision: "accept", + }).catch(() => {}); + } + } else if (method === "commandExecution/outputDelta") { + // Surface command output as message text + if (params?.delta) { + accumulatedText += params.delta; + if (!hasTools) { + eventQueue.push({ type: "message", text: params.delta }); + } + } } } else if (event.type === "agent_message_content_delta") { - eventQueue.push({ type: "message", text: event.delta }); + accumulatedText += event.delta; + if (!hasTools) { + eventQueue.push({ type: "message", text: event.delta }); + } } else if (event.type === "reasoning_content_delta") { eventQueue.push({ type: "reasoning", text: event.delta }); } else if (event.type === "task_complete") { + if (hasTools && accumulatedText) { + const toolCalls = parseToolCalls(accumulatedText); + if (toolCalls.length > 0) { + const textWithoutToolCalls = accumulatedText + .replace(/[\s\S]*?<\/tool_call>/g, "") + .trim(); + if (textWithoutToolCalls) { + eventQueue.push({ + type: "message", + text: textWithoutToolCalls, + }); + } + eventQueue.push({ type: "tool_calls", calls: toolCalls }); + } else { + eventQueue.push({ type: "message", text: accumulatedText }); + } + } turnDone = true; } diff --git a/src/codex.ts b/src/codex.ts index feeda15..bf74e12 100644 --- a/src/codex.ts +++ b/src/codex.ts @@ -36,6 +36,18 @@ export interface CodexOptions { max_tokens?: number; reasoning_effort?: string; signal?: AbortSignal; + tools?: any[]; + tool_choice?: any; + browseros_mode?: boolean; +} + +export interface ParsedToolCall { + id: string; + type: "function"; + function: { + name: string; + arguments: string; + }; } export async function execCodex( @@ -54,18 +66,154 @@ export async function execCodex( export type CodexStreamEvent = | { type: "reasoning"; text: string } | { type: "message"; text: string } - | { type: "error"; text: string }; + | { type: "error"; text: string } + | { type: "tool_calls"; calls: ParsedToolCall[] }; + +/** + * Parse ... blocks from model output text. + * Returns parsed tool calls, or empty array if none found. + */ +export function parseToolCalls(text: string): ParsedToolCall[] { + const calls: ParsedToolCall[] = []; + const seen = new Set(); + let callIndex = 0; + + const pushCall = (raw: any) => { + const name = raw?.name || raw?.toolName || raw?.function?.name || ""; + const argsRaw = + raw?.arguments ?? raw?.input ?? raw?.parameters ?? raw?.function?.arguments; + if (!name) return; + const args = + typeof argsRaw === "string" + ? argsRaw + : JSON.stringify(argsRaw ?? {}); + const key = `${name}::${args}`; + if (seen.has(key)) return; + seen.add(key); + calls.push({ + id: `call_${Date.now()}_${callIndex++}`, + type: "function", + function: { + name, + arguments: args, + }, + }); + }; + + // Format 1: explicit ... blocks. + const taggedRegex = /([\s\S]*?)<\/tool_call>/g; + let match; + while ((match = taggedRegex.exec(text)) !== null) { + try { + pushCall(JSON.parse(match[1].trim())); + } catch { + // Ignore malformed block. + } + } + + // Format 2: JSON fenced blocks that contain a single call, call list, or tool_calls. + const fencedJsonRegex = /```(?:json)?\s*([\s\S]*?)```/g; + while ((match = fencedJsonRegex.exec(text)) !== null) { + const candidate = match[1].trim(); + try { + const parsed = JSON.parse(candidate); + if (Array.isArray(parsed)) { + for (const item of parsed) pushCall(item); + } else if (parsed?.tool_calls && Array.isArray(parsed.tool_calls)) { + for (const item of parsed.tool_calls) pushCall(item); + } else { + pushCall(parsed); + } + } catch { + // Not valid JSON; ignore. + } + } + + // Format 3: whole response is a JSON object/array describing tool calls. + const trimmed = text.trim(); + if (trimmed.startsWith("{") || trimmed.startsWith("[")) { + try { + const parsed = JSON.parse(trimmed); + if (Array.isArray(parsed)) { + for (const item of parsed) pushCall(item); + } else if (parsed?.tool_calls && Array.isArray(parsed.tool_calls)) { + for (const item of parsed.tool_calls) pushCall(item); + } else { + pushCall(parsed); + } + } catch { + // Not parseable as JSON; ignore. + } + } + + return calls; +} + +/** + * Build a tool-use instruction block from an OpenAI-format tools array. + * Injected into the model's system instructions so it knows which tools + * are available and the expected output format. + */ +export function buildToolInstructions(tools: any[], tool_choice?: any): string { + let block = + `\n\n## Available Tools\n\n` + + `You are an agentic planner operating through external tools. ` + + `When tools are available, your next action MUST be emitted as tool calls, not prose refusals.\n\n` + + `Tool call output format (required):\n` + + `{"name": "tool_name", "arguments": {"param": "value"}}\n\n` + + `IMPORTANT RULES:\n` + + `- If a user request is actionable with provided tools, emit one or more blocks.\n` + + `- Do not say you cannot access the browser/environment when browser tools are provided.\n` + + `- Keep normal text minimal. Prefer tool-call-only responses for action steps.\n` + + `- After tool results are returned, emit the next tool call(s) needed to continue.\n` + + `- For commerce tasks, adding an item to cart is allowed; do not attempt checkout/payment unless user explicitly requests it.\n\n` + + `Here are the tools:\n\n`; + + for (const tool of tools) { + if (tool.type === "function" && tool.function) { + const fn = tool.function; + block += `### ${fn.name}\n`; + if (fn.description) block += `${fn.description}\n`; + if (fn.parameters) { + block += `Parameters: ${JSON.stringify(fn.parameters)}\n`; + } + block += `\n`; + } else if (tool?.name) { + // Support alternate tool schemas used by some providers/agents. + block += `### ${tool.name}\n`; + if (tool.description) block += `${tool.description}\n`; + if (tool.input_schema) { + block += `Parameters: ${JSON.stringify(tool.input_schema)}\n`; + } else if (tool.parameters) { + block += `Parameters: ${JSON.stringify(tool.parameters)}\n`; + } + block += `\n`; + } + } + + if (tool_choice && tool_choice !== "auto") { + if (typeof tool_choice === "object" && tool_choice.function?.name) { + block += `\nYou MUST use the tool "${tool_choice.function.name}" in your response.\n`; + } else if (tool_choice === "required") { + block += `\nYou MUST use at least one tool in your response.\n`; + } + } + + return block; +} export async function* execCodexStream( messages: Message[], options: CodexOptions = {}, ): AsyncGenerator { if (!options.model) { - // Default to a sane model if not provided options.model = "gpt-5.1"; } yield* codexClient.chatCompletionStream(messages, { model: options.model, + tools: options.tools, + tool_choice: options.tool_choice, + browseros_mode: options.browseros_mode, }); } diff --git a/src/index.ts b/src/index.ts index 3283717..d9d6d57 100644 --- a/src/index.ts +++ b/src/index.ts @@ -56,6 +56,13 @@ Bun.serve({ const temperature = body.temperature; const max_tokens = body.max_tokens; const reasoning_effort = body.reasoning_effort; + const tools = Array.isArray(body.tools) ? body.tools : undefined; + // Default to BrowserOS-style strict tool mode whenever tools are supplied, + // unless callers explicitly disable it with browseros_mode: false. + const browseros_mode = + tools && tools.length > 0 ? body.browseros_mode !== false : false; + const tool_choice = + body.tool_choice ?? (browseros_mode ? "required" : undefined); const stream = body.stream === true; @@ -64,6 +71,19 @@ Bun.serve({ if (body.messages) { console.log(`[Proxy] Messages count: ${body.messages.length}`); } + if (tools) { + console.log(`[Proxy] Tools count: ${tools.length}`); + } + if (tools && tools.length > 0) { + console.log( + `[Proxy] BrowserOS mode: ${browseros_mode ? "enabled" : "disabled"}`, + ); + if (body.browseros_mode === undefined && browseros_mode) { + console.log( + `[Proxy] BrowserOS mode auto-enabled because tools were provided`, + ); + } + } if (stream) { const responseId = `chatcmpl-${Date.now()}`; @@ -81,6 +101,9 @@ Bun.serve({ max_tokens, reasoning_effort, signal: req.signal, + tools, + tool_choice, + browseros_mode, })) { if (req.signal.aborted) break; @@ -110,6 +133,60 @@ Bun.serve({ controller.enqueue( encoder.encode(`data: ${JSON.stringify(payload)}\n\n`), ); + } else if (event.type === "tool_calls") { + // Emit tool_calls in OpenAI streaming delta format + const toolCallsDeltas = event.calls.map((tc, idx) => ({ + index: idx, + id: tc.id, + type: "function" as const, + function: { + name: tc.function.name, + arguments: tc.function.arguments, + }, + })); + const payload = { + id: responseId, + object: "chat.completion.chunk", + created: createdTime, + model: model, + choices: [ + { + index: 0, + delta: { + role: "assistant", + tool_calls: toolCallsDeltas, + }, + finish_reason: null, + }, + ], + }; + controller.enqueue( + encoder.encode(`data: ${JSON.stringify(payload)}\n\n`), + ); + // Emit finish with tool_calls reason + const finishPayload = { + id: responseId, + object: "chat.completion.chunk", + created: createdTime, + model: model, + choices: [ + { + index: 0, + delta: {}, + finish_reason: "tool_calls", + }, + ], + }; + controller.enqueue( + encoder.encode( + `data: ${JSON.stringify(finishPayload)}\n\n`, + ), + ); + controller.enqueue(encoder.encode(`data: [DONE]\n\n`)); + try { + controller.close(); + } catch {} + return; // Don't emit the normal stop sequence } else if (event.type === "message") { const payload = { id: responseId, @@ -197,6 +274,7 @@ Bun.serve({ `[Proxy] Executing codex internally via stream buffer for non-streaming request...`, ); let finalMessage = ""; + let finalToolCalls: any[] | null = null; try { for await (const event of execCodexStream(messages, { @@ -205,10 +283,15 @@ Bun.serve({ max_tokens, reasoning_effort, signal: req.signal, + tools, + tool_choice, + browseros_mode, })) { if (req.signal.aborted) break; if (event.type === "message") { finalMessage += event.text; + } else if (event.type === "tool_calls") { + finalToolCalls = event.calls; } else if (event.type === "error") { finalMessage = `[Error] ${event.text}`; break; @@ -219,7 +302,7 @@ Bun.serve({ finalMessage = "Internal Server Error during execution."; } - if (!finalMessage) { + if (!finalMessage && !finalToolCalls) { finalMessage = "No response received."; } @@ -227,6 +310,18 @@ Bun.serve({ const createdTime = Math.floor(Date.now() / 1000); // Format an OpenAI-like response object + const assistantMessage: any = { + role: "assistant", + content: finalToolCalls ? finalMessage || null : finalMessage, + }; + + let finishReason = "stop"; + + if (finalToolCalls && finalToolCalls.length > 0) { + assistantMessage.tool_calls = finalToolCalls; + finishReason = "tool_calls"; + } + const openAiResponse = { id: responseId, object: "chat.completion", @@ -235,11 +330,8 @@ Bun.serve({ choices: [ { index: 0, - message: { - role: "assistant", - content: finalMessage, - }, - finish_reason: "stop", + message: assistantMessage, + finish_reason: finishReason, }, ], usage: {