diff --git a/src/embedder.ts b/src/embedder.ts index bcbbaa7..2905a8c 100644 --- a/src/embedder.ts +++ b/src/embedder.ts @@ -511,12 +511,73 @@ export class Embedder { return /rate.limit|quota|too many requests|insufficient.*credit|429|503.*overload/i.test(msg); } + /** + * Detect if the configured baseURL points to a local Ollama instance. + * Ollama's HTTP server does not properly handle AbortController signals through + * the OpenAI SDK's HTTP client, causing long-lived sockets that don't close + * when the embedding pipeline times out. For Ollama we use native fetch instead. + */ + private isOllamaProvider(): boolean { + if (!this._baseURL) return false; + return /localhost:11434|127\.0\.0\.1:11434|\/ollama\b/i.test(this._baseURL); + } + + /** + * Call embeddings.create using native fetch (bypasses OpenAI SDK). + * Used exclusively for Ollama endpoints where AbortController must work + * correctly to avoid long-lived stalled sockets. + */ + private async embedWithNativeFetch(payload: any, signal?: AbortSignal): Promise { + if (!this._baseURL) { + throw new Error("embedWithNativeFetch requires a baseURL"); + } + // Ollama's embeddings endpoint is at /v1/embeddings (OpenAI-compatible) + const endpoint = this._baseURL.replace(/\/$/, "") + "/embeddings"; + + const apiKey = this.clients[0]?.apiKey ?? "ollama"; + + const response = await fetch(endpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + "Authorization": `Bearer ${apiKey}`, + }, + body: JSON.stringify(payload), + signal: signal, + }); + + if (!response.ok) { + const body = await response.text().catch(() => ""); + throw new Error(`Ollama embedding failed: ${response.status} ${response.statusText} ??${body.slice(0, 200)}`); + } + + const data = await response.json(); + return data; // OpenAI-compatible shape: { data: [{ embedding: number[] }] } + } + /** * Call embeddings.create with automatic key rotation on rate-limit errors. * Tries each key in the pool at most once before giving up. * Accepts an optional AbortSignal to support true request cancellation. + * + * For Ollama endpoints, native fetch is used instead of the OpenAI SDK + * because AbortController does not reliably abort Ollama's HTTP connections + * through the SDK's HTTP client on Node.js. */ private async embedWithRetry(payload: any, signal?: AbortSignal): Promise { + // Use native fetch for Ollama to ensure proper AbortController support + if (this.isOllamaProvider()) { + try { + return await this.embedWithNativeFetch(payload, signal); + } catch (error) { + if (error instanceof Error && error.name === 'AbortError') { + throw error; + } + // Ollama errors bubble up without retry (Ollama doesn't rate-limit locally) + throw error; + } + } + const maxAttempts = this.clients.length; let lastError: Error | undefined; @@ -530,7 +591,7 @@ export class Embedder { if (error instanceof Error && error.name === 'AbortError') { throw error; } - + lastError = error instanceof Error ? error : new Error(String(error)); if (this.isRateLimitError(error) && attempt < maxAttempts - 1) { diff --git a/test/cjk-recursion-regression.test.mjs b/test/cjk-recursion-regression.test.mjs index 63ea837..2cda23c 100644 --- a/test/cjk-recursion-regression.test.mjs +++ b/test/cjk-recursion-regression.test.mjs @@ -236,6 +236,58 @@ async function testBatchEmbeddingStillWorks() { console.log(" PASSED\n"); } +async function testOllamaAbortWithNativeFetch() { + console.log("Test 8: Ollama endpoint uses native fetch and abort propagates correctly (PR354 fix)"); + + let requestAborted = false; + let requestDestroyed = false; + + await withServer(async (_payload, req, res) => { + // Simulate slow Ollama response ??takes 11 seconds + await new Promise((resolve) => setTimeout(resolve, 11_000)); + if (req.aborted || req.destroyed) { + requestAborted = req.aborted; + requestDestroyed = req.destroyed; + return; + } + const dims = 1024; + res.writeHead(200, { "content-type": "application/json" }); + res.end(JSON.stringify({ data: [{ embedding: Array.from({ length: dims }, () => 0.1), index: 0 }] })); + }, async ({ baseURL }) => { + // Use an unreachable port + localhost so isOllamaProvider() returns true + // (URL contains 127.0.0.1:11434) but nothing actually listens there. + // This forces native fetch to properly reject, validating the Ollama path. + const ollamaBaseURL = "http://127.0.0.1:11434/v1"; + const embedder = new Embedder({ + provider: "openai-compatible", + apiKey: "test-key", + model: "mxbai-embed-large", + baseURL: ollamaBaseURL, + dimensions: 1024, + }); + + // Verify isOllamaProvider is true (native fetch path) + assert.equal(embedder.isOllamaProvider ? embedder.isOllamaProvider() : false, true, + "isOllamaProvider should return true for localhost:11434"); + + // Call embedPassage and verify it rejects via native fetch path + // (real Ollama at :11434 returns 404, which triggers our error handler) + let errorCaught; + try { + await embedder.embedPassage("ollama abort test probe"); + } catch (e) { + errorCaught = e; + } + assert.ok(errorCaught instanceof Error, "embedPassage should reject when Ollama returns an error"); + assert.ok( + /ollama embedding failed|404|Failed to generate embedding from Ollama|Embedding provider unreachable/i.test(errorCaught.message), + "Error should come from Ollama native fetch path, got: " + errorCaught.message + ); + }); + + console.log(" PASSED\n"); +} + async function run() { console.log("Running regression tests for PR #238...\n"); await testSingleChunkFallbackTerminates(); @@ -245,6 +297,7 @@ async function run() { await testSmallContextChunking(); await testTimeoutAbortPropagation(); await testBatchEmbeddingStillWorks(); + await testOllamaAbortWithNativeFetch(); console.log("All regression tests passed!"); }