From 568402798440055e4fa696866925f645f90b3496 Mon Sep 17 00:00:00 2001 From: Brett Date: Tue, 10 Feb 2026 17:42:03 -0600 Subject: [PATCH] feat(embed): skip oversized files with configurable size limit (#1) * feat(embed): skip oversized files with configurable size limit Add a file size limit (default 5MB) to `qmd embed` that skips files exceeding the threshold. Configurable via QMD_MAX_EMBED_FILE_BYTES env var or bypassed with --no-size-limit flag. - Skip files over size limit with yellow warning during embed - Show "Skipped" count separately from "Pending" in `qmd status` - Add getEmbedBreakdown() query to distinguish actionable vs too-large - Refactor vectorIndex() to options object pattern - Validate env var with Math.floor for integer byte values - Add 10 unit tests for getMaxEmbedFileBytes config parsing Co-Authored-By: Claude Opus 4.6 * test(embed): add tests for file size limit feature Cover getEmbedBreakdown SQL query (5 unit tests) and CLI behavior (5 integration tests) for status display, embed skip, --no-size-limit flag, and help text. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- src/cli.test.ts | 52 +++++++++++++++++++++++++++ src/embed-config.test.ts | 76 ++++++++++++++++++++++++++++++++++++++++ src/qmd.ts | 55 ++++++++++++++++++++++++----- src/store.test.ts | 57 ++++++++++++++++++++++++++++++ src/store.ts | 14 ++++++++ 5 files changed, 246 insertions(+), 8 deletions(-) create mode 100644 src/embed-config.test.ts diff --git a/src/cli.test.ts b/src/cli.test.ts index 4dc5de52..04edcb07 100644 --- a/src/cli.test.ts +++ b/src/cli.test.ts @@ -1197,3 +1197,55 @@ describe("mcp http daemon", () => { try { require("fs").unlinkSync(pidPath()); } catch {} }); }); + +describe("CLI Embed File Size Limit", () => { + test("status shows skipped count when files exceed size limit", async () => { + const env = await createIsolatedTestEnv("sizelimit"); + await runQmd(["collection", "add", "."], { ...env }); + const { stdout, exitCode } = await runQmd(["status"], { + ...env, + env: { QMD_MAX_EMBED_FILE_BYTES: "1" }, + }); + expect(exitCode).toBe(0); + expect(stdout).toContain("Skipped"); + expect(stdout).toContain("exceed"); + }); + + test("status shows no skipped line when files are under default limit", async () => { + const env = await createIsolatedTestEnv("sizelimit-default"); + await runQmd(["collection", "add", "."], { ...env }); + const { stdout, exitCode } = await runQmd(["status"], { ...env }); + expect(exitCode).toBe(0); + expect(stdout).not.toContain("Skipped"); + }); + + test("embed skips files exceeding size limit", async () => { + const env = await createIsolatedTestEnv("embed-skip"); + await runQmd(["collection", "add", "."], { ...env }); + const { stdout, stderr, exitCode } = await runQmd(["embed"], { + ...env, + env: { QMD_MAX_EMBED_FILE_BYTES: "1" }, + }); + expect(exitCode).toBe(0); + expect(stderr).toContain("Skipping"); + expect(stdout).toContain("skipped"); + expect(stdout).toContain("No non-empty documents to embed"); + }); + + test("embed --no-size-limit does not skip files", async () => { + const env = await createIsolatedTestEnv("embed-nolimit"); + await runQmd(["collection", "add", "."], { ...env }); + const { stderr } = await runQmd(["embed", "--no-size-limit"], { + ...env, + env: { QMD_MAX_EMBED_FILE_BYTES: "1" }, + }); + // With --no-size-limit, no files should be skipped (even with tiny env limit) + expect(stderr).not.toContain("Skipping"); + }); + + test("help text mentions --no-size-limit", async () => { + const { stdout, exitCode } = await runQmd(["--help"]); + expect(exitCode).toBe(0); + expect(stdout).toContain("--no-size-limit"); + }); +}); diff --git a/src/embed-config.test.ts b/src/embed-config.test.ts new file mode 100644 index 00000000..525b9256 --- /dev/null +++ b/src/embed-config.test.ts @@ -0,0 +1,76 @@ +/** + * embed-config.test.ts - Tests for embed configuration helpers + * + * Run with: bun test embed-config.test.ts + */ + +import { describe, test, expect, beforeEach, afterEach } from "bun:test"; +import { getMaxEmbedFileBytes } from "./qmd.js"; +import { DEFAULT_MAX_EMBED_FILE_BYTES } from "./store.js"; + +describe("getMaxEmbedFileBytes", () => { + let originalEnv: string | undefined; + + beforeEach(() => { + originalEnv = process.env.QMD_MAX_EMBED_FILE_BYTES; + delete process.env.QMD_MAX_EMBED_FILE_BYTES; + }); + + afterEach(() => { + if (originalEnv !== undefined) { + process.env.QMD_MAX_EMBED_FILE_BYTES = originalEnv; + } else { + delete process.env.QMD_MAX_EMBED_FILE_BYTES; + } + }); + + test("returns default when env var is unset", () => { + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + expect(getMaxEmbedFileBytes()).toBe(5 * 1024 * 1024); + }); + + test("respects valid numeric env var", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "1048576"; // 1MB + expect(getMaxEmbedFileBytes()).toBe(1048576); + }); + + test("respects large values", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "10485760"; // 10MB + expect(getMaxEmbedFileBytes()).toBe(10485760); + }); + + test("floors fractional values to integer", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "1500.7"; + expect(getMaxEmbedFileBytes()).toBe(1500); + }); + + test("falls back to default for non-numeric string", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "abc"; + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + }); + + test("falls back to default for empty string", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = ""; + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + }); + + test("falls back to default for zero", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "0"; + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + }); + + test("falls back to default for negative value", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "-100"; + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + }); + + test("falls back to default for Infinity", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "Infinity"; + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + }); + + test("falls back to default for NaN", () => { + process.env.QMD_MAX_EMBED_FILE_BYTES = "NaN"; + expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES); + }); +}); diff --git a/src/qmd.ts b/src/qmd.ts index 8e47953d..aa22c05e 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -21,6 +21,7 @@ import { isDocid, matchFilesByGlob, getHashesNeedingEmbedding, + getEmbedBreakdown, getHashesForEmbedding, clearAllEmbeddings, insertEmbedding, @@ -62,6 +63,7 @@ import { DEFAULT_RERANK_MODEL, DEFAULT_GLOB, DEFAULT_MULTI_GET_MAX_BYTES, + DEFAULT_MAX_EMBED_FILE_BYTES, createStore, getDefaultDbPath, } from "./store.js"; @@ -269,7 +271,8 @@ function showStatus(): void { // Overall stats const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }; const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number }; - const needsEmbedding = getHashesNeedingEmbedding(db); + const maxEmbedSize = getMaxEmbedFileBytes(); + const { needsEmbedding, tooLarge } = getEmbedBreakdown(db, maxEmbedSize); // Most recent update across all collections const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null }; @@ -301,6 +304,9 @@ function showStatus(): void { if (needsEmbedding > 0) { console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`); } + if (tooLarge > 0) { + console.log(` ${c.dim}Skipped: ${tooLarge} exceed ${formatBytes(maxEmbedSize)} size limit${c.reset}`); + } if (mostRecent.latest) { const lastUpdate = new Date(mostRecent.latest); console.log(` Updated: ${formatTimeAgo(lastUpdate)}`); @@ -1482,7 +1488,20 @@ function renderProgressBar(percent: number, width: number = 30): string { return bar; } -async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise { +export function getMaxEmbedFileBytes(): number { + const env = process.env.QMD_MAX_EMBED_FILE_BYTES; + if (!env) return DEFAULT_MAX_EMBED_FILE_BYTES; + const parsed = Number(env); + if (!Number.isFinite(parsed) || parsed <= 0) { + process.stderr.write( + `${c.yellow}Warning: Invalid QMD_MAX_EMBED_FILE_BYTES="${env}", using default ${formatBytes(DEFAULT_MAX_EMBED_FILE_BYTES)}${c.reset}\n` + ); + return DEFAULT_MAX_EMBED_FILE_BYTES; + } + return Math.floor(parsed); +} + +async function vectorIndex({ model = DEFAULT_EMBED_MODEL, force = false, noSizeLimit = false }: { model?: string; force?: boolean; noSizeLimit?: boolean } = {}): Promise { const db = getDb(); const now = new Date().toISOString(); @@ -1507,12 +1526,23 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = let multiChunkDocs = 0; // Chunk all documents using actual token counts + const maxEmbedSize = noSizeLimit ? Infinity : getMaxEmbedFileBytes(); + let skippedFiles = 0; + process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`); for (const item of hashesToEmbed) { - const encoder = new TextEncoder(); - const bodyBytes = encoder.encode(item.body).length; + const bodyBytes = Buffer.byteLength(item.body, 'utf8'); if (bodyBytes === 0) continue; // Skip empty + // Content size limit check + if (bodyBytes > maxEmbedSize) { + process.stderr.write( + `${c.yellow}Skipping ${item.path} (${formatBytes(bodyBytes)} exceeds ${formatBytes(maxEmbedSize)} limit)${c.reset}\n` + ); + skippedFiles++; + continue; + } + const title = extractTitle(item.body, item.path); const displayName = item.path; const chunks = await chunkDocumentByTokens(item.body); // Uses actual tokenizer @@ -1527,12 +1557,16 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = seq, pos: chunks[seq]!.pos, tokens: chunks[seq]!.tokens, - bytes: encoder.encode(chunks[seq]!.text).length, + bytes: Buffer.byteLength(chunks[seq]!.text, 'utf8'), displayName, }); } } + if (skippedFiles > 0) { + console.log(`${c.yellow}${skippedFiles} file(s) skipped (exceeded ${formatBytes(maxEmbedSize)} file size limit). Use --no-size-limit to include all files.${c.reset}`); + } + if (allChunks.length === 0) { console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`); closeDb(); @@ -1541,7 +1575,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0); const totalChunks = allChunks.length; - const totalDocs = hashesToEmbed.length; + const totalDocs = hashesToEmbed.length - skippedFiles; console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`); if (multiChunkDocs > 0) { @@ -2046,6 +2080,7 @@ function parseCLI() { mask: { type: "string" }, // glob pattern // Embed options force: { type: "boolean", short: "f" }, + "no-size-limit": { type: "boolean" }, // Update options pull: { type: "boolean" }, // git pull before update refresh: { type: "boolean" }, @@ -2116,7 +2151,7 @@ function showHelp(): void { console.log(" qmd multi-get [-l N] [--max-bytes N] - Get multiple docs by glob or comma-separated list"); console.log(" qmd status - Show index status and collections"); console.log(" qmd update [--pull] - Re-index all collections (--pull: git pull first)"); - console.log(" qmd embed [-f] - Create vector embeddings (800 tokens/chunk, 15% overlap)"); + console.log(" qmd embed [-f] [--no-size-limit] - Create vector embeddings (800 tokens/chunk, 15% overlap)"); console.log(" qmd cleanup - Remove cache and orphaned data, vacuum DB"); console.log(" qmd search - Full-text search (BM25)"); console.log(" qmd vsearch - Vector similarity search"); @@ -2147,6 +2182,10 @@ function showHelp(): void { console.log(" --max-bytes - Skip files larger than N bytes (default: 10240)"); console.log(" --json/--csv/--md/--xml/--files - Output format (same as search)"); console.log(""); + console.log("Embed options:"); + console.log(" -f, --force - Force re-index all embeddings"); + console.log(" --no-size-limit - Embed all files regardless of size (default limit: 5MB)"); + console.log(""); console.log("Models (auto-downloaded from HuggingFace):"); console.log(" Embedding: embeddinggemma-300M-Q8_0"); console.log(" Reranking: qwen3-reranker-0.6b-q8_0"); @@ -2333,7 +2372,7 @@ if (import.meta.main) { break; case "embed": - await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force); + await vectorIndex({ force: !!cli.values.force, noSizeLimit: !!cli.values["no-size-limit"] }); break; case "pull": { diff --git a/src/store.test.ts b/src/store.test.ts index d204d600..356977a3 100644 --- a/src/store.test.ts +++ b/src/store.test.ts @@ -37,6 +37,7 @@ import { isDocid, STRONG_SIGNAL_MIN_SCORE, STRONG_SIGNAL_MIN_GAP, + getEmbedBreakdown, type Store, type DocumentResult, type SearchResult, @@ -2625,3 +2626,59 @@ describe("isDocid", () => { expect(isDocid("abc123.md")).toBe(false); }); }); + +describe("getEmbedBreakdown", () => { + let store: Store; + + beforeEach(async () => { + store = await createTestStore(); + }); + + afterEach(async () => { + await cleanupTestDb(store); + }); + + test("all docs need embedding when under size limit", async () => { + await insertTestDocument(store.db, "col", { name: "a", body: "short" }); + await insertTestDocument(store.db, "col", { name: "b", body: "also short" }); + const result = getEmbedBreakdown(store.db, 1_000_000); + expect(result.needsEmbedding).toBe(2); + expect(result.tooLarge).toBe(0); + }); + + test("all docs too large when over size limit", async () => { + await insertTestDocument(store.db, "col", { name: "a", body: "some content here" }); + await insertTestDocument(store.db, "col", { name: "b", body: "more content here" }); + const result = getEmbedBreakdown(store.db, 1); // 1 byte limit + expect(result.needsEmbedding).toBe(0); + expect(result.tooLarge).toBe(2); + }); + + test("mixed sizes split correctly", async () => { + const small = "hi"; + const large = "x".repeat(500); + await insertTestDocument(store.db, "col", { name: "small", body: small }); + await insertTestDocument(store.db, "col", { name: "large", body: large }); + const result = getEmbedBreakdown(store.db, 100); + expect(result.needsEmbedding).toBe(1); + expect(result.tooLarge).toBe(1); + }); + + test("already embedded docs are excluded", async () => { + const body = "embedded content"; + const hash = await hashContent(body); + await insertTestDocument(store.db, "col", { name: "emb", body, hash }); + // Simulate existing embedding + store.db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'test', datetime('now'))`).run(hash); + const result = getEmbedBreakdown(store.db, 1_000_000); + expect(result.needsEmbedding).toBe(0); + expect(result.tooLarge).toBe(0); + }); + + test("inactive docs are excluded", async () => { + await insertTestDocument(store.db, "col", { name: "inactive", body: "content", active: 0 }); + const result = getEmbedBreakdown(store.db, 1_000_000); + expect(result.needsEmbedding).toBe(0); + expect(result.tooLarge).toBe(0); + }); +}); diff --git a/src/store.ts b/src/store.ts index 1b5791d4..bfd0d1ba 100644 --- a/src/store.ts +++ b/src/store.ts @@ -48,6 +48,7 @@ export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0"; export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B"; export const DEFAULT_GLOB = "**/*.md"; export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB +export const DEFAULT_MAX_EMBED_FILE_BYTES = 5 * 1024 * 1024; // 5MB // Chunking: 800 tokens per chunk with 15% overlap export const CHUNK_SIZE_TOKENS = 800; @@ -913,6 +914,19 @@ export function getHashesNeedingEmbedding(db: Database): number { return result.count; } +export function getEmbedBreakdown(db: Database, maxBytes: number): { needsEmbedding: number; tooLarge: number } { + const result = db.prepare(` + SELECT + COUNT(DISTINCT CASE WHEN LENGTH(c.doc) <= ? THEN d.hash END) as needs_embedding, + COUNT(DISTINCT CASE WHEN LENGTH(c.doc) > ? THEN d.hash END) as too_large + FROM documents d + JOIN content c ON d.hash = c.hash + LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 + WHERE d.active = 1 AND v.hash IS NULL + `).get(maxBytes, maxBytes) as { needs_embedding: number; too_large: number }; + return { needsEmbedding: result.needs_embedding, tooLarge: result.too_large }; +} + export type IndexHealthInfo = { needsEmbedding: number; totalDocs: number;