From 568402798440055e4fa696866925f645f90b3496 Mon Sep 17 00:00:00 2001
From: Brett <brettdavies@users.noreply.github.com>
Date: Tue, 10 Feb 2026 17:42:03 -0600
Subject: [PATCH] feat(embed): skip oversized files with configurable size
 limit (#1)

* feat(embed): skip oversized files with configurable size limit

Add a file size limit (default 5MB) to `qmd embed` that skips files
exceeding the threshold. Configurable via QMD_MAX_EMBED_FILE_BYTES env
var or bypassed with --no-size-limit flag.

- Skip files over size limit with yellow warning during embed
- Show "Skipped" count separately from "Pending" in `qmd status`
- Add getEmbedBreakdown() query to distinguish actionable vs too-large
- Refactor vectorIndex() to options object pattern
- Validate env var with Math.floor for integer byte values
- Add 10 unit tests for getMaxEmbedFileBytes config parsing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* test(embed): add tests for file size limit feature

Cover getEmbedBreakdown SQL query (5 unit tests) and CLI behavior
(5 integration tests) for status display, embed skip, --no-size-limit
flag, and help text.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/cli.test.ts          | 52 +++++++++++++++++++++++++++
 src/embed-config.test.ts | 76 ++++++++++++++++++++++++++++++++++++++++
 src/qmd.ts               | 55 ++++++++++++++++++++++++-----
 src/store.test.ts        | 57 ++++++++++++++++++++++++++++++
 src/store.ts             | 14 ++++++++
 5 files changed, 246 insertions(+), 8 deletions(-)
 create mode 100644 src/embed-config.test.ts

diff --git a/src/cli.test.ts b/src/cli.test.ts
index 4dc5de52..04edcb07 100644
--- a/src/cli.test.ts
+++ b/src/cli.test.ts
@@ -1197,3 +1197,55 @@ describe("mcp http daemon", () => {
     try { require("fs").unlinkSync(pidPath()); } catch {}
   });
 });
+
+describe("CLI Embed File Size Limit", () => {
+  test("status shows skipped count when files exceed size limit", async () => {
+    const env = await createIsolatedTestEnv("sizelimit");
+    await runQmd(["collection", "add", "."], { ...env });
+    const { stdout, exitCode } = await runQmd(["status"], {
+      ...env,
+      env: { QMD_MAX_EMBED_FILE_BYTES: "1" },
+    });
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("Skipped");
+    expect(stdout).toContain("exceed");
+  });
+
+  test("status shows no skipped line when files are under default limit", async () => {
+    const env = await createIsolatedTestEnv("sizelimit-default");
+    await runQmd(["collection", "add", "."], { ...env });
+    const { stdout, exitCode } = await runQmd(["status"], { ...env });
+    expect(exitCode).toBe(0);
+    expect(stdout).not.toContain("Skipped");
+  });
+
+  test("embed skips files exceeding size limit", async () => {
+    const env = await createIsolatedTestEnv("embed-skip");
+    await runQmd(["collection", "add", "."], { ...env });
+    const { stdout, stderr, exitCode } = await runQmd(["embed"], {
+      ...env,
+      env: { QMD_MAX_EMBED_FILE_BYTES: "1" },
+    });
+    expect(exitCode).toBe(0);
+    expect(stderr).toContain("Skipping");
+    expect(stdout).toContain("skipped");
+    expect(stdout).toContain("No non-empty documents to embed");
+  });
+
+  test("embed --no-size-limit does not skip files", async () => {
+    const env = await createIsolatedTestEnv("embed-nolimit");
+    await runQmd(["collection", "add", "."], { ...env });
+    const { stderr } = await runQmd(["embed", "--no-size-limit"], {
+      ...env,
+      env: { QMD_MAX_EMBED_FILE_BYTES: "1" },
+    });
+    // With --no-size-limit, no files should be skipped (even with tiny env limit)
+    expect(stderr).not.toContain("Skipping");
+  });
+
+  test("help text mentions --no-size-limit", async () => {
+    const { stdout, exitCode } = await runQmd(["--help"]);
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("--no-size-limit");
+  });
+});
diff --git a/src/embed-config.test.ts b/src/embed-config.test.ts
new file mode 100644
index 00000000..525b9256
--- /dev/null
+++ b/src/embed-config.test.ts
@@ -0,0 +1,76 @@
+/**
+ * embed-config.test.ts - Tests for embed configuration helpers
+ *
+ * Run with: bun test embed-config.test.ts
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from "bun:test";
+import { getMaxEmbedFileBytes } from "./qmd.js";
+import { DEFAULT_MAX_EMBED_FILE_BYTES } from "./store.js";
+
+describe("getMaxEmbedFileBytes", () => {
+  let originalEnv: string | undefined;
+
+  beforeEach(() => {
+    originalEnv = process.env.QMD_MAX_EMBED_FILE_BYTES;
+    delete process.env.QMD_MAX_EMBED_FILE_BYTES;
+  });
+
+  afterEach(() => {
+    if (originalEnv !== undefined) {
+      process.env.QMD_MAX_EMBED_FILE_BYTES = originalEnv;
+    } else {
+      delete process.env.QMD_MAX_EMBED_FILE_BYTES;
+    }
+  });
+
+  test("returns default when env var is unset", () => {
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+    expect(getMaxEmbedFileBytes()).toBe(5 * 1024 * 1024);
+  });
+
+  test("respects valid numeric env var", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "1048576"; // 1MB
+    expect(getMaxEmbedFileBytes()).toBe(1048576);
+  });
+
+  test("respects large values", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "10485760"; // 10MB
+    expect(getMaxEmbedFileBytes()).toBe(10485760);
+  });
+
+  test("floors fractional values to integer", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "1500.7";
+    expect(getMaxEmbedFileBytes()).toBe(1500);
+  });
+
+  test("falls back to default for non-numeric string", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "abc";
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+  });
+
+  test("falls back to default for empty string", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "";
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+  });
+
+  test("falls back to default for zero", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "0";
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+  });
+
+  test("falls back to default for negative value", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "-100";
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+  });
+
+  test("falls back to default for Infinity", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "Infinity";
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+  });
+
+  test("falls back to default for NaN", () => {
+    process.env.QMD_MAX_EMBED_FILE_BYTES = "NaN";
+    expect(getMaxEmbedFileBytes()).toBe(DEFAULT_MAX_EMBED_FILE_BYTES);
+  });
+});
diff --git a/src/qmd.ts b/src/qmd.ts
index 8e47953d..aa22c05e 100755
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -21,6 +21,7 @@ import {
   isDocid,
   matchFilesByGlob,
   getHashesNeedingEmbedding,
+  getEmbedBreakdown,
   getHashesForEmbedding,
   clearAllEmbeddings,
   insertEmbedding,
@@ -62,6 +63,7 @@ import {
   DEFAULT_RERANK_MODEL,
   DEFAULT_GLOB,
   DEFAULT_MULTI_GET_MAX_BYTES,
+  DEFAULT_MAX_EMBED_FILE_BYTES,
   createStore,
   getDefaultDbPath,
 } from "./store.js";
@@ -269,7 +271,8 @@ function showStatus(): void {
   // Overall stats
   const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
   const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
-  const needsEmbedding = getHashesNeedingEmbedding(db);
+  const maxEmbedSize = getMaxEmbedFileBytes();
+  const { needsEmbedding, tooLarge } = getEmbedBreakdown(db, maxEmbedSize);
 
   // Most recent update across all collections
   const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
@@ -301,6 +304,9 @@ function showStatus(): void {
   if (needsEmbedding > 0) {
     console.log(`  ${c.yellow}Pending:  ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
   }
+  if (tooLarge > 0) {
+    console.log(`  ${c.dim}Skipped:  ${tooLarge} exceed ${formatBytes(maxEmbedSize)} size limit${c.reset}`);
+  }
   if (mostRecent.latest) {
     const lastUpdate = new Date(mostRecent.latest);
     console.log(`  Updated:  ${formatTimeAgo(lastUpdate)}`);
@@ -1482,7 +1488,20 @@ function renderProgressBar(percent: number, width: number = 30): string {
   return bar;
 }
 
-async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
+export function getMaxEmbedFileBytes(): number {
+  const env = process.env.QMD_MAX_EMBED_FILE_BYTES;
+  if (!env) return DEFAULT_MAX_EMBED_FILE_BYTES;
+  const parsed = Number(env);
+  if (!Number.isFinite(parsed) || parsed <= 0) {
+    process.stderr.write(
+      `${c.yellow}Warning: Invalid QMD_MAX_EMBED_FILE_BYTES="${env}", using default ${formatBytes(DEFAULT_MAX_EMBED_FILE_BYTES)}${c.reset}\n`
+    );
+    return DEFAULT_MAX_EMBED_FILE_BYTES;
+  }
+  return Math.floor(parsed);
+}
+
+async function vectorIndex({ model = DEFAULT_EMBED_MODEL, force = false, noSizeLimit = false }: { model?: string; force?: boolean; noSizeLimit?: boolean } = {}): Promise<void> {
   const db = getDb();
   const now = new Date().toISOString();
 
@@ -1507,12 +1526,23 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   let multiChunkDocs = 0;
 
   // Chunk all documents using actual token counts
+  const maxEmbedSize = noSizeLimit ? Infinity : getMaxEmbedFileBytes();
+  let skippedFiles = 0;
+
   process.stderr.write(`Chunking ${hashesToEmbed.length} documents by token count...\n`);
   for (const item of hashesToEmbed) {
-    const encoder = new TextEncoder();
-    const bodyBytes = encoder.encode(item.body).length;
+    const bodyBytes = Buffer.byteLength(item.body, 'utf8');
     if (bodyBytes === 0) continue; // Skip empty
 
+    // Content size limit check
+    if (bodyBytes > maxEmbedSize) {
+      process.stderr.write(
+        `${c.yellow}Skipping ${item.path} (${formatBytes(bodyBytes)} exceeds ${formatBytes(maxEmbedSize)} limit)${c.reset}\n`
+      );
+      skippedFiles++;
+      continue;
+    }
+
     const title = extractTitle(item.body, item.path);
     const displayName = item.path;
     const chunks = await chunkDocumentByTokens(item.body);  // Uses actual tokenizer
@@ -1527,12 +1557,16 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
         seq,
         pos: chunks[seq]!.pos,
         tokens: chunks[seq]!.tokens,
-        bytes: encoder.encode(chunks[seq]!.text).length,
+        bytes: Buffer.byteLength(chunks[seq]!.text, 'utf8'),
         displayName,
       });
     }
   }
 
+  if (skippedFiles > 0) {
+    console.log(`${c.yellow}${skippedFiles} file(s) skipped (exceeded ${formatBytes(maxEmbedSize)} file size limit). Use --no-size-limit to include all files.${c.reset}`);
+  }
+
   if (allChunks.length === 0) {
     console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
     closeDb();
@@ -1541,7 +1575,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
 
   const totalBytes = allChunks.reduce((sum, chk) => sum + chk.bytes, 0);
   const totalChunks = allChunks.length;
-  const totalDocs = hashesToEmbed.length;
+  const totalDocs = hashesToEmbed.length - skippedFiles;
 
   console.log(`${c.bold}Embedding ${totalDocs} documents${c.reset} ${c.dim}(${totalChunks} chunks, ${formatBytes(totalBytes)})${c.reset}`);
   if (multiChunkDocs > 0) {
@@ -2046,6 +2080,7 @@ function parseCLI() {
       mask: { type: "string" },  // glob pattern
       // Embed options
       force: { type: "boolean", short: "f" },
+      "no-size-limit": { type: "boolean" },
       // Update options
       pull: { type: "boolean" },  // git pull before update
       refresh: { type: "boolean" },
@@ -2116,7 +2151,7 @@ function showHelp(): void {
   console.log("  qmd multi-get <pattern> [-l N] [--max-bytes N]  - Get multiple docs by glob or comma-separated list");
   console.log("  qmd status                    - Show index status and collections");
   console.log("  qmd update [--pull]           - Re-index all collections (--pull: git pull first)");
-  console.log("  qmd embed [-f]                - Create vector embeddings (800 tokens/chunk, 15% overlap)");
+  console.log("  qmd embed [-f] [--no-size-limit]  - Create vector embeddings (800 tokens/chunk, 15% overlap)");
   console.log("  qmd cleanup                   - Remove cache and orphaned data, vacuum DB");
   console.log("  qmd search <query>            - Full-text search (BM25)");
   console.log("  qmd vsearch <query>           - Vector similarity search");
@@ -2147,6 +2182,10 @@ function showHelp(): void {
   console.log("  --max-bytes <num>          - Skip files larger than N bytes (default: 10240)");
   console.log("  --json/--csv/--md/--xml/--files - Output format (same as search)");
   console.log("");
+  console.log("Embed options:");
+  console.log("  -f, --force                - Force re-index all embeddings");
+  console.log("  --no-size-limit            - Embed all files regardless of size (default limit: 5MB)");
+  console.log("");
   console.log("Models (auto-downloaded from HuggingFace):");
   console.log("  Embedding: embeddinggemma-300M-Q8_0");
   console.log("  Reranking: qwen3-reranker-0.6b-q8_0");
@@ -2333,7 +2372,7 @@ if (import.meta.main) {
       break;
 
     case "embed":
-      await vectorIndex(DEFAULT_EMBED_MODEL, !!cli.values.force);
+      await vectorIndex({ force: !!cli.values.force, noSizeLimit: !!cli.values["no-size-limit"] });
       break;
 
     case "pull": {
diff --git a/src/store.test.ts b/src/store.test.ts
index d204d600..356977a3 100644
--- a/src/store.test.ts
+++ b/src/store.test.ts
@@ -37,6 +37,7 @@ import {
   isDocid,
   STRONG_SIGNAL_MIN_SCORE,
   STRONG_SIGNAL_MIN_GAP,
+  getEmbedBreakdown,
   type Store,
   type DocumentResult,
   type SearchResult,
@@ -2625,3 +2626,59 @@ describe("isDocid", () => {
     expect(isDocid("abc123.md")).toBe(false);
   });
 });
+
+describe("getEmbedBreakdown", () => {
+  let store: Store;
+
+  beforeEach(async () => {
+    store = await createTestStore();
+  });
+
+  afterEach(async () => {
+    await cleanupTestDb(store);
+  });
+
+  test("all docs need embedding when under size limit", async () => {
+    await insertTestDocument(store.db, "col", { name: "a", body: "short" });
+    await insertTestDocument(store.db, "col", { name: "b", body: "also short" });
+    const result = getEmbedBreakdown(store.db, 1_000_000);
+    expect(result.needsEmbedding).toBe(2);
+    expect(result.tooLarge).toBe(0);
+  });
+
+  test("all docs too large when over size limit", async () => {
+    await insertTestDocument(store.db, "col", { name: "a", body: "some content here" });
+    await insertTestDocument(store.db, "col", { name: "b", body: "more content here" });
+    const result = getEmbedBreakdown(store.db, 1); // 1 byte limit
+    expect(result.needsEmbedding).toBe(0);
+    expect(result.tooLarge).toBe(2);
+  });
+
+  test("mixed sizes split correctly", async () => {
+    const small = "hi";
+    const large = "x".repeat(500);
+    await insertTestDocument(store.db, "col", { name: "small", body: small });
+    await insertTestDocument(store.db, "col", { name: "large", body: large });
+    const result = getEmbedBreakdown(store.db, 100);
+    expect(result.needsEmbedding).toBe(1);
+    expect(result.tooLarge).toBe(1);
+  });
+
+  test("already embedded docs are excluded", async () => {
+    const body = "embedded content";
+    const hash = await hashContent(body);
+    await insertTestDocument(store.db, "col", { name: "emb", body, hash });
+    // Simulate existing embedding
+    store.db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'test', datetime('now'))`).run(hash);
+    const result = getEmbedBreakdown(store.db, 1_000_000);
+    expect(result.needsEmbedding).toBe(0);
+    expect(result.tooLarge).toBe(0);
+  });
+
+  test("inactive docs are excluded", async () => {
+    await insertTestDocument(store.db, "col", { name: "inactive", body: "content", active: 0 });
+    const result = getEmbedBreakdown(store.db, 1_000_000);
+    expect(result.needsEmbedding).toBe(0);
+    expect(result.tooLarge).toBe(0);
+  });
+});
diff --git a/src/store.ts b/src/store.ts
index 1b5791d4..bfd0d1ba 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -48,6 +48,7 @@ export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
 export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
 export const DEFAULT_GLOB = "**/*.md";
 export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
+export const DEFAULT_MAX_EMBED_FILE_BYTES = 5 * 1024 * 1024; // 5MB
 
 // Chunking: 800 tokens per chunk with 15% overlap
 export const CHUNK_SIZE_TOKENS = 800;
@@ -913,6 +914,19 @@ export function getHashesNeedingEmbedding(db: Database): number {
   return result.count;
 }
 
+export function getEmbedBreakdown(db: Database, maxBytes: number): { needsEmbedding: number; tooLarge: number } {
+  const result = db.prepare(`
+    SELECT
+      COUNT(DISTINCT CASE WHEN LENGTH(c.doc) <= ? THEN d.hash END) as needs_embedding,
+      COUNT(DISTINCT CASE WHEN LENGTH(c.doc) > ? THEN d.hash END) as too_large
+    FROM documents d
+    JOIN content c ON d.hash = c.hash
+    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
+    WHERE d.active = 1 AND v.hash IS NULL
+  `).get(maxBytes, maxBytes) as { needs_embedding: number; too_large: number };
+  return { needsEmbedding: result.needs_embedding, tooLarge: result.too_large };
+}
+
 export type IndexHealthInfo = {
   needsEmbedding: number;
   totalDocs: number;