diff --git a/src/objects/pdf-name.test.ts b/src/objects/pdf-name.test.ts index 115b330..8684668 100644 --- a/src/objects/pdf-name.test.ts +++ b/src/objects/pdf-name.test.ts @@ -50,9 +50,9 @@ describe("PdfName", () => { expect(PdfName.of("")).toBe(empty); }); - describe("LRU cache", () => { + describe("WeakRef cache", () => { it("clearCache clears non-permanent names", () => { - const custom = PdfName.of("CustomName"); + PdfName.of("CustomName"); expect(PdfName.cacheSize).toBeGreaterThan(0); PdfName.clearCache(); @@ -70,5 +70,13 @@ describe("PdfName", () => { expect(PdfName.of("Type")).toBe(PdfName.Type); expect(PdfName.of("Page")).toBe(PdfName.Page); }); + + it("returns same instance while strong reference is held", () => { + const held = PdfName.of("HeldName"); + + // As long as we hold the reference, .of() returns the same instance + expect(PdfName.of("HeldName")).toBe(held); + expect(PdfName.of("HeldName")).toBe(held); + }); }); }); diff --git a/src/objects/pdf-name.ts b/src/objects/pdf-name.ts index d52561c..31adffb 100644 --- a/src/objects/pdf-name.ts +++ b/src/objects/pdf-name.ts @@ -1,6 +1,5 @@ import { HEX_TABLE } from "#src/helpers/buffer"; import { CHAR_HASH, DELIMITERS, WHITESPACE } from "#src/helpers/chars"; -import { LRUCache } from "#src/helpers/lru-cache"; import type { ByteWriter } from "#src/io/byte-writer"; import type { PdfPrimitive } from "./pdf-primitive"; @@ -60,37 +59,52 @@ function escapeName(name: string): string { } /** - * Default cache size for PdfName interning. - * Can be overridden via PdfName.setCacheSize(). - */ -const DEFAULT_NAME_CACHE_SIZE = 10000; - -/** - * PDF name object (interned). + * PDF name object (interned via WeakRef). * * In PDF: `/Type`, `/Page`, `/Length` * - * Names are interned using an LRU cache to prevent unbounded memory growth. - * `PdfName.of("Type") === PdfName.of("Type")` as long as both are in cache. - * Use `.of()` to get or create instances. + * Names are interned using a WeakRef cache: as long as any live object + * (e.g. a PdfDict key) holds a strong reference to a PdfName, calling + * `PdfName.of()` with the same string returns the *same instance*. + * Once all strong references are dropped, the GC may collect the + * PdfName and a FinalizationRegistry cleans up the cache entry. + * + * This avoids the correctness bug of LRU-based caching, where eviction + * of a still-referenced name would break Map key identity in PdfDict. * - * Common PDF names (Type, Page, etc.) are pre-cached and always available. + * Common PDF names (Type, Page, etc.) are held as static fields and + * therefore never collected. */ export class PdfName implements PdfPrimitive { get type(): "name" { return "name"; } - private static cache = new LRUCache({ max: DEFAULT_NAME_CACHE_SIZE }); + /** WeakRef cache for interning. Entries are cleaned up by the FinalizationRegistry. */ + private static cache = new Map>(); + + /** Cleans up dead WeakRef entries from the cache when a PdfName is GC'd. */ + private static registry = new FinalizationRegistry(name => { + const ref = PdfName.cache.get(name); + + // Only delete if the entry is actually dead — a new instance for the + // same name may have been inserted since the old one was collected. + if (ref && ref.deref() === undefined) { + PdfName.cache.delete(name); + } + }); /** - * Pre-cached common names that should never be evicted. - * These are stored separately from the LRU cache. + * Pre-cached common names that are always available. + * These are stored as static readonly fields, so they always have + * strong references and their WeakRefs never die. */ private static readonly permanentCache = new Map(); // Common PDF names (pre-cached in permanent cache) + // -- Document structure -- static readonly Type = PdfName.createPermanent("Type"); + static readonly Subtype = PdfName.createPermanent("Subtype"); static readonly Page = PdfName.createPermanent("Page"); static readonly Pages = PdfName.createPermanent("Pages"); static readonly Catalog = PdfName.createPermanent("Catalog"); @@ -100,9 +114,25 @@ export class PdfName implements PdfPrimitive { static readonly MediaBox = PdfName.createPermanent("MediaBox"); static readonly Resources = PdfName.createPermanent("Resources"); static readonly Contents = PdfName.createPermanent("Contents"); + static readonly Annots = PdfName.createPermanent("Annots"); + // -- Trailer / xref -- + static readonly Root = PdfName.createPermanent("Root"); + static readonly Size = PdfName.createPermanent("Size"); + static readonly Info = PdfName.createPermanent("Info"); + static readonly Prev = PdfName.createPermanent("Prev"); + static readonly ID = PdfName.createPermanent("ID"); + static readonly Encrypt = PdfName.createPermanent("Encrypt"); + // -- Streams -- static readonly Length = PdfName.createPermanent("Length"); static readonly Filter = PdfName.createPermanent("Filter"); static readonly FlateDecode = PdfName.createPermanent("FlateDecode"); + // -- Fonts / resources -- + static readonly Font = PdfName.createPermanent("Font"); + static readonly BaseFont = PdfName.createPermanent("BaseFont"); + static readonly Encoding = PdfName.createPermanent("Encoding"); + static readonly XObject = PdfName.createPermanent("XObject"); + // -- Name trees -- + static readonly Names = PdfName.createPermanent("Names"); /** Cached serialized form (e.g. "/Type"). Computed lazily on first toBytes(). */ private cachedBytes: Uint8Array | null = null; @@ -114,21 +144,31 @@ export class PdfName implements PdfPrimitive { * The leading `/` should NOT be included. */ static of(name: string): PdfName { - // Check permanent cache first (common names) + // Check permanent cache first (common names — always alive) const permanent = PdfName.permanentCache.get(name); + if (permanent) { return permanent; } - // Check LRU cache - let cached = PdfName.cache.get(name); + // Check WeakRef cache + const ref = PdfName.cache.get(name); + + if (ref) { + const existing = ref.deref(); - if (!cached) { - cached = new PdfName(name); - PdfName.cache.set(name, cached); + if (existing) { + return existing; + } } - return cached; + // Create new instance, store WeakRef, register for cleanup + const instance = new PdfName(name); + + PdfName.cache.set(name, new WeakRef(instance)); + PdfName.registry.register(instance, name); + + return instance; } /** @@ -144,7 +184,9 @@ export class PdfName implements PdfPrimitive { } /** - * Get the current size of the LRU cache. + * Get the current number of entries in the WeakRef cache. + * This includes entries whose targets may have been GC'd but whose + * FinalizationRegistry callbacks haven't run yet. */ static get cacheSize(): number { return PdfName.cache.size; diff --git a/src/parser/indirect-object-parser.test.ts b/src/parser/indirect-object-parser.test.ts index dcf1814..6b1fc41 100644 --- a/src/parser/indirect-object-parser.test.ts +++ b/src/parser/indirect-object-parser.test.ts @@ -217,15 +217,51 @@ endobj`, expect(new TextDecoder().decode(stream.data)).toBe("Hello"); }); - it("throws if indirect /Length cannot be resolved", () => { + it("falls back to endstream scan when indirect /Length cannot be resolved", () => { const p = parser(`1 0 obj << /Length 99 0 R >> stream Hello endstream endobj`); + const result = p.parseObject(); - expect(() => p.parseObject()).toThrow(/resolve.*length/i); + const stream = result.value as PdfStream; + expect(new TextDecoder().decode(stream.data)).toBe("Hello"); + }); + + it("falls back to endstream scan when no resolver provided", () => { + // Build input with actual binary bytes in the stream data + const prefix = new TextEncoder().encode("1 0 obj\n<< /Length 99 0 R >>\nstream\n"); + const binaryContent = new Uint8Array([0x00, 0x01, 0xff, 0xfe, 0x80]); + const suffix = new TextEncoder().encode("\nendstream\nendobj"); + + const fullBytes = new Uint8Array(prefix.length + binaryContent.length + suffix.length); + fullBytes.set(prefix); + fullBytes.set(binaryContent, prefix.length); + fullBytes.set(suffix, prefix.length + binaryContent.length); + + const scanner = new Scanner(fullBytes); + const p = new IndirectObjectParser(scanner); + const result = p.parseObject(); + + const stream = result.value as PdfStream; + expect(stream.data.length).toBe(5); + expect(stream.data[0]).toBe(0x00); + expect(stream.data[2]).toBe(0xff); + }); + + it("falls back to endstream scan when /Length is missing", () => { + const p = parser(`1 0 obj +<< /Filter /FlateDecode >> +stream +Hello +endstream +endobj`); + const result = p.parseObject(); + + const stream = result.value as PdfStream; + expect(new TextDecoder().decode(stream.data)).toBe("Hello"); }); it("preserves stream dict entries", () => { @@ -281,15 +317,17 @@ endobj`); expect(() => p.parseObject()).toThrow(/obj/i); }); - it("throws on missing /Length in stream", () => { + it("recovers stream with missing /Length via endstream scan", () => { const p = parser(`1 0 obj << /Type /XObject >> stream data endstream endobj`); + const result = p.parseObject(); - expect(() => p.parseObject()).toThrow(/length/i); + const stream = result.value as PdfStream; + expect(new TextDecoder().decode(stream.data)).toBe("data"); }); }); }); diff --git a/src/parser/indirect-object-parser.ts b/src/parser/indirect-object-parser.ts index c8923b5..411ee41 100644 --- a/src/parser/indirect-object-parser.ts +++ b/src/parser/indirect-object-parser.ts @@ -129,13 +129,26 @@ export class IndirectObjectParser { // Skip EOL after "stream" (required: LF or CRLF) this.skipStreamEOL(); - // Get the stream length - const length = this.resolveLength(dict); + const startPos = this.scanner.position; + + // Try to resolve /Length from the dict. If that fails (e.g. indirect + // ref during brute-force recovery with no resolver), fall back to + // scanning for the "endstream" keyword to determine the length. + let length: number; + + try { + length = this.resolveLength(dict); + } catch { + length = this.findEndStream(startPos); + + if (length < 0) { + throw new ObjectParseError("Stream missing /Length and no endstream found"); + } + } // Read exactly `length` bytes. // Use subarray (zero-copy view) since the underlying PDF bytes // are kept alive by the PDF object for the document's lifetime. - const startPos = this.scanner.position; const data = this.scanner.bytes.subarray(startPos, startPos + length); this.scanner.moveTo(startPos + length); @@ -220,6 +233,52 @@ export class IndirectObjectParser { } } + /** + * Scan forward from startPos looking for the "endstream" keyword. + * Returns the stream data length (excluding any EOL before endstream), + * or -1 if not found. + */ + private findEndStream(startPos: number): number { + const bytes = this.scanner.bytes; + const len = bytes.length; + + // "endstream" as byte values + const sig = [0x65, 0x6e, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d]; + const sigLen = sig.length; + + for (let i = startPos; i <= len - sigLen; i++) { + let match = true; + + for (let j = 0; j < sigLen; j++) { + if (bytes[i + j] !== sig[j]) { + match = false; + break; + } + } + + if (match) { + // Found "endstream" at position i. + // Strip the optional EOL that precedes it (part of stream framing, + // not stream data — per PDF spec 7.3.8.1). + let end = i; + + if (end > startPos && bytes[end - 1] === LF) { + end--; + + if (end > startPos && bytes[end - 1] === CR) { + end--; + } + } else if (end > startPos && bytes[end - 1] === CR) { + end--; + } + + return end - startPos; + } + } + + return -1; + } + /** * Resolve the /Length value from the stream dict. * Handles both direct values and indirect references. diff --git a/src/parser/xref-parser.test.ts b/src/parser/xref-parser.test.ts index c323b58..a7134b1 100644 --- a/src/parser/xref-parser.test.ts +++ b/src/parser/xref-parser.test.ts @@ -356,6 +356,48 @@ some content without startxref expect(() => p.findStartXRef()).toThrow(/startxref/i); }); + + it("skips trailing null bytes to find startxref", () => { + const content = `%PDF-1.4 +some content +xref +0 1 +0000000000 65535 f +trailer +<< /Size 1 /Root 1 0 R >> +startxref +23 +%%EOF`; + // Append 2048 null bytes (exceeds the 1024-byte search window) + const contentBytes = new TextEncoder().encode(content); + const padded = new Uint8Array(contentBytes.length + 2048); + + padded.set(contentBytes); + // rest is already 0x00 + + const scanner = new Scanner(padded); + const p = new XRefParser(scanner); + const offset = p.findStartXRef(); + + expect(offset).toBe(23); + }); + + it("skips trailing whitespace mix to find startxref", () => { + const content = `%PDF-1.4\nstartxref\n50\n%%EOF`; + const contentBytes = new TextEncoder().encode(content); + // Append a mix of whitespace: spaces, newlines, tabs, nulls + const padding = new Uint8Array([0x20, 0x0a, 0x09, 0x00, 0x0d, 0x20, 0x00]); + const padded = new Uint8Array(contentBytes.length + padding.length); + + padded.set(contentBytes); + padded.set(padding, contentBytes.length); + + const scanner = new Scanner(padded); + const p = new XRefParser(scanner); + const offset = p.findStartXRef(); + + expect(offset).toBe(50); + }); }); describe("lenient parsing", () => { diff --git a/src/parser/xref-parser.ts b/src/parser/xref-parser.ts index 83e1f9c..8bd43d8 100644 --- a/src/parser/xref-parser.ts +++ b/src/parser/xref-parser.ts @@ -1,4 +1,4 @@ -import { CR, DIGIT_0, DIGIT_9, LF, SPACE, TAB } from "#src/helpers/chars"; +import { CR, DIGIT_0, DIGIT_9, isWhitespace, LF, SPACE, TAB } from "#src/helpers/chars"; import type { Scanner } from "#src/io/scanner"; import type { PdfDict } from "#src/objects/pdf-dict"; import { PdfNumber } from "#src/objects/pdf-number"; @@ -47,13 +47,21 @@ export class XRefParser { const bytes = this.scanner.bytes; const len = bytes.length; - // Search backwards from end, looking for "startxref" - // Usually within last 1024 bytes, but search more if needed - const searchStart = Math.max(0, len - 1024); + // Skip trailing whitespace/null bytes to find the effective end of file. + // Some systems pad PDFs with null bytes to block boundaries, which can + // push the actual %%EOF / startxref beyond the normal 1024-byte window. + let effectiveEnd = len; + + while (effectiveEnd > 0 && isWhitespace(bytes[effectiveEnd - 1])) { + effectiveEnd--; + } + + // Search backwards from effective end, looking for "startxref" + const searchStart = Math.max(0, effectiveEnd - 1024); let startxrefPos = -1; - for (let i = len - 9; i >= searchStart; i--) { + for (let i = effectiveEnd - 9; i >= searchStart; i--) { if (this.matchesAt(i, "startxref")) { startxrefPos = i; break; diff --git a/src/tests/issues/issue-54-name-interning-load.test.ts b/src/tests/issues/issue-54-name-interning-load.test.ts new file mode 100644 index 0000000..94a23b7 --- /dev/null +++ b/src/tests/issues/issue-54-name-interning-load.test.ts @@ -0,0 +1,205 @@ +/** + * Load test for PdfName interning under sustained PDF processing. + * + * Verifies that PdfDict lookups remain correct after processing many + * PDFs with diverse name sets. This is the scenario from issue #54 + * (point 3): in a long-running server processing hundreds of PDFs per + * second, the old LRU cache would evict PdfName instances that were + * still in use as PdfDict keys, causing silent lookup failures. + * + * The WeakRef-based cache fixes this by never evicting names that are + * still referenced. This test validates that guarantee under load. + * + * Run with: + * bun run test:run -- -t "issue-54.*load" + */ + +import { PdfDict } from "#src/objects/pdf-dict"; +import { PdfName } from "#src/objects/pdf-name"; +import { PdfString } from "#src/objects/pdf-string"; +import { loadFixture } from "#src/test-utils"; +import { describe, expect, it } from "vitest"; + +import { PDF } from "../../api/pdf"; + +describe("issue-54: PdfName interning under load", () => { + describe("PdfDict correctness under name cache pressure", () => { + it("dict lookups work after flooding the cache with unique names", () => { + // Create dicts with custom keys, then flood the cache, then verify lookups. + // With the old LRU (max 10k), this would fail once the custom keys + // were evicted. With WeakRef, the dicts hold strong references so + // the names stay alive. + const dicts: PdfDict[] = []; + const expectedValues = new Map(); + + // Create 100 dicts, each with a unique key + for (let i = 0; i < 100; i++) { + const uniqueKey = `CustomField_Batch0_${i}`; + const dict = new PdfDict(); + + dict.set(uniqueKey, PdfString.fromString(`value_${i}`)); + dicts.push(dict); + expectedValues.set(i, uniqueKey); + } + + // Flood the name cache with 15,000 unique names (exceeds old LRU max of 10k). + // Don't hold references — these should be collectible. + for (let i = 0; i < 15_000; i++) { + PdfName.of(`FloodName_${i}_${Math.random().toString(36).slice(2)}`); + } + + // Verify all original dicts still work + for (let i = 0; i < dicts.length; i++) { + const key = expectedValues.get(i)!; + const value = dicts[i].get(key); + + expect(value, `dict[${i}].get("${key}") should not be undefined`).toBeDefined(); + expect((value as PdfString).asString()).toBe(`value_${i}`); + } + }); + + it("dict lookups work across interleaved create-flood-verify cycles", () => { + // Simulates a server that processes batches of PDFs: each batch + // creates dicts with unique names, and between batches we flood + // the cache with unrelated names. + const allDicts: { dict: PdfDict; key: string; expected: string }[] = []; + + for (let batch = 0; batch < 20; batch++) { + // Create dicts with unique keys for this batch + for (let i = 0; i < 50; i++) { + const key = `Batch${batch}_Field${i}`; + const expected = `batch${batch}_value${i}`; + const dict = new PdfDict(); + + dict.set(key, PdfString.fromString(expected)); + allDicts.push({ dict, key, expected }); + } + + // Flood between batches — different names each time + for (let i = 0; i < 2000; i++) { + PdfName.of(`Flood_B${batch}_${i}`); + } + + // Verify ALL dicts from ALL previous batches still work + for (const { dict, key, expected } of allDicts) { + const value = dict.get(key); + + expect(value, `"${key}" lookup failed after batch ${batch}`).toBeDefined(); + expect((value as PdfString).asString()).toBe(expected); + } + } + + // 20 batches × 50 dicts = 1000 dicts, all still correct + expect(allDicts.length).toBe(1000); + }); + + it("permanent cache names always resolve to the same instance", () => { + // Flood heavily, then verify static names + for (let i = 0; i < 20_000; i++) { + PdfName.of(`StaticFlood_${i}`); + } + + expect(PdfName.of("Type")).toBe(PdfName.Type); + expect(PdfName.of("Root")).toBe(PdfName.Root); + expect(PdfName.of("Subtype")).toBe(PdfName.Subtype); + expect(PdfName.of("Font")).toBe(PdfName.Font); + expect(PdfName.of("Catalog")).toBe(PdfName.Catalog); + }); + }); + + describe("sustained PDF load/modify/save cycles", () => { + it("processes 200 PDFs with unique metadata without corruption", async () => { + const baseBytes = await loadFixture("basic", "rot0.pdf"); + + for (let i = 0; i < 200; i++) { + const title = `Document_${i}_${Date.now()}`; + + // Load a fresh copy + const pdf = await PDF.load(new Uint8Array(baseBytes)); + + // Set metadata with unique values + pdf.setTitle(title); + pdf.setAuthor(`Author_${i}`); + + // Save and verify round-trip + const saved = await pdf.save(); + + expect(saved.length).toBeGreaterThan(0); + + // Reload and verify metadata survived + const reloaded = await PDF.load(saved); + + expect(reloaded.getTitle()).toBe(title); + expect(reloaded.getPageCount()).toBe(1); + } + }); + + it("processes 100 PDFs with unique form fields and verifies values", async () => { + // Each iteration creates a PDF with uniquely-named form fields, + // saves it, reloads it, and verifies the field values. + // This creates thousands of unique PdfName instances for field + // names, appearances, fonts, etc. + for (let i = 0; i < 100; i++) { + const pdf = PDF.create(); + const form = pdf.getOrCreateForm(); + + // Create fields with unique names — each becomes a PdfName + const fieldName = `user_input_${i}_${Math.random().toString(36).slice(2, 8)}`; + const field = form.createTextField(fieldName); + + field.setValue(`Response #${i}`); + + const saved = await pdf.save(); + const reloaded = await PDF.load(saved); + + const reloadedForm = reloaded.getForm(); + + expect(reloadedForm, `form missing on iteration ${i}`).not.toBeNull(); + + const reloadedField = reloadedForm!.getField(fieldName); + + expect(reloadedField, `field "${fieldName}" missing on iteration ${i}`).toBeDefined(); + expect(reloadedField!.getValue()).toBe(`Response #${i}`); + } + }); + + it("holds multiple PDFs concurrently while processing more", async () => { + // Simulates a server that keeps recent PDFs in memory while + // processing new ones. The held PDFs must remain functional + // even as the name cache churns. + const baseBytes = await loadFixture("forms", "sample_form.pdf"); + const heldPdfs: { pdf: PDF; expectedPageCount: number }[] = []; + + for (let i = 0; i < 150; i++) { + const pdf = await PDF.load(new Uint8Array(baseBytes)); + + // Hold onto every 10th PDF + if (i % 10 === 0) { + heldPdfs.push({ pdf, expectedPageCount: pdf.getPageCount() }); + } + + // Create name pressure via metadata + pdf.setTitle(`Concurrent_${i}`); + + const saved = await pdf.save(); + + expect(saved.length).toBeGreaterThan(0); + + // Periodically verify all held PDFs still work + if (i % 25 === 0) { + for (const { pdf: held, expectedPageCount } of heldPdfs) { + expect(held.getPageCount()).toBe(expectedPageCount); + expect(held.getTitle()).toBeDefined(); + } + } + } + + // Final verification of all held PDFs + expect(heldPdfs.length).toBe(15); + + for (const { pdf, expectedPageCount } of heldPdfs) { + expect(pdf.getPageCount()).toBe(expectedPageCount); + } + }); + }); +});