Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions src/objects/pdf-name.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ describe("PdfName", () => {
expect(PdfName.of("")).toBe(empty);
});

describe("LRU cache", () => {
describe("WeakRef cache", () => {
it("clearCache clears non-permanent names", () => {
const custom = PdfName.of("CustomName");
PdfName.of("CustomName");
expect(PdfName.cacheSize).toBeGreaterThan(0);

PdfName.clearCache();
Expand All @@ -70,5 +70,13 @@ describe("PdfName", () => {
expect(PdfName.of("Type")).toBe(PdfName.Type);
expect(PdfName.of("Page")).toBe(PdfName.Page);
});

it("returns same instance while strong reference is held", () => {
const held = PdfName.of("HeldName");

// As long as we hold the reference, .of() returns the same instance
expect(PdfName.of("HeldName")).toBe(held);
expect(PdfName.of("HeldName")).toBe(held);
});
});
});
88 changes: 65 additions & 23 deletions src/objects/pdf-name.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { HEX_TABLE } from "#src/helpers/buffer";
import { CHAR_HASH, DELIMITERS, WHITESPACE } from "#src/helpers/chars";
import { LRUCache } from "#src/helpers/lru-cache";
import type { ByteWriter } from "#src/io/byte-writer";

import type { PdfPrimitive } from "./pdf-primitive";
Expand Down Expand Up @@ -60,37 +59,52 @@ function escapeName(name: string): string {
}

/**
* Default cache size for PdfName interning.
* Can be overridden via PdfName.setCacheSize().
*/
const DEFAULT_NAME_CACHE_SIZE = 10000;

/**
* PDF name object (interned).
* PDF name object (interned via WeakRef).
*
* In PDF: `/Type`, `/Page`, `/Length`
*
* Names are interned using an LRU cache to prevent unbounded memory growth.
* `PdfName.of("Type") === PdfName.of("Type")` as long as both are in cache.
* Use `.of()` to get or create instances.
* Names are interned using a WeakRef cache: as long as any live object
* (e.g. a PdfDict key) holds a strong reference to a PdfName, calling
* `PdfName.of()` with the same string returns the *same instance*.
* Once all strong references are dropped, the GC may collect the
* PdfName and a FinalizationRegistry cleans up the cache entry.
*
* This avoids the correctness bug of LRU-based caching, where eviction
* of a still-referenced name would break Map key identity in PdfDict.
*
* Common PDF names (Type, Page, etc.) are pre-cached and always available.
* Common PDF names (Type, Page, etc.) are held as static fields and
* therefore never collected.
*/
export class PdfName implements PdfPrimitive {
get type(): "name" {
return "name";
}

private static cache = new LRUCache<string, PdfName>({ max: DEFAULT_NAME_CACHE_SIZE });
/** WeakRef cache for interning. Entries are cleaned up by the FinalizationRegistry. */
private static cache = new Map<string, WeakRef<PdfName>>();

/** Cleans up dead WeakRef entries from the cache when a PdfName is GC'd. */
private static registry = new FinalizationRegistry<string>(name => {
const ref = PdfName.cache.get(name);

// Only delete if the entry is actually dead — a new instance for the
// same name may have been inserted since the old one was collected.
if (ref && ref.deref() === undefined) {
PdfName.cache.delete(name);
}
});

/**
* Pre-cached common names that should never be evicted.
* These are stored separately from the LRU cache.
* Pre-cached common names that are always available.
* These are stored as static readonly fields, so they always have
* strong references and their WeakRefs never die.
*/
private static readonly permanentCache = new Map<string, PdfName>();

// Common PDF names (pre-cached in permanent cache)
// -- Document structure --
static readonly Type = PdfName.createPermanent("Type");
static readonly Subtype = PdfName.createPermanent("Subtype");
static readonly Page = PdfName.createPermanent("Page");
static readonly Pages = PdfName.createPermanent("Pages");
static readonly Catalog = PdfName.createPermanent("Catalog");
Expand All @@ -100,9 +114,25 @@ export class PdfName implements PdfPrimitive {
static readonly MediaBox = PdfName.createPermanent("MediaBox");
static readonly Resources = PdfName.createPermanent("Resources");
static readonly Contents = PdfName.createPermanent("Contents");
static readonly Annots = PdfName.createPermanent("Annots");
// -- Trailer / xref --
static readonly Root = PdfName.createPermanent("Root");
static readonly Size = PdfName.createPermanent("Size");
static readonly Info = PdfName.createPermanent("Info");
static readonly Prev = PdfName.createPermanent("Prev");
static readonly ID = PdfName.createPermanent("ID");
static readonly Encrypt = PdfName.createPermanent("Encrypt");
// -- Streams --
static readonly Length = PdfName.createPermanent("Length");
static readonly Filter = PdfName.createPermanent("Filter");
static readonly FlateDecode = PdfName.createPermanent("FlateDecode");
// -- Fonts / resources --
static readonly Font = PdfName.createPermanent("Font");
static readonly BaseFont = PdfName.createPermanent("BaseFont");
static readonly Encoding = PdfName.createPermanent("Encoding");
static readonly XObject = PdfName.createPermanent("XObject");
// -- Name trees --
static readonly Names = PdfName.createPermanent("Names");

/** Cached serialized form (e.g. "/Type"). Computed lazily on first toBytes(). */
private cachedBytes: Uint8Array | null = null;
Expand All @@ -114,21 +144,31 @@ export class PdfName implements PdfPrimitive {
* The leading `/` should NOT be included.
*/
static of(name: string): PdfName {
// Check permanent cache first (common names)
// Check permanent cache first (common names — always alive)
const permanent = PdfName.permanentCache.get(name);

if (permanent) {
return permanent;
}

// Check LRU cache
let cached = PdfName.cache.get(name);
// Check WeakRef cache
const ref = PdfName.cache.get(name);

if (ref) {
const existing = ref.deref();

if (!cached) {
cached = new PdfName(name);
PdfName.cache.set(name, cached);
if (existing) {
return existing;
}
}

return cached;
// Create new instance, store WeakRef, register for cleanup
const instance = new PdfName(name);

PdfName.cache.set(name, new WeakRef(instance));
PdfName.registry.register(instance, name);

return instance;
}

/**
Expand All @@ -144,7 +184,9 @@ export class PdfName implements PdfPrimitive {
}

/**
* Get the current size of the LRU cache.
* Get the current number of entries in the WeakRef cache.
* This includes entries whose targets may have been GC'd but whose
* FinalizationRegistry callbacks haven't run yet.
*/
static get cacheSize(): number {
return PdfName.cache.size;
Expand Down
46 changes: 42 additions & 4 deletions src/parser/indirect-object-parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,51 @@ endobj`,
expect(new TextDecoder().decode(stream.data)).toBe("Hello");
});

it("throws if indirect /Length cannot be resolved", () => {
it("falls back to endstream scan when indirect /Length cannot be resolved", () => {
const p = parser(`1 0 obj
<< /Length 99 0 R >>
stream
Hello
endstream
endobj`);
const result = p.parseObject();

expect(() => p.parseObject()).toThrow(/resolve.*length/i);
const stream = result.value as PdfStream;
expect(new TextDecoder().decode(stream.data)).toBe("Hello");
});

it("falls back to endstream scan when no resolver provided", () => {
// Build input with actual binary bytes in the stream data
const prefix = new TextEncoder().encode("1 0 obj\n<< /Length 99 0 R >>\nstream\n");
const binaryContent = new Uint8Array([0x00, 0x01, 0xff, 0xfe, 0x80]);
const suffix = new TextEncoder().encode("\nendstream\nendobj");

const fullBytes = new Uint8Array(prefix.length + binaryContent.length + suffix.length);
fullBytes.set(prefix);
fullBytes.set(binaryContent, prefix.length);
fullBytes.set(suffix, prefix.length + binaryContent.length);

const scanner = new Scanner(fullBytes);
const p = new IndirectObjectParser(scanner);
const result = p.parseObject();

const stream = result.value as PdfStream;
expect(stream.data.length).toBe(5);
expect(stream.data[0]).toBe(0x00);
expect(stream.data[2]).toBe(0xff);
});

it("falls back to endstream scan when /Length is missing", () => {
const p = parser(`1 0 obj
<< /Filter /FlateDecode >>
stream
Hello
endstream
endobj`);
const result = p.parseObject();

const stream = result.value as PdfStream;
expect(new TextDecoder().decode(stream.data)).toBe("Hello");
});

it("preserves stream dict entries", () => {
Expand Down Expand Up @@ -281,15 +317,17 @@ endobj`);
expect(() => p.parseObject()).toThrow(/obj/i);
});

it("throws on missing /Length in stream", () => {
it("recovers stream with missing /Length via endstream scan", () => {
const p = parser(`1 0 obj
<< /Type /XObject >>
stream
data
endstream
endobj`);
const result = p.parseObject();

expect(() => p.parseObject()).toThrow(/length/i);
const stream = result.value as PdfStream;
expect(new TextDecoder().decode(stream.data)).toBe("data");
});
});
});
65 changes: 62 additions & 3 deletions src/parser/indirect-object-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,26 @@ export class IndirectObjectParser {
// Skip EOL after "stream" (required: LF or CRLF)
this.skipStreamEOL();

// Get the stream length
const length = this.resolveLength(dict);
const startPos = this.scanner.position;

// Try to resolve /Length from the dict. If that fails (e.g. indirect
// ref during brute-force recovery with no resolver), fall back to
// scanning for the "endstream" keyword to determine the length.
let length: number;

try {
length = this.resolveLength(dict);
} catch {
length = this.findEndStream(startPos);

if (length < 0) {
throw new ObjectParseError("Stream missing /Length and no endstream found");
}
}

// Read exactly `length` bytes.
// Use subarray (zero-copy view) since the underlying PDF bytes
// are kept alive by the PDF object for the document's lifetime.
const startPos = this.scanner.position;
const data = this.scanner.bytes.subarray(startPos, startPos + length);

this.scanner.moveTo(startPos + length);
Expand Down Expand Up @@ -220,6 +233,52 @@ export class IndirectObjectParser {
}
}

/**
* Scan forward from startPos looking for the "endstream" keyword.
* Returns the stream data length (excluding any EOL before endstream),
* or -1 if not found.
*/
private findEndStream(startPos: number): number {
const bytes = this.scanner.bytes;
const len = bytes.length;

// "endstream" as byte values
const sig = [0x65, 0x6e, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d];
const sigLen = sig.length;

for (let i = startPos; i <= len - sigLen; i++) {
let match = true;

for (let j = 0; j < sigLen; j++) {
if (bytes[i + j] !== sig[j]) {
match = false;
break;
}
}

if (match) {
// Found "endstream" at position i.
// Strip the optional EOL that precedes it (part of stream framing,
// not stream data — per PDF spec 7.3.8.1).
let end = i;

if (end > startPos && bytes[end - 1] === LF) {
end--;

if (end > startPos && bytes[end - 1] === CR) {
end--;
}
} else if (end > startPos && bytes[end - 1] === CR) {
end--;
}

return end - startPos;
}
}

return -1;
}

/**
* Resolve the /Length value from the stream dict.
* Handles both direct values and indirect references.
Expand Down
42 changes: 42 additions & 0 deletions src/parser/xref-parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,48 @@ some content without startxref

expect(() => p.findStartXRef()).toThrow(/startxref/i);
});

it("skips trailing null bytes to find startxref", () => {
const content = `%PDF-1.4
some content
xref
0 1
0000000000 65535 f
trailer
<< /Size 1 /Root 1 0 R >>
startxref
23
%%EOF`;
// Append 2048 null bytes (exceeds the 1024-byte search window)
const contentBytes = new TextEncoder().encode(content);
const padded = new Uint8Array(contentBytes.length + 2048);

padded.set(contentBytes);
// rest is already 0x00

const scanner = new Scanner(padded);
const p = new XRefParser(scanner);
const offset = p.findStartXRef();

expect(offset).toBe(23);
});

it("skips trailing whitespace mix to find startxref", () => {
const content = `%PDF-1.4\nstartxref\n50\n%%EOF`;
const contentBytes = new TextEncoder().encode(content);
// Append a mix of whitespace: spaces, newlines, tabs, nulls
const padding = new Uint8Array([0x20, 0x0a, 0x09, 0x00, 0x0d, 0x20, 0x00]);
const padded = new Uint8Array(contentBytes.length + padding.length);

padded.set(contentBytes);
padded.set(padding, contentBytes.length);

const scanner = new Scanner(padded);
const p = new XRefParser(scanner);
const offset = p.findStartXRef();

expect(offset).toBe(50);
});
});

describe("lenient parsing", () => {
Expand Down
Loading
Loading