From e7900fcb87f4d3e22f05cb0e8c440fcf7a628cfd Mon Sep 17 00:00:00 2001 From: calixteman Date: Thu, 29 Jan 2026 21:43:30 +0100 Subject: [PATCH] Decompress object streams asynchronously when it's possible Most of the time, the object streams are compressed using FlateDecode (and in future with BrotliDecode). So in order to improve the performances we can decompress those streams with a built-in decompressor but it has to be done asynchronously. Since it cannot be done when fetching which is synchronous, we need to do it as part of the PDF parsing process. The drawback is that this requires more memory since we need to keep both the compressed and uncompressed versions of the object streams in memory until the parsing is done. --- src/core/core_utils.js | 3 +- src/core/document.js | 4 +- src/core/pdf_manager.js | 7 +++- src/core/xref.js | 82 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 90 insertions(+), 6 deletions(-) diff --git a/src/core/core_utils.js b/src/core/core_utils.js index f3218ac614053..0e4aa9dddc477 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -62,10 +62,11 @@ function getLookupTableFactory(initializer) { } class MissingDataException extends BaseException { - constructor(begin, end) { + constructor(begin, end, objStreamRefNum = 0) { super(`Missing data [${begin}, ${end})`, "MissingDataException"); this.begin = begin; this.end = end; + this.objStreamRefNum = objStreamRefNum; } } diff --git a/src/core/document.js b/src/core/document.js index 9d65d04ac84ab..c1e9800e7553f 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -1038,8 +1038,8 @@ class PDFDocument { }; } - parse(recoveryMode) { - this.xref.parse(recoveryMode); + async parse(recoveryMode) { + await this.xref.parse(recoveryMode); this.catalog = new Catalog(this.pdfManager, this.xref); } diff --git a/src/core/pdf_manager.js b/src/core/pdf_manager.js index 83ea377bc47a2..ec26a197cfbfb 100644 --- a/src/core/pdf_manager.js +++ b/src/core/pdf_manager.js @@ -191,7 +191,7 @@ class NetworkPdfManager extends BasePdfManager { try { const value = obj[prop]; if (typeof value === "function") { - return value.apply(obj, args); + return await value.apply(obj, args); } return value; } catch (ex) { @@ -199,6 +199,11 @@ class NetworkPdfManager extends BasePdfManager { throw ex; } await this.requestRange(ex.begin, ex.end); + if (ex.objectStreamOffset) { + await this.pdfDocument.xref.decompressObjectStreams( + ex.objectStreamOffset + ); + } return this.ensure(obj, prop, args); } } diff --git a/src/core/xref.js b/src/core/xref.js index c13c08a7d3bde..7b3489cef3100 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -31,6 +31,7 @@ import { } from "./core_utils.js"; import { BaseStream } from "./base_stream.js"; import { CipherTransformFactory } from "./crypto.js"; +import { Stream } from "./stream.js"; class XRef { constructor(stream, pdfManager) { @@ -43,6 +44,7 @@ class XRef { this._newPersistentRefNum = null; this._newTemporaryRefNum = null; this._persistentRefsCache = null; + this._objectStreams = new Map(); } getNewPersistentRef(obj) { @@ -96,7 +98,7 @@ class XRef { this.startXRefQueue = [startXRef]; } - parse(recoveryMode = false) { + async parse(recoveryMode = false) { let trailerDict; if (!recoveryMode) { trailerDict = this.readXRef(); @@ -107,6 +109,8 @@ class XRef { trailerDict.assignXref(this); this.trailer = trailerDict; + await this.decompressObjectStreams(); + let encrypt; try { encrypt = trailerDict.get("Encrypt"); @@ -925,7 +929,29 @@ class XRef { fetchCompressed(ref, xrefEntry, suppressEncryption = false) { const tableOffset = xrefEntry.offset; - const stream = this.fetch(Ref.get(tableOffset, 0)); + const objectStream = this._objectStreams.get(tableOffset); + let stream; + if (objectStream) { + // The object stream has already been parsed. + stream = objectStream; + this._objectStreams.delete(tableOffset); + } else { + try { + stream = this.fetch(Ref.get(tableOffset, 0)); + } catch (ex) { + if (ex instanceof MissingDataException) { + const objStream = this.entries[tableOffset]; + const start = this.stream.start + objStream.offset; + const end = this.stream.start + this.entries[tableOffset + 1].offset; + throw new MissingDataException( + start, + end, + /* objStreamRefNum = */ ref.num + ); + } + throw new FormatError("bad ObjStm stream"); + } + } if (!(stream instanceof BaseStream)) { throw new FormatError("bad ObjStm stream"); } @@ -1030,6 +1056,58 @@ class XRef { getCatalogObj() { return this.root; } + + async decompressObjectStreams(entryOffset = null) { + const done = new Set([0]); + const promises = []; + let entries = this.entries; + if (entryOffset !== null) { + entries = { [entryOffset]: this.entries[entryOffset] }; + } + for (const num in entries) { + if (!Object.hasOwn(entries, num)) { + continue; + } + const entry = entries[num]; + if (entry.uncompressed) { + continue; + } + const tableOffset = entry.offset; + if (done.has(tableOffset)) { + continue; + } + done.add(tableOffset); + let stream; + try { + stream = this.fetch(Ref.get(tableOffset, 0)); + } catch {} + + if ( + !(stream instanceof BaseStream) || + !stream.isAsync || + !stream.isDataLoaded + ) { + continue; + } + + promises.push( + stream + .asyncGetBytes() + .then(bytes => { + if (bytes) { + this._objectStreams.set( + tableOffset, + new Stream(bytes, 0, bytes.length, stream.dict) + ); + } + }) + .catch(() => { + /* no-op */ + }) + ); + } + await Promise.all(promises); + } } export { XRef };