diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index d72a27a3..296b8997 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -1,4 +1,5 @@ import json +import logging from enum import Enum from io import BytesIO from pathlib import Path @@ -17,6 +18,8 @@ from docling_eval.datamodels.types import EvaluationModality, PredictionFormats from docling_eval.utils.utils import extract_images +_log = logging.getLogger(__name__) + seg_adapter = TypeAdapter(Dict[int, SegmentedPage]) @@ -151,6 +154,10 @@ def _extract_images( return pictures, page_images def as_record_dict(self): + # Convert images to bytes format BEFORE closing them + gt_pictures_bytes = self._images_to_bytes(self.ground_truth_pictures) + gt_page_images_bytes = self._images_to_bytes(self.ground_truth_page_images) + record = { self.get_field_alias("doc_id"): self.doc_id, self.get_field_alias("doc_path"): str(self.doc_path), @@ -158,13 +165,11 @@ def as_record_dict(self): self.get_field_alias("ground_truth_doc"): json.dumps( self.ground_truth_doc.export_to_dict() ), - self.get_field_alias("ground_truth_pictures"): self.ground_truth_pictures, + self.get_field_alias("ground_truth_pictures"): gt_pictures_bytes, self.get_field_alias("ground_truth_segmented_pages"): seg_adapter.dump_json( self.ground_truth_segmented_pages ).decode("utf-8"), - self.get_field_alias( - "ground_truth_page_images" - ): self.ground_truth_page_images, + self.get_field_alias("ground_truth_page_images"): gt_page_images_bytes, self.get_field_alias("mime_type"): self.mime_type, self.get_field_alias("modalities"): list( [m.value for m in self.modalities] @@ -183,6 +188,33 @@ def as_record_dict(self): return record + @staticmethod + def _pil_to_bytes(img: PIL.Image.Image) -> bytes: + """Convert PIL image to PNG bytes.""" + buffered = BytesIO() + img.save(buffered, format="PNG") + return buffered.getvalue() + + def _images_to_bytes(self, images: List[PIL.Image.Image]) -> List[dict]: + """Convert list of PIL Images to HuggingFace-compatible bytes format.""" + return [{"bytes": self._pil_to_bytes(img), "path": None} for img in images] + + @staticmethod + def _close_image_list(images: List[PIL.Image.Image]) -> None: + """Close all PIL Images in a list, logging any errors at debug level.""" + for img in images: + try: + img.close() + except Exception as e: + _log.debug(f"Failed to close PIL image: {e}") + + def _close_images(self) -> None: + """Close ground truth PIL Images to prevent memory leaks.""" + self._close_image_list(self.ground_truth_page_images) + self._close_image_list(self.ground_truth_pictures) + self.ground_truth_page_images = [] + self.ground_truth_pictures = [] + @model_validator(mode="after") def validate_images(self) -> "DatasetRecord": if not len(self.ground_truth_pictures) and not len( @@ -240,9 +272,9 @@ def validate_record_dict(cls, data: dict): img_buffer.seek(0) data[gt_binary] = DocumentStream(name="image.png", stream=img_buffer) - # Backward compatibility: ensure tags field exists for old datasets + # Backward compatibility: ensure tags field exists for old datasets and is not None tags_alias = cls.get_field_alias("tags") - if tags_alias not in data: + if tags_alias not in data or data[tags_alias] is None: data[tags_alias] = [] return data @@ -317,6 +349,10 @@ def as_record_dict(self): ) if self.predicted_doc is not None: + # Convert prediction images to bytes BEFORE closing + pred_pictures_bytes = self._images_to_bytes(self.predicted_pictures) + pred_page_images_bytes = self._images_to_bytes(self.predicted_page_images) + record.update( { self.get_field_alias("predicted_doc"): json.dumps( @@ -327,18 +363,27 @@ def as_record_dict(self): ): seg_adapter.dump_json(self.predicted_segmented_pages).decode( "utf-8" ), - self.get_field_alias("predicted_pictures"): self.predicted_pictures, + self.get_field_alias("predicted_pictures"): pred_pictures_bytes, self.get_field_alias( "predicted_page_images" - ): self.predicted_page_images, + ): pred_page_images_bytes, self.get_field_alias("original_prediction"): ( self.original_prediction ), } ) + # Close prediction images (parent already closed ground truth images) + self._close_prediction_images() return record + def _close_prediction_images(self) -> None: + """Close prediction PIL Images to prevent memory leaks.""" + self._close_image_list(self.predicted_page_images) + self._close_image_list(self.predicted_pictures) + self.predicted_page_images = [] + self.predicted_pictures = [] + @model_validator(mode="after") def validate_images(self) -> "DatasetRecordWithPrediction": # super().validate_images() diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index acd6d829..74749334 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -192,23 +192,20 @@ def retrieve_input_dataset(self) -> Path: Path to the retrieved dataset """ if isinstance(self.dataset_source, HFSource): + download_kwargs = { + "repo_id": self.dataset_source.repo_id, + "revision": self.dataset_source.revision, + "repo_type": "dataset", + "token": self.dataset_source.hf_token, + } + if not self.dataset_local_path: - path_str = snapshot_download( - repo_id=self.dataset_source.repo_id, - revision=self.dataset_source.revision, - repo_type="dataset", - token=self.dataset_source.hf_token, - ) + path_str = snapshot_download(**download_kwargs) path: Path = Path(path_str) self.dataset_local_path = path else: - path_str = snapshot_download( - repo_id=self.dataset_source.repo_id, - revision=self.dataset_source.revision, - repo_type="dataset", - token=self.dataset_source.hf_token, - local_dir=self.dataset_local_path, - ) + download_kwargs["local_dir"] = str(self.dataset_local_path) + path_str = snapshot_download(**download_kwargs) path = Path(path_str) elif isinstance(self.dataset_source, Path): path = self.dataset_source @@ -315,7 +312,6 @@ def save_to_disk( for record_chunk in chunkify(self.iterate(), chunk_size): record_list = [] for r in record_chunk: - record_list.append(r.as_record_dict()) if do_visualization: viz_path_split = self.target / "visualizations" / f"{r.doc_id}.html" @@ -333,6 +329,7 @@ def save_to_disk( doc=tmp, draw_reading_order=True, ) + record_list.append(r.as_record_dict()) save_shard_to_disk( items=record_list, diff --git a/docling_eval/dataset_builders/doclaynet_v1_builder.py b/docling_eval/dataset_builders/doclaynet_v1_builder.py index e5355153..7074b939 100644 --- a/docling_eval/dataset_builders/doclaynet_v1_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v1_builder.py @@ -335,33 +335,37 @@ def iterate(self) -> Iterable[DatasetRecord]: ) # Set up document dimensions - img = true_page_images[0] - old_w, old_h = doc["image"].size - old_size = Size(width=old_w, height=old_h) - - # Process elements - current_list = None - labels = list( - map(lambda cid: self.category_map[int(cid)], doc["category_id"]) - ) - bboxes = doc["bboxes"] - segments = doc["pdf_cells"] - contents = [ - " ".join(map(lambda cell: cell["text"], cells)) - for cells in segments - ] - - for l, b, c in zip(labels, bboxes, contents): - current_list = self.update_doc_with_gt( - true_doc, current_list, img, old_size, l, b, c + hf_image = doc["image"] + try: + img = true_page_images[0] + old_w, old_h = hf_image.size + old_size = Size(width=old_w, height=old_h) + + # Process elements + current_list = None + labels = list( + map(lambda cid: self.category_map[int(cid)], doc["category_id"]) ) - - # Extract images from the ground truth document - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) + bboxes = doc["bboxes"] + segments = doc["pdf_cells"] + contents = [ + " ".join(map(lambda cell: cell["text"], cells)) + for cells in segments + ] + + for l, b, c in zip(labels, bboxes, contents): + current_list = self.update_doc_with_gt( + true_doc, current_list, img, old_size, l, b, c + ) + + # Extract images from the ground truth document + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + finally: + hf_image.close() pdf_stream.seek(0) doc_stream = DocumentStream(name=page_hash, stream=pdf_stream) diff --git a/docling_eval/dataset_builders/doclaynet_v2_builder.py b/docling_eval/dataset_builders/doclaynet_v2_builder.py index f0386d39..92e1601a 100644 --- a/docling_eval/dataset_builders/doclaynet_v2_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v2_builder.py @@ -633,59 +633,64 @@ def iterate(self) -> Iterable[DatasetRecord]: # Extract image img = doc["image"] - # Convert image to bytes for storage - with io.BytesIO() as img_byte_stream: - img.save(img_byte_stream, format=img.format or "PNG") - img_byte_stream.seek(0) - img_bytes = img_byte_stream.getvalue() - - # Create ground truth document - doc_id = doc["page_hash"] - true_doc = DoclingDocument(name=doc_id) - - # Add page with image - image_ref = ImageRef( - mimetype=f"image/{img.format.lower() if img.format else 'png'}", - dpi=72, - size=Size(width=float(img.width), height=float(img.height)), - uri=from_pil_to_base64uri(img), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(img.width), height=float(img.height)), - image=image_ref, - ) - true_doc.pages[1] = page_item - - # Create key-value pairs if present - kv_pairs = self.create_kv_pairs(doc) - if kv_pairs: - self.populate_key_value_item(true_doc, kv_pairs) - - # Process layout elements - current_list = None - boxes = doc["boxes"] - labels = list( - map( - lambda label: label.lower() - .replace("-", "_") - .replace(" ", "_"), - doc["labels"], + try: + # Convert image to bytes for storage + with io.BytesIO() as img_byte_stream: + img.save(img_byte_stream, format=img.format or "PNG") + img_byte_stream.seek(0) + img_bytes = img_byte_stream.getvalue() + + # Create ground truth document + doc_id = doc["page_hash"] + true_doc = DoclingDocument(name=doc_id) + + # Add page with image + image_ref = ImageRef( + mimetype=f"image/{img.format.lower() if img.format else 'png'}", + dpi=72, + size=Size(width=float(img.width), height=float(img.height)), + uri=from_pil_to_base64uri(img), ) - ) - segments = doc["segments"] - - for label, segment, box in zip(labels, segments, boxes): - current_list = self.update_doc( - true_doc, current_list, img, label, segment, box + page_item = PageItem( + page_no=1, + size=Size(width=float(img.width), height=float(img.height)), + image=image_ref, + ) + true_doc.pages[1] = page_item + + # Create key-value pairs if present + kv_pairs = self.create_kv_pairs(doc) + if kv_pairs: + self.populate_key_value_item(true_doc, kv_pairs) + + # Process layout elements + current_list = None + boxes = doc["boxes"] + labels = list( + map( + lambda label: label.lower() + .replace("-", "_") + .replace(" ", "_"), + doc["labels"], + ) + ) + segments = doc["segments"] + + for label, segment, box in zip(labels, segments, boxes): + current_list = self.update_doc( + true_doc, current_list, img, label, segment, box + ) + + # Extract images from ground truth document + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, ) - # Extract images from ground truth document - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) + img_format = img.format + finally: + img.close() # Create dataset record record = DatasetRecord( @@ -695,7 +700,7 @@ def iterate(self) -> Iterable[DatasetRecord]: original=DocumentStream( name=doc_id, stream=io.BytesIO(img_bytes) ), - mime_type=f"image/{img.format.lower() if img.format else 'png'}", + mime_type=f"image/{img_format.lower() if img_format else 'png'}", modalities=[ EvaluationModality.LAYOUT, EvaluationModality.MARKDOWN_TEXT, diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py index 6f2178bf..cd087e67 100644 --- a/docling_eval/dataset_builders/doclingdpbench_builder.py +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -101,3 +101,9 @@ def iterate(self) -> Iterable[DatasetRecord]: ) yield record + + # Close PIL images to prevent memory leaks + for img in page_images: + img.close() + for img in pictures: + img.close() diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py index 2f173dd3..0019e4e5 100644 --- a/docling_eval/dataset_builders/file_dataset_builder.py +++ b/docling_eval/dataset_builders/file_dataset_builder.py @@ -118,26 +118,30 @@ def iterate(self) -> Iterable[DatasetRecord]: if filename.suffix.lower() in [".tif", ".tiff"]: # Process all pages/frames in multipage TIFF page_no = 1 + img = None try: img = Image.open(filename) while True: img.seek(page_no - 1) image = img.convert("RGB") - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=image.width, height=image.height), - uri=from_pil_to_base64uri(image), - ) - page_item = PageItem( - page_no=page_no, - size=Size( - width=float(image.width), - height=float(image.height), - ), - image=image_ref, - ) - true_doc.pages[page_no] = page_item + try: + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=image.width, height=image.height), + uri=from_pil_to_base64uri(image), + ) + page_item = PageItem( + page_no=page_no, + size=Size( + width=float(image.width), + height=float(image.height), + ), + image=image_ref, + ) + true_doc.pages[page_no] = page_item + finally: + image.close() page_no += 1 # Try to seek to next frame try: @@ -145,15 +149,39 @@ def iterate(self) -> Iterable[DatasetRecord]: except EOFError: # No more frames break - img.close() except Exception as e: _log.warning( f"Failed to process multipage TIFF {filename}: {e}. " "Falling back to single-page processing." ) # Fallback to single-page processing - image = Image.open(filename) - image = image.convert("RGB") + with Image.open(filename) as src_img: + image = src_img.convert("RGB") + try: + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=image.width, height=image.height), + uri=from_pil_to_base64uri(image), + ) + page_item = PageItem( + page_no=1, + size=Size( + width=float(image.width), height=float(image.height) + ), + image=image_ref, + ) + true_doc.pages[1] = page_item + finally: + image.close() + finally: + if img is not None: + img.close() + else: + # Single-page image formats + with Image.open(filename) as src_img: + image = src_img.convert("RGB") + try: image_ref = ImageRef( mimetype="image/png", dpi=72, @@ -167,25 +195,11 @@ def iterate(self) -> Iterable[DatasetRecord]: ), image=image_ref, ) - true_doc.pages[1] = page_item - else: - # Single-page image formats - image = Image.open(filename) - image = image.convert("RGB") - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=image.width, height=image.height), - uri=from_pil_to_base64uri(image), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(image.width), height=float(image.height)), - image=image_ref, - ) - # _log.debug(f"add_pages_to_true_doc: {filename}") - true_doc.pages[1] = page_item + # _log.debug(f"add_pages_to_true_doc: {filename}") + true_doc.pages[1] = page_item + finally: + image.close() elif mime_type == "application/json": # .. support DoclingDocument json files try: diff --git a/docling_eval/dataset_builders/funsd_builder.py b/docling_eval/dataset_builders/funsd_builder.py index 21c3cf4b..ac85cc7b 100644 --- a/docling_eval/dataset_builders/funsd_builder.py +++ b/docling_eval/dataset_builders/funsd_builder.py @@ -378,50 +378,57 @@ def iterate(self) -> Iterable[DatasetRecord]: ) # Load image and annotation - img: PillowImageType = Image.open(img_path) - if img.mode != "RGBA": - _log.debug( - f"Converting image {img_path.name} from {img.mode} to RGBA during dataset preparation." + src_img = Image.open(img_path) + try: + if src_img.mode != "RGBA": + _log.debug( + f"Converting image {img_path.name} from {src_img.mode} to RGBA during dataset preparation." + ) + img = src_img.convert("RGBA") + src_img.close() + else: + img = src_img + + with open(annotation_path, "r", encoding="utf-8") as f: + funsd_data = json.load(f) + + # Get image bytes + with io.BytesIO() as img_byte_stream: + img.save(img_byte_stream, format="PNG") + img_byte_stream.seek(0) + img_bytes = img_byte_stream.getvalue() + + # Create ground truth document + true_doc = DoclingDocument(name=img_path.stem) + + # Add page with image + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=float(img.width), height=float(img.height)), + uri=from_pil_to_base64uri(img), ) - img = img.convert("RGBA") - - with open(annotation_path, "r", encoding="utf-8") as f: - funsd_data = json.load(f) - - # Get image bytes - with io.BytesIO() as img_byte_stream: - img.save(img_byte_stream, format="PNG") - img_byte_stream.seek(0) - img_bytes = img_byte_stream.getvalue() - - # Create ground truth document - true_doc = DoclingDocument(name=img_path.stem) - - # Add page with image - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=float(img.width), height=float(img.height)), - uri=from_pil_to_base64uri(img), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(img.width), height=float(img.height)), - image=image_ref, - ) - true_doc.pages[1] = page_item + page_item = PageItem( + page_no=1, + size=Size(width=float(img.width), height=float(img.height)), + image=image_ref, + ) + true_doc.pages[1] = page_item - # Populate document with key-value data - true_doc, seg_pages = self._create_ground_truth_doc( - true_doc, funsd_data - ) + # Populate document with key-value data + true_doc, seg_pages = self._create_ground_truth_doc( + true_doc, funsd_data + ) + + # Extract images + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + finally: + img.close() - # Extract images - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) image_stream = DocumentStream( name=img_path.stem, stream=io.BytesIO(img_bytes) ) diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py index 68d0d9a7..8d6ab721 100644 --- a/docling_eval/dataset_builders/omnidocbench_builder.py +++ b/docling_eval/dataset_builders/omnidocbench_builder.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Dict, Iterable, List, Tuple +from datasets import load_dataset from docling_core.types import DoclingDocument from docling_core.types.doc import ( BoundingBox, @@ -17,6 +18,8 @@ Size, ) from docling_core.types.io import DocumentStream +from huggingface_hub import snapshot_download +from PIL import Image as PILImage from PIL.Image import Image from tqdm import tqdm @@ -88,11 +91,18 @@ class OmniDocBenchDatasetBuilder(BaseEvaluationDatasetBuilder): This builder processes the OmniDocBench dataset, which contains document layout annotations for a variety of document types. + + Supports two modes: + - Raw mode: Downloads raw files via snapshot_download (many requests) + - Parquet mode: Downloads Parquet shards and extracts files (few requests) """ def __init__( self, target: Path, + repo_id: str = "opendatalab/OmniDocBench", + revision: str = "v1_0", + use_parquet: bool = False, split: str = "test", begin_index: int = 0, end_index: int = -1, @@ -102,23 +112,56 @@ def __init__( Args: target: Path where processed dataset will be saved + repo_id: HuggingFace repository ID (default: opendatalab/OmniDocBench) + revision: Repository revision/branch + use_parquet: If True, download Parquet and extract files (avoids rate limits) split: Dataset split to use begin_index: Start index for processing (inclusive) end_index: End index for processing (exclusive), -1 means process all """ super().__init__( name="OmniDocBench: end-to-end", - dataset_source=HFSource( - repo_id="opendatalab/OmniDocBench", revision="v1_0" - ), + dataset_source=HFSource(repo_id=repo_id, revision=revision), target=target, split=split, begin_index=begin_index, end_index=end_index, ) + self.use_parquet = use_parquet self.must_retrieve = True + def retrieve_input_dataset(self) -> Path: + """ + Download and retrieve the input dataset. + + In Parquet mode, this is a no-op since iterate() loads data directly. + In raw mode, downloads all files via snapshot_download. + """ + if self.use_parquet: + # Parquet mode: iterate() uses load_dataset directly, no download needed + _log.info("Parquet mode: skipping download (data loaded in iterate)") + self.retrieved = True + return self.target + + # Raw mode: download all raw files + if not self.dataset_local_path: + self.dataset_local_path = self.target / "source_data" + + self.dataset_local_path.mkdir(parents=True, exist_ok=True) + + _log.info("Downloading files (raw mode)...") + assert isinstance(self.dataset_source, HFSource) + snapshot_download( + repo_id=self.dataset_source.repo_id, + revision=self.dataset_source.revision, + repo_type="dataset", + token=self.dataset_source.hf_token, + local_dir=self.dataset_local_path, + ) + self.retrieved = True + return self.dataset_local_path + def update_gt_into_map(self, gt: List[Dict]) -> Dict[str, Dict]: """ Convert list of annotation items to a map keyed by image path. @@ -330,6 +373,11 @@ def iterate(self) -> Iterable[DatasetRecord]: Yields: DatasetRecord objects """ + # Parquet mode: use load_dataset directly + if self.use_parquet: + yield from self._iterate_parquet() + return + if not self.retrieved and self.must_retrieve: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -421,3 +469,88 @@ def iterate(self) -> Iterable[DatasetRecord]: ) yield record + + def _iterate_parquet(self) -> Iterable[DatasetRecord]: + """ + Iterate through the Parquet dataset and yield DatasetRecord objects. + + This method loads data directly via load_dataset, avoiding rate limits + from downloading many individual files. + """ + _log.info("Loading dataset via load_dataset (Parquet mode)...") + assert isinstance(self.dataset_source, HFSource) + + ds = load_dataset( + self.dataset_source.repo_id, + split="train", + ) + + total_items = len(ds) + begin, end = self.get_effective_indices(total_items) + ds = ds.select(range(begin, end)) + selected_items = len(ds) + + self.log_dataset_stats(total_items, selected_items) + + for item in tqdm( + ds, total=selected_items, ncols=128, desc="Processing Parquet records" + ): + filename = item["filename"] + gt_data = json.loads(item["ground_truth"]) + pdf_bytes = item["pdf"] + page_image: PILImage.Image = item["image"] + + # Create document and add page + true_doc = DoclingDocument(name=f"ground-truth {filename}") + page_image_rgb = page_image.convert("RGB") + page_width = float(page_image_rgb.width) + page_height = float(page_image_rgb.height) + + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=page_width, height=page_height), + uri=from_pil_to_base64uri(page_image_rgb), + ) + page_item = PageItem( + page_no=1, + size=Size(width=page_width, height=page_height), + image=image_ref, + ) + true_doc.pages[1] = page_item + + # Update document with ground truth + true_doc = self.update_doc_with_gt( + gt=gt_data, + true_doc=true_doc, + page=true_doc.pages[1], + page_image=page_image_rgb, + page_width=page_width, + page_height=page_height, + ) + + # Extract images from the ground truth document + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + + # Create PDF stream from bytes + pdf_stream = DocumentStream( + name=Path(filename).stem + ".pdf", + stream=BytesIO(pdf_bytes), + ) + + # Create dataset record + record = DatasetRecord( + doc_id=filename, + doc_hash=get_binhash(pdf_bytes), + ground_truth_doc=true_doc, + ground_truth_pictures=true_pictures, + ground_truth_page_images=true_page_images, + original=pdf_stream, + mime_type="application/pdf", + ) + + yield record diff --git a/docling_eval/dataset_builders/otsl_table_dataset_builder.py b/docling_eval/dataset_builders/otsl_table_dataset_builder.py index 768fc2ac..6a496516 100644 --- a/docling_eval/dataset_builders/otsl_table_dataset_builder.py +++ b/docling_eval/dataset_builders/otsl_table_dataset_builder.py @@ -159,82 +159,89 @@ def iterate(self) -> Iterable[DatasetRecord]: filename = item["filename"] table_image = item["image"] - page_tokens = self.create_page_tokens( - data=item["cells"], - height=table_image.height, - width=table_image.width, - ) - - # Create ground truth document - true_doc = DoclingDocument(name=f"ground-truth {filename}") - - # Add page to document - page_index = 1 - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size( - width=float(table_image.width), height=float(table_image.height) - ), - uri=from_pil_to_base64uri(table_image), - ) - page_item = PageItem( - page_no=page_index, - size=Size( - width=float(table_image.width), height=float(table_image.height) - ), - image=image_ref, - ) - true_doc.pages[1] = page_item - - # Create table data - html = "" + "".join(item["html"]) + "
" - table_data = convert_html_table_into_docling_tabledata( - html, text_cells=item["cells"][0] - ) - - for tbl_cell, page_token in zip( - table_data.table_cells, page_tokens.tokens, strict=True - ): - tbl_cell.bbox = page_token.bbox - - # Create bounding box for table - l = 0.0 - b = 0.0 - r = table_image.width - t = table_image.height - if "table_bbox" in item: - l = item["table_bbox"][0] - b = table_image.height - item["table_bbox"][3] - r = item["table_bbox"][2] - t = table_image.height - item["table_bbox"][1] - - bbox = BoundingBox( - l=l, - r=r, - b=b, - t=t, - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - # Create provenance - prov = ProvenanceItem(page_no=page_index, bbox=bbox, charspan=(0, 0)) - - # Add table to document - true_doc.add_table(data=table_data, caption=None, prov=prov) - - # Extract images - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) - - # Create dataset record - with io.BytesIO() as img_byte_stream: - table_image.save(img_byte_stream, format="PNG") - img_byte_stream.seek(0) - img_bytes = img_byte_stream.read() + try: + page_tokens = self.create_page_tokens( + data=item["cells"], + height=table_image.height, + width=table_image.width, + ) + + # Create ground truth document + true_doc = DoclingDocument(name=f"ground-truth {filename}") + + # Add page to document + page_index = 1 + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size( + width=float(table_image.width), + height=float(table_image.height), + ), + uri=from_pil_to_base64uri(table_image), + ) + page_item = PageItem( + page_no=page_index, + size=Size( + width=float(table_image.width), + height=float(table_image.height), + ), + image=image_ref, + ) + true_doc.pages[1] = page_item + + # Create table data + html = "" + "".join(item["html"]) + "
" + table_data = convert_html_table_into_docling_tabledata( + html, text_cells=item["cells"][0] + ) + + for tbl_cell, page_token in zip( + table_data.table_cells, page_tokens.tokens, strict=True + ): + tbl_cell.bbox = page_token.bbox + + # Create bounding box for table + l = 0.0 + b = 0.0 + r = table_image.width + t = table_image.height + if "table_bbox" in item: + l = item["table_bbox"][0] + b = table_image.height - item["table_bbox"][3] + r = item["table_bbox"][2] + t = table_image.height - item["table_bbox"][1] + + bbox = BoundingBox( + l=l, + r=r, + b=b, + t=t, + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + + # Create provenance + prov = ProvenanceItem( + page_no=page_index, bbox=bbox, charspan=(0, 0) + ) + + # Add table to document + true_doc.add_table(data=table_data, caption=None, prov=prov) + + # Extract images + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + + # Create dataset record + with io.BytesIO() as img_byte_stream: + table_image.save(img_byte_stream, format="PNG") + img_byte_stream.seek(0) + img_bytes = img_byte_stream.read() + finally: + table_image.close() record = DatasetRecord( doc_id=str(Path(filename).stem), diff --git a/docling_eval/dataset_builders/pixparse_builder.py b/docling_eval/dataset_builders/pixparse_builder.py index cc769a4b..b8ace19a 100644 --- a/docling_eval/dataset_builders/pixparse_builder.py +++ b/docling_eval/dataset_builders/pixparse_builder.py @@ -1,4 +1,5 @@ import base64 +import gc import json import logging from io import BytesIO @@ -179,23 +180,29 @@ def iterate(self) -> Iterable[DatasetRecord]: continue image_bytes = image_data["bytes"] - image = Image.open(BytesIO(image_bytes)).convert("RGB") gt_data = json.loads(sample["json_data"]) - true_doc, seg_pages = self._create_ground_truth_doc( - doc_id, gt_data, image - ) + with BytesIO(image_bytes) as input_stream: + with Image.open(input_stream) as src_img: + image = src_img.convert("RGB") - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) + try: + true_doc, seg_pages = self._create_ground_truth_doc( + doc_id, gt_data, image + ) + + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) - with BytesIO() as img_byte_stream: - image.save(img_byte_stream, format="PNG") - img_byte_stream.seek(0) - img_bytes = img_byte_stream.getvalue() + with BytesIO() as img_byte_stream: + image.save(img_byte_stream, format="PNG") + img_byte_stream.seek(0) + img_bytes = img_byte_stream.getvalue() + finally: + image.close() image_stream = DocumentStream( name=f"{doc_id}.png", stream=BytesIO(img_bytes) @@ -213,6 +220,17 @@ def iterate(self) -> Iterable[DatasetRecord]: ground_truth_page_images=true_page_images, ) + # Manual cleanup to prevent memory accumulation + del ( + true_doc, + seg_pages, + true_pictures, + true_page_images, + image_stream, + img_bytes, + ) + gc.collect() + except Exception as e: logging.error( f"Error processing sample {sample.get('key', 'unknown')}: {e}" diff --git a/docling_eval/dataset_builders/xfund_builder.py b/docling_eval/dataset_builders/xfund_builder.py index f08d62fb..a997f6d2 100644 --- a/docling_eval/dataset_builders/xfund_builder.py +++ b/docling_eval/dataset_builders/xfund_builder.py @@ -418,45 +418,50 @@ def iterate(self) -> Iterable[DatasetRecord]: # Load image img = Image.open(img_path) + try: + img_format = img.format + # Get image bytes + with io.BytesIO() as img_byte_stream: + img.save(img_byte_stream, format=img_format) + img_byte_stream.seek(0) + img_bytes = img_byte_stream.getvalue() + + # Create ground truth document + true_doc = DoclingDocument(name=Path(img_filename).stem) + + assert img.format is not None + + # Add page with image + image_ref = ImageRef( + mimetype=f"image/{img_format.lower()}", + dpi=72, + size=Size(width=float(img.width), height=float(img.height)), + uri=from_pil_to_base64uri(img), + ) + page_item = PageItem( + page_no=1, + size=Size(width=float(img.width), height=float(img.height)), + image=image_ref, + ) + true_doc.pages[1] = page_item - # Get image bytes - with io.BytesIO() as img_byte_stream: - img.save(img_byte_stream, format=img.format) - img_byte_stream.seek(0) - img_bytes = img_byte_stream.getvalue() - - # Create ground truth document - true_doc = DoclingDocument(name=Path(img_filename).stem) - - assert img.format is not None - - # Add page with image - image_ref = ImageRef( - mimetype=f"image/{img.format.lower()}", - dpi=72, - size=Size(width=float(img.width), height=float(img.height)), - uri=from_pil_to_base64uri(img), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(img.width), height=float(img.height)), - image=image_ref, - ) - true_doc.pages[1] = page_item + # Populate document with key-value data + true_doc, seg_pages = self._create_ground_truth_doc( + true_doc, doc_data + ) - # Populate document with key-value data - true_doc, seg_pages = self._create_ground_truth_doc(true_doc, doc_data) + # Extract images + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + finally: + img.close() - # Extract images - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) image_stream = DocumentStream( name=img_path.stem, stream=io.BytesIO(img_bytes) ) - assert img.format is not None # Create dataset record record = DatasetRecord( doc_id=Path(img_filename).stem, @@ -464,7 +469,7 @@ def iterate(self) -> Iterable[DatasetRecord]: ground_truth_doc=true_doc, ground_truth_segmented_pages=seg_pages, original=image_stream, - mime_type=f"image/{img.format.lower()}", + mime_type=f"image/{img_format.lower()}", modalities=[EvaluationModality.KEY_VALUE, EvaluationModality.OCR], ground_truth_pictures=true_pictures, ground_truth_page_images=true_page_images, diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py index 2e15f72b..8f855a1d 100644 --- a/docling_eval/evaluators/layout_evaluator.py +++ b/docling_eval/evaluators/layout_evaluator.py @@ -202,7 +202,12 @@ def __call__( split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) logging.info("#-files: %s", len(split_files)) - ds = load_dataset("parquet", data_files={split: split_files}) + + ds = load_dataset( + "parquet", + data_files={split: split_files}, + features=DatasetRecordWithPrediction.features(), + ) logging.info("Overview of dataset: %s", ds) # Select the split diff --git a/docling_eval/evaluators/ocr_evaluator.py b/docling_eval/evaluators/ocr_evaluator.py index 2364583d..c8c95bca 100644 --- a/docling_eval/evaluators/ocr_evaluator.py +++ b/docling_eval/evaluators/ocr_evaluator.py @@ -108,7 +108,9 @@ def __call__( "Found %d files for processing: %s", len(dataset_files), dataset_files ) hf_dataset = load_dataset( - "parquet", data_files={data_split_name: dataset_files} + "parquet", + data_files={data_split_name: dataset_files}, + features=DatasetRecordWithPrediction.features(), ) _log.info("Dataset overview: %s", hf_dataset) @@ -394,7 +396,9 @@ def __call__( path_to_parquet_files: str = str(dataset_path / data_split_name / "*.parquet") hf_dataset: Dataset = load_dataset( - "parquet", data_files={data_split_name: path_to_parquet_files} + "parquet", + data_files={data_split_name: path_to_parquet_files}, + features=DatasetRecordWithPrediction.features(), ) generated_visualization_paths: List[Path] = [] diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py index b82ceda6..c2d2c717 100644 --- a/docling_eval/evaluators/table_evaluator.py +++ b/docling_eval/evaluators/table_evaluator.py @@ -208,8 +208,14 @@ def __call__( # Load the dataset split_path = str(ds_path / split / "*.parquet") split_files = glob.glob(split_path) + _log.debug("Files: %s", split_files) - ds = load_dataset("parquet", data_files={split: split_files}) + + ds = load_dataset( + "parquet", + data_files={split: split_files}, + features=DatasetRecordWithPrediction.features(), + ) _log.info("Overview of dataset: %s", ds) # Select the split diff --git a/docling_eval/prediction_providers/aws_prediction_provider.py b/docling_eval/prediction_providers/aws_prediction_provider.py index 54e8bcc0..07a246a3 100644 --- a/docling_eval/prediction_providers/aws_prediction_provider.py +++ b/docling_eval/prediction_providers/aws_prediction_provider.py @@ -207,6 +207,11 @@ def convert_aws_output_to_docling( processed_pages = set() segmented_pages: Dict[int, SegmentedPage] = {} + if not record.ground_truth_page_images: + _log.warning( + "No ground truth page images available for AWS Textract conversion" + ) + return doc, segmented_pages # Get page dimensions from page block # AWS provides normalized coordinates, so we need to multiply by a typical page size # width = 8.5 * 72 # Standard US Letter width in points @@ -542,6 +547,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: """For the given document stream (single document), run the API and create the doclingDocument.""" status = ConversionStatus.SUCCESS + result_orig = None + pred_segmented_pages: Dict[int, SegmentedPage] = {} + pred_doc = None assert record.original is not None if not isinstance(record.original, DocumentStream): @@ -557,7 +565,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: "image/jpeg", ]: # Call the AWS Textract API by passing in the image for prediction - + record.original.stream.seek(0) file_bytes = record.original.stream.read() response = self.textract_client.analyze_document( Document={"Bytes": file_bytes}, diff --git a/docling_eval/prediction_providers/azure_prediction_provider.py b/docling_eval/prediction_providers/azure_prediction_provider.py index 71b09795..f60e6734 100644 --- a/docling_eval/prediction_providers/azure_prediction_provider.py +++ b/docling_eval/prediction_providers/azure_prediction_provider.py @@ -128,6 +128,10 @@ def convert_azure_output_to_docling( doc = DoclingDocument(name=record.doc_id) segmented_pages: Dict[int, SegmentedPage] = {} + if not record.ground_truth_page_images: + _log.warning("No ground truth page images available for Azure conversion") + return doc, segmented_pages + for page in analyze_result.get("pages", []): page_no = page.get("pageNumber", 1) @@ -414,7 +418,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: from azure.ai.documentintelligence.models import AnalyzeOutputOption status = ConversionStatus.SUCCESS - result_orig = None + result_json = None + pred_segmented_pages: Dict[int, SegmentedPage] = {} + pred_doc = None assert record.original is not None try: @@ -424,6 +430,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction: raise RuntimeError( "Original document must be a DocumentStream for PDF files" ) + record.original.stream.seek(0) # Call the Azure API by passing in the image for prediction poller = self.doc_intelligence_client.begin_analyze_document( "prebuilt-layout", diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py index 43c034bb..1e3c25b4 100644 --- a/docling_eval/prediction_providers/base_prediction_provider.py +++ b/docling_eval/prediction_providers/base_prediction_provider.py @@ -149,6 +149,7 @@ def visualize_results( if ( prediction_record.predicted_doc is not None and prediction_record.ground_truth_page_images + and prediction_record.predicted_page_images ): gt_doc = insert_images_from_pil( prediction_record.ground_truth_doc.model_copy(), @@ -453,9 +454,13 @@ def create_prediction_dataset( end_index: End index for processing (exclusive), -1 means process all chunk_size: items per chunk """ - # Load the dataset + # Load the dataset with proper schema to ensure PIL images are decoded parquet_files = str(gt_dataset_dir / split / "*.parquet") - ds = load_dataset("parquet", data_files={split: parquet_files}) + ds = load_dataset( + "parquet", + data_files={split: parquet_files}, + features=DatasetRecord.features(), + ) if ds is None: _log.error(f"Failed to load dataset from {parquet_files}") @@ -501,6 +506,13 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]: if not self.ignore_missing_predictions: raise + def _serialize_predictions(): + for pred_record in _iterate_predictions(): + if self.do_visualization: + self.visualize_results(pred_record, target_dataset_dir) + # Serialize immediately to release PIL image memory + yield pred_record.as_record_dict() + # Create output directories test_dir = target_dataset_dir / split test_dir.mkdir(parents=True, exist_ok=True) @@ -513,20 +525,15 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]: count = 0 chunk_count = 0 - for record_chunk in chunkify(_iterate_predictions(), chunk_size): - if self.do_visualization: - for r in record_chunk: - self.visualize_results(r, target_dataset_dir) - - record_chunk = [r.as_record_dict() for r in record_chunk] - + # Use _serialize_predictions to ensure we hold dicts (bytes), not open PIL images + for record_chunk_dicts in chunkify(_serialize_predictions(), chunk_size): save_shard_to_disk( - items=record_chunk, + items=record_chunk_dicts, dataset_path=test_dir, schema=DatasetRecordWithPrediction.pyarrow_schema(), shard_id=chunk_count, ) - count += len(record_chunk) + count += len(record_chunk_dicts) chunk_count += 1 if chunk_count >= max_num_chunks: diff --git a/docling_eval/prediction_providers/google_prediction_provider.py b/docling_eval/prediction_providers/google_prediction_provider.py index 48dc352f..f8ac3546 100644 --- a/docling_eval/prediction_providers/google_prediction_provider.py +++ b/docling_eval/prediction_providers/google_prediction_provider.py @@ -597,6 +597,10 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord): doc = DoclingDocument(name=record.doc_id) segmented_pages: Dict[int, SegmentedPage] = {} + if not record.ground_truth_page_images: + _log.warning("No ground truth page images available for Google conversion") + return doc, segmented_pages + for page in document.get("pages", []): page_no = page.get("pageNumber", 1) page_width = page.get("dimension", {}).get("width", 0) diff --git a/uv.lock b/uv.lock index 4e943437..589bdd10 100644 --- a/uv.lock +++ b/uv.lock @@ -7721,4 +7721,4 @@ source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, -] +] \ No newline at end of file