diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py
index d72a27a3..296b8997 100644
--- a/docling_eval/datamodels/dataset_record.py
+++ b/docling_eval/datamodels/dataset_record.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from enum import Enum
 from io import BytesIO
 from pathlib import Path
@@ -17,6 +18,8 @@
 from docling_eval.datamodels.types import EvaluationModality, PredictionFormats
 from docling_eval.utils.utils import extract_images
 
+_log = logging.getLogger(__name__)
+
 seg_adapter = TypeAdapter(Dict[int, SegmentedPage])
 
 
@@ -151,6 +154,10 @@ def _extract_images(
         return pictures, page_images
 
     def as_record_dict(self):
+        # Convert images to bytes format BEFORE closing them
+        gt_pictures_bytes = self._images_to_bytes(self.ground_truth_pictures)
+        gt_page_images_bytes = self._images_to_bytes(self.ground_truth_page_images)
+
         record = {
             self.get_field_alias("doc_id"): self.doc_id,
             self.get_field_alias("doc_path"): str(self.doc_path),
@@ -158,13 +165,11 @@ def as_record_dict(self):
             self.get_field_alias("ground_truth_doc"): json.dumps(
                 self.ground_truth_doc.export_to_dict()
             ),
-            self.get_field_alias("ground_truth_pictures"): self.ground_truth_pictures,
+            self.get_field_alias("ground_truth_pictures"): gt_pictures_bytes,
             self.get_field_alias("ground_truth_segmented_pages"): seg_adapter.dump_json(
                 self.ground_truth_segmented_pages
             ).decode("utf-8"),
-            self.get_field_alias(
-                "ground_truth_page_images"
-            ): self.ground_truth_page_images,
+            self.get_field_alias("ground_truth_page_images"): gt_page_images_bytes,
             self.get_field_alias("mime_type"): self.mime_type,
             self.get_field_alias("modalities"): list(
                 [m.value for m in self.modalities]
@@ -183,6 +188,33 @@ def as_record_dict(self):
 
         return record
 
+    @staticmethod
+    def _pil_to_bytes(img: PIL.Image.Image) -> bytes:
+        """Convert PIL image to PNG bytes."""
+        buffered = BytesIO()
+        img.save(buffered, format="PNG")
+        return buffered.getvalue()
+
+    def _images_to_bytes(self, images: List[PIL.Image.Image]) -> List[dict]:
+        """Convert list of PIL Images to HuggingFace-compatible bytes format."""
+        return [{"bytes": self._pil_to_bytes(img), "path": None} for img in images]
+
+    @staticmethod
+    def _close_image_list(images: List[PIL.Image.Image]) -> None:
+        """Close all PIL Images in a list, logging any errors at debug level."""
+        for img in images:
+            try:
+                img.close()
+            except Exception as e:
+                _log.debug(f"Failed to close PIL image: {e}")
+
+    def _close_images(self) -> None:
+        """Close ground truth PIL Images to prevent memory leaks."""
+        self._close_image_list(self.ground_truth_page_images)
+        self._close_image_list(self.ground_truth_pictures)
+        self.ground_truth_page_images = []
+        self.ground_truth_pictures = []
+
     @model_validator(mode="after")
     def validate_images(self) -> "DatasetRecord":
         if not len(self.ground_truth_pictures) and not len(
@@ -240,9 +272,9 @@ def validate_record_dict(cls, data: dict):
                 img_buffer.seek(0)
                 data[gt_binary] = DocumentStream(name="image.png", stream=img_buffer)
 
-        # Backward compatibility: ensure tags field exists for old datasets
+        # Backward compatibility: ensure tags field exists for old datasets and is not None
         tags_alias = cls.get_field_alias("tags")
-        if tags_alias not in data:
+        if tags_alias not in data or data[tags_alias] is None:
             data[tags_alias] = []
 
         return data
@@ -317,6 +349,10 @@ def as_record_dict(self):
         )
 
         if self.predicted_doc is not None:
+            # Convert prediction images to bytes BEFORE closing
+            pred_pictures_bytes = self._images_to_bytes(self.predicted_pictures)
+            pred_page_images_bytes = self._images_to_bytes(self.predicted_page_images)
+
             record.update(
                 {
                     self.get_field_alias("predicted_doc"): json.dumps(
@@ -327,18 +363,27 @@ def as_record_dict(self):
                     ): seg_adapter.dump_json(self.predicted_segmented_pages).decode(
                         "utf-8"
                     ),
-                    self.get_field_alias("predicted_pictures"): self.predicted_pictures,
+                    self.get_field_alias("predicted_pictures"): pred_pictures_bytes,
                     self.get_field_alias(
                         "predicted_page_images"
-                    ): self.predicted_page_images,
+                    ): pred_page_images_bytes,
                     self.get_field_alias("original_prediction"): (
                         self.original_prediction
                     ),
                 }
             )
 
+        # Close prediction images (parent already closed ground truth images)
+        self._close_prediction_images()
         return record
 
+    def _close_prediction_images(self) -> None:
+        """Close prediction PIL Images to prevent memory leaks."""
+        self._close_image_list(self.predicted_page_images)
+        self._close_image_list(self.predicted_pictures)
+        self.predicted_page_images = []
+        self.predicted_pictures = []
+
     @model_validator(mode="after")
     def validate_images(self) -> "DatasetRecordWithPrediction":
         # super().validate_images()
diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py
index acd6d829..74749334 100644
--- a/docling_eval/dataset_builders/dataset_builder.py
+++ b/docling_eval/dataset_builders/dataset_builder.py
@@ -192,23 +192,20 @@ def retrieve_input_dataset(self) -> Path:
             Path to the retrieved dataset
         """
         if isinstance(self.dataset_source, HFSource):
+            download_kwargs = {
+                "repo_id": self.dataset_source.repo_id,
+                "revision": self.dataset_source.revision,
+                "repo_type": "dataset",
+                "token": self.dataset_source.hf_token,
+            }
+
             if not self.dataset_local_path:
-                path_str = snapshot_download(
-                    repo_id=self.dataset_source.repo_id,
-                    revision=self.dataset_source.revision,
-                    repo_type="dataset",
-                    token=self.dataset_source.hf_token,
-                )
+                path_str = snapshot_download(**download_kwargs)
                 path: Path = Path(path_str)
                 self.dataset_local_path = path
             else:
-                path_str = snapshot_download(
-                    repo_id=self.dataset_source.repo_id,
-                    revision=self.dataset_source.revision,
-                    repo_type="dataset",
-                    token=self.dataset_source.hf_token,
-                    local_dir=self.dataset_local_path,
-                )
+                download_kwargs["local_dir"] = str(self.dataset_local_path)
+                path_str = snapshot_download(**download_kwargs)
                 path = Path(path_str)
         elif isinstance(self.dataset_source, Path):
             path = self.dataset_source
@@ -315,7 +312,6 @@ def save_to_disk(
         for record_chunk in chunkify(self.iterate(), chunk_size):
             record_list = []
             for r in record_chunk:
-                record_list.append(r.as_record_dict())
                 if do_visualization:
                     viz_path_split = self.target / "visualizations" / f"{r.doc_id}.html"
 
@@ -333,6 +329,7 @@ def save_to_disk(
                         doc=tmp,
                         draw_reading_order=True,
                     )
+                record_list.append(r.as_record_dict())
 
             save_shard_to_disk(
                 items=record_list,
diff --git a/docling_eval/dataset_builders/doclaynet_v1_builder.py b/docling_eval/dataset_builders/doclaynet_v1_builder.py
index e5355153..7074b939 100644
--- a/docling_eval/dataset_builders/doclaynet_v1_builder.py
+++ b/docling_eval/dataset_builders/doclaynet_v1_builder.py
@@ -335,33 +335,37 @@ def iterate(self) -> Iterable[DatasetRecord]:
                 )
 
                 # Set up document dimensions
-                img = true_page_images[0]
-                old_w, old_h = doc["image"].size
-                old_size = Size(width=old_w, height=old_h)
-
-                # Process elements
-                current_list = None
-                labels = list(
-                    map(lambda cid: self.category_map[int(cid)], doc["category_id"])
-                )
-                bboxes = doc["bboxes"]
-                segments = doc["pdf_cells"]
-                contents = [
-                    " ".join(map(lambda cell: cell["text"], cells))
-                    for cells in segments
-                ]
-
-                for l, b, c in zip(labels, bboxes, contents):
-                    current_list = self.update_doc_with_gt(
-                        true_doc, current_list, img, old_size, l, b, c
+                hf_image = doc["image"]
+                try:
+                    img = true_page_images[0]
+                    old_w, old_h = hf_image.size
+                    old_size = Size(width=old_w, height=old_h)
+
+                    # Process elements
+                    current_list = None
+                    labels = list(
+                        map(lambda cid: self.category_map[int(cid)], doc["category_id"])
                     )
-
-                # Extract images from the ground truth document
-                true_doc, true_pictures, true_page_images = extract_images(
-                    document=true_doc,
-                    pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-                    page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-                )
+                    bboxes = doc["bboxes"]
+                    segments = doc["pdf_cells"]
+                    contents = [
+                        " ".join(map(lambda cell: cell["text"], cells))
+                        for cells in segments
+                    ]
+
+                    for l, b, c in zip(labels, bboxes, contents):
+                        current_list = self.update_doc_with_gt(
+                            true_doc, current_list, img, old_size, l, b, c
+                        )
+
+                    # Extract images from the ground truth document
+                    true_doc, true_pictures, true_page_images = extract_images(
+                        document=true_doc,
+                        pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                        page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+                    )
+                finally:
+                    hf_image.close()
 
                 pdf_stream.seek(0)
                 doc_stream = DocumentStream(name=page_hash, stream=pdf_stream)
diff --git a/docling_eval/dataset_builders/doclaynet_v2_builder.py b/docling_eval/dataset_builders/doclaynet_v2_builder.py
index f0386d39..92e1601a 100644
--- a/docling_eval/dataset_builders/doclaynet_v2_builder.py
+++ b/docling_eval/dataset_builders/doclaynet_v2_builder.py
@@ -633,59 +633,64 @@ def iterate(self) -> Iterable[DatasetRecord]:
                     # Extract image
                     img = doc["image"]
 
-                    # Convert image to bytes for storage
-                    with io.BytesIO() as img_byte_stream:
-                        img.save(img_byte_stream, format=img.format or "PNG")
-                        img_byte_stream.seek(0)
-                        img_bytes = img_byte_stream.getvalue()
-
-                    # Create ground truth document
-                    doc_id = doc["page_hash"]
-                    true_doc = DoclingDocument(name=doc_id)
-
-                    # Add page with image
-                    image_ref = ImageRef(
-                        mimetype=f"image/{img.format.lower() if img.format else 'png'}",
-                        dpi=72,
-                        size=Size(width=float(img.width), height=float(img.height)),
-                        uri=from_pil_to_base64uri(img),
-                    )
-                    page_item = PageItem(
-                        page_no=1,
-                        size=Size(width=float(img.width), height=float(img.height)),
-                        image=image_ref,
-                    )
-                    true_doc.pages[1] = page_item
-
-                    # Create key-value pairs if present
-                    kv_pairs = self.create_kv_pairs(doc)
-                    if kv_pairs:
-                        self.populate_key_value_item(true_doc, kv_pairs)
-
-                    # Process layout elements
-                    current_list = None
-                    boxes = doc["boxes"]
-                    labels = list(
-                        map(
-                            lambda label: label.lower()
-                            .replace("-", "_")
-                            .replace(" ", "_"),
-                            doc["labels"],
+                    try:
+                        # Convert image to bytes for storage
+                        with io.BytesIO() as img_byte_stream:
+                            img.save(img_byte_stream, format=img.format or "PNG")
+                            img_byte_stream.seek(0)
+                            img_bytes = img_byte_stream.getvalue()
+
+                        # Create ground truth document
+                        doc_id = doc["page_hash"]
+                        true_doc = DoclingDocument(name=doc_id)
+
+                        # Add page with image
+                        image_ref = ImageRef(
+                            mimetype=f"image/{img.format.lower() if img.format else 'png'}",
+                            dpi=72,
+                            size=Size(width=float(img.width), height=float(img.height)),
+                            uri=from_pil_to_base64uri(img),
                         )
-                    )
-                    segments = doc["segments"]
-
-                    for label, segment, box in zip(labels, segments, boxes):
-                        current_list = self.update_doc(
-                            true_doc, current_list, img, label, segment, box
+                        page_item = PageItem(
+                            page_no=1,
+                            size=Size(width=float(img.width), height=float(img.height)),
+                            image=image_ref,
+                        )
+                        true_doc.pages[1] = page_item
+
+                        # Create key-value pairs if present
+                        kv_pairs = self.create_kv_pairs(doc)
+                        if kv_pairs:
+                            self.populate_key_value_item(true_doc, kv_pairs)
+
+                        # Process layout elements
+                        current_list = None
+                        boxes = doc["boxes"]
+                        labels = list(
+                            map(
+                                lambda label: label.lower()
+                                .replace("-", "_")
+                                .replace(" ", "_"),
+                                doc["labels"],
+                            )
+                        )
+                        segments = doc["segments"]
+
+                        for label, segment, box in zip(labels, segments, boxes):
+                            current_list = self.update_doc(
+                                true_doc, current_list, img, label, segment, box
+                            )
+
+                        # Extract images from ground truth document
+                        true_doc, true_pictures, true_page_images = extract_images(
+                            document=true_doc,
+                            pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                            page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
                         )
 
-                    # Extract images from ground truth document
-                    true_doc, true_pictures, true_page_images = extract_images(
-                        document=true_doc,
-                        pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-                        page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-                    )
+                        img_format = img.format
+                    finally:
+                        img.close()
 
                     # Create dataset record
                     record = DatasetRecord(
@@ -695,7 +700,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
                         original=DocumentStream(
                             name=doc_id, stream=io.BytesIO(img_bytes)
                         ),
-                        mime_type=f"image/{img.format.lower() if img.format else 'png'}",
+                        mime_type=f"image/{img_format.lower() if img_format else 'png'}",
                         modalities=[
                             EvaluationModality.LAYOUT,
                             EvaluationModality.MARKDOWN_TEXT,
diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py
index 6f2178bf..cd087e67 100644
--- a/docling_eval/dataset_builders/doclingdpbench_builder.py
+++ b/docling_eval/dataset_builders/doclingdpbench_builder.py
@@ -101,3 +101,9 @@ def iterate(self) -> Iterable[DatasetRecord]:
             )
 
             yield record
+
+            # Close PIL images to prevent memory leaks
+            for img in page_images:
+                img.close()
+            for img in pictures:
+                img.close()
diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py
index 2f173dd3..0019e4e5 100644
--- a/docling_eval/dataset_builders/file_dataset_builder.py
+++ b/docling_eval/dataset_builders/file_dataset_builder.py
@@ -118,26 +118,30 @@ def iterate(self) -> Iterable[DatasetRecord]:
                 if filename.suffix.lower() in [".tif", ".tiff"]:
                     # Process all pages/frames in multipage TIFF
                     page_no = 1
+                    img = None
                     try:
                         img = Image.open(filename)
                         while True:
                             img.seek(page_no - 1)
                             image = img.convert("RGB")
-                            image_ref = ImageRef(
-                                mimetype="image/png",
-                                dpi=72,
-                                size=Size(width=image.width, height=image.height),
-                                uri=from_pil_to_base64uri(image),
-                            )
-                            page_item = PageItem(
-                                page_no=page_no,
-                                size=Size(
-                                    width=float(image.width),
-                                    height=float(image.height),
-                                ),
-                                image=image_ref,
-                            )
-                            true_doc.pages[page_no] = page_item
+                            try:
+                                image_ref = ImageRef(
+                                    mimetype="image/png",
+                                    dpi=72,
+                                    size=Size(width=image.width, height=image.height),
+                                    uri=from_pil_to_base64uri(image),
+                                )
+                                page_item = PageItem(
+                                    page_no=page_no,
+                                    size=Size(
+                                        width=float(image.width),
+                                        height=float(image.height),
+                                    ),
+                                    image=image_ref,
+                                )
+                                true_doc.pages[page_no] = page_item
+                            finally:
+                                image.close()
                             page_no += 1
                             # Try to seek to next frame
                             try:
@@ -145,15 +149,39 @@ def iterate(self) -> Iterable[DatasetRecord]:
                             except EOFError:
                                 # No more frames
                                 break
-                        img.close()
                     except Exception as e:
                         _log.warning(
                             f"Failed to process multipage TIFF {filename}: {e}. "
                             "Falling back to single-page processing."
                         )
                         # Fallback to single-page processing
-                        image = Image.open(filename)
-                        image = image.convert("RGB")
+                        with Image.open(filename) as src_img:
+                            image = src_img.convert("RGB")
+                        try:
+                            image_ref = ImageRef(
+                                mimetype="image/png",
+                                dpi=72,
+                                size=Size(width=image.width, height=image.height),
+                                uri=from_pil_to_base64uri(image),
+                            )
+                            page_item = PageItem(
+                                page_no=1,
+                                size=Size(
+                                    width=float(image.width), height=float(image.height)
+                                ),
+                                image=image_ref,
+                            )
+                            true_doc.pages[1] = page_item
+                        finally:
+                            image.close()
+                    finally:
+                        if img is not None:
+                            img.close()
+                else:
+                    # Single-page image formats
+                    with Image.open(filename) as src_img:
+                        image = src_img.convert("RGB")
+                    try:
                         image_ref = ImageRef(
                             mimetype="image/png",
                             dpi=72,
@@ -167,25 +195,11 @@ def iterate(self) -> Iterable[DatasetRecord]:
                             ),
                             image=image_ref,
                         )
-                        true_doc.pages[1] = page_item
-                else:
-                    # Single-page image formats
-                    image = Image.open(filename)
-                    image = image.convert("RGB")
-                    image_ref = ImageRef(
-                        mimetype="image/png",
-                        dpi=72,
-                        size=Size(width=image.width, height=image.height),
-                        uri=from_pil_to_base64uri(image),
-                    )
-                    page_item = PageItem(
-                        page_no=1,
-                        size=Size(width=float(image.width), height=float(image.height)),
-                        image=image_ref,
-                    )
 
-                    # _log.debug(f"add_pages_to_true_doc: {filename}")
-                    true_doc.pages[1] = page_item
+                        # _log.debug(f"add_pages_to_true_doc: {filename}")
+                        true_doc.pages[1] = page_item
+                    finally:
+                        image.close()
             elif mime_type == "application/json":
                 # .. support DoclingDocument json files
                 try:
diff --git a/docling_eval/dataset_builders/funsd_builder.py b/docling_eval/dataset_builders/funsd_builder.py
index 21c3cf4b..ac85cc7b 100644
--- a/docling_eval/dataset_builders/funsd_builder.py
+++ b/docling_eval/dataset_builders/funsd_builder.py
@@ -378,50 +378,57 @@ def iterate(self) -> Iterable[DatasetRecord]:
                 )
 
                 # Load image and annotation
-                img: PillowImageType = Image.open(img_path)
-                if img.mode != "RGBA":
-                    _log.debug(
-                        f"Converting image {img_path.name} from {img.mode} to RGBA during dataset preparation."
+                src_img = Image.open(img_path)
+                try:
+                    if src_img.mode != "RGBA":
+                        _log.debug(
+                            f"Converting image {img_path.name} from {src_img.mode} to RGBA during dataset preparation."
+                        )
+                        img = src_img.convert("RGBA")
+                        src_img.close()
+                    else:
+                        img = src_img
+
+                    with open(annotation_path, "r", encoding="utf-8") as f:
+                        funsd_data = json.load(f)
+
+                    # Get image bytes
+                    with io.BytesIO() as img_byte_stream:
+                        img.save(img_byte_stream, format="PNG")
+                        img_byte_stream.seek(0)
+                        img_bytes = img_byte_stream.getvalue()
+
+                    # Create ground truth document
+                    true_doc = DoclingDocument(name=img_path.stem)
+
+                    # Add page with image
+                    image_ref = ImageRef(
+                        mimetype="image/png",
+                        dpi=72,
+                        size=Size(width=float(img.width), height=float(img.height)),
+                        uri=from_pil_to_base64uri(img),
                     )
-                    img = img.convert("RGBA")
-
-                with open(annotation_path, "r", encoding="utf-8") as f:
-                    funsd_data = json.load(f)
-
-                # Get image bytes
-                with io.BytesIO() as img_byte_stream:
-                    img.save(img_byte_stream, format="PNG")
-                    img_byte_stream.seek(0)
-                    img_bytes = img_byte_stream.getvalue()
-
-                # Create ground truth document
-                true_doc = DoclingDocument(name=img_path.stem)
-
-                # Add page with image
-                image_ref = ImageRef(
-                    mimetype="image/png",
-                    dpi=72,
-                    size=Size(width=float(img.width), height=float(img.height)),
-                    uri=from_pil_to_base64uri(img),
-                )
-                page_item = PageItem(
-                    page_no=1,
-                    size=Size(width=float(img.width), height=float(img.height)),
-                    image=image_ref,
-                )
-                true_doc.pages[1] = page_item
+                    page_item = PageItem(
+                        page_no=1,
+                        size=Size(width=float(img.width), height=float(img.height)),
+                        image=image_ref,
+                    )
+                    true_doc.pages[1] = page_item
 
-                # Populate document with key-value data
-                true_doc, seg_pages = self._create_ground_truth_doc(
-                    true_doc, funsd_data
-                )
+                    # Populate document with key-value data
+                    true_doc, seg_pages = self._create_ground_truth_doc(
+                        true_doc, funsd_data
+                    )
+
+                    # Extract images
+                    true_doc, true_pictures, true_page_images = extract_images(
+                        document=true_doc,
+                        pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                        page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+                    )
+                finally:
+                    img.close()
 
-                # Extract images
-                true_doc, true_pictures, true_page_images = extract_images(
-                    document=true_doc,
-                    pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-                    page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-                )
                 image_stream = DocumentStream(
                     name=img_path.stem, stream=io.BytesIO(img_bytes)
                 )
diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py
index 68d0d9a7..8d6ab721 100644
--- a/docling_eval/dataset_builders/omnidocbench_builder.py
+++ b/docling_eval/dataset_builders/omnidocbench_builder.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import Dict, Iterable, List, Tuple
 
+from datasets import load_dataset
 from docling_core.types import DoclingDocument
 from docling_core.types.doc import (
     BoundingBox,
@@ -17,6 +18,8 @@
     Size,
 )
 from docling_core.types.io import DocumentStream
+from huggingface_hub import snapshot_download
+from PIL import Image as PILImage
 from PIL.Image import Image
 from tqdm import tqdm
 
@@ -88,11 +91,18 @@ class OmniDocBenchDatasetBuilder(BaseEvaluationDatasetBuilder):
 
     This builder processes the OmniDocBench dataset, which contains document
     layout annotations for a variety of document types.
+
+    Supports two modes:
+    - Raw mode: Downloads raw files via snapshot_download (many requests)
+    - Parquet mode: Downloads Parquet shards and extracts files (few requests)
     """
 
     def __init__(
         self,
         target: Path,
+        repo_id: str = "opendatalab/OmniDocBench",
+        revision: str = "v1_0",
+        use_parquet: bool = False,
         split: str = "test",
         begin_index: int = 0,
         end_index: int = -1,
@@ -102,23 +112,56 @@ def __init__(
 
         Args:
             target: Path where processed dataset will be saved
+            repo_id: HuggingFace repository ID (default: opendatalab/OmniDocBench)
+            revision: Repository revision/branch
+            use_parquet: If True, download Parquet and extract files (avoids rate limits)
             split: Dataset split to use
             begin_index: Start index for processing (inclusive)
             end_index: End index for processing (exclusive), -1 means process all
         """
         super().__init__(
             name="OmniDocBench: end-to-end",
-            dataset_source=HFSource(
-                repo_id="opendatalab/OmniDocBench", revision="v1_0"
-            ),
+            dataset_source=HFSource(repo_id=repo_id, revision=revision),
             target=target,
             split=split,
             begin_index=begin_index,
             end_index=end_index,
         )
 
+        self.use_parquet = use_parquet
         self.must_retrieve = True
 
+    def retrieve_input_dataset(self) -> Path:
+        """
+        Download and retrieve the input dataset.
+
+        In Parquet mode, this is a no-op since iterate() loads data directly.
+        In raw mode, downloads all files via snapshot_download.
+        """
+        if self.use_parquet:
+            # Parquet mode: iterate() uses load_dataset directly, no download needed
+            _log.info("Parquet mode: skipping download (data loaded in iterate)")
+            self.retrieved = True
+            return self.target
+
+        # Raw mode: download all raw files
+        if not self.dataset_local_path:
+            self.dataset_local_path = self.target / "source_data"
+
+        self.dataset_local_path.mkdir(parents=True, exist_ok=True)
+
+        _log.info("Downloading files (raw mode)...")
+        assert isinstance(self.dataset_source, HFSource)
+        snapshot_download(
+            repo_id=self.dataset_source.repo_id,
+            revision=self.dataset_source.revision,
+            repo_type="dataset",
+            token=self.dataset_source.hf_token,
+            local_dir=self.dataset_local_path,
+        )
+        self.retrieved = True
+        return self.dataset_local_path
+
     def update_gt_into_map(self, gt: List[Dict]) -> Dict[str, Dict]:
         """
         Convert list of annotation items to a map keyed by image path.
@@ -330,6 +373,11 @@ def iterate(self) -> Iterable[DatasetRecord]:
         Yields:
             DatasetRecord objects
         """
+        # Parquet mode: use load_dataset directly
+        if self.use_parquet:
+            yield from self._iterate_parquet()
+            return
+
         if not self.retrieved and self.must_retrieve:
             raise RuntimeError(
                 "You must first retrieve the source dataset. Call retrieve_input_dataset()."
@@ -421,3 +469,88 @@ def iterate(self) -> Iterable[DatasetRecord]:
             )
 
             yield record
+
+    def _iterate_parquet(self) -> Iterable[DatasetRecord]:
+        """
+        Iterate through the Parquet dataset and yield DatasetRecord objects.
+
+        This method loads data directly via load_dataset, avoiding rate limits
+        from downloading many individual files.
+        """
+        _log.info("Loading dataset via load_dataset (Parquet mode)...")
+        assert isinstance(self.dataset_source, HFSource)
+
+        ds = load_dataset(
+            self.dataset_source.repo_id,
+            split="train",
+        )
+
+        total_items = len(ds)
+        begin, end = self.get_effective_indices(total_items)
+        ds = ds.select(range(begin, end))
+        selected_items = len(ds)
+
+        self.log_dataset_stats(total_items, selected_items)
+
+        for item in tqdm(
+            ds, total=selected_items, ncols=128, desc="Processing Parquet records"
+        ):
+            filename = item["filename"]
+            gt_data = json.loads(item["ground_truth"])
+            pdf_bytes = item["pdf"]
+            page_image: PILImage.Image = item["image"]
+
+            # Create document and add page
+            true_doc = DoclingDocument(name=f"ground-truth {filename}")
+            page_image_rgb = page_image.convert("RGB")
+            page_width = float(page_image_rgb.width)
+            page_height = float(page_image_rgb.height)
+
+            image_ref = ImageRef(
+                mimetype="image/png",
+                dpi=72,
+                size=Size(width=page_width, height=page_height),
+                uri=from_pil_to_base64uri(page_image_rgb),
+            )
+            page_item = PageItem(
+                page_no=1,
+                size=Size(width=page_width, height=page_height),
+                image=image_ref,
+            )
+            true_doc.pages[1] = page_item
+
+            # Update document with ground truth
+            true_doc = self.update_doc_with_gt(
+                gt=gt_data,
+                true_doc=true_doc,
+                page=true_doc.pages[1],
+                page_image=page_image_rgb,
+                page_width=page_width,
+                page_height=page_height,
+            )
+
+            # Extract images from the ground truth document
+            true_doc, true_pictures, true_page_images = extract_images(
+                document=true_doc,
+                pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+            )
+
+            # Create PDF stream from bytes
+            pdf_stream = DocumentStream(
+                name=Path(filename).stem + ".pdf",
+                stream=BytesIO(pdf_bytes),
+            )
+
+            # Create dataset record
+            record = DatasetRecord(
+                doc_id=filename,
+                doc_hash=get_binhash(pdf_bytes),
+                ground_truth_doc=true_doc,
+                ground_truth_pictures=true_pictures,
+                ground_truth_page_images=true_page_images,
+                original=pdf_stream,
+                mime_type="application/pdf",
+            )
+
+            yield record
diff --git a/docling_eval/dataset_builders/otsl_table_dataset_builder.py b/docling_eval/dataset_builders/otsl_table_dataset_builder.py
index 768fc2ac..6a496516 100644
--- a/docling_eval/dataset_builders/otsl_table_dataset_builder.py
+++ b/docling_eval/dataset_builders/otsl_table_dataset_builder.py
@@ -159,82 +159,89 @@ def iterate(self) -> Iterable[DatasetRecord]:
                 filename = item["filename"]
                 table_image = item["image"]
 
-                page_tokens = self.create_page_tokens(
-                    data=item["cells"],
-                    height=table_image.height,
-                    width=table_image.width,
-                )
-
-                # Create ground truth document
-                true_doc = DoclingDocument(name=f"ground-truth {filename}")
-
-                # Add page to document
-                page_index = 1
-                image_ref = ImageRef(
-                    mimetype="image/png",
-                    dpi=72,
-                    size=Size(
-                        width=float(table_image.width), height=float(table_image.height)
-                    ),
-                    uri=from_pil_to_base64uri(table_image),
-                )
-                page_item = PageItem(
-                    page_no=page_index,
-                    size=Size(
-                        width=float(table_image.width), height=float(table_image.height)
-                    ),
-                    image=image_ref,
-                )
-                true_doc.pages[1] = page_item
-
-                # Create table data
-                html = "<table>" + "".join(item["html"]) + "</table>"
-                table_data = convert_html_table_into_docling_tabledata(
-                    html, text_cells=item["cells"][0]
-                )
-
-                for tbl_cell, page_token in zip(
-                    table_data.table_cells, page_tokens.tokens, strict=True
-                ):
-                    tbl_cell.bbox = page_token.bbox
-
-                # Create bounding box for table
-                l = 0.0
-                b = 0.0
-                r = table_image.width
-                t = table_image.height
-                if "table_bbox" in item:
-                    l = item["table_bbox"][0]
-                    b = table_image.height - item["table_bbox"][3]
-                    r = item["table_bbox"][2]
-                    t = table_image.height - item["table_bbox"][1]
-
-                bbox = BoundingBox(
-                    l=l,
-                    r=r,
-                    b=b,
-                    t=t,
-                    coord_origin=CoordOrigin.BOTTOMLEFT,
-                )
-
-                # Create provenance
-                prov = ProvenanceItem(page_no=page_index, bbox=bbox, charspan=(0, 0))
-
-                # Add table to document
-                true_doc.add_table(data=table_data, caption=None, prov=prov)
-
-                # Extract images
-                true_doc, true_pictures, true_page_images = extract_images(
-                    document=true_doc,
-                    pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-                    page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-                )
-
-                # Create dataset record
-                with io.BytesIO() as img_byte_stream:
-                    table_image.save(img_byte_stream, format="PNG")
-                    img_byte_stream.seek(0)
-                    img_bytes = img_byte_stream.read()
+                try:
+                    page_tokens = self.create_page_tokens(
+                        data=item["cells"],
+                        height=table_image.height,
+                        width=table_image.width,
+                    )
+
+                    # Create ground truth document
+                    true_doc = DoclingDocument(name=f"ground-truth {filename}")
+
+                    # Add page to document
+                    page_index = 1
+                    image_ref = ImageRef(
+                        mimetype="image/png",
+                        dpi=72,
+                        size=Size(
+                            width=float(table_image.width),
+                            height=float(table_image.height),
+                        ),
+                        uri=from_pil_to_base64uri(table_image),
+                    )
+                    page_item = PageItem(
+                        page_no=page_index,
+                        size=Size(
+                            width=float(table_image.width),
+                            height=float(table_image.height),
+                        ),
+                        image=image_ref,
+                    )
+                    true_doc.pages[1] = page_item
+
+                    # Create table data
+                    html = "<table>" + "".join(item["html"]) + "</table>"
+                    table_data = convert_html_table_into_docling_tabledata(
+                        html, text_cells=item["cells"][0]
+                    )
+
+                    for tbl_cell, page_token in zip(
+                        table_data.table_cells, page_tokens.tokens, strict=True
+                    ):
+                        tbl_cell.bbox = page_token.bbox
+
+                    # Create bounding box for table
+                    l = 0.0
+                    b = 0.0
+                    r = table_image.width
+                    t = table_image.height
+                    if "table_bbox" in item:
+                        l = item["table_bbox"][0]
+                        b = table_image.height - item["table_bbox"][3]
+                        r = item["table_bbox"][2]
+                        t = table_image.height - item["table_bbox"][1]
+
+                    bbox = BoundingBox(
+                        l=l,
+                        r=r,
+                        b=b,
+                        t=t,
+                        coord_origin=CoordOrigin.BOTTOMLEFT,
+                    )
+
+                    # Create provenance
+                    prov = ProvenanceItem(
+                        page_no=page_index, bbox=bbox, charspan=(0, 0)
+                    )
+
+                    # Add table to document
+                    true_doc.add_table(data=table_data, caption=None, prov=prov)
+
+                    # Extract images
+                    true_doc, true_pictures, true_page_images = extract_images(
+                        document=true_doc,
+                        pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                        page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+                    )
+
+                    # Create dataset record
+                    with io.BytesIO() as img_byte_stream:
+                        table_image.save(img_byte_stream, format="PNG")
+                        img_byte_stream.seek(0)
+                        img_bytes = img_byte_stream.read()
+                finally:
+                    table_image.close()
 
                 record = DatasetRecord(
                     doc_id=str(Path(filename).stem),
diff --git a/docling_eval/dataset_builders/pixparse_builder.py b/docling_eval/dataset_builders/pixparse_builder.py
index cc769a4b..b8ace19a 100644
--- a/docling_eval/dataset_builders/pixparse_builder.py
+++ b/docling_eval/dataset_builders/pixparse_builder.py
@@ -1,4 +1,5 @@
 import base64
+import gc
 import json
 import logging
 from io import BytesIO
@@ -179,23 +180,29 @@ def iterate(self) -> Iterable[DatasetRecord]:
                     continue
 
                 image_bytes = image_data["bytes"]
-                image = Image.open(BytesIO(image_bytes)).convert("RGB")
                 gt_data = json.loads(sample["json_data"])
 
-                true_doc, seg_pages = self._create_ground_truth_doc(
-                    doc_id, gt_data, image
-                )
+                with BytesIO(image_bytes) as input_stream:
+                    with Image.open(input_stream) as src_img:
+                        image = src_img.convert("RGB")
 
-                true_doc, true_pictures, true_page_images = extract_images(
-                    document=true_doc,
-                    pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-                    page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-                )
+                        try:
+                            true_doc, seg_pages = self._create_ground_truth_doc(
+                                doc_id, gt_data, image
+                            )
+
+                            true_doc, true_pictures, true_page_images = extract_images(
+                                document=true_doc,
+                                pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                                page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+                            )
 
-                with BytesIO() as img_byte_stream:
-                    image.save(img_byte_stream, format="PNG")
-                    img_byte_stream.seek(0)
-                    img_bytes = img_byte_stream.getvalue()
+                            with BytesIO() as img_byte_stream:
+                                image.save(img_byte_stream, format="PNG")
+                                img_byte_stream.seek(0)
+                                img_bytes = img_byte_stream.getvalue()
+                        finally:
+                            image.close()
 
                 image_stream = DocumentStream(
                     name=f"{doc_id}.png", stream=BytesIO(img_bytes)
@@ -213,6 +220,17 @@ def iterate(self) -> Iterable[DatasetRecord]:
                     ground_truth_page_images=true_page_images,
                 )
 
+                # Manual cleanup to prevent memory accumulation
+                del (
+                    true_doc,
+                    seg_pages,
+                    true_pictures,
+                    true_page_images,
+                    image_stream,
+                    img_bytes,
+                )
+                gc.collect()
+
             except Exception as e:
                 logging.error(
                     f"Error processing sample {sample.get('key', 'unknown')}: {e}"
diff --git a/docling_eval/dataset_builders/xfund_builder.py b/docling_eval/dataset_builders/xfund_builder.py
index f08d62fb..a997f6d2 100644
--- a/docling_eval/dataset_builders/xfund_builder.py
+++ b/docling_eval/dataset_builders/xfund_builder.py
@@ -418,45 +418,50 @@ def iterate(self) -> Iterable[DatasetRecord]:
 
                 # Load image
                 img = Image.open(img_path)
+                try:
+                    img_format = img.format
+                    # Get image bytes
+                    with io.BytesIO() as img_byte_stream:
+                        img.save(img_byte_stream, format=img_format)
+                        img_byte_stream.seek(0)
+                        img_bytes = img_byte_stream.getvalue()
+
+                    # Create ground truth document
+                    true_doc = DoclingDocument(name=Path(img_filename).stem)
+
+                    assert img.format is not None
+
+                    # Add page with image
+                    image_ref = ImageRef(
+                        mimetype=f"image/{img_format.lower()}",
+                        dpi=72,
+                        size=Size(width=float(img.width), height=float(img.height)),
+                        uri=from_pil_to_base64uri(img),
+                    )
+                    page_item = PageItem(
+                        page_no=1,
+                        size=Size(width=float(img.width), height=float(img.height)),
+                        image=image_ref,
+                    )
+                    true_doc.pages[1] = page_item
 
-                # Get image bytes
-                with io.BytesIO() as img_byte_stream:
-                    img.save(img_byte_stream, format=img.format)
-                    img_byte_stream.seek(0)
-                    img_bytes = img_byte_stream.getvalue()
-
-                # Create ground truth document
-                true_doc = DoclingDocument(name=Path(img_filename).stem)
-
-                assert img.format is not None
-
-                # Add page with image
-                image_ref = ImageRef(
-                    mimetype=f"image/{img.format.lower()}",
-                    dpi=72,
-                    size=Size(width=float(img.width), height=float(img.height)),
-                    uri=from_pil_to_base64uri(img),
-                )
-                page_item = PageItem(
-                    page_no=1,
-                    size=Size(width=float(img.width), height=float(img.height)),
-                    image=image_ref,
-                )
-                true_doc.pages[1] = page_item
+                    # Populate document with key-value data
+                    true_doc, seg_pages = self._create_ground_truth_doc(
+                        true_doc, doc_data
+                    )
 
-                # Populate document with key-value data
-                true_doc, seg_pages = self._create_ground_truth_doc(true_doc, doc_data)
+                    # Extract images
+                    true_doc, true_pictures, true_page_images = extract_images(
+                        document=true_doc,
+                        pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
+                        page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
+                    )
+                finally:
+                    img.close()
 
-                # Extract images
-                true_doc, true_pictures, true_page_images = extract_images(
-                    document=true_doc,
-                    pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value,
-                    page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value,
-                )
                 image_stream = DocumentStream(
                     name=img_path.stem, stream=io.BytesIO(img_bytes)
                 )
-                assert img.format is not None
                 # Create dataset record
                 record = DatasetRecord(
                     doc_id=Path(img_filename).stem,
@@ -464,7 +469,7 @@ def iterate(self) -> Iterable[DatasetRecord]:
                     ground_truth_doc=true_doc,
                     ground_truth_segmented_pages=seg_pages,
                     original=image_stream,
-                    mime_type=f"image/{img.format.lower()}",
+                    mime_type=f"image/{img_format.lower()}",
                     modalities=[EvaluationModality.KEY_VALUE, EvaluationModality.OCR],
                     ground_truth_pictures=true_pictures,
                     ground_truth_page_images=true_page_images,
diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
index 2e15f72b..8f855a1d 100644
--- a/docling_eval/evaluators/layout_evaluator.py
+++ b/docling_eval/evaluators/layout_evaluator.py
@@ -202,7 +202,12 @@ def __call__(
         split_path = str(ds_path / split / "*.parquet")
         split_files = glob.glob(split_path)
         logging.info("#-files: %s", len(split_files))
-        ds = load_dataset("parquet", data_files={split: split_files})
+
+        ds = load_dataset(
+            "parquet",
+            data_files={split: split_files},
+            features=DatasetRecordWithPrediction.features(),
+        )
         logging.info("Overview of dataset: %s", ds)
 
         # Select the split
diff --git a/docling_eval/evaluators/ocr_evaluator.py b/docling_eval/evaluators/ocr_evaluator.py
index 2364583d..c8c95bca 100644
--- a/docling_eval/evaluators/ocr_evaluator.py
+++ b/docling_eval/evaluators/ocr_evaluator.py
@@ -108,7 +108,9 @@ def __call__(
             "Found %d files for processing: %s", len(dataset_files), dataset_files
         )
         hf_dataset = load_dataset(
-            "parquet", data_files={data_split_name: dataset_files}
+            "parquet",
+            data_files={data_split_name: dataset_files},
+            features=DatasetRecordWithPrediction.features(),
         )
         _log.info("Dataset overview: %s", hf_dataset)
 
@@ -394,7 +396,9 @@ def __call__(
 
         path_to_parquet_files: str = str(dataset_path / data_split_name / "*.parquet")
         hf_dataset: Dataset = load_dataset(
-            "parquet", data_files={data_split_name: path_to_parquet_files}
+            "parquet",
+            data_files={data_split_name: path_to_parquet_files},
+            features=DatasetRecordWithPrediction.features(),
         )
 
         generated_visualization_paths: List[Path] = []
diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
index b82ceda6..c2d2c717 100644
--- a/docling_eval/evaluators/table_evaluator.py
+++ b/docling_eval/evaluators/table_evaluator.py
@@ -208,8 +208,14 @@ def __call__(
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
         split_files = glob.glob(split_path)
+
         _log.debug("Files: %s", split_files)
-        ds = load_dataset("parquet", data_files={split: split_files})
+
+        ds = load_dataset(
+            "parquet",
+            data_files={split: split_files},
+            features=DatasetRecordWithPrediction.features(),
+        )
         _log.info("Overview of dataset: %s", ds)
 
         # Select the split
diff --git a/docling_eval/prediction_providers/aws_prediction_provider.py b/docling_eval/prediction_providers/aws_prediction_provider.py
index 54e8bcc0..07a246a3 100644
--- a/docling_eval/prediction_providers/aws_prediction_provider.py
+++ b/docling_eval/prediction_providers/aws_prediction_provider.py
@@ -207,6 +207,11 @@ def convert_aws_output_to_docling(
         processed_pages = set()
         segmented_pages: Dict[int, SegmentedPage] = {}
 
+        if not record.ground_truth_page_images:
+            _log.warning(
+                "No ground truth page images available for AWS Textract conversion"
+            )
+            return doc, segmented_pages
         # Get page dimensions from page block
         # AWS provides normalized coordinates, so we need to multiply by a typical page size
         # width = 8.5 * 72  # Standard US Letter width in points
@@ -542,6 +547,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
         """For the given document stream (single document), run the API and create the doclingDocument."""
 
         status = ConversionStatus.SUCCESS
+        result_orig = None
+        pred_segmented_pages: Dict[int, SegmentedPage] = {}
+        pred_doc = None
         assert record.original is not None
 
         if not isinstance(record.original, DocumentStream):
@@ -557,7 +565,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
                 "image/jpeg",
             ]:
                 # Call the AWS Textract API by passing in the image for prediction
-
+                record.original.stream.seek(0)
                 file_bytes = record.original.stream.read()
                 response = self.textract_client.analyze_document(
                     Document={"Bytes": file_bytes},
diff --git a/docling_eval/prediction_providers/azure_prediction_provider.py b/docling_eval/prediction_providers/azure_prediction_provider.py
index 71b09795..f60e6734 100644
--- a/docling_eval/prediction_providers/azure_prediction_provider.py
+++ b/docling_eval/prediction_providers/azure_prediction_provider.py
@@ -128,6 +128,10 @@ def convert_azure_output_to_docling(
         doc = DoclingDocument(name=record.doc_id)
         segmented_pages: Dict[int, SegmentedPage] = {}
 
+        if not record.ground_truth_page_images:
+            _log.warning("No ground truth page images available for Azure conversion")
+            return doc, segmented_pages
+
         for page in analyze_result.get("pages", []):
             page_no = page.get("pageNumber", 1)
 
@@ -414,7 +418,9 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
         from azure.ai.documentintelligence.models import AnalyzeOutputOption
 
         status = ConversionStatus.SUCCESS
-        result_orig = None
+        result_json = None
+        pred_segmented_pages: Dict[int, SegmentedPage] = {}
+        pred_doc = None
         assert record.original is not None
 
         try:
@@ -424,6 +430,7 @@ def predict(self, record: DatasetRecord) -> DatasetRecordWithPrediction:
                     raise RuntimeError(
                         "Original document must be a DocumentStream for PDF files"
                     )
+                record.original.stream.seek(0)
                 # Call the Azure API by passing in the image for prediction
                 poller = self.doc_intelligence_client.begin_analyze_document(
                     "prebuilt-layout",
diff --git a/docling_eval/prediction_providers/base_prediction_provider.py b/docling_eval/prediction_providers/base_prediction_provider.py
index 43c034bb..1e3c25b4 100644
--- a/docling_eval/prediction_providers/base_prediction_provider.py
+++ b/docling_eval/prediction_providers/base_prediction_provider.py
@@ -149,6 +149,7 @@ def visualize_results(
         if (
             prediction_record.predicted_doc is not None
             and prediction_record.ground_truth_page_images
+            and prediction_record.predicted_page_images
         ):
             gt_doc = insert_images_from_pil(
                 prediction_record.ground_truth_doc.model_copy(),
@@ -453,9 +454,13 @@ def create_prediction_dataset(
             end_index: End index for processing (exclusive), -1 means process all
             chunk_size: items per chunk
         """
-        # Load the dataset
+        # Load the dataset with proper schema to ensure PIL images are decoded
         parquet_files = str(gt_dataset_dir / split / "*.parquet")
-        ds = load_dataset("parquet", data_files={split: parquet_files})
+        ds = load_dataset(
+            "parquet",
+            data_files={split: parquet_files},
+            features=DatasetRecord.features(),
+        )
 
         if ds is None:
             _log.error(f"Failed to load dataset from {parquet_files}")
@@ -501,6 +506,13 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]:
                     if not self.ignore_missing_predictions:
                         raise
 
+        def _serialize_predictions():
+            for pred_record in _iterate_predictions():
+                if self.do_visualization:
+                    self.visualize_results(pred_record, target_dataset_dir)
+                # Serialize immediately to release PIL image memory
+                yield pred_record.as_record_dict()
+
         # Create output directories
         test_dir = target_dataset_dir / split
         test_dir.mkdir(parents=True, exist_ok=True)
@@ -513,20 +525,15 @@ def _iterate_predictions() -> Iterable[DatasetRecordWithPrediction]:
 
         count = 0
         chunk_count = 0
-        for record_chunk in chunkify(_iterate_predictions(), chunk_size):
-            if self.do_visualization:
-                for r in record_chunk:
-                    self.visualize_results(r, target_dataset_dir)
-
-            record_chunk = [r.as_record_dict() for r in record_chunk]
-
+        # Use _serialize_predictions to ensure we hold dicts (bytes), not open PIL images
+        for record_chunk_dicts in chunkify(_serialize_predictions(), chunk_size):
             save_shard_to_disk(
-                items=record_chunk,
+                items=record_chunk_dicts,
                 dataset_path=test_dir,
                 schema=DatasetRecordWithPrediction.pyarrow_schema(),
                 shard_id=chunk_count,
             )
-            count += len(record_chunk)
+            count += len(record_chunk_dicts)
             chunk_count += 1
 
             if chunk_count >= max_num_chunks:
diff --git a/docling_eval/prediction_providers/google_prediction_provider.py b/docling_eval/prediction_providers/google_prediction_provider.py
index 48dc352f..f8ac3546 100644
--- a/docling_eval/prediction_providers/google_prediction_provider.py
+++ b/docling_eval/prediction_providers/google_prediction_provider.py
@@ -597,6 +597,10 @@ def convert_google_output_to_docling(self, document, record: DatasetRecord):
         doc = DoclingDocument(name=record.doc_id)
         segmented_pages: Dict[int, SegmentedPage] = {}
 
+        if not record.ground_truth_page_images:
+            _log.warning("No ground truth page images available for Google conversion")
+            return doc, segmented_pages
+
         for page in document.get("pages", []):
             page_no = page.get("pageNumber", 1)
             page_width = page.get("dimension", {}).get("width", 0)
diff --git a/uv.lock b/uv.lock
index 4e943437..589bdd10 100644
--- a/uv.lock
+++ b/uv.lock
@@ -7721,4 +7721,4 @@ source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
-]
+]
\ No newline at end of file