diff --git a/docling_eval/datamodels/dataset_record.py b/docling_eval/datamodels/dataset_record.py index d72a27a3..296b8997 100644 --- a/docling_eval/datamodels/dataset_record.py +++ b/docling_eval/datamodels/dataset_record.py @@ -1,4 +1,5 @@ import json +import logging from enum import Enum from io import BytesIO from pathlib import Path @@ -17,6 +18,8 @@ from docling_eval.datamodels.types import EvaluationModality, PredictionFormats from docling_eval.utils.utils import extract_images +_log = logging.getLogger(__name__) + seg_adapter = TypeAdapter(Dict[int, SegmentedPage]) @@ -151,6 +154,10 @@ def _extract_images( return pictures, page_images def as_record_dict(self): + # Convert images to bytes format BEFORE closing them + gt_pictures_bytes = self._images_to_bytes(self.ground_truth_pictures) + gt_page_images_bytes = self._images_to_bytes(self.ground_truth_page_images) + record = { self.get_field_alias("doc_id"): self.doc_id, self.get_field_alias("doc_path"): str(self.doc_path), @@ -158,13 +165,11 @@ def as_record_dict(self): self.get_field_alias("ground_truth_doc"): json.dumps( self.ground_truth_doc.export_to_dict() ), - self.get_field_alias("ground_truth_pictures"): self.ground_truth_pictures, + self.get_field_alias("ground_truth_pictures"): gt_pictures_bytes, self.get_field_alias("ground_truth_segmented_pages"): seg_adapter.dump_json( self.ground_truth_segmented_pages ).decode("utf-8"), - self.get_field_alias( - "ground_truth_page_images" - ): self.ground_truth_page_images, + self.get_field_alias("ground_truth_page_images"): gt_page_images_bytes, self.get_field_alias("mime_type"): self.mime_type, self.get_field_alias("modalities"): list( [m.value for m in self.modalities] @@ -183,6 +188,33 @@ def as_record_dict(self): return record + @staticmethod + def _pil_to_bytes(img: PIL.Image.Image) -> bytes: + """Convert PIL image to PNG bytes.""" + buffered = BytesIO() + img.save(buffered, format="PNG") + return buffered.getvalue() + + def _images_to_bytes(self, images: List[PIL.Image.Image]) -> List[dict]: + """Convert list of PIL Images to HuggingFace-compatible bytes format.""" + return [{"bytes": self._pil_to_bytes(img), "path": None} for img in images] + + @staticmethod + def _close_image_list(images: List[PIL.Image.Image]) -> None: + """Close all PIL Images in a list, logging any errors at debug level.""" + for img in images: + try: + img.close() + except Exception as e: + _log.debug(f"Failed to close PIL image: {e}") + + def _close_images(self) -> None: + """Close ground truth PIL Images to prevent memory leaks.""" + self._close_image_list(self.ground_truth_page_images) + self._close_image_list(self.ground_truth_pictures) + self.ground_truth_page_images = [] + self.ground_truth_pictures = [] + @model_validator(mode="after") def validate_images(self) -> "DatasetRecord": if not len(self.ground_truth_pictures) and not len( @@ -240,9 +272,9 @@ def validate_record_dict(cls, data: dict): img_buffer.seek(0) data[gt_binary] = DocumentStream(name="image.png", stream=img_buffer) - # Backward compatibility: ensure tags field exists for old datasets + # Backward compatibility: ensure tags field exists for old datasets and is not None tags_alias = cls.get_field_alias("tags") - if tags_alias not in data: + if tags_alias not in data or data[tags_alias] is None: data[tags_alias] = [] return data @@ -317,6 +349,10 @@ def as_record_dict(self): ) if self.predicted_doc is not None: + # Convert prediction images to bytes BEFORE closing + pred_pictures_bytes = self._images_to_bytes(self.predicted_pictures) + pred_page_images_bytes = self._images_to_bytes(self.predicted_page_images) + record.update( { self.get_field_alias("predicted_doc"): json.dumps( @@ -327,18 +363,27 @@ def as_record_dict(self): ): seg_adapter.dump_json(self.predicted_segmented_pages).decode( "utf-8" ), - self.get_field_alias("predicted_pictures"): self.predicted_pictures, + self.get_field_alias("predicted_pictures"): pred_pictures_bytes, self.get_field_alias( "predicted_page_images" - ): self.predicted_page_images, + ): pred_page_images_bytes, self.get_field_alias("original_prediction"): ( self.original_prediction ), } ) + # Close prediction images (parent already closed ground truth images) + self._close_prediction_images() return record + def _close_prediction_images(self) -> None: + """Close prediction PIL Images to prevent memory leaks.""" + self._close_image_list(self.predicted_page_images) + self._close_image_list(self.predicted_pictures) + self.predicted_page_images = [] + self.predicted_pictures = [] + @model_validator(mode="after") def validate_images(self) -> "DatasetRecordWithPrediction": # super().validate_images() diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index acd6d829..74749334 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -192,23 +192,20 @@ def retrieve_input_dataset(self) -> Path: Path to the retrieved dataset """ if isinstance(self.dataset_source, HFSource): + download_kwargs = { + "repo_id": self.dataset_source.repo_id, + "revision": self.dataset_source.revision, + "repo_type": "dataset", + "token": self.dataset_source.hf_token, + } + if not self.dataset_local_path: - path_str = snapshot_download( - repo_id=self.dataset_source.repo_id, - revision=self.dataset_source.revision, - repo_type="dataset", - token=self.dataset_source.hf_token, - ) + path_str = snapshot_download(**download_kwargs) path: Path = Path(path_str) self.dataset_local_path = path else: - path_str = snapshot_download( - repo_id=self.dataset_source.repo_id, - revision=self.dataset_source.revision, - repo_type="dataset", - token=self.dataset_source.hf_token, - local_dir=self.dataset_local_path, - ) + download_kwargs["local_dir"] = str(self.dataset_local_path) + path_str = snapshot_download(**download_kwargs) path = Path(path_str) elif isinstance(self.dataset_source, Path): path = self.dataset_source @@ -315,7 +312,6 @@ def save_to_disk( for record_chunk in chunkify(self.iterate(), chunk_size): record_list = [] for r in record_chunk: - record_list.append(r.as_record_dict()) if do_visualization: viz_path_split = self.target / "visualizations" / f"{r.doc_id}.html" @@ -333,6 +329,7 @@ def save_to_disk( doc=tmp, draw_reading_order=True, ) + record_list.append(r.as_record_dict()) save_shard_to_disk( items=record_list, diff --git a/docling_eval/dataset_builders/doclaynet_v1_builder.py b/docling_eval/dataset_builders/doclaynet_v1_builder.py index e5355153..7074b939 100644 --- a/docling_eval/dataset_builders/doclaynet_v1_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v1_builder.py @@ -335,33 +335,37 @@ def iterate(self) -> Iterable[DatasetRecord]: ) # Set up document dimensions - img = true_page_images[0] - old_w, old_h = doc["image"].size - old_size = Size(width=old_w, height=old_h) - - # Process elements - current_list = None - labels = list( - map(lambda cid: self.category_map[int(cid)], doc["category_id"]) - ) - bboxes = doc["bboxes"] - segments = doc["pdf_cells"] - contents = [ - " ".join(map(lambda cell: cell["text"], cells)) - for cells in segments - ] - - for l, b, c in zip(labels, bboxes, contents): - current_list = self.update_doc_with_gt( - true_doc, current_list, img, old_size, l, b, c + hf_image = doc["image"] + try: + img = true_page_images[0] + old_w, old_h = hf_image.size + old_size = Size(width=old_w, height=old_h) + + # Process elements + current_list = None + labels = list( + map(lambda cid: self.category_map[int(cid)], doc["category_id"]) ) - - # Extract images from the ground truth document - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) + bboxes = doc["bboxes"] + segments = doc["pdf_cells"] + contents = [ + " ".join(map(lambda cell: cell["text"], cells)) + for cells in segments + ] + + for l, b, c in zip(labels, bboxes, contents): + current_list = self.update_doc_with_gt( + true_doc, current_list, img, old_size, l, b, c + ) + + # Extract images from the ground truth document + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + finally: + hf_image.close() pdf_stream.seek(0) doc_stream = DocumentStream(name=page_hash, stream=pdf_stream) diff --git a/docling_eval/dataset_builders/doclaynet_v2_builder.py b/docling_eval/dataset_builders/doclaynet_v2_builder.py index f0386d39..92e1601a 100644 --- a/docling_eval/dataset_builders/doclaynet_v2_builder.py +++ b/docling_eval/dataset_builders/doclaynet_v2_builder.py @@ -633,59 +633,64 @@ def iterate(self) -> Iterable[DatasetRecord]: # Extract image img = doc["image"] - # Convert image to bytes for storage - with io.BytesIO() as img_byte_stream: - img.save(img_byte_stream, format=img.format or "PNG") - img_byte_stream.seek(0) - img_bytes = img_byte_stream.getvalue() - - # Create ground truth document - doc_id = doc["page_hash"] - true_doc = DoclingDocument(name=doc_id) - - # Add page with image - image_ref = ImageRef( - mimetype=f"image/{img.format.lower() if img.format else 'png'}", - dpi=72, - size=Size(width=float(img.width), height=float(img.height)), - uri=from_pil_to_base64uri(img), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(img.width), height=float(img.height)), - image=image_ref, - ) - true_doc.pages[1] = page_item - - # Create key-value pairs if present - kv_pairs = self.create_kv_pairs(doc) - if kv_pairs: - self.populate_key_value_item(true_doc, kv_pairs) - - # Process layout elements - current_list = None - boxes = doc["boxes"] - labels = list( - map( - lambda label: label.lower() - .replace("-", "_") - .replace(" ", "_"), - doc["labels"], + try: + # Convert image to bytes for storage + with io.BytesIO() as img_byte_stream: + img.save(img_byte_stream, format=img.format or "PNG") + img_byte_stream.seek(0) + img_bytes = img_byte_stream.getvalue() + + # Create ground truth document + doc_id = doc["page_hash"] + true_doc = DoclingDocument(name=doc_id) + + # Add page with image + image_ref = ImageRef( + mimetype=f"image/{img.format.lower() if img.format else 'png'}", + dpi=72, + size=Size(width=float(img.width), height=float(img.height)), + uri=from_pil_to_base64uri(img), ) - ) - segments = doc["segments"] - - for label, segment, box in zip(labels, segments, boxes): - current_list = self.update_doc( - true_doc, current_list, img, label, segment, box + page_item = PageItem( + page_no=1, + size=Size(width=float(img.width), height=float(img.height)), + image=image_ref, + ) + true_doc.pages[1] = page_item + + # Create key-value pairs if present + kv_pairs = self.create_kv_pairs(doc) + if kv_pairs: + self.populate_key_value_item(true_doc, kv_pairs) + + # Process layout elements + current_list = None + boxes = doc["boxes"] + labels = list( + map( + lambda label: label.lower() + .replace("-", "_") + .replace(" ", "_"), + doc["labels"], + ) + ) + segments = doc["segments"] + + for label, segment, box in zip(labels, segments, boxes): + current_list = self.update_doc( + true_doc, current_list, img, label, segment, box + ) + + # Extract images from ground truth document + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, ) - # Extract images from ground truth document - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) + img_format = img.format + finally: + img.close() # Create dataset record record = DatasetRecord( @@ -695,7 +700,7 @@ def iterate(self) -> Iterable[DatasetRecord]: original=DocumentStream( name=doc_id, stream=io.BytesIO(img_bytes) ), - mime_type=f"image/{img.format.lower() if img.format else 'png'}", + mime_type=f"image/{img_format.lower() if img_format else 'png'}", modalities=[ EvaluationModality.LAYOUT, EvaluationModality.MARKDOWN_TEXT, diff --git a/docling_eval/dataset_builders/doclingdpbench_builder.py b/docling_eval/dataset_builders/doclingdpbench_builder.py index 6f2178bf..cd087e67 100644 --- a/docling_eval/dataset_builders/doclingdpbench_builder.py +++ b/docling_eval/dataset_builders/doclingdpbench_builder.py @@ -101,3 +101,9 @@ def iterate(self) -> Iterable[DatasetRecord]: ) yield record + + # Close PIL images to prevent memory leaks + for img in page_images: + img.close() + for img in pictures: + img.close() diff --git a/docling_eval/dataset_builders/file_dataset_builder.py b/docling_eval/dataset_builders/file_dataset_builder.py index 2f173dd3..0019e4e5 100644 --- a/docling_eval/dataset_builders/file_dataset_builder.py +++ b/docling_eval/dataset_builders/file_dataset_builder.py @@ -118,26 +118,30 @@ def iterate(self) -> Iterable[DatasetRecord]: if filename.suffix.lower() in [".tif", ".tiff"]: # Process all pages/frames in multipage TIFF page_no = 1 + img = None try: img = Image.open(filename) while True: img.seek(page_no - 1) image = img.convert("RGB") - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=image.width, height=image.height), - uri=from_pil_to_base64uri(image), - ) - page_item = PageItem( - page_no=page_no, - size=Size( - width=float(image.width), - height=float(image.height), - ), - image=image_ref, - ) - true_doc.pages[page_no] = page_item + try: + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=image.width, height=image.height), + uri=from_pil_to_base64uri(image), + ) + page_item = PageItem( + page_no=page_no, + size=Size( + width=float(image.width), + height=float(image.height), + ), + image=image_ref, + ) + true_doc.pages[page_no] = page_item + finally: + image.close() page_no += 1 # Try to seek to next frame try: @@ -145,15 +149,39 @@ def iterate(self) -> Iterable[DatasetRecord]: except EOFError: # No more frames break - img.close() except Exception as e: _log.warning( f"Failed to process multipage TIFF {filename}: {e}. " "Falling back to single-page processing." ) # Fallback to single-page processing - image = Image.open(filename) - image = image.convert("RGB") + with Image.open(filename) as src_img: + image = src_img.convert("RGB") + try: + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=image.width, height=image.height), + uri=from_pil_to_base64uri(image), + ) + page_item = PageItem( + page_no=1, + size=Size( + width=float(image.width), height=float(image.height) + ), + image=image_ref, + ) + true_doc.pages[1] = page_item + finally: + image.close() + finally: + if img is not None: + img.close() + else: + # Single-page image formats + with Image.open(filename) as src_img: + image = src_img.convert("RGB") + try: image_ref = ImageRef( mimetype="image/png", dpi=72, @@ -167,25 +195,11 @@ def iterate(self) -> Iterable[DatasetRecord]: ), image=image_ref, ) - true_doc.pages[1] = page_item - else: - # Single-page image formats - image = Image.open(filename) - image = image.convert("RGB") - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=image.width, height=image.height), - uri=from_pil_to_base64uri(image), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(image.width), height=float(image.height)), - image=image_ref, - ) - # _log.debug(f"add_pages_to_true_doc: {filename}") - true_doc.pages[1] = page_item + # _log.debug(f"add_pages_to_true_doc: {filename}") + true_doc.pages[1] = page_item + finally: + image.close() elif mime_type == "application/json": # .. support DoclingDocument json files try: diff --git a/docling_eval/dataset_builders/funsd_builder.py b/docling_eval/dataset_builders/funsd_builder.py index 21c3cf4b..ac85cc7b 100644 --- a/docling_eval/dataset_builders/funsd_builder.py +++ b/docling_eval/dataset_builders/funsd_builder.py @@ -378,50 +378,57 @@ def iterate(self) -> Iterable[DatasetRecord]: ) # Load image and annotation - img: PillowImageType = Image.open(img_path) - if img.mode != "RGBA": - _log.debug( - f"Converting image {img_path.name} from {img.mode} to RGBA during dataset preparation." + src_img = Image.open(img_path) + try: + if src_img.mode != "RGBA": + _log.debug( + f"Converting image {img_path.name} from {src_img.mode} to RGBA during dataset preparation." + ) + img = src_img.convert("RGBA") + src_img.close() + else: + img = src_img + + with open(annotation_path, "r", encoding="utf-8") as f: + funsd_data = json.load(f) + + # Get image bytes + with io.BytesIO() as img_byte_stream: + img.save(img_byte_stream, format="PNG") + img_byte_stream.seek(0) + img_bytes = img_byte_stream.getvalue() + + # Create ground truth document + true_doc = DoclingDocument(name=img_path.stem) + + # Add page with image + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=float(img.width), height=float(img.height)), + uri=from_pil_to_base64uri(img), ) - img = img.convert("RGBA") - - with open(annotation_path, "r", encoding="utf-8") as f: - funsd_data = json.load(f) - - # Get image bytes - with io.BytesIO() as img_byte_stream: - img.save(img_byte_stream, format="PNG") - img_byte_stream.seek(0) - img_bytes = img_byte_stream.getvalue() - - # Create ground truth document - true_doc = DoclingDocument(name=img_path.stem) - - # Add page with image - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size(width=float(img.width), height=float(img.height)), - uri=from_pil_to_base64uri(img), - ) - page_item = PageItem( - page_no=1, - size=Size(width=float(img.width), height=float(img.height)), - image=image_ref, - ) - true_doc.pages[1] = page_item + page_item = PageItem( + page_no=1, + size=Size(width=float(img.width), height=float(img.height)), + image=image_ref, + ) + true_doc.pages[1] = page_item - # Populate document with key-value data - true_doc, seg_pages = self._create_ground_truth_doc( - true_doc, funsd_data - ) + # Populate document with key-value data + true_doc, seg_pages = self._create_ground_truth_doc( + true_doc, funsd_data + ) + + # Extract images + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + finally: + img.close() - # Extract images - true_doc, true_pictures, true_page_images = extract_images( - document=true_doc, - pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, - page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, - ) image_stream = DocumentStream( name=img_path.stem, stream=io.BytesIO(img_bytes) ) diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py index 68d0d9a7..8d6ab721 100644 --- a/docling_eval/dataset_builders/omnidocbench_builder.py +++ b/docling_eval/dataset_builders/omnidocbench_builder.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Dict, Iterable, List, Tuple +from datasets import load_dataset from docling_core.types import DoclingDocument from docling_core.types.doc import ( BoundingBox, @@ -17,6 +18,8 @@ Size, ) from docling_core.types.io import DocumentStream +from huggingface_hub import snapshot_download +from PIL import Image as PILImage from PIL.Image import Image from tqdm import tqdm @@ -88,11 +91,18 @@ class OmniDocBenchDatasetBuilder(BaseEvaluationDatasetBuilder): This builder processes the OmniDocBench dataset, which contains document layout annotations for a variety of document types. + + Supports two modes: + - Raw mode: Downloads raw files via snapshot_download (many requests) + - Parquet mode: Downloads Parquet shards and extracts files (few requests) """ def __init__( self, target: Path, + repo_id: str = "opendatalab/OmniDocBench", + revision: str = "v1_0", + use_parquet: bool = False, split: str = "test", begin_index: int = 0, end_index: int = -1, @@ -102,23 +112,56 @@ def __init__( Args: target: Path where processed dataset will be saved + repo_id: HuggingFace repository ID (default: opendatalab/OmniDocBench) + revision: Repository revision/branch + use_parquet: If True, download Parquet and extract files (avoids rate limits) split: Dataset split to use begin_index: Start index for processing (inclusive) end_index: End index for processing (exclusive), -1 means process all """ super().__init__( name="OmniDocBench: end-to-end", - dataset_source=HFSource( - repo_id="opendatalab/OmniDocBench", revision="v1_0" - ), + dataset_source=HFSource(repo_id=repo_id, revision=revision), target=target, split=split, begin_index=begin_index, end_index=end_index, ) + self.use_parquet = use_parquet self.must_retrieve = True + def retrieve_input_dataset(self) -> Path: + """ + Download and retrieve the input dataset. + + In Parquet mode, this is a no-op since iterate() loads data directly. + In raw mode, downloads all files via snapshot_download. + """ + if self.use_parquet: + # Parquet mode: iterate() uses load_dataset directly, no download needed + _log.info("Parquet mode: skipping download (data loaded in iterate)") + self.retrieved = True + return self.target + + # Raw mode: download all raw files + if not self.dataset_local_path: + self.dataset_local_path = self.target / "source_data" + + self.dataset_local_path.mkdir(parents=True, exist_ok=True) + + _log.info("Downloading files (raw mode)...") + assert isinstance(self.dataset_source, HFSource) + snapshot_download( + repo_id=self.dataset_source.repo_id, + revision=self.dataset_source.revision, + repo_type="dataset", + token=self.dataset_source.hf_token, + local_dir=self.dataset_local_path, + ) + self.retrieved = True + return self.dataset_local_path + def update_gt_into_map(self, gt: List[Dict]) -> Dict[str, Dict]: """ Convert list of annotation items to a map keyed by image path. @@ -330,6 +373,11 @@ def iterate(self) -> Iterable[DatasetRecord]: Yields: DatasetRecord objects """ + # Parquet mode: use load_dataset directly + if self.use_parquet: + yield from self._iterate_parquet() + return + if not self.retrieved and self.must_retrieve: raise RuntimeError( "You must first retrieve the source dataset. Call retrieve_input_dataset()." @@ -421,3 +469,88 @@ def iterate(self) -> Iterable[DatasetRecord]: ) yield record + + def _iterate_parquet(self) -> Iterable[DatasetRecord]: + """ + Iterate through the Parquet dataset and yield DatasetRecord objects. + + This method loads data directly via load_dataset, avoiding rate limits + from downloading many individual files. + """ + _log.info("Loading dataset via load_dataset (Parquet mode)...") + assert isinstance(self.dataset_source, HFSource) + + ds = load_dataset( + self.dataset_source.repo_id, + split="train", + ) + + total_items = len(ds) + begin, end = self.get_effective_indices(total_items) + ds = ds.select(range(begin, end)) + selected_items = len(ds) + + self.log_dataset_stats(total_items, selected_items) + + for item in tqdm( + ds, total=selected_items, ncols=128, desc="Processing Parquet records" + ): + filename = item["filename"] + gt_data = json.loads(item["ground_truth"]) + pdf_bytes = item["pdf"] + page_image: PILImage.Image = item["image"] + + # Create document and add page + true_doc = DoclingDocument(name=f"ground-truth {filename}") + page_image_rgb = page_image.convert("RGB") + page_width = float(page_image_rgb.width) + page_height = float(page_image_rgb.height) + + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=page_width, height=page_height), + uri=from_pil_to_base64uri(page_image_rgb), + ) + page_item = PageItem( + page_no=1, + size=Size(width=page_width, height=page_height), + image=image_ref, + ) + true_doc.pages[1] = page_item + + # Update document with ground truth + true_doc = self.update_doc_with_gt( + gt=gt_data, + true_doc=true_doc, + page=true_doc.pages[1], + page_image=page_image_rgb, + page_width=page_width, + page_height=page_height, + ) + + # Extract images from the ground truth document + true_doc, true_pictures, true_page_images = extract_images( + document=true_doc, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + + # Create PDF stream from bytes + pdf_stream = DocumentStream( + name=Path(filename).stem + ".pdf", + stream=BytesIO(pdf_bytes), + ) + + # Create dataset record + record = DatasetRecord( + doc_id=filename, + doc_hash=get_binhash(pdf_bytes), + ground_truth_doc=true_doc, + ground_truth_pictures=true_pictures, + ground_truth_page_images=true_page_images, + original=pdf_stream, + mime_type="application/pdf", + ) + + yield record diff --git a/docling_eval/dataset_builders/otsl_table_dataset_builder.py b/docling_eval/dataset_builders/otsl_table_dataset_builder.py index 768fc2ac..6a496516 100644 --- a/docling_eval/dataset_builders/otsl_table_dataset_builder.py +++ b/docling_eval/dataset_builders/otsl_table_dataset_builder.py @@ -159,82 +159,89 @@ def iterate(self) -> Iterable[DatasetRecord]: filename = item["filename"] table_image = item["image"] - page_tokens = self.create_page_tokens( - data=item["cells"], - height=table_image.height, - width=table_image.width, - ) - - # Create ground truth document - true_doc = DoclingDocument(name=f"ground-truth {filename}") - - # Add page to document - page_index = 1 - image_ref = ImageRef( - mimetype="image/png", - dpi=72, - size=Size( - width=float(table_image.width), height=float(table_image.height) - ), - uri=from_pil_to_base64uri(table_image), - ) - page_item = PageItem( - page_no=page_index, - size=Size( - width=float(table_image.width), height=float(table_image.height) - ), - image=image_ref, - ) - true_doc.pages[1] = page_item - - # Create table data - html = "