From 9d375e7db1c69357f0ffd1ff769c7e39a850c0c3 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Fri, 27 Mar 2026 13:50:58 +0100 Subject: [PATCH] Added new builder DoclingSDGDatasetBuilder Signed-off-by: Maksym Lysak --- docling_eval/cli/main.py | 26 +- docling_eval/datamodels/types.py | 1 + .../dataset_builders/doclingsdg_builder.py | 247 ++++++++++++++++++ 3 files changed, 271 insertions(+), 3 deletions(-) create mode 100644 docling_eval/dataset_builders/doclingsdg_builder.py diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py index f898a40d..27eba766 100644 --- a/docling_eval/cli/main.py +++ b/docling_eval/cli/main.py @@ -70,6 +70,7 @@ from docling_eval.dataset_builders.doclingdpbench_builder import ( DoclingDPBenchDatasetBuilder, ) +from docling_eval.dataset_builders.doclingsdg_builder import DoclingSDGDatasetBuilder from docling_eval.dataset_builders.docvqa_builder import DocVQADatasetBuilder from docling_eval.dataset_builders.dpbench_builder import DPBenchDatasetBuilder from docling_eval.dataset_builders.file_dataset_builder import FileDatasetBuilder @@ -136,9 +137,17 @@ from docling_eval.prediction_providers.google_prediction_provider import ( GoogleDocAIPredictionProvider, ) -from docling_eval.prediction_providers.tableformer_provider import ( - TableFormerPredictionProvider, -) + +# TableFormer provider may not be available for all docling installations. +try: + from docling_eval.prediction_providers.tableformer_provider import ( + TableFormerPredictionProvider, + ) + + TABLEFORMER_AVAILABLE = True +except ImportError: + TABLEFORMER_AVAILABLE = False + TableFormerPredictionProvider = None # type: ignore from docling_eval.utils.external_docling_document_loader import ( ExternalDoclingDocumentLoader, ) @@ -315,6 +324,11 @@ def get_dataset_builder( elif benchmark == BenchMarkNames.DOCLING_DPBENCH: return DoclingDPBenchDatasetBuilder(**common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLING_SDG: + if dataset_source is None: + raise ValueError("dataset_source is required for DOCLING_SDG") + return DoclingSDGDatasetBuilder(dataset_source=dataset_source, **common_params) # type: ignore + elif benchmark == BenchMarkNames.DOCLAYNETV1: return DocLayNetV1DatasetBuilder(**common_params) # type: ignore @@ -620,6 +634,12 @@ def get_prediction_provider( ignore_missing_predictions=True, ) elif provider_type == PredictionProviderType.TABLEFORMER: + if not TABLEFORMER_AVAILABLE: + raise ImportError( + "TableFormer provider is not available in this environment. " + "Please install a compatible docling/docling-eval setup " + "that provides `docling.models.stages`." + ) return TableFormerPredictionProvider( do_visualization=do_visualization, ignore_missing_predictions=True, diff --git a/docling_eval/datamodels/types.py b/docling_eval/datamodels/types.py index 8786f48b..0a3057c2 100644 --- a/docling_eval/datamodels/types.py +++ b/docling_eval/datamodels/types.py @@ -59,6 +59,7 @@ class BenchMarkNames(str, Enum): # End-to-End DPBENCH = "DPBench" DOCLING_DPBENCH = "DoclingDPBench" + DOCLING_SDG = "DoclingSDG" OMNIDOCBENCH = "OmniDocBench" WORDSCAPE = "WordScape" CANVA_A = "canva_a" diff --git a/docling_eval/dataset_builders/doclingsdg_builder.py b/docling_eval/dataset_builders/doclingsdg_builder.py new file mode 100644 index 00000000..60e62dca --- /dev/null +++ b/docling_eval/dataset_builders/doclingsdg_builder.py @@ -0,0 +1,247 @@ +import logging +import re +from io import BytesIO +from pathlib import Path +from typing import Iterable, List + +from docling_core.types import DoclingDocument +from docling_core.types.doc import ImageRef, PageItem, Size +from docling_core.types.io import DocumentStream +from PIL import Image +from pydantic import ValidationError +from tqdm import tqdm + +from docling_eval.datamodels.dataset_record import DatasetRecord +from docling_eval.datamodels.types import BenchMarkColumns +from docling_eval.dataset_builders.dataset_builder import BaseEvaluationDatasetBuilder +from docling_eval.utils.utils import ( + extract_images, + from_pil_to_base64uri, + get_binary, + get_binhash, +) + +_log = logging.getLogger(__name__) + +_PAGE_SUFFIX_PATTERN = re.compile(r"_page_(\\d+)$", re.IGNORECASE) + + +class DoclingSDGDatasetBuilder(BaseEvaluationDatasetBuilder): + """ + Dataset builder for local Docling JSON + PNG source files. + + Expected input layout in ``dataset_source``: + - one Docling JSON file per document (``.json``), and + - either one PNG (``.png``) or page-wise PNGs + (``_page_000001.png``, ...). + """ + + def __init__( + self, + dataset_source: Path, + target: Path, + split: str = "test", + begin_index: int = 0, + end_index: int = -1, + ): + super().__init__( + name="DoclingSDG", + dataset_source=dataset_source, + target=target, + split=split, + begin_index=begin_index, + end_index=end_index, + ) + + self.must_retrieve = False + + @staticmethod + def _sort_by_page_suffix(path: Path) -> tuple[int, str]: + match = _PAGE_SUFFIX_PATTERN.search(path.stem) + page_no = int(match.group(1)) if match else 0 + return page_no, path.name.lower() + + def _find_json_files(self) -> List[Path]: + assert isinstance(self.dataset_source, Path) + + files = list(self.dataset_source.glob("*.json")) + files.extend(self.dataset_source.glob("*.JSON")) + + deduped = {f.resolve(): f for f in files} + return sorted(deduped.values(), key=lambda p: p.name.lower()) + + def _find_png_files_for_doc(self, doc_id: str) -> List[Path]: + assert isinstance(self.dataset_source, Path) + + base_names = [doc_id] + if doc_id.lower().endswith(".png"): + base_names.append(doc_id[:-4]) + + exact_matches: List[Path] = [] + paged_matches: List[Path] = [] + + for base_name in dict.fromkeys(base_names): + if not base_name: + continue + + exact_matches.extend(self.dataset_source.glob(f"{base_name}.png")) + exact_matches.extend(self.dataset_source.glob(f"{base_name}.PNG")) + + paged_matches.extend(self.dataset_source.glob(f"{base_name}_page_*.png")) + paged_matches.extend(self.dataset_source.glob(f"{base_name}_page_*.PNG")) + + if exact_matches: + deduped = {f.resolve(): f for f in exact_matches} + return sorted(deduped.values(), key=lambda p: p.name.lower()) + + deduped = {f.resolve(): f for f in paged_matches} + return sorted(deduped.values(), key=self._sort_by_page_suffix) + + @staticmethod + def _load_png_images(files: List[Path]) -> List[Image.Image]: + images: List[Image.Image] = [] + + for png_path in files: + with Image.open(png_path) as img: + images.append(img.convert("RGB")) + + return images + + @staticmethod + def _attach_page_images( + document: DoclingDocument, + page_images: List[Image.Image], + ) -> DoclingDocument: + for page_no, page_image in enumerate(page_images, start=1): + image_ref = ImageRef( + mimetype="image/png", + dpi=72, + size=Size(width=page_image.width, height=page_image.height), + uri=from_pil_to_base64uri(page_image), + ) + + if page_no in document.pages: + page_item = document.pages[page_no] + page_item.image = image_ref + if ( + page_item.size is None + or page_item.size.width <= 0 + or page_item.size.height <= 0 + ): + page_item.size = Size( + width=float(page_image.width), + height=float(page_image.height), + ) + else: + document.pages[page_no] = PageItem( + page_no=page_no, + size=Size( + width=float(page_image.width), + height=float(page_image.height), + ), + image=image_ref, + ) + + return document + + @staticmethod + def _page_images_to_pdf_bytes(page_images: List[Image.Image]) -> bytes: + if not page_images: + raise ValueError("page_images must not be empty") + + pdf_buffer = BytesIO() + first_page = page_images[0] + other_pages = page_images[1:] + + first_page.save( + pdf_buffer, + format="PDF", + save_all=True, + append_images=other_pages, + ) + + return pdf_buffer.getvalue() + + def iterate(self) -> Iterable[DatasetRecord]: + assert isinstance(self.dataset_source, Path) + + json_files = self._find_json_files() + + begin, end = self.get_effective_indices(len(json_files)) + selected_json_files = json_files[begin:end] + + self.log_dataset_stats(len(json_files), len(selected_json_files)) + _log.info( + "Processing DoclingSDG dataset with %d documents", + len(selected_json_files), + ) + + for json_path in tqdm( + selected_json_files, + desc="Processing files for DoclingSDG", + ncols=128, + ): + doc_id = json_path.stem + + try: + document = DoclingDocument.load_from_json(json_path) + except ValidationError as exc: + _log.warning("Validation error for %s: %s. Skipping.", json_path, exc) + continue + except Exception as exc: # noqa: BLE001 + _log.warning("Failed to load %s: %s. Skipping.", json_path, exc) + continue + + png_files = self._find_png_files_for_doc(doc_id) + if len(png_files) == 0: + _log.warning( + "No matching PNG found for %s. Expected '%s.png' or '%s_page_*.png'. Skipping.", + json_path.name, + doc_id, + doc_id, + ) + continue + + try: + page_images = self._load_png_images(png_files) + except Exception as exc: # noqa: BLE001 + _log.warning( + "Failed to read PNG files for %s: %s. Skipping.", doc_id, exc + ) + continue + + self._attach_page_images(document, page_images) + document, pictures, extracted_page_images = extract_images( + document=document, + pictures_column=BenchMarkColumns.GROUNDTRUTH_PICTURES.value, + page_images_column=BenchMarkColumns.GROUNDTRUTH_PAGE_IMAGES.value, + ) + + if len(extracted_page_images) == 0: + extracted_page_images = page_images + + if len(png_files) == 1: + original_bytes = get_binary(png_files[0]) + original_stream = DocumentStream( + name=png_files[0].name, + stream=BytesIO(original_bytes), + ) + mime_type = "image/png" + else: + original_bytes = self._page_images_to_pdf_bytes(page_images) + original_stream = DocumentStream( + name=f"{doc_id}.pdf", + stream=BytesIO(original_bytes), + ) + mime_type = "application/pdf" + + yield DatasetRecord( + doc_id=doc_id, + doc_path=json_path, + doc_hash=get_binhash(original_bytes), + ground_truth_doc=document, + ground_truth_pictures=pictures, + ground_truth_page_images=extracted_page_images, + original=original_stream, + mime_type=mime_type, + )