From 5cc61d93fb8ec8136fd52cbd087b92722fea86b3 Mon Sep 17 00:00:00 2001 From: odelliab Date: Wed, 25 Feb 2026 21:04:58 +0200 Subject: [PATCH 01/17] line_chunker --- .../transforms/chunker/line_chunker.py | 199 +++++++++++ test/test_line_chunker.py | 308 ++++++++++++++++++ 2 files changed, 507 insertions(+) create mode 100644 docling_core/transforms/chunker/line_chunker.py create mode 100644 test/test_line_chunker.py diff --git a/docling_core/transforms/chunker/line_chunker.py b/docling_core/transforms/chunker/line_chunker.py new file mode 100644 index 00000000..c2813f3d --- /dev/null +++ b/docling_core/transforms/chunker/line_chunker.py @@ -0,0 +1,199 @@ +import warnings +from typing import Any, Tuple, Optional + +from collections.abc import Iterator + +from pydantic import ConfigDict, Field + +from docling_core.types import DoclingDocument +from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta +from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer +from docling_core.transforms.chunker.hybrid_chunker import _get_default_tokenizer +from docling_core.transforms.chunker.hierarchical_chunker import ( + ChunkingSerializerProvider, +) +from docling_core.transforms.serializer.base import ( + BaseSerializerProvider, +) + + +class LineBasedTokenChunker(BaseChunker): + r"""Chunker doing tokenization-aware chunking of document text. Chunk contains full lines. + + Args: + tokenizer: The tokenizer to use; either instantiated object or name or path of + respective pretrained model + max_tokens: The maximum number of tokens per chunk. If not set, limit is + resolved from the tokenizer + prefix: a text that should appear at the beginning of each chunks, default is an empty string + """ + model_config = ConfigDict(arbitrary_types_allowed=True) + tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) + prefix: str = "" + prefix_len: int = Field(default=0, init=False) + serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() + + @property + def max_tokens(self) -> int: + """Get maximum number of tokens allowed.""" + return self.tokenizer.get_max_tokens() + + def model_post_init(self, __context) -> None: + + self.prefix_len = self.tokenizer.count_tokens(self.prefix) + if self.prefix_len >= self.max_tokens: + warnings.warn( + f"Chunks prefix: {self.prefix} is too long for chunk size {self.max_tokens} and will be ignored" + ) + self.prefix = "" + self.prefix_len = 0 + + + def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: + """Chunk the provided document using line-based token-aware chunking. + + Args: + dl_doc (DoclingDocument): document to chunk + + Yields: + Iterator[BaseChunk]: iterator over extracted chunks + """ + my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc) + + # Serialize the entire document to get the text + ser_res = my_doc_ser.serialize() + + if not ser_res.text: + return + + # Use chunk_text to split the text into chunks + text_chunks = self.chunk_text(lines = ser_res.text.splitlines(True)) + + # Yield DocChunk objects for each text chunk + for chunk_text in text_chunks: + yield DocChunk( + text=chunk_text, + meta=DocMeta( + doc_items=ser_res.get_unique_doc_items(), + headings=None, + origin=dl_doc.origin, + ), + ) + + def chunk_text(self, lines: list[str]) -> list[str]: + chunks = [] + current = self.prefix + current_len = self.prefix_len + + for line in lines: + remaining = line + + while True: + line_tokens = self.tokenizer.count_tokens(remaining) + available = self.max_tokens - current_len + + # If the remaining part fits entirely into current chunk → append and stop + if line_tokens <= available: + current += remaining + current_len += line_tokens + break + + # Remaining does NOT fit into current chunk. + # If it CAN fit into a fresh chunk → flush current and start new one. + if line_tokens + self.prefix_len <= self.max_tokens: + chunks.append(current) + current = self.prefix + current_len = self.prefix_len + # loop continues to retry fitting `remaining` + continue + + # Remaining is too large even for an empty chunk → split it. + # Split off the first segment that fits into current. + take, remaining = self.split_by_token_limit( + remaining, + available + ) + + # Add the taken part + current += "\n" + take + current_len += self.tokenizer.count_tokens(take) + + # flush the current chunk (full) + chunks.append(current) + current = self.prefix + current_len = self.prefix_len + + # end while for this line + + # push final chunk if non-empty + if current != self.prefix: + chunks.append(current) + + return chunks + + + def split_by_token_limit( + self, + text: str, + token_limit: int, + prefer_word_boundary: bool = True, + ) -> Tuple[str, str]: + """ + Split `text` into (head, tail) where `head` has at most `token_limit` tokens, + and `tail` is the remainder. Uses binary search on character indices to minimize + calls to `count_tokens`. + + Parameters + ---------- + text : str + Input string to split. + token_limit: int + Maximum number of tokens allowed in the head. + prefer_word_boundary : bool + If True, try to end the head on a whitespace boundary (without violating + the token limit). If no boundary exists in range, fall back to the + exact max index found by search. + + Returns + ------- + (head, tail) : Tuple[str, str] + `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. + If `token_limit <= 0`, returns ("", text). + """ + if token_limit <= 0 or not text: + return "", text + + # if the whole text already fits, return as is. + if self.tokenizer.count_tokens(text) <= token_limit: + return text, "" + + # Binary search over character indices [0, len(text)] + lo, hi = 0, len(text) + best_idx: Optional[int] = None + + while lo <= hi: + mid = (lo + hi) // 2 + head = text[:mid] + tok_count = self.tokenizer.count_tokens(head) + + if tok_count <= token_limit: + best_idx = mid # feasible; try to extend + lo = mid + 1 + else: + hi = mid - 1 + + if best_idx is None or best_idx <= 0: + # Even the first character exceeds the limit (e.g., tokenizer behavior). + # Return nothing in head, everything in tail. + return "", text + + # Optionally adjust to a previous whitespace boundary without violating the limit + if prefer_word_boundary: + # Search backwards from best_idx to find whitespace; keep within token limit. + + last_space_index= text[:best_idx].rfind(" ") + if last_space_index > 0: + best_idx = last_space_index + + head, tail = text[:best_idx], text[best_idx:] + return head, tail diff --git a/test/test_line_chunker.py b/test/test_line_chunker.py new file mode 100644 index 00000000..33f5e4cf --- /dev/null +++ b/test/test_line_chunker.py @@ -0,0 +1,308 @@ +import json +import pytest +from transformers import AutoTokenizer + +from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker +from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer +from docling_core.types.doc import DoclingDocument as DLDocument +from docling_core.types.doc.labels import DocItemLabel + +from .test_data_gen_flag import GEN_TEST_DATA + +EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" +MAX_TOKENS = 25 +INNER_TOKENIZER = AutoTokenizer.from_pretrained(EMBED_MODEL_ID) + + +def _process(act_data, exp_path_str): + """Helper function to either generate or compare test data.""" + if GEN_TEST_DATA: + with open(exp_path_str, mode="w", encoding="utf-8") as f: + json.dump(act_data, fp=f, indent=4) + f.write("\n") + else: + with open(exp_path_str, encoding="utf-8") as f: + exp_data = json.load(fp=f) + assert exp_data == act_data + + +def test_chunk_text_with_prefix(): + """Test text chunking with a prefix.""" + prefix = "Context: " + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + prefix=prefix, + ) + + lines = ["Line 1\n", "Line 2\n", "Line 3"] + chunks = chunker.chunk_text(lines) + + assert isinstance(chunks, list) + assert len(chunks) > 0 + # Each chunk should start with the prefix + for chunk in chunks: + assert isinstance(chunk, str) + assert chunk.startswith(prefix) + + +def test_chunk_text_long_prefix_warning(): + """Test that a warning is issued when prefix is too long.""" + # Create a very long prefix that exceeds max_tokens + long_prefix = "This is a very long prefix " * 50 + + with pytest.warns(UserWarning, match="too long for chunk size"): + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + prefix=long_prefix, + ) + + # Prefix should be reset to empty string + assert chunker.prefix == "" + assert chunker.prefix_len == 0 + + +def test_chunk_text_single_long_line(): + """Test chunking when a single line exceeds max_tokens.""" + + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + # Create a very long line + long_line = "word " * MAX_TOKENS * 5 + lines = [long_line] + chunks = chunker.chunk_text(lines) + + assert len(chunks) > 1 + # Verify each chunk respects token limit + for chunk in chunks: + token_count = chunker.tokenizer.count_tokens(chunk) + assert token_count <= MAX_TOKENS + + +def test_chunk_text_empty_string(): + """Test chunking an empty list.""" + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = chunker.chunk_text([]) + assert len(chunks) == 0 + + +def test_chunk_text_single_line(): + """Test chunking a single line that fits in one chunk.""" + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + text = "This is a single short line.\n" + lines = [text] + chunks = chunker.chunk_text(lines) + + assert len(chunks) == 1 + assert chunks[0] == text + # newline should be preserved + assert "\n" in chunks[0] + + +def test_split_by_token_limit(): + """Test the split_by_token_limit method.""" + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + available = 10 + text = "This is a test sentence with multiple words that should be split." + head, tail = chunker.split_by_token_limit(text, token_limit=available) + + assert len(head) > 0 + assert len(tail) > 0 + assert chunker.tokenizer.count_tokens(head) <= available + assert head + tail == text + + +def test_split_by_token_limit_zero_limit(): + """Test split_by_token_limit with zero token limit.""" + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + text = "Some text" + head, tail = chunker.split_by_token_limit(text, token_limit=0) + + assert head == "" + assert tail == text + + +def test_split_by_token_limit_fits_entirely(): + """Test split_by_token_limit when text fits within limit.""" + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + text = "Short text" + head, tail = chunker.split_by_token_limit(text, token_limit=100) + + assert head == text + assert tail == "" + + +def test_split_by_token_limit_word_boundary(): + """Test that split_by_token_limit prefers word boundaries.""" + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + text = "word1 word2 word3 word4 word5" + head, tail = chunker.split_by_token_limit(text, token_limit=5, prefer_word_boundary=True) + + # Head should end at a word boundary (space) + if len(head) > 0 and len(tail) > 0: + # Either head ends with a space or tail starts with a space + assert head[-1].isspace() or tail[0].isspace() or not head[-1].isalnum() + + + +def test_chunk_text_with_prefix_and_long_lines(): + """Test chunking with prefix when lines are long.""" + prefix = "PREFIX: " + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + prefix=prefix, + ) + + long_line = "This is a long line that will need to be split " * 3 + lines = [long_line] + chunks = chunker.chunk_text(lines) + + assert len(chunks) > 0 + for chunk in chunks: + assert chunk.startswith(prefix) + token_count = chunker.tokenizer.count_tokens(chunk) + assert token_count <= MAX_TOKENS + + + +def test_chunk_document(): + """Test the chunk() method with a DoclingDocument.""" + # Create a simple DoclingDocument + doc = DLDocument(name="test_doc") + paragraphs = ["This is the first paragraph with some content.", + "This is the second paragraph with more content", + "This is the third paragraph with even more content."] + + + # Add some text items to the document + for t in paragraphs: + doc.add_text(label=DocItemLabel.PARAGRAPH, text=t) + + # Create chunker + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + # Chunk the document + chunks = list(chunker.chunk(doc)) + + # Verify chunks were created + assert len(chunks) > 0 + + # Verify each chunk is a DocChunk with proper structure + for chunk in chunks: + assert hasattr(chunk, 'text') + assert hasattr(chunk, 'meta') + assert isinstance(chunk.text, str) + assert len(chunk.text) > 0 + + # Verify token count is within limit + token_count = chunker.tokenizer.count_tokens(chunk.text) + assert token_count <= MAX_TOKENS + + # Verify each paragraph resides fully in a chunk + for t in paragraphs: + assert any(t in c.text for c in chunks) + + +def test_chunk_empty_document(): + """Test the chunk() method with an empty document.""" + # Create an empty DoclingDocument + doc = DLDocument(name="empty_doc") + + # Create chunker + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + ) + + # Chunk the document + chunks = list(chunker.chunk(doc)) + + # Should return no chunks for empty document + assert len(chunks) == 0 + + +def test_chunk_document_with_long_content(): + """Test the chunk() method with long content that requires multiple chunks.""" + # Create a DoclingDocument with long content + doc = DLDocument(name="long_doc") + prefix = "Document: " + + # Add a very long paragraph + long_text = "This is a sentence with multiple words. " * 50 + doc.add_text(label=DocItemLabel.PARAGRAPH, text=long_text) + + chunker = LineBasedTokenChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=MAX_TOKENS, + ), + prefix=prefix + ) + + # Chunk the document + chunks = list(chunker.chunk(doc)) + + # Should create multiple chunks + assert len(chunks) > 1 + + # Verify each chunk respects token limit + for chunk in chunks: + assert chunk.text.startswith(prefix) + token_count = chunker.tokenizer.count_tokens(chunk.text) + assert token_count <= MAX_TOKENS From 91b43f97e44c7c06ef35695222faa41053109cfb Mon Sep 17 00:00:00 2001 From: odelliab Date: Wed, 25 Feb 2026 21:09:56 +0200 Subject: [PATCH 02/17] split table to header and body --- docling_core/transforms/serializer/base.py | 20 +++++++++++++ .../transforms/serializer/markdown.py | 30 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py index 6dfbaab3..d5a67b0a 100644 --- a/docling_core/transforms/serializer/base.py +++ b/docling_core/transforms/serializer/base.py @@ -78,6 +78,26 @@ def serialize( """Serializes the passed item.""" ... + def get_header_and_body_lines( + self, + *, + table_text: str, + **kwargs: Any, + ) -> tuple[list[str], list[str]]: + """Get header lines and body lines from the table. + + Returns: + A tuple of (header_lines, body_lines) where header_lines is a list + of strings representing table headers and body_lines is a list of + strings representing table body rows. + + Default implementation returns empty header lines and all content in body lines. + """ + # default: empty headers, all content in body + header_lines: list[str] = [] + body_lines = [line for line in table_text.split("\n") if line.strip()] + return header_lines, body_lines + class BasePictureSerializer(ABC): """Base class for picture item serializers.""" diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 30bf623a..5d4a029e 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -351,6 +351,36 @@ def serialize( class MarkdownTableSerializer(BaseTableSerializer): """Markdown-specific table item serializer.""" + @override + def get_header_and_body_lines( + self, + *, + table_text: str, + + **kwargs: Any, + ) -> tuple[list[str], list[str]]: + """Get header lines and body lines from the markdown table. + + Returns: + A tuple of (header_lines, body_lines) where header_lines contains + the header row and separator row, and body_lines contains the data rows. + """ + + lines = [line for line in table_text.split("\n") if line.strip()] + + if len(lines) < 2: + # Not enough lines for a proper markdown table (need at least header + separator) + return [], lines + + # In markdown tables: + # Line 0: Header row + # Line 1: Separator row (with dashes) + # Lines 2+: Body rows + header_lines = lines[:2] + body_lines = lines[2:] + + return header_lines, body_lines + @staticmethod def _compact_table(table_text: str) -> str: """Remove padding from a markdown table. From 5d17bdacf2acb6d300e7e44e83309bb182e81189 Mon Sep 17 00:00:00 2001 From: odelliab Date: Wed, 25 Feb 2026 21:12:04 +0200 Subject: [PATCH 03/17] duplicat table headers --- .../transforms/chunker/hybrid_chunker.py | 41 +- test/data/chunker/0c_out_chunks.json | 1111 +++++++++++++++++ test/test_hybrid_chunker.py | 101 +- 3 files changed, 1243 insertions(+), 10 deletions(-) create mode 100644 test/data/chunker/0c_out_chunks.json diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py index 46ca1651..cef90688 100644 --- a/docling_core/transforms/chunker/hybrid_chunker.py +++ b/docling_core/transforms/chunker/hybrid_chunker.py @@ -8,10 +8,12 @@ from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator from docling_core.transforms.chunker.hierarchical_chunker import ( + ChunkingDocSerializer, ChunkingSerializerProvider, ) +from docling_core.transforms.serializer.base import BaseDocSerializer from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer -from docling_core.types.doc.document import SectionHeaderItem, TitleItem +from docling_core.types.doc.document import SectionHeaderItem, TitleItem, TableItem try: import semchunk @@ -37,7 +39,6 @@ ) from docling_core.types import DoclingDocument - def _get_default_tokenizer(): from docling_core.transforms.chunker.tokenizer.huggingface import ( HuggingFaceTokenizer, @@ -61,6 +62,7 @@ class HybridChunker(BaseChunker): model_config = ConfigDict(arbitrary_types_allowed=True) tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) + duplicate_table_header: bool = True merge_peers: bool = True serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() @@ -214,7 +216,9 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial def _split_using_plain_text( self, - doc_chunk: DocChunk, + doc_chunk: DocChunk, + doc_serializer:ChunkingDocSerializer, + ) -> list[DocChunk]: lengths = self._doc_chunk_length(doc_chunk) if lengths.total_len <= self.max_tokens: @@ -223,7 +227,7 @@ def _split_using_plain_text( # How much room is there for text after subtracting out the headers and # captions: available_length = self.max_tokens - lengths.other_len - sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length) + if available_length <= 0: warnings.warn( "Headers and captions for this chunk are longer than the total " @@ -233,12 +237,31 @@ def _split_using_plain_text( new_chunk = DocChunk(**doc_chunk.export_json_dict()) new_chunk.meta.captions = None new_chunk.meta.headings = None - return self._split_using_plain_text(doc_chunk=new_chunk) - text = doc_chunk.text - segments = sem_chunker.chunk(text) + return self._split_using_plain_text(doc_chunk=new_chunk, doc_serializer=doc_serializer) + + segments = self.segment(doc_chunk,available_length,doc_serializer) chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments] return chunks + def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer:ChunkingDocSerializer) -> list[str]: + segments = [] + if self.duplicate_table_header and len(doc_chunk.meta.doc_items) == 1 and isinstance(doc_chunk.meta.doc_items[0], TableItem): + + header_lines, body_lines = doc_serializer.table_serializer.get_header_and_body_lines( + table_text=doc_chunk.text) + from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker + line_chunker = LineBasedTokenChunker( + tokenizer=self.tokenizer, + max_tokens=available_length, + prefix="\n".join(header_lines) + ) + segments = line_chunker.chunk_text(lines=body_lines) + else: + sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length) + segments= sem_chunker.chunk(doc_chunk.text) + return segments + + def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): output_chunks = [] window_start = 0 @@ -246,7 +269,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): num_chunks = len(chunks) while window_end < num_chunks: chunk = chunks[window_end] - headings = chunk.meta.headings + headings = chunk.meta.headings ready_to_append = False if window_start == window_end: current_headings = headings @@ -306,7 +329,7 @@ def chunk( **kwargs, ) # type: ignore res = [x for c in res for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)] - res = [x for c in res for x in self._split_using_plain_text(c)] + res = [x for c in res for x in self._split_using_plain_text(c, doc_serializer=my_doc_ser)] if self.merge_peers: res = self._merge_chunks_with_matching_metadata(res) return iter(res) diff --git a/test/data/chunker/0c_out_chunks.json b/test/data/chunker/0c_out_chunks.json new file mode 100644 index 00000000..395d8ae5 --- /dev/null +++ b/test/data/chunker/0c_out_chunks.json @@ -0,0 +1,1111 @@ +[ + { + "text": "In this image we can see a cartoon image of a duck holding a paper.\n\n", + "meta": { + "doc_items": [ + "#/pictures/0" + ], + "headings": null + } + }, + { + "text": "Version 1.0\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\nAI4K Group, IBM Research R\u00a8 uschlikon, Switzerland", + "meta": { + "doc_items": [ + "#/texts/2", + "#/texts/3", + "#/texts/4" + ], + "headings": [ + "Docling Technical Report" + ] + } + }, + { + "text": "This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.", + "meta": { + "doc_items": [ + "#/texts/6" + ], + "headings": [ + "Docling Technical Report", + "Abstract" + ] + } + }, + { + "text": "Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.\nWith Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.", + "meta": { + "doc_items": [ + "#/texts/8", + "#/texts/9" + ], + "headings": [ + "Docling Technical Report", + "1 Introduction" + ] + } + }, + { + "text": "Here is what Docling delivers today:\n- \u00b7 Converts PDF documents to JSON or Markdown format, stable and lightning fast\n- \u00b7 Understands detailed page layout, reading order, locates figures and recovers table structures\n- \u00b7 Extracts metadata from the document, such as title, authors, references and language\n- \u00b7 Optionally applies OCR, e.g. for scanned PDFs\n- \u00b7 Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)\n- \u00b7 Can leverage different accelerators (GPU, MPS, etc).", + "meta": { + "doc_items": [ + "#/texts/12", + "#/texts/13", + "#/texts/14", + "#/texts/15", + "#/texts/16", + "#/texts/17", + "#/texts/18" + ], + "headings": [ + "Docling Technical Report", + "1 Introduction" + ] + } + }, + { + "text": "To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling. All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance.\nDocling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.\n```\nfrom docling.document_converter import DocumentConverter Large\n```", + "meta": { + "doc_items": [ + "#/texts/20", + "#/texts/21", + "#/texts/22" + ], + "headings": [ + "Docling Technical Report", + "2 Getting Started" + ] + } + }, + { + "text": "```\nsource = \"https://arxiv.org/pdf/2206.01062\" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: \"## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]\"\n```\nOptionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.", + "meta": { + "doc_items": [ + "#/texts/23", + "#/texts/24" + ], + "headings": [ + "Docling Technical Report", + "2 Getting Started" + ] + } + }, + { + "text": "Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.", + "meta": { + "doc_items": [ + "#/texts/26" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline" + ] + } + }, + { + "text": "Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive\n1 see huggingface.co/ds4sd/docling-models/\nIn this image, we can see some text and images.\n\nFigure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.\n\n\nlicensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].", + "meta": { + "doc_items": [ + "#/texts/28", + "#/texts/29", + "#/pictures/1", + "#/texts/31", + "#/texts/47" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.1 PDF backends" + ] + } + }, + { + "text": "We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.", + "meta": { + "doc_items": [ + "#/texts/48" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.1 PDF backends" + ] + } + }, + { + "text": "As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.", + "meta": { + "doc_items": [ + "#/texts/50" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.2 AI models" + ] + } + }, + { + "text": "Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].\nThe Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.", + "meta": { + "doc_items": [ + "#/texts/52", + "#/texts/53" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.2 AI models", + "Layout Analysis Model" + ] + } + }, + { + "text": "The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].", + "meta": { + "doc_items": [ + "#/texts/55" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.2 AI models", + "Table Structure Recognition" + ] + } + }, + { + "text": "The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.", + "meta": { + "doc_items": [ + "#/texts/57" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.2 AI models", + "Table Structure Recognition" + ] + } + }, + { + "text": "Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).\nWe are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.", + "meta": { + "doc_items": [ + "#/texts/59", + "#/texts/60" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.2 AI models", + "OCR" + ] + } + }, + { + "text": "In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.", + "meta": { + "doc_items": [ + "#/texts/62" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.3 Assembly" + ] + } + }, + { + "text": "Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.\nImplementations of model classes must satisfy the python Callable interface. The \\_\\_call\\_\\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.", + "meta": { + "doc_items": [ + "#/texts/64", + "#/texts/65" + ], + "headings": [ + "Docling Technical Report", + "3 Processing pipeline", + "3.4 Extensibility" + ] + } + }, + { + "text": "In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.\nIf you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.", + "meta": { + "doc_items": [ + "#/texts/67", + "#/texts/68" + ], + "headings": [ + "Docling Technical Report", + "4 Performance" + ] + } + }, + { + "text": "Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and\ntorch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.\n", + "meta": { + "doc_items": [ + "#/texts/69", + "#/texts/71", + "#/texts/72" + ], + "headings": [ + "Docling Technical Report", + "4 Performance" + ] + } + }, + { + "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || - | - | - | - | - | - | - | - || | | TTS | Pages/s | Mem | TTS | Pages/s | Mem || Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |", + "meta": { + "doc_items": [ + "#/tables/0" + ], + "headings": [ + "Docling Technical Report", + "4 Performance" + ] + } + }, + { + "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |", + "meta": { + "doc_items": [ + "#/tables/0" + ], + "headings": [ + "Docling Technical Report", + "4 Performance" + ] + } + }, + { + "text": "Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.", + "meta": { + "doc_items": [ + "#/texts/74" + ], + "headings": [ + "Docling Technical Report", + "5 Applications" + ] + } + }, + { + "text": "Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.\nWe encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.", + "meta": { + "doc_items": [ + "#/texts/76", + "#/texts/77" + ], + "headings": [ + "Docling Technical Report", + "6 Future work and contributions" + ] + } + }, + { + "text": "- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.", + "meta": { + "doc_items": [ + "#/texts/79" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S.", + "meta": { + "doc_items": [ + "#/texts/80" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "Chintala. Pytorch 2: Faster\nmachine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .", + "meta": { + "doc_items": [ + "#/texts/80", + "#/texts/82" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.\n- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .\n- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.\n- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .", + "meta": { + "doc_items": [ + "#/texts/83", + "#/texts/84", + "#/texts/85", + "#/texts/86" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .\n- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\\_index .", + "meta": { + "doc_items": [ + "#/texts/87", + "#/texts/88" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos\u00b4 e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\\_3 .", + "meta": { + "doc_items": [ + "#/texts/89" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .", + "meta": { + "doc_items": [ + "#/texts/90" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .\n- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.", + "meta": { + "doc_items": [ + "#/texts/91", + "#/texts/92" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.\n- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .\n- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .\n- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.", + "meta": { + "doc_items": [ + "#/texts/93", + "#/texts/94", + "#/texts/95", + "#/texts/96" + ], + "headings": [ + "Docling Technical Report", + "References" + ] + } + }, + { + "text": "In this section, we illustrate a few examples of Docling's output in Markdown and JSON.", + "meta": { + "doc_items": [ + "#/texts/100" + ], + "headings": [ + "Docling Technical Report", + "Appendix" + ] + } + }, + { + "text": "Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com\nChristoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com\nMichele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com\nAhmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com\nPeter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", + "meta": { + "doc_items": [ + "#/texts/103", + "#/texts/104", + "#/texts/105", + "#/texts/106", + "#/texts/107" + ], + "headings": [ + "Docling Technical Report", + "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" + ] + } + }, + { + "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models.", + "meta": { + "doc_items": [ + "#/texts/109" + ], + "headings": [ + "Docling Technical Report", + "ABSTRACT" + ] + } + }, + { + "text": "We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", + "meta": { + "doc_items": [ + "#/texts/109" + ], + "headings": [ + "Docling Technical Report", + "ABSTRACT" + ] + } + }, + { + "text": "\u00b7 Informationsystems \u2192 Documentstructure ; \u00b7 Appliedcomputing \u2192 Document analysis ; \u00b7 Computing methodologies \u2192 Machine learning Computer vision ; ; Object detection ;\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA \u00a9 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\nBirgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com\nChristoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com", + "meta": { + "doc_items": [ + "#/texts/111", + "#/texts/112", + "#/texts/113", + "#/texts/114" + ], + "headings": [ + "Docling Technical Report", + "CCS CONCEPTS" + ] + } + }, + { + "text": "Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com\nAhmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com\nPeter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", + "meta": { + "doc_items": [ + "#/texts/115", + "#/texts/116", + "#/texts/117" + ], + "headings": [ + "Docling Technical Report", + "CCS CONCEPTS" + ] + } + }, + { + "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models.", + "meta": { + "doc_items": [ + "#/texts/119" + ], + "headings": [ + "Docling Technical Report", + "ABSTRACT" + ] + } + }, + { + "text": "We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", + "meta": { + "doc_items": [ + "#/texts/119" + ], + "headings": [ + "Docling Technical Report", + "ABSTRACT" + ] + } + }, + { + "text": "\u00c6 Information systems \u2192 Document structure ; \u00c6 Applied computing \u2192 Document analysis ; \u00c6 Computing methodologies \u2192 Machine learning ; Computer vision ; Object detection ;\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).\nKDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\nFigure 1: Four examples of complex page layouts across different document categories", + "meta": { + "doc_items": [ + "#/texts/121", + "#/texts/122", + "#/texts/123", + "#/texts/124" + ], + "headings": [ + "Docling Technical Report", + "CCS CONCEPTS" + ] + } + }, + { + "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", + "meta": { + "doc_items": [ + "#/texts/126" + ], + "headings": [ + "Docling Technical Report", + "KEYWORDS" + ] + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043\nIn this image there is a table with some text on it.\n\n\nIn this image we can see a text.\n\n\nAGL Energy Limited ABN 74 1\n5 061 375\nIn this image I can see the text on the image.\n\n\nIn this image there is a paper with some text on it.\n\n\nFigure 1: Four examples of complex page layouts across different document categories", + "meta": { + "doc_items": [ + "#/texts/128", + "#/pictures/2", + "#/pictures/3", + "#/texts/393", + "#/texts/394", + "#/pictures/4", + "#/pictures/5", + "#/texts/503" + ], + "headings": [ + "Docling Technical Report", + "ACM Reference Format:" + ] + } + }, + { + "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", + "meta": { + "doc_items": [ + "#/texts/505" + ], + "headings": [ + "Docling Technical Report", + "KEYWORDS" + ] + } + }, + { + "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043\n1 INTRODUCTION", + "meta": { + "doc_items": [ + "#/texts/507", + "#/texts/508" + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ] + } + }, + { + "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", + "meta": { + "doc_items": [ + "#/texts/509" + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ] + } + }, + { + "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", + "meta": { + "doc_items": [ + "#/texts/511", + "#/texts/512" + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ] + } + }, + { + "text": "| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 |\n| - | - | - | - | - |\n| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 |", + "meta": { + "doc_items": [ + "#/tables/1" + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ] + } + }, + { + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page", + "meta": { + "doc_items": [ + "#/texts/513" + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ] + } + }, + { + "text": "in a typical timeframe of 20s to 60s, depending on its complexity.", + "meta": { + "doc_items": [ + "#/texts/513" + ], + "headings": [ + "Docling Technical Report", + "ACMReference Format:" + ] + } + }, + { + "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this\nIn this image, we can see a table.\n\n\nThird, achienec", + "meta": { + "doc_items": [ + "#/texts/515", + "#/pictures/6", + "#/texts/516" + ], + "headings": [ + "Docling Technical Report", + "5 EXPERIMENTS" + ] + } + }, + { + "text": "chalenongayouls ground-vuth dawa such WC\nThe image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the\n\n", + "meta": { + "doc_items": [ + "#/texts/518", + "#/pictures/7" + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ] + } + }, + { + "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.\nIn this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", + "meta": { + "doc_items": [ + "#/texts/519", + "#/texts/520", + "#/texts/521" + ], + "headings": [ + "Docling Technical Report", + "EXPERIMENTS" + ] + } + }, + { + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and", + "meta": { + "doc_items": [ + "#/texts/523" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.\nof row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", + "meta": { + "doc_items": [ + "#/texts/523", + "#/texts/524", + "#/texts/526", + "#/texts/527", + "#/texts/529", + "#/texts/530", + "#/texts/531", + "#/texts/532", + "#/texts/533" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.\n\n", + "meta": { + "doc_items": [ + "#/pictures/8" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.\n\n", + "meta": { + "doc_items": [ + "#/pictures/9" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", + "meta": { + "doc_items": [ + "#/tables/3" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "In this image I can see a blue circle.\n\n\ninclude publication repositories such as arXiv\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-\nannotated pages, from which we obtain accuracy ranges.\nA table with different columns and rows.\n\n", + "meta": { + "doc_items": [ + "#/pictures/10", + "#/texts/695", + "#/texts/696", + "#/texts/697", + "#/pictures/11" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", + "meta": { + "doc_items": [ + "#/tables/4" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |\n3\n,\ngovernment offices,\nWe reviewed the col-\n,\nPage-\nTitle and\n.", + "meta": { + "doc_items": [ + "#/tables/4", + "#/texts/906", + "#/texts/907", + "#/texts/908", + "#/texts/909", + "#/texts/910", + "#/texts/911", + "#/texts/912", + "#/texts/913" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "page. Specificity ensures that the choice of label is not ambiguous,\nIn this image there is a table with some text on it.\n\n\nwe distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific\nonly. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can\nquality controls. Phase one and two required a small team of experts to a document category, such as\nAbstract in the\nScientific Articles were assembled and supervised.\ncategory. We also avoided class labels that are tightly linked to the\nPhase 1: Data selection and preparation.\nOur inclusion cri-\nAuthor\nAffiliation", + "meta": { + "doc_items": [ + "#/texts/914", + "#/pictures/12", + "#/texts/981", + "#/texts/982", + "#/texts/983", + "#/texts/984", + "#/texts/985", + "#/texts/986", + "#/texts/987", + "#/texts/988", + "#/texts/989", + "#/texts/990" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + }, + { + "text": "teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).\nsemantics of the text. Labels such as and\n,\nas seen", + "meta": { + "doc_items": [ + "#/texts/991", + "#/texts/992", + "#/texts/993", + "#/texts/994" + ], + "headings": [ + "Docling Technical Report", + "Baselines for Object Detection" + ] + } + } +] diff --git a/test/test_hybrid_chunker.py b/test/test_hybrid_chunker.py index 79e976fd..b9a95863 100644 --- a/test/test_hybrid_chunker.py +++ b/test/test_hybrid_chunker.py @@ -15,7 +15,7 @@ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer -from docling_core.transforms.serializer.markdown import MarkdownTableSerializer +from docling_core.transforms.serializer.markdown import MarkdownTableSerializer, MarkdownParams from docling_core.types.doc import DoclingDocument as DLDocument from docling_core.types.doc.document import DoclingDocument from docling_core.types.doc.labels import DocItemLabel @@ -412,3 +412,102 @@ class Setup: act_data=act_data, exp_path_str=setup.exp, ) + +def test_chunk_with_duplicate_table_header(): + """Test that table headers are repeated when a table is split across chunks.""" + INPUT_FILE = "test/data/chunker/0_inp_dl_doc.json" + EXPECTED_OUT_FILE = "test/data/chunker/0c_out_chunks.json" + + with open(INPUT_FILE, encoding="utf-8") as f: + data_json = f.read() + dl_doc = DLDocument.model_validate_json(data_json) + + # Verify the document has tables + assert len(dl_doc.tables) > 0, "Input file should contain at least one table" + + class MarkdownSerializerProvider(ChunkingSerializerProvider): + def get_serializer(self, doc: DoclingDocument): + return ChunkingDocSerializer( + doc=doc, + table_serializer=MarkdownTableSerializer(), + params = MarkdownParams(compact_tables=True), # Use compact table format to reduce token count + ) + + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer( + tokenizer=INNER_TOKENIZER, + max_tokens=250, + ), + merge_peers=True, + duplicate_table_header=True, + serializer_provider=MarkdownSerializerProvider(), + ) + # Create table serializer to serialize individual tables + serializer = chunker.serializer_provider.get_serializer(dl_doc) + + # Serialize each table item individually to get expected content + table_contents = {} + for table_item in dl_doc.tables: + # Serialize the table + ser_result = serializer.serialize( + item=table_item, + ) + table_contents[table_item.self_ref] = ser_result.text + + + chunks = list(chunker.chunk(dl_doc=dl_doc)) + # for chunk in chunks: + # print(chunk, file=open("output.txt", "a", encoding="utf-8")) + # print("+"*50, file=open("output.txt", "a", encoding="utf-8")) + + # Verify we got chunks + assert len(chunks) > 0, "Expected at least one chunk from the input document" + + # For each table, verify its content appears in chunks + for table_ref, table_text in table_contents.items(): + # Get header and body lines from the serialized table + if table_text: + header_lines, body_lines = serializer.table_serializer.get_header_and_body_lines( + table_text=table_text + ) + + # Find all chunks that contain content from this table + chunks_with_table = [chunk for chunk in chunks if table_ref in [i.self_ref for i in chunk.meta.doc_items]] + + # Verify table content appears in at least one chunk + assert len(chunks_with_table) > 0, f"Table {table_ref} content should appear in at least one chunk" + + # If table is split across multiple chunks, verify header is repeated + if len(chunks_with_table) > 1: + # Each chunk with table content should have the header + for chunk in chunks_with_table: + # Check if header lines are present + has_header = all( + header_line.strip() in chunk.text + for header_line in header_lines + ) + assert has_header, ( + f"Table {table_ref} split across chunks should have header repeated in each chunk. " + f"Missing header in chunk: {chunk.text[:200]}..." + ) + + # Verify all body lines appear somewhere in the chunks + all_chunk_text = "\n".join(chunk.text for chunk in chunks_with_table) + for body_line in body_lines: + assert body_line.strip() in all_chunk_text, ( + f"Table {table_ref} body line '{body_line.strip()}' should appear in chunks" + ) + + # Save chunks to output file for inspection + chunks_data = [ + { + "text": chunk.text, + "meta": { + "doc_items": [item.self_ref for item in chunk.meta.doc_items] if chunk.meta.doc_items else [], + "headings": chunk.meta.headings, + } + } + for chunk in chunks + ] + + _process(chunks_data, EXPECTED_OUT_FILE) \ No newline at end of file From a50392e53cccea75144f4307bd70055ceb1b150c Mon Sep 17 00:00:00 2001 From: odelliab Date: Wed, 25 Feb 2026 21:26:38 +0200 Subject: [PATCH 04/17] Revert "duplicat table headers" This reverts commit 5d17bdacf2acb6d300e7e44e83309bb182e81189. --- .../transforms/chunker/hybrid_chunker.py | 41 +- test/data/chunker/0c_out_chunks.json | 1111 ----------------- test/test_hybrid_chunker.py | 101 +- 3 files changed, 10 insertions(+), 1243 deletions(-) delete mode 100644 test/data/chunker/0c_out_chunks.json diff --git a/docling_core/transforms/chunker/hybrid_chunker.py b/docling_core/transforms/chunker/hybrid_chunker.py index cef90688..46ca1651 100644 --- a/docling_core/transforms/chunker/hybrid_chunker.py +++ b/docling_core/transforms/chunker/hybrid_chunker.py @@ -8,12 +8,10 @@ from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator from docling_core.transforms.chunker.hierarchical_chunker import ( - ChunkingDocSerializer, ChunkingSerializerProvider, ) -from docling_core.transforms.serializer.base import BaseDocSerializer from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer -from docling_core.types.doc.document import SectionHeaderItem, TitleItem, TableItem +from docling_core.types.doc.document import SectionHeaderItem, TitleItem try: import semchunk @@ -39,6 +37,7 @@ ) from docling_core.types import DoclingDocument + def _get_default_tokenizer(): from docling_core.transforms.chunker.tokenizer.huggingface import ( HuggingFaceTokenizer, @@ -62,7 +61,6 @@ class HybridChunker(BaseChunker): model_config = ConfigDict(arbitrary_types_allowed=True) tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) - duplicate_table_header: bool = True merge_peers: bool = True serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() @@ -216,9 +214,7 @@ def _split_by_doc_items(self, doc_chunk: DocChunk, doc_serializer: BaseDocSerial def _split_using_plain_text( self, - doc_chunk: DocChunk, - doc_serializer:ChunkingDocSerializer, - + doc_chunk: DocChunk, ) -> list[DocChunk]: lengths = self._doc_chunk_length(doc_chunk) if lengths.total_len <= self.max_tokens: @@ -227,7 +223,7 @@ def _split_using_plain_text( # How much room is there for text after subtracting out the headers and # captions: available_length = self.max_tokens - lengths.other_len - + sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length) if available_length <= 0: warnings.warn( "Headers and captions for this chunk are longer than the total " @@ -237,31 +233,12 @@ def _split_using_plain_text( new_chunk = DocChunk(**doc_chunk.export_json_dict()) new_chunk.meta.captions = None new_chunk.meta.headings = None - return self._split_using_plain_text(doc_chunk=new_chunk, doc_serializer=doc_serializer) - - segments = self.segment(doc_chunk,available_length,doc_serializer) + return self._split_using_plain_text(doc_chunk=new_chunk) + text = doc_chunk.text + segments = sem_chunker.chunk(text) chunks = [DocChunk(text=s, meta=doc_chunk.meta) for s in segments] return chunks - def segment(self, doc_chunk: DocChunk, available_length: int, doc_serializer:ChunkingDocSerializer) -> list[str]: - segments = [] - if self.duplicate_table_header and len(doc_chunk.meta.doc_items) == 1 and isinstance(doc_chunk.meta.doc_items[0], TableItem): - - header_lines, body_lines = doc_serializer.table_serializer.get_header_and_body_lines( - table_text=doc_chunk.text) - from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker - line_chunker = LineBasedTokenChunker( - tokenizer=self.tokenizer, - max_tokens=available_length, - prefix="\n".join(header_lines) - ) - segments = line_chunker.chunk_text(lines=body_lines) - else: - sem_chunker = semchunk.chunkerify(self.tokenizer.get_tokenizer(), chunk_size=available_length) - segments= sem_chunker.chunk(doc_chunk.text) - return segments - - def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): output_chunks = [] window_start = 0 @@ -269,7 +246,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]): num_chunks = len(chunks) while window_end < num_chunks: chunk = chunks[window_end] - headings = chunk.meta.headings + headings = chunk.meta.headings ready_to_append = False if window_start == window_end: current_headings = headings @@ -329,7 +306,7 @@ def chunk( **kwargs, ) # type: ignore res = [x for c in res for x in self._split_by_doc_items(c, doc_serializer=my_doc_ser)] - res = [x for c in res for x in self._split_using_plain_text(c, doc_serializer=my_doc_ser)] + res = [x for c in res for x in self._split_using_plain_text(c)] if self.merge_peers: res = self._merge_chunks_with_matching_metadata(res) return iter(res) diff --git a/test/data/chunker/0c_out_chunks.json b/test/data/chunker/0c_out_chunks.json deleted file mode 100644 index 395d8ae5..00000000 --- a/test/data/chunker/0c_out_chunks.json +++ /dev/null @@ -1,1111 +0,0 @@ -[ - { - "text": "In this image we can see a cartoon image of a duck holding a paper.\n\n", - "meta": { - "doc_items": [ - "#/pictures/0" - ], - "headings": null - } - }, - { - "text": "Version 1.0\nChristoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar\nAI4K Group, IBM Research R\u00a8 uschlikon, Switzerland", - "meta": { - "doc_items": [ - "#/texts/2", - "#/texts/3", - "#/texts/4" - ], - "headings": [ - "Docling Technical Report" - ] - } - }, - { - "text": "This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.", - "meta": { - "doc_items": [ - "#/texts/6" - ], - "headings": [ - "Docling Technical Report", - "Abstract" - ] - } - }, - { - "text": "Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.\nWith Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.", - "meta": { - "doc_items": [ - "#/texts/8", - "#/texts/9" - ], - "headings": [ - "Docling Technical Report", - "1 Introduction" - ] - } - }, - { - "text": "Here is what Docling delivers today:\n- \u00b7 Converts PDF documents to JSON or Markdown format, stable and lightning fast\n- \u00b7 Understands detailed page layout, reading order, locates figures and recovers table structures\n- \u00b7 Extracts metadata from the document, such as title, authors, references and language\n- \u00b7 Optionally applies OCR, e.g. for scanned PDFs\n- \u00b7 Can be configured to be optimal for batch-mode (i.e high throughput, low time-to-solution) or interactive mode (compromise on efficiency, low time-to-solution)\n- \u00b7 Can leverage different accelerators (GPU, MPS, etc).", - "meta": { - "doc_items": [ - "#/texts/12", - "#/texts/13", - "#/texts/14", - "#/texts/15", - "#/texts/16", - "#/texts/17", - "#/texts/18" - ], - "headings": [ - "Docling Technical Report", - "1 Introduction" - ] - } - }, - { - "text": "To use Docling, you can simply install the docling package from PyPI. Documentation and examples are available in our GitHub repository at github.com/DS4SD/docling. All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance.\nDocling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository.\n```\nfrom docling.document_converter import DocumentConverter Large\n```", - "meta": { - "doc_items": [ - "#/texts/20", - "#/texts/21", - "#/texts/22" - ], - "headings": [ - "Docling Technical Report", - "2 Getting Started" - ] - } - }, - { - "text": "```\nsource = \"https://arxiv.org/pdf/2206.01062\" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: \"## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]\"\n```\nOptionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container.", - "meta": { - "doc_items": [ - "#/texts/23", - "#/texts/24" - ], - "headings": [ - "Docling Technical Report", - "2 Getting Started" - ] - } - }, - { - "text": "Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown.", - "meta": { - "doc_items": [ - "#/texts/26" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline" - ] - } - }, - { - "text": "Two basic requirements to process PDF documents in our pipeline are a) to retrieve all text content and their geometric coordinates on each page and b) to render the visual representation of each page as it would appear in a PDF viewer. Both these requirements are encapsulated in Docling's PDF backend interface. While there are several open-source PDF parsing libraries available for python, we faced major obstacles with all of them for different reasons, among which were restrictive\n1 see huggingface.co/ds4sd/docling-models/\nIn this image, we can see some text and images.\n\nFigure 1: Sketch of Docling's default processing pipeline. The inner part of the model pipeline is easily customizable and extensible.\n\n\nlicensing (e.g. pymupdf [7]), poor speed or unrecoverable quality issues, such as merged text cells across far-apart text tokens or table columns (pypdfium, PyPDF) [15, 14].", - "meta": { - "doc_items": [ - "#/texts/28", - "#/texts/29", - "#/pictures/1", - "#/texts/31", - "#/texts/47" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.1 PDF backends" - ] - } - }, - { - "text": "We therefore decided to provide multiple backend choices, and additionally open-source a custombuilt PDF parser, which is based on the low-level qpdf [4] library. It is made available in a separate package named docling-parse and powers the default PDF backend in Docling. As an alternative, we provide a PDF backend relying on pypdfium , which may be a safe backup choice in certain cases, e.g. if issues are seen with particular font encodings.", - "meta": { - "doc_items": [ - "#/texts/48" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.1 PDF backends" - ] - } - }, - { - "text": "As part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure recognition model. We provide the pre-trained weights (hosted on huggingface) and a separate package for the inference code as docling-ibm-models . Both models are also powering the open-access deepsearch-experience, our cloud-native service for knowledge exploration tasks.", - "meta": { - "doc_items": [ - "#/texts/50" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.2 AI models" - ] - } - }, - { - "text": "Our layout analysis model is an object-detector which predicts the bounding-boxes and classes of various elements on the image of a given page. Its architecture is derived from RT-DETR [16] and re-trained on DocLayNet [13], our popular human-annotated dataset for document-layout analysis, among other proprietary datasets. For inference, our implementation relies on the onnxruntime [5].\nThe Docling pipeline feeds page images at 72 dpi resolution, which can be processed on a single CPU with sub-second latency. All predicted bounding-box proposals for document elements are post-processed to remove overlapping proposals based on confidence and size, and then intersected with the text tokens in the PDF to group them into meaningful and complete units such as paragraphs, section titles, list items, captions, figures or tables.", - "meta": { - "doc_items": [ - "#/texts/52", - "#/texts/53" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.2 AI models", - "Layout Analysis Model" - ] - } - }, - { - "text": "The TableFormer model [12], first published in 2022 and since refined with a custom structure token language [9], is a vision-transformer model for table structure recovery. It can predict the logical row and column structure of a given table based on an input image, and determine which table cells belong to column headers, row headers or the table body. Compared to earlier approaches, TableFormer handles many characteristics of tables, such as partial or no borderlines, empty cells, rows or columns, cell spans and hierarchy both on column-heading or row-heading level, tables with inconsistent indentation or alignment and other complexities. For inference, our implementation relies on PyTorch [2].", - "meta": { - "doc_items": [ - "#/texts/55" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.2 AI models", - "Table Structure Recognition" - ] - } - }, - { - "text": "The Docling pipeline feeds all table objects detected in the layout analysis to the TableFormer model, by providing an image-crop of the table and the included text cells. TableFormer structure predictions are matched back to the PDF cells in post-processing to avoid expensive re-transcription text in the table image. Typical tables require between 2 and 6 seconds to be processed on a standard CPU, strongly depending on the amount of included table cells.", - "meta": { - "doc_items": [ - "#/texts/57" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.2 AI models", - "Table Structure Recognition" - ] - } - }, - { - "text": "Docling provides optional support for OCR, for example to cover scanned PDFs or content in bitmaps images embedded on a page. In our initial release, we rely on EasyOCR [1], a popular thirdparty OCR library with support for many languages. Docling, by default, feeds a high-resolution page image (216 dpi) to the OCR engine, to allow capturing small print detail in decent quality. While EasyOCR delivers reasonable transcription quality, we observe that it runs fairly slow on CPU (upwards of 30 seconds per page).\nWe are actively seeking collaboration from the open-source community to extend Docling with additional OCR backends and speed improvements.", - "meta": { - "doc_items": [ - "#/texts/59", - "#/texts/60" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.2 AI models", - "OCR" - ] - } - }, - { - "text": "In the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliary package docling-core . The generated document object is passed through a post-processing model which leverages several algorithms to augment features, such as detection of the document language, correcting the reading order, matching figures with captions and labelling metadata such as title, authors and references. The final output can then be serialized to JSON or transformed into a Markdown representation at the users request.", - "meta": { - "doc_items": [ - "#/texts/62" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.3 Assembly" - ] - } - }, - { - "text": "Docling provides a straight-forward interface to extend its capabilities, namely the model pipeline. A model pipeline constitutes the central part in the processing, following initial document parsing and preceding output assembly, and can be fully customized by sub-classing from an abstract baseclass ( BaseModelPipeline ) or cloning the default model pipeline. This effectively allows to fully customize the chain of models, add or replace models, and introduce additional pipeline configuration parameters. To use a custom model pipeline, the custom pipeline class to instantiate can be provided as an argument to the main document conversion methods. We invite everyone in the community to propose additional or alternative models and improvements.\nImplementations of model classes must satisfy the python Callable interface. The \\_\\_call\\_\\_ method must accept an iterator over page objects, and produce another iterator over the page objects which were augmented with the additional features predicted by the model, by extending the provided PagePredictions data model accordingly.", - "meta": { - "doc_items": [ - "#/texts/64", - "#/texts/65" - ], - "headings": [ - "Docling Technical Report", - "3 Processing pipeline", - "3.4 Extensibility" - ] - } - }, - { - "text": "In this section, we establish some reference numbers for the processing speed of Docling and the resource budget it requires. All tests in this section are run with default options on our standard test set distributed with Docling, which consists of three papers from arXiv and two IBM Redbooks, with a total of 225 pages. Measurements were taken using both available PDF backends on two different hardware systems: one MacBook Pro M3 Max, and one bare-metal server running Ubuntu 20.04 LTS on an Intel Xeon E5-2690 CPU. For reproducibility, we fixed the thread budget (through setting OMP NUM THREADS environment variable ) once to 4 (Docling default) and once to 16 (equal to full core count on the test hardware). All results are shown in Table 1.\nIf you need to run Docling in very low-resource environments, please consider configuring the pypdfium backend. While it is faster and more memory efficient than the default docling-parse backend, it will come at the expense of worse quality results, especially in table structure recovery.", - "meta": { - "doc_items": [ - "#/texts/67", - "#/texts/68" - ], - "headings": [ - "Docling Technical Report", - "4 Performance" - ] - } - }, - { - "text": "Establishing GPU acceleration support for the AI models is currently work-in-progress and largely untested, but may work implicitly when CUDA is available and discovered by the onnxruntime and\ntorch runtimes backing the Docling pipeline. We will deliver updates on this topic at in a future version of this report.\n", - "meta": { - "doc_items": [ - "#/texts/69", - "#/texts/71", - "#/texts/72" - ], - "headings": [ - "Docling Technical Report", - "4 Performance" - ] - } - }, - { - "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || - | - | - | - | - | - | - | - || | | TTS | Pages/s | Mem | TTS | Pages/s | Mem || Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |", - "meta": { - "doc_items": [ - "#/tables/0" - ], - "headings": [ - "Docling Technical Report", - "4 Performance" - ] - } - }, - { - "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |", - "meta": { - "doc_items": [ - "#/tables/0" - ], - "headings": [ - "Docling Technical Report", - "4 Performance" - ] - } - }, - { - "text": "Thanks to the high-quality, richly structured document conversion achieved by Docling, its output qualifies for numerous downstream applications. For example, Docling can provide a base for detailed enterprise document search, passage retrieval or classification use-cases, or support knowledge extraction pipelines, allowing specific treatment of different structures in the document, such as tables, figures, section structure or references. For popular generative AI application patterns, such as retrieval-augmented generation (RAG), we provide quackling , an open-source package which capitalizes on Docling's feature-rich document output to enable document-native optimized vector embedding and chunking. It plugs in seamlessly with LLM frameworks such as LlamaIndex [8]. Since Docling is fast, stable and cheap to run, it also makes for an excellent choice to build document-derived datasets. With its powerful table structure recognition, it provides significant benefit to automated knowledge-base construction [11, 10]. Docling is also integrated within the open IBM data prep kit [6], which implements scalable data transforms to build large-scale multi-modal training datasets.", - "meta": { - "doc_items": [ - "#/texts/74" - ], - "headings": [ - "Docling Technical Report", - "5 Applications" - ] - } - }, - { - "text": "Docling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of content, as well as augment extracted document metadata with additional information. Further investment into testing and optimizing GPU acceleration as well as improving the Docling-native PDF backend are on our roadmap, too.\nWe encourage everyone to propose or implement additional features and models, and will gladly take your inputs and contributions under review . The codebase of Docling is open for use and contribution, under the MIT license agreement and in alignment with our contributing guidelines included in the Docling repository. If you use Docling in your projects, please consider citing this technical report.", - "meta": { - "doc_items": [ - "#/texts/76", - "#/texts/77" - ], - "headings": [ - "Docling Technical Report", - "6 Future work and contributions" - ] - } - }, - { - "text": "- [1] J. AI. Easyocr: Ready-to-use ocr with 80+ supported languages. https://github.com/ JaidedAI/EasyOCR , 2024. Version: 1.7.0.", - "meta": { - "doc_items": [ - "#/texts/79" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [2] J. Ansel, E. Yang, H. He, N. Gimelshein, A. Jain, M. Voznesensky, B. Bao, P. Bell, D. Berard, E. Burovski, G. Chauhan, A. Chourdia, W. Constable, A. Desmaison, Z. DeVito, E. Ellison, W. Feng, J. Gong, M. Gschwind, B. Hirsh, S. Huang, K. Kalambarkar, L. Kirsch, M. Lazos, M. Lezcano, Y. Liang, J. Liang, Y. Lu, C. Luk, B. Maher, Y. Pan, C. Puhrsch, M. Reso, M. Saroufim, M. Y. Siraichi, H. Suk, M. Suo, P. Tillet, E. Wang, X. Wang, W. Wen, S. Zhang, X. Zhao, K. Zhou, R. Zou, A. Mathews, G. Chanan, P. Wu, and S.", - "meta": { - "doc_items": [ - "#/texts/80" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "Chintala. Pytorch 2: Faster\nmachine learning through dynamic python bytecode transformation and graph compilation. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24) . ACM, 4 2024. doi: 10.1145/3620665.3640366. URL https://pytorch.org/assets/pytorch2-2.pdf .", - "meta": { - "doc_items": [ - "#/texts/80", - "#/texts/82" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [3] C. Auer, M. Dolfi, A. Carvalho, C. B. Ramis, and P. W. Staar. Delivering document conversion as a cloud service with high throughput and responsiveness. In 2022 IEEE 15th International Conference on Cloud Computing (CLOUD) , pages 363-373. IEEE, 2022.\n- [4] J. Berkenbilt. Qpdf: A content-preserving pdf document transformer, 2024. URL https: //github.com/qpdf/qpdf .\n- [5] O. R. developers. Onnx runtime. https://onnxruntime.ai/ , 2024. Version: 1.18.1.\n- [6] IBM. Data Prep Kit: a community project to democratize and accelerate unstructured data preparation for LLM app developers, 2024. URL https://github.com/IBM/ data-prep-kit .", - "meta": { - "doc_items": [ - "#/texts/83", - "#/texts/84", - "#/texts/85", - "#/texts/86" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [7] A. S. Inc. PyMuPDF, 2024. URL https://github.com/pymupdf/PyMuPDF .\n- [8] J. Liu. LlamaIndex, 11 2022. URL https://github.com/jerryjliu/llama\\_index .", - "meta": { - "doc_items": [ - "#/texts/87", - "#/texts/88" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [9] M. Lysak, A. Nassar, N. Livathinos, C. Auer, and P. Staar. Optimized Table Tokenization for Table Structure Recognition. In Document Analysis and Recognition - ICDAR 2023: 17th International Conference, San Jos\u00b4 e, CA, USA, August 21-26, 2023, Proceedings, Part II , pages 37-50, Berlin, Heidelberg, Aug. 2023. Springer-Verlag. ISBN 978-3-031-41678-1. doi: 10. 1007/978-3-031-41679-8 3. URL https://doi.org/10.1007/978-3-031-41679-8\\_3 .", - "meta": { - "doc_items": [ - "#/texts/89" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [10] L. Mishra, S. Dhibi, Y. Kim, C. Berrospi Ramis, S. Gupta, M. Dolfi, and P. Staar. Statements: Universal information extraction from tables with large language models for ESG KPIs. In D. Stammbach, J. Ni, T. Schimanski, K. Dutia, A. Singh, J. Bingler, C. Christiaen, N. Kushwaha, V. Muccione, S. A. Vaghefi, and M. Leippold, editors, Proceedings of the 1st Workshop on Natural Language Processing Meets Climate Change (ClimateNLP 2024) , pages 193-214, Bangkok, Thailand, Aug. 2024. Association for Computational Linguistics. URL https://aclanthology.org/2024.climatenlp-1.15 .", - "meta": { - "doc_items": [ - "#/texts/90" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [11] L. Morin, V. Weber, G. I. Meijer, F. Yu, and P. W. J. Staar. Patcid: an open-access dataset of chemical structures in patent documents. Nature Communications , 15(1):6532, August 2024. ISSN 2041-1723. doi: 10.1038/s41467-024-50779-y. URL https://doi.org/10.1038/ s41467-024-50779-y .\n- [12] A. Nassar, N. Livathinos, M. Lysak, and P. Staar. Tableformer: Table structure understanding with transformers. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 4614-4623, 2022.", - "meta": { - "doc_items": [ - "#/texts/91", - "#/texts/92" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "- [13] B. Pfitzmann, C. Auer, M. Dolfi, A. S. Nassar, and P. Staar. Doclaynet: a large humanannotated dataset for document-layout segmentation. pages 3743-3751, 2022.\n- [14] pypdf Maintainers. pypdf: A Pure-Python PDF Library, 2024. URL https://github.com/ py-pdf/pypdf .\n- [15] P. Team. PyPDFium2: Python bindings for PDFium, 2024. URL https://github.com/ pypdfium2-team/pypdfium2 .\n- [16] Y. Zhao, W. Lv, S. Xu, J. Wei, G. Wang, Q. Dang, Y. Liu, and J. Chen. Detrs beat yolos on real-time object detection, 2023.", - "meta": { - "doc_items": [ - "#/texts/93", - "#/texts/94", - "#/texts/95", - "#/texts/96" - ], - "headings": [ - "Docling Technical Report", - "References" - ] - } - }, - { - "text": "In this section, we illustrate a few examples of Docling's output in Markdown and JSON.", - "meta": { - "doc_items": [ - "#/texts/100" - ], - "headings": [ - "Docling Technical Report", - "Appendix" - ] - } - }, - { - "text": "Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com\nChristoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com\nMichele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com\nAhmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com\nPeter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", - "meta": { - "doc_items": [ - "#/texts/103", - "#/texts/104", - "#/texts/105", - "#/texts/106", - "#/texts/107" - ], - "headings": [ - "Docling Technical Report", - "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" - ] - } - }, - { - "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models.", - "meta": { - "doc_items": [ - "#/texts/109" - ], - "headings": [ - "Docling Technical Report", - "ABSTRACT" - ] - } - }, - { - "text": "We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", - "meta": { - "doc_items": [ - "#/texts/109" - ], - "headings": [ - "Docling Technical Report", - "ABSTRACT" - ] - } - }, - { - "text": "\u00b7 Informationsystems \u2192 Documentstructure ; \u00b7 Appliedcomputing \u2192 Document analysis ; \u00b7 Computing methodologies \u2192 Machine learning Computer vision ; ; Object detection ;\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s). KDD '22, August 14-18, 2022, Washington, DC, USA \u00a9 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\nBirgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com\nChristoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com", - "meta": { - "doc_items": [ - "#/texts/111", - "#/texts/112", - "#/texts/113", - "#/texts/114" - ], - "headings": [ - "Docling Technical Report", - "CCS CONCEPTS" - ] - } - }, - { - "text": "Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com\nAhmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com\nPeter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com", - "meta": { - "doc_items": [ - "#/texts/115", - "#/texts/116", - "#/texts/117" - ], - "headings": [ - "Docling Technical Report", - "CCS CONCEPTS" - ] - } - }, - { - "text": "Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large groundtruth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models.", - "meta": { - "doc_items": [ - "#/texts/119" - ], - "headings": [ - "Docling Technical Report", - "ABSTRACT" - ] - } - }, - { - "text": "We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.", - "meta": { - "doc_items": [ - "#/texts/119" - ], - "headings": [ - "Docling Technical Report", - "ABSTRACT" - ] - } - }, - { - "text": "\u00c6 Information systems \u2192 Document structure ; \u00c6 Applied computing \u2192 Document analysis ; \u00c6 Computing methodologies \u2192 Machine learning ; Computer vision ; Object detection ;\nPermission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).\nKDD '22, August 14-18, 2022, Washington, DC, USA ' 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08. https://doi.org/10.1145/3534678.3539043\nFigure 1: Four examples of complex page layouts across different document categories", - "meta": { - "doc_items": [ - "#/texts/121", - "#/texts/122", - "#/texts/123", - "#/texts/124" - ], - "headings": [ - "Docling Technical Report", - "CCS CONCEPTS" - ] - } - }, - { - "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", - "meta": { - "doc_items": [ - "#/texts/126" - ], - "headings": [ - "Docling Technical Report", - "KEYWORDS" - ] - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043\nIn this image there is a table with some text on it.\n\n\nIn this image we can see a text.\n\n\nAGL Energy Limited ABN 74 1\n5 061 375\nIn this image I can see the text on the image.\n\n\nIn this image there is a paper with some text on it.\n\n\nFigure 1: Four examples of complex page layouts across different document categories", - "meta": { - "doc_items": [ - "#/texts/128", - "#/pictures/2", - "#/pictures/3", - "#/texts/393", - "#/texts/394", - "#/pictures/4", - "#/pictures/5", - "#/texts/503" - ], - "headings": [ - "Docling Technical Report", - "ACM Reference Format:" - ] - } - }, - { - "text": "PDF document conversion, layout segmentation, object-detection, data set, Machine Learning", - "meta": { - "doc_items": [ - "#/texts/505" - ], - "headings": [ - "Docling Technical Report", - "KEYWORDS" - ] - } - }, - { - "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar. 2022. DocLayNet: A Large Human-Annotated Dataset for DocumentLayout Analysis. In Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD '22), August 14-18, 2022, Washington, DC, USA. ACM, New York, NY, USA, 9 pages. https://doi.org/10.1145/ 3534678.3539043\n1 INTRODUCTION", - "meta": { - "doc_items": [ - "#/texts/507", - "#/texts/508" - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ] - } - }, - { - "text": "Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. Figure 2: Title page of the DocLayNet paper (arxiv.org/pdf/2206.01062) - left PDF, right rendered Markdown. If recognized, metadata such as authors are appearing first under the title. Text content inside figures is currently dropped, the caption is retained and linked to the figure in the JSON representation (not shown).", - "meta": { - "doc_items": [ - "#/texts/509" - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ] - } - }, - { - "text": "KDD '22, August 14-18, 2022, Washington, DC, USA Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.", - "meta": { - "doc_items": [ - "#/texts/511", - "#/texts/512" - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ] - } - }, - { - "text": "| | human | MRCNN R50 R101 | FRCNN R101 | YOLO v5x6 |\n| - | - | - | - | - |\n| Caption Footnote Formula List-item Page-footer Page-header Picture Section-header Table Text Title All | 84-89 83-91 83-85 87-88 93-94 85-89 69-71 83-84 77-81 84-86 | 68.4 71.5 70.9 60.1 63.4 81.2 80.8 61.6 59.3 71.9 70.0 71.7 72.7 67.6 69.3 82.2 82.9 85.8 76.7 80.4 72.4 73.5 | 70.1 73.7 63.5 81.0 58.9 72.0 72.0 68.4 82.2 85.4 79.9 73.4 | 77.7 77.2 66.2 86.2 61.1 67.9 74.6 86.3 88.1 82.7 76.8 |", - "meta": { - "doc_items": [ - "#/tables/1" - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ] - } - }, - { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page", - "meta": { - "doc_items": [ - "#/texts/513" - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ] - } - }, - { - "text": "in a typical timeframe of 20s to 60s, depending on its complexity.", - "meta": { - "doc_items": [ - "#/texts/513" - ], - "headings": [ - "Docling Technical Report", - "ACMReference Format:" - ] - } - }, - { - "text": "The primary goal of DocLayNet is to obtain high-quality ML models capable of accurate document-layout analysis on a wide variety of challenging layouts. As discussed in Section 2, object detection models are currently the easiest to use, due to the standardisation of ground-truth data in COCO format [16] and the availability of general frameworks such as detectron2 [17]. Furthermore, baseline numbers in PubLayNet and DocBank were obtained using standard object detection models such as Mask R-CNN and Faster R-CNN. As such, we will relate to these object detection methods in this\nIn this image, we can see a table.\n\n\nThird, achienec", - "meta": { - "doc_items": [ - "#/texts/515", - "#/pictures/6", - "#/texts/516" - ], - "headings": [ - "Docling Technical Report", - "5 EXPERIMENTS" - ] - } - }, - { - "text": "chalenongayouls ground-vuth dawa such WC\nThe image is a line graph that shows the percentage of respondents who have used a specific type of training in the past 24 hours. The x-axis represents the time in hours, while the y-axis represents the percentage of respondents. The graph is titled \"Training.\"\n\nThe graph has two lines: one for the training type and one for the training duration. The training type line is labeled \"Training\" and the training duration line is labeled \"Training Duration.\"\n\nThe graph shows that the percentage of respondents who have used the training type increases as the time increases. Specifically, the percentage of respondents who have used the training type increases from 10% to 20% over the 24-hour period. The percentage of respondents who have used the training duration increases from 10% to 20% over the 24-hour period.\n\nThe graph also shows that the percentage of respondents who have used the training type increases as the\n\n", - "meta": { - "doc_items": [ - "#/texts/518", - "#/pictures/7" - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ] - } - }, - { - "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.\npaper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.\nIn this section, we will present several aspects related to the performance of object detection models on DocLayNet. Similarly as in PubLayNet, we will evaluate the quality of their predictions using mean average precision (mAP) with 10 overlaps that range from 0.5 to 0.95 in steps of 0.05 (mAP@0.5-0.95). These scores are computed by leveraging the evaluation code provided by the COCO API [16].", - "meta": { - "doc_items": [ - "#/texts/519", - "#/texts/520", - "#/texts/521" - ], - "headings": [ - "Docling Technical Report", - "EXPERIMENTS" - ] - } - }, - { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and", - "meta": { - "doc_items": [ - "#/texts/523" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.\nof row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "doc_items": [ - "#/texts/523", - "#/texts/524", - "#/texts/526", - "#/texts/527", - "#/texts/529", - "#/texts/530", - "#/texts/531", - "#/texts/532", - "#/texts/533" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "The image consists of a blue circle with the letter \"A\" written in white color. The circle is placed on a white background, which makes the letter \"A\" stand out prominently. The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom. The gradient effect gives a sense of depth and movement, making the letter \"A\" stand out more.\n\n### Description of Objects in the Image:\n1. **Circle**: The central element of the image is a blue circle.\n2. **White Letter \"A\"**: The letter \"A\" is written in white color.\n3. **Background**: The background is a gradient of blue and white, transitioning from a lighter shade at the top to a darker shade at the bottom.\n4. **Color Gradient**: The gradient effect is present, with the color transitioning from a lighter shade at the top to a darker shade at the bottom.\n\n", - "meta": { - "doc_items": [ - "#/pictures/8" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "In this image, there is a table with two columns. The first column is labeled \"Class label,\" and the second column is labeled \"Count.\" The first row in the table has the label \"Class label,\" and the count is 22524. The second row in the table has the label \"Count,\" and the count is 204.\n\n", - "meta": { - "doc_items": [ - "#/pictures/9" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", - "meta": { - "doc_items": [ - "#/tables/3" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "In this image I can see a blue circle.\n\n\ninclude publication repositories such as arXiv\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row \"Total\") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-\nannotated pages, from which we obtain accuracy ranges.\nA table with different columns and rows.\n\n", - "meta": { - "doc_items": [ - "#/pictures/10", - "#/texts/695", - "#/texts/696", - "#/texts/697", - "#/pictures/11" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", - "meta": { - "doc_items": [ - "#/tables/4" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |\n3\n,\ngovernment offices,\nWe reviewed the col-\n,\nPage-\nTitle and\n.", - "meta": { - "doc_items": [ - "#/tables/4", - "#/texts/906", - "#/texts/907", - "#/texts/908", - "#/texts/909", - "#/texts/910", - "#/texts/911", - "#/texts/912", - "#/texts/913" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "page. Specificity ensures that the choice of label is not ambiguous,\nIn this image there is a table with some text on it.\n\n\nwe distributed the annotation workload and performed continuous be annotated. We refrained from class labels that are very specific\nonly. For phases three and four, a group of 40 dedicated annotators while coverage ensures that all meaningful items on a page can\nquality controls. Phase one and two required a small team of experts to a document category, such as\nAbstract in the\nScientific Articles were assembled and supervised.\ncategory. We also avoided class labels that are tightly linked to the\nPhase 1: Data selection and preparation.\nOur inclusion cri-\nAuthor\nAffiliation", - "meta": { - "doc_items": [ - "#/texts/914", - "#/pictures/12", - "#/texts/981", - "#/texts/982", - "#/texts/983", - "#/texts/984", - "#/texts/985", - "#/texts/986", - "#/texts/987", - "#/texts/988", - "#/texts/989", - "#/texts/990" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "teria for documents were described in Section 3. A large effort went into ensuring that all documents are free to use. The data sources in DocBank, are often only distinguishable by discriminating on 3 https://arxiv.org/ Figure 4: Table 1 from the DocLayNet paper in the original PDF (A), as rendered Markdown (B) and in JSON representation (C). Spanning table cells, such as the multi-column header 'triple interannotator mAP@0.5-0.95 (%)', is repeated for each column in the Markdown representation (B), which guarantees that every data point can be traced back to row and column headings only by its grid coordinates in the table. In the JSON representation, the span information is reflected in the fields of each table cell (C).\nsemantics of the text. Labels such as and\n,\nas seen", - "meta": { - "doc_items": [ - "#/texts/991", - "#/texts/992", - "#/texts/993", - "#/texts/994" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - } -] diff --git a/test/test_hybrid_chunker.py b/test/test_hybrid_chunker.py index b9a95863..79e976fd 100644 --- a/test/test_hybrid_chunker.py +++ b/test/test_hybrid_chunker.py @@ -15,7 +15,7 @@ from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer -from docling_core.transforms.serializer.markdown import MarkdownTableSerializer, MarkdownParams +from docling_core.transforms.serializer.markdown import MarkdownTableSerializer from docling_core.types.doc import DoclingDocument as DLDocument from docling_core.types.doc.document import DoclingDocument from docling_core.types.doc.labels import DocItemLabel @@ -412,102 +412,3 @@ class Setup: act_data=act_data, exp_path_str=setup.exp, ) - -def test_chunk_with_duplicate_table_header(): - """Test that table headers are repeated when a table is split across chunks.""" - INPUT_FILE = "test/data/chunker/0_inp_dl_doc.json" - EXPECTED_OUT_FILE = "test/data/chunker/0c_out_chunks.json" - - with open(INPUT_FILE, encoding="utf-8") as f: - data_json = f.read() - dl_doc = DLDocument.model_validate_json(data_json) - - # Verify the document has tables - assert len(dl_doc.tables) > 0, "Input file should contain at least one table" - - class MarkdownSerializerProvider(ChunkingSerializerProvider): - def get_serializer(self, doc: DoclingDocument): - return ChunkingDocSerializer( - doc=doc, - table_serializer=MarkdownTableSerializer(), - params = MarkdownParams(compact_tables=True), # Use compact table format to reduce token count - ) - - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=250, - ), - merge_peers=True, - duplicate_table_header=True, - serializer_provider=MarkdownSerializerProvider(), - ) - # Create table serializer to serialize individual tables - serializer = chunker.serializer_provider.get_serializer(dl_doc) - - # Serialize each table item individually to get expected content - table_contents = {} - for table_item in dl_doc.tables: - # Serialize the table - ser_result = serializer.serialize( - item=table_item, - ) - table_contents[table_item.self_ref] = ser_result.text - - - chunks = list(chunker.chunk(dl_doc=dl_doc)) - # for chunk in chunks: - # print(chunk, file=open("output.txt", "a", encoding="utf-8")) - # print("+"*50, file=open("output.txt", "a", encoding="utf-8")) - - # Verify we got chunks - assert len(chunks) > 0, "Expected at least one chunk from the input document" - - # For each table, verify its content appears in chunks - for table_ref, table_text in table_contents.items(): - # Get header and body lines from the serialized table - if table_text: - header_lines, body_lines = serializer.table_serializer.get_header_and_body_lines( - table_text=table_text - ) - - # Find all chunks that contain content from this table - chunks_with_table = [chunk for chunk in chunks if table_ref in [i.self_ref for i in chunk.meta.doc_items]] - - # Verify table content appears in at least one chunk - assert len(chunks_with_table) > 0, f"Table {table_ref} content should appear in at least one chunk" - - # If table is split across multiple chunks, verify header is repeated - if len(chunks_with_table) > 1: - # Each chunk with table content should have the header - for chunk in chunks_with_table: - # Check if header lines are present - has_header = all( - header_line.strip() in chunk.text - for header_line in header_lines - ) - assert has_header, ( - f"Table {table_ref} split across chunks should have header repeated in each chunk. " - f"Missing header in chunk: {chunk.text[:200]}..." - ) - - # Verify all body lines appear somewhere in the chunks - all_chunk_text = "\n".join(chunk.text for chunk in chunks_with_table) - for body_line in body_lines: - assert body_line.strip() in all_chunk_text, ( - f"Table {table_ref} body line '{body_line.strip()}' should appear in chunks" - ) - - # Save chunks to output file for inspection - chunks_data = [ - { - "text": chunk.text, - "meta": { - "doc_items": [item.self_ref for item in chunk.meta.doc_items] if chunk.meta.doc_items else [], - "headings": chunk.meta.headings, - } - } - for chunk in chunks - ] - - _process(chunks_data, EXPECTED_OUT_FILE) \ No newline at end of file From e5894290d5498afc8f66c61419e7fb5fac47ca19 Mon Sep 17 00:00:00 2001 From: odelliab Date: Wed, 25 Feb 2026 21:29:24 +0200 Subject: [PATCH 05/17] Revert "split table to header and body" This reverts commit 91b43f97e44c7c06ef35695222faa41053109cfb. --- docling_core/transforms/serializer/base.py | 20 ------------- .../transforms/serializer/markdown.py | 30 ------------------- 2 files changed, 50 deletions(-) diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py index d5a67b0a..6dfbaab3 100644 --- a/docling_core/transforms/serializer/base.py +++ b/docling_core/transforms/serializer/base.py @@ -78,26 +78,6 @@ def serialize( """Serializes the passed item.""" ... - def get_header_and_body_lines( - self, - *, - table_text: str, - **kwargs: Any, - ) -> tuple[list[str], list[str]]: - """Get header lines and body lines from the table. - - Returns: - A tuple of (header_lines, body_lines) where header_lines is a list - of strings representing table headers and body_lines is a list of - strings representing table body rows. - - Default implementation returns empty header lines and all content in body lines. - """ - # default: empty headers, all content in body - header_lines: list[str] = [] - body_lines = [line for line in table_text.split("\n") if line.strip()] - return header_lines, body_lines - class BasePictureSerializer(ABC): """Base class for picture item serializers.""" diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 5d4a029e..30bf623a 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -351,36 +351,6 @@ def serialize( class MarkdownTableSerializer(BaseTableSerializer): """Markdown-specific table item serializer.""" - @override - def get_header_and_body_lines( - self, - *, - table_text: str, - - **kwargs: Any, - ) -> tuple[list[str], list[str]]: - """Get header lines and body lines from the markdown table. - - Returns: - A tuple of (header_lines, body_lines) where header_lines contains - the header row and separator row, and body_lines contains the data rows. - """ - - lines = [line for line in table_text.split("\n") if line.strip()] - - if len(lines) < 2: - # Not enough lines for a proper markdown table (need at least header + separator) - return [], lines - - # In markdown tables: - # Line 0: Header row - # Line 1: Separator row (with dashes) - # Lines 2+: Body rows - header_lines = lines[:2] - body_lines = lines[2:] - - return header_lines, body_lines - @staticmethod def _compact_table(table_text: str) -> str: """Remove padding from a markdown table. From 30c72a99be398e0b3e3828ba8951d1ee8b55df9d Mon Sep 17 00:00:00 2001 From: odelliab Date: Wed, 25 Feb 2026 21:30:31 +0200 Subject: [PATCH 06/17] Revert "line_chunker" This reverts commit 5cc61d93fb8ec8136fd52cbd087b92722fea86b3. --- .../transforms/chunker/line_chunker.py | 199 ----------- test/test_line_chunker.py | 308 ------------------ 2 files changed, 507 deletions(-) delete mode 100644 docling_core/transforms/chunker/line_chunker.py delete mode 100644 test/test_line_chunker.py diff --git a/docling_core/transforms/chunker/line_chunker.py b/docling_core/transforms/chunker/line_chunker.py deleted file mode 100644 index c2813f3d..00000000 --- a/docling_core/transforms/chunker/line_chunker.py +++ /dev/null @@ -1,199 +0,0 @@ -import warnings -from typing import Any, Tuple, Optional - -from collections.abc import Iterator - -from pydantic import ConfigDict, Field - -from docling_core.types import DoclingDocument -from docling_core.transforms.chunker import BaseChunk, BaseChunker, DocChunk, DocMeta -from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer -from docling_core.transforms.chunker.hybrid_chunker import _get_default_tokenizer -from docling_core.transforms.chunker.hierarchical_chunker import ( - ChunkingSerializerProvider, -) -from docling_core.transforms.serializer.base import ( - BaseSerializerProvider, -) - - -class LineBasedTokenChunker(BaseChunker): - r"""Chunker doing tokenization-aware chunking of document text. Chunk contains full lines. - - Args: - tokenizer: The tokenizer to use; either instantiated object or name or path of - respective pretrained model - max_tokens: The maximum number of tokens per chunk. If not set, limit is - resolved from the tokenizer - prefix: a text that should appear at the beginning of each chunks, default is an empty string - """ - model_config = ConfigDict(arbitrary_types_allowed=True) - tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) - prefix: str = "" - prefix_len: int = Field(default=0, init=False) - serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() - - @property - def max_tokens(self) -> int: - """Get maximum number of tokens allowed.""" - return self.tokenizer.get_max_tokens() - - def model_post_init(self, __context) -> None: - - self.prefix_len = self.tokenizer.count_tokens(self.prefix) - if self.prefix_len >= self.max_tokens: - warnings.warn( - f"Chunks prefix: {self.prefix} is too long for chunk size {self.max_tokens} and will be ignored" - ) - self.prefix = "" - self.prefix_len = 0 - - - def chunk(self, dl_doc: DoclingDocument, **kwargs: Any) -> Iterator[BaseChunk]: - """Chunk the provided document using line-based token-aware chunking. - - Args: - dl_doc (DoclingDocument): document to chunk - - Yields: - Iterator[BaseChunk]: iterator over extracted chunks - """ - my_doc_ser = self.serializer_provider.get_serializer(doc=dl_doc) - - # Serialize the entire document to get the text - ser_res = my_doc_ser.serialize() - - if not ser_res.text: - return - - # Use chunk_text to split the text into chunks - text_chunks = self.chunk_text(lines = ser_res.text.splitlines(True)) - - # Yield DocChunk objects for each text chunk - for chunk_text in text_chunks: - yield DocChunk( - text=chunk_text, - meta=DocMeta( - doc_items=ser_res.get_unique_doc_items(), - headings=None, - origin=dl_doc.origin, - ), - ) - - def chunk_text(self, lines: list[str]) -> list[str]: - chunks = [] - current = self.prefix - current_len = self.prefix_len - - for line in lines: - remaining = line - - while True: - line_tokens = self.tokenizer.count_tokens(remaining) - available = self.max_tokens - current_len - - # If the remaining part fits entirely into current chunk → append and stop - if line_tokens <= available: - current += remaining - current_len += line_tokens - break - - # Remaining does NOT fit into current chunk. - # If it CAN fit into a fresh chunk → flush current and start new one. - if line_tokens + self.prefix_len <= self.max_tokens: - chunks.append(current) - current = self.prefix - current_len = self.prefix_len - # loop continues to retry fitting `remaining` - continue - - # Remaining is too large even for an empty chunk → split it. - # Split off the first segment that fits into current. - take, remaining = self.split_by_token_limit( - remaining, - available - ) - - # Add the taken part - current += "\n" + take - current_len += self.tokenizer.count_tokens(take) - - # flush the current chunk (full) - chunks.append(current) - current = self.prefix - current_len = self.prefix_len - - # end while for this line - - # push final chunk if non-empty - if current != self.prefix: - chunks.append(current) - - return chunks - - - def split_by_token_limit( - self, - text: str, - token_limit: int, - prefer_word_boundary: bool = True, - ) -> Tuple[str, str]: - """ - Split `text` into (head, tail) where `head` has at most `token_limit` tokens, - and `tail` is the remainder. Uses binary search on character indices to minimize - calls to `count_tokens`. - - Parameters - ---------- - text : str - Input string to split. - token_limit: int - Maximum number of tokens allowed in the head. - prefer_word_boundary : bool - If True, try to end the head on a whitespace boundary (without violating - the token limit). If no boundary exists in range, fall back to the - exact max index found by search. - - Returns - ------- - (head, tail) : Tuple[str, str] - `head` contains at most `token_limit` tokens, `tail` is the remaining suffix. - If `token_limit <= 0`, returns ("", text). - """ - if token_limit <= 0 or not text: - return "", text - - # if the whole text already fits, return as is. - if self.tokenizer.count_tokens(text) <= token_limit: - return text, "" - - # Binary search over character indices [0, len(text)] - lo, hi = 0, len(text) - best_idx: Optional[int] = None - - while lo <= hi: - mid = (lo + hi) // 2 - head = text[:mid] - tok_count = self.tokenizer.count_tokens(head) - - if tok_count <= token_limit: - best_idx = mid # feasible; try to extend - lo = mid + 1 - else: - hi = mid - 1 - - if best_idx is None or best_idx <= 0: - # Even the first character exceeds the limit (e.g., tokenizer behavior). - # Return nothing in head, everything in tail. - return "", text - - # Optionally adjust to a previous whitespace boundary without violating the limit - if prefer_word_boundary: - # Search backwards from best_idx to find whitespace; keep within token limit. - - last_space_index= text[:best_idx].rfind(" ") - if last_space_index > 0: - best_idx = last_space_index - - head, tail = text[:best_idx], text[best_idx:] - return head, tail diff --git a/test/test_line_chunker.py b/test/test_line_chunker.py deleted file mode 100644 index 33f5e4cf..00000000 --- a/test/test_line_chunker.py +++ /dev/null @@ -1,308 +0,0 @@ -import json -import pytest -from transformers import AutoTokenizer - -from docling_core.transforms.chunker.line_chunker import LineBasedTokenChunker -from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer -from docling_core.types.doc import DoclingDocument as DLDocument -from docling_core.types.doc.labels import DocItemLabel - -from .test_data_gen_flag import GEN_TEST_DATA - -EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" -MAX_TOKENS = 25 -INNER_TOKENIZER = AutoTokenizer.from_pretrained(EMBED_MODEL_ID) - - -def _process(act_data, exp_path_str): - """Helper function to either generate or compare test data.""" - if GEN_TEST_DATA: - with open(exp_path_str, mode="w", encoding="utf-8") as f: - json.dump(act_data, fp=f, indent=4) - f.write("\n") - else: - with open(exp_path_str, encoding="utf-8") as f: - exp_data = json.load(fp=f) - assert exp_data == act_data - - -def test_chunk_text_with_prefix(): - """Test text chunking with a prefix.""" - prefix = "Context: " - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - prefix=prefix, - ) - - lines = ["Line 1\n", "Line 2\n", "Line 3"] - chunks = chunker.chunk_text(lines) - - assert isinstance(chunks, list) - assert len(chunks) > 0 - # Each chunk should start with the prefix - for chunk in chunks: - assert isinstance(chunk, str) - assert chunk.startswith(prefix) - - -def test_chunk_text_long_prefix_warning(): - """Test that a warning is issued when prefix is too long.""" - # Create a very long prefix that exceeds max_tokens - long_prefix = "This is a very long prefix " * 50 - - with pytest.warns(UserWarning, match="too long for chunk size"): - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - prefix=long_prefix, - ) - - # Prefix should be reset to empty string - assert chunker.prefix == "" - assert chunker.prefix_len == 0 - - -def test_chunk_text_single_long_line(): - """Test chunking when a single line exceeds max_tokens.""" - - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - # Create a very long line - long_line = "word " * MAX_TOKENS * 5 - lines = [long_line] - chunks = chunker.chunk_text(lines) - - assert len(chunks) > 1 - # Verify each chunk respects token limit - for chunk in chunks: - token_count = chunker.tokenizer.count_tokens(chunk) - assert token_count <= MAX_TOKENS - - -def test_chunk_text_empty_string(): - """Test chunking an empty list.""" - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = chunker.chunk_text([]) - assert len(chunks) == 0 - - -def test_chunk_text_single_line(): - """Test chunking a single line that fits in one chunk.""" - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - text = "This is a single short line.\n" - lines = [text] - chunks = chunker.chunk_text(lines) - - assert len(chunks) == 1 - assert chunks[0] == text - # newline should be preserved - assert "\n" in chunks[0] - - -def test_split_by_token_limit(): - """Test the split_by_token_limit method.""" - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - available = 10 - text = "This is a test sentence with multiple words that should be split." - head, tail = chunker.split_by_token_limit(text, token_limit=available) - - assert len(head) > 0 - assert len(tail) > 0 - assert chunker.tokenizer.count_tokens(head) <= available - assert head + tail == text - - -def test_split_by_token_limit_zero_limit(): - """Test split_by_token_limit with zero token limit.""" - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - text = "Some text" - head, tail = chunker.split_by_token_limit(text, token_limit=0) - - assert head == "" - assert tail == text - - -def test_split_by_token_limit_fits_entirely(): - """Test split_by_token_limit when text fits within limit.""" - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - text = "Short text" - head, tail = chunker.split_by_token_limit(text, token_limit=100) - - assert head == text - assert tail == "" - - -def test_split_by_token_limit_word_boundary(): - """Test that split_by_token_limit prefers word boundaries.""" - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - text = "word1 word2 word3 word4 word5" - head, tail = chunker.split_by_token_limit(text, token_limit=5, prefer_word_boundary=True) - - # Head should end at a word boundary (space) - if len(head) > 0 and len(tail) > 0: - # Either head ends with a space or tail starts with a space - assert head[-1].isspace() or tail[0].isspace() or not head[-1].isalnum() - - - -def test_chunk_text_with_prefix_and_long_lines(): - """Test chunking with prefix when lines are long.""" - prefix = "PREFIX: " - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - prefix=prefix, - ) - - long_line = "This is a long line that will need to be split " * 3 - lines = [long_line] - chunks = chunker.chunk_text(lines) - - assert len(chunks) > 0 - for chunk in chunks: - assert chunk.startswith(prefix) - token_count = chunker.tokenizer.count_tokens(chunk) - assert token_count <= MAX_TOKENS - - - -def test_chunk_document(): - """Test the chunk() method with a DoclingDocument.""" - # Create a simple DoclingDocument - doc = DLDocument(name="test_doc") - paragraphs = ["This is the first paragraph with some content.", - "This is the second paragraph with more content", - "This is the third paragraph with even more content."] - - - # Add some text items to the document - for t in paragraphs: - doc.add_text(label=DocItemLabel.PARAGRAPH, text=t) - - # Create chunker - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - # Chunk the document - chunks = list(chunker.chunk(doc)) - - # Verify chunks were created - assert len(chunks) > 0 - - # Verify each chunk is a DocChunk with proper structure - for chunk in chunks: - assert hasattr(chunk, 'text') - assert hasattr(chunk, 'meta') - assert isinstance(chunk.text, str) - assert len(chunk.text) > 0 - - # Verify token count is within limit - token_count = chunker.tokenizer.count_tokens(chunk.text) - assert token_count <= MAX_TOKENS - - # Verify each paragraph resides fully in a chunk - for t in paragraphs: - assert any(t in c.text for c in chunks) - - -def test_chunk_empty_document(): - """Test the chunk() method with an empty document.""" - # Create an empty DoclingDocument - doc = DLDocument(name="empty_doc") - - # Create chunker - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - ) - - # Chunk the document - chunks = list(chunker.chunk(doc)) - - # Should return no chunks for empty document - assert len(chunks) == 0 - - -def test_chunk_document_with_long_content(): - """Test the chunk() method with long content that requires multiple chunks.""" - # Create a DoclingDocument with long content - doc = DLDocument(name="long_doc") - prefix = "Document: " - - # Add a very long paragraph - long_text = "This is a sentence with multiple words. " * 50 - doc.add_text(label=DocItemLabel.PARAGRAPH, text=long_text) - - chunker = LineBasedTokenChunker( - tokenizer=HuggingFaceTokenizer( - tokenizer=INNER_TOKENIZER, - max_tokens=MAX_TOKENS, - ), - prefix=prefix - ) - - # Chunk the document - chunks = list(chunker.chunk(doc)) - - # Should create multiple chunks - assert len(chunks) > 1 - - # Verify each chunk respects token limit - for chunk in chunks: - assert chunk.text.startswith(prefix) - token_count = chunker.tokenizer.count_tokens(chunk.text) - assert token_count <= MAX_TOKENS From df0c30a78ee86830539c365ac765233a187e1947 Mon Sep 17 00:00:00 2001 From: odelliab Date: Sun, 15 Mar 2026 22:30:31 +0100 Subject: [PATCH 07/17] chunk expansion + test Signed-off-by: odelliab --- docling_core/transforms/chunker/doc_chunk.py | 70 ++- test/test_doc_chunk_expansion.py | 532 +++++++++++++++++++ 2 files changed, 601 insertions(+), 1 deletion(-) create mode 100644 test/test_doc_chunk_expansion.py diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index b0bcc6fe..2966e6bc 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -1,14 +1,19 @@ """Data model for chunk metadata.""" +from __future__ import annotations + import logging import re +from copy import copy from typing import Annotated, ClassVar, Final, Literal, Optional from pydantic import Field, StringConstraints, field_validator from docling_core.search.package import VERSION_PATTERN from docling_core.transforms.chunker import BaseChunk, BaseMeta -from docling_core.types.doc.document import DocItem, DocumentOrigin +from docling_core.transforms.serializer.base import BaseDocSerializer +from docling_core.transforms.serializer.common import CommonParams +from docling_core.types.doc.document import DocItem, DoclingDocument, DocumentOrigin, InlineGroup, ListGroup, RefItem _VERSION: Final = "1.0.0" @@ -87,3 +92,66 @@ class DocChunk(BaseChunk): """Data model for document chunks.""" meta: DocMeta + + def get_top_containing_objects(self, doc: DoclingDocument) -> list[DocItem] | None: + objects = {} + ref_items = [item.self_ref for item in self.meta.doc_items] + for item in ref_items: + # traverse document tree till top level (body) + obj = RefItem(cref=item).resolve(doc) + while obj.parent != doc.body.get_ref(): + obj = obj.parent.resolve(doc) + objects[obj.self_ref] = obj + + # maintain the reading order as in the original document + doc_body_refs = [ref.cref for ref in doc.body.children] + doc_ordered_refs = [ref for ref in doc_body_refs if ref in objects] + if len(doc_ordered_refs) > 0: + return [objects[ref] for ref in doc_ordered_refs] + return None + + def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk: + top_objects = self.get_top_containing_objects(dl_doc) + if not top_objects: + _logger.warning(f"error in getting top objects of {self}") + return self + + content = "" + doc_items = [] + + for top_object in top_objects: + if isinstance(top_object, ListGroup | InlineGroup | DocItem): + try: + ser_res = serializer.serialize(item=top_object) + content += ser_res.text + " " + doc_items.append(top_object) + + except Exception as e: + _logger.warning(f"error in extacting text of {top_object}: {e}") + if len(content.strip()) == 0: + _logger.warning(f"expansion of {self} did not yield any text") + return self + + # fix me: update meta.headings + + meta = copy(self.meta) + meta.doc_items = doc_items + return DocChunk( + text=content, + meta=self.meta, + ) + + def expand_to_page(self, doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk | None: + page_ids = [i.page_no for item in self.meta.doc_items for i in item.prov] + ser_params: CommonParams | None = getattr(serializer, "params", None) + if len(doc.pages) == 0 or page_ids is None or len(page_ids) == 0 or not ser_params: + _logger.warning(f"cannot expand to page the following chunk: {self}") + return self + + ser_params.pages = page_ids + pages_content = serializer.serialize().text + + return DocChunk( + text=pages_content, + meta=self.meta, + ) diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py new file mode 100644 index 00000000..53b3d035 --- /dev/null +++ b/test/test_doc_chunk_expansion.py @@ -0,0 +1,532 @@ +"""Tests for DocChunk expansion methods.""" + +from ast import Or +import re + +import pytest + +from docling_core.transforms.chunker.doc_chunk import DocChunk, DocMeta +from docling_core.transforms.chunker.hierarchical_chunker import ( + ChunkingDocSerializer, + ChunkingSerializerProvider, +) +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker +from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer +from docling_core.transforms.serializer.markdown import MarkdownTableSerializer +from docling_core.types.doc import DocItemLabel, DoclingDocument, Size +from docling_core.types.doc.document import TableData + +EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" +MAX_TOKENS = 64 +INPUT_FILE = "test/data/chunker/2_inp_dl_doc.json" + + +def text_contains_ignoring_whitespace(haystack: str, needle: str) -> bool: + """ + Check if needle text is contained in haystack, ignoring whitespace differences. + + Normalizes both strings by removing all whitespace characters. + + Args: + haystack: The text to search in + needle: The text to search for + + Returns: + True if needle is found in haystack (ignoring whitespace), False otherwise + """ + # Remove all whitespace + haystack_normalized = re.sub(r'\s+', '', haystack) + needle_normalized = re.sub(r'\s+', '', needle) + return needle_normalized in haystack_normalized + + +@pytest.fixture +def sample_doc(): + """Create a sample document for testing.""" + doc = DoclingDocument(name="test_doc") + + # Add some content with hierarchy + doc.add_heading(text="Section 1", level=1) + doc.add_text(text="This is the first paragraph.", label=DocItemLabel.PARAGRAPH) + doc.add_text(text="This is the second paragraph.", label=DocItemLabel.PARAGRAPH) + + doc.add_heading(text="Section 2", level=1) + doc.add_text(text="Content in section 2.", label=DocItemLabel.PARAGRAPH) + + # Add a table + table_data = TableData(num_cols=2) + table_data.add_row(["Header 1", "Header 2"]) + table_data.add_row(["Value 1", "Value 2"]) + doc.add_table(data=table_data) + + return doc + + +@pytest.fixture +def sample_doc_with_pages(): + """Create a sample document with page information.""" + doc = DoclingDocument(name="test_doc_pages") + + # Add page + page = doc.add_page(size=Size(width=612, height=792), page_no=1) + + # Add content to page 1 + doc.add_heading(text="Page 1 Heading", level=1) + doc.add_text(text="Content on page 1.", label=DocItemLabel.PARAGRAPH) + + # Add another page + page2 = doc.add_page(size=Size(width=612, height=792), page_no=2) + + # Add content to page 2 + doc.add_heading(text="Page 2 Heading", level=1) + doc.add_text(text="Content on page 2.", label=DocItemLabel.PARAGRAPH) + + return doc + + +@pytest.fixture +def chunking_serializer(sample_doc): + """Create a chunking serializer for testing.""" + return ChunkingDocSerializer(doc=sample_doc) + + +class TestGetTopContainingObjects: + """Tests for get_top_containing_objects method.""" + + def test_get_top_objects_basic(self, sample_doc): + """Test getting top-level objects from a chunk.""" + # Create a chunker and get chunks + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc)) + assert len(chunks) > 0, "Should have at least one chunk" + + # Test the first chunk - convert to DocChunk + chunk = DocChunk.model_validate(chunks[0]) + top_objects = chunk.get_top_containing_objects(sample_doc) + + assert top_objects is not None, "Should return top objects" + assert len(top_objects) > 0, "Should have at least one top object" + + # Verify all returned objects are direct children of body + for obj in top_objects: + assert obj.parent == sample_doc.body.get_ref(), ( + f"Object {obj.self_ref} should be direct child of body" + ) + + # Verify that at least one doc_item from the chunk is a descendant of a top object + chunk_item_refs = {item.self_ref for item in chunk.meta.doc_items} + + # Additional check: recursively traverse top object children to find chunk items + def find_chunk_item_in_descendants(obj, doc, target_refs): + """Recursively check if any target_refs are in obj's descendants.""" + # Check if this object itself is a target + if obj.self_ref in target_refs: + return True + + # Check children if object has them + if hasattr(obj, 'children') and obj.children: + for child_ref in obj.children: + child = child_ref.resolve(doc) + if find_chunk_item_in_descendants(child, doc, target_refs): + return True + + return False + + + for top_obj in top_objects: + assert find_chunk_item_in_descendants(top_obj, sample_doc, chunk_item_refs), ( + f"Could not find any chunk items in descendants of top object {top_obj.self_ref}" + ) + + + def test_get_top_objects_maintains_order(self, sample_doc): + """Test that top objects maintain document reading order.""" + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc)) + + for chunk in chunks: + top_objects = chunk.get_top_containing_objects(sample_doc) + if top_objects and len(top_objects) > 1: + # Get the order in the document body + body_refs = [ref.cref for ref in sample_doc.body.children] + top_refs = [obj.self_ref for obj in top_objects] + + # Verify order is maintained + prev_idx = -1 + for ref in top_refs: + curr_idx = body_refs.index(ref) + assert curr_idx > prev_idx, "Objects should maintain reading order" + prev_idx = curr_idx + + def test_get_top_objects_empty_chunk(self): + """Test get_top_containing_objects with chunk containing non-body items.""" + doc = DoclingDocument(name="empty_doc") + text_item = doc.add_text(text="Some text", label=DocItemLabel.PARAGRAPH) + + # Create a chunk with a doc item that doesn't have proper parent + # This simulates an edge case where get_top_containing_objects might return None + meta = DocMeta(doc_items=[text_item]) + chunk = DocChunk(text="test", meta=meta) + + # Should return the text item as top object since it's a direct child of body + result = chunk.get_top_containing_objects(doc) + assert result is not None, "Should return top objects for valid doc_items" + assert len(result) > 0, "Should have at least one top object" + + +class TestExpandToObject: + """Tests for expand_to_object method.""" + + def test_expand_to_object_basic(self, sample_doc, chunking_serializer): + """Test basic expansion to full objects.""" + # Create chunks + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc)) + assert len(chunks) > 0, "Should have chunks" + + # Expand the first chunk - convert to DocChunk + original_chunk = DocChunk.model_validate(chunks[0]) + expanded_chunk = original_chunk.expand_to_object( + dl_doc=sample_doc, + serializer=chunking_serializer + ) + + assert expanded_chunk is not None, "Should return expanded chunk" + assert isinstance(expanded_chunk, DocChunk), "Should return DocChunk instance" + + # Expanded chunk should have content + assert len(expanded_chunk.text.strip()) > 0, "Expanded chunk should have text" + + # Expanded chunk text should contain original chunk text (or be a superset) + assert text_contains_ignoring_whitespace(expanded_chunk.text, needle=original_chunk.text), ( + f"Expanded chunk should contain of original chunk text. " + f"original {original_chunk.text}" + f"expanded: {expanded_chunk.text}" + ) + + + + def test_expand_to_object_with_table(self): + """Test expansion with table content.""" + doc = DoclingDocument(name="table_doc") + doc.add_heading(text="Table Section", level=1) + + # Add a table + table_data = TableData(num_cols=3) + table_data.add_row(["Col1", "Col2", "Col3"]) + table_data.add_row(["A", "B", "C"]) + table_data.add_row(["D", "E", "F"]) + table_item = doc.add_table(data=table_data) + + # Create chunks + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=doc)) + serializer = chunker.serializer_provider.get_serializer(doc) + + # Serialize the table to get expected text + table_serialized = serializer.serialize(item=table_item) + table_text = table_serialized.text + + # Find chunk with table + table_chunk = None + for c in chunks: + chunk = DocChunk.model_validate(c) + if any(hasattr(item, 'data') for item in chunk.meta.doc_items): + table_chunk = chunk + break + + if table_chunk: + expanded = table_chunk.expand_to_object( + dl_doc=doc, + serializer=serializer + ) + + + + # Verify that the serialized table text is in expanded text + assert table_text in expanded.text, ( + f"Expanded chunk should contain the full serialized table text. " + f"table text: {table_text}\n" + f"expanded: {expanded.text}" + ) + + def test_expand_to_object_error_handling(self, sample_doc): + """Test error handling in expand_to_object when serialization fails.""" + # Create a mock serializer that raises an exception + class FailingSerializer: + def serialize(self, item): + raise RuntimeError("Serialization failed") + + # Create a chunk with valid doc items + text_item = sample_doc.texts[0] + meta = DocMeta(doc_items=[text_item]) + original_chunk = DocChunk(text="original text", meta=meta) + + # Call expand_to_object with failing serializer + # Should catch the exception and return original chunk + result = original_chunk.expand_to_object( + dl_doc=sample_doc, + serializer=FailingSerializer() + ) + + # Should return original chunk when serialization fails + assert result == original_chunk, "Should return original chunk when serialization fails" + assert result.text == "original text", "Original text should be preserved" + + def test_expand_to_object_preserves_metadata(self, sample_doc, chunking_serializer): + """Test that expansion preserves chunk metadata.""" + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc)) + if len(chunks) > 0: + original = chunks[0] + expanded = original.expand_to_object( + dl_doc=sample_doc, + serializer=chunking_serializer + ) + + + assert expanded.meta.origin == original.meta.origin, ( + "Origin should be preserved" + ) + + +class TestExpandToPage: + """Tests for expand_to_page method.""" + + def test_expand_to_page_basic(self, sample_doc_with_pages): + """Test basic page expansion.""" + # Create chunks + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc_with_pages)) + serializer = chunker.serializer_provider.get_serializer(sample_doc_with_pages) + + if len(chunks) > 0: + chunk = chunks[0] + expanded = chunk.expand_to_page( + doc=sample_doc_with_pages, + serializer=serializer + ) + + assert expanded is not None, "Should return expanded chunk" + assert isinstance(expanded, DocChunk), "Should return DocChunk" + + def test_expand_to_page_includes_page_content(self, sample_doc_with_pages): + """Test that page expansion includes all page content.""" + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc_with_pages)) + serializer = chunker.serializer_provider.get_serializer(sample_doc_with_pages) + + for c in chunks: + chunk = DocChunk.model_validate(c) + # Get page numbers from chunk + page_ids = [ + i.page_no for item in chunk.meta.doc_items for i in item.prov + ] + + if page_ids: + expanded = chunk.expand_to_page( + doc=sample_doc_with_pages, + serializer=serializer + ) + + # Expanded text should contain page content + assert len(expanded.text) > 0, "Expanded chunk should have text" + + # Verify it contains original + assert chunk.text in expanded.text, ( + "Expanded text should contain original" + ) + + def test_expand_to_page_no_pages(self, sample_doc): + """Test expand_to_page when document has no pages.""" + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc)) + serializer = chunker.serializer_provider.get_serializer(sample_doc) + + if len(chunks) > 0: + chunk = DocChunk.model_validate(chunks[0]) + result = chunk.expand_to_page( + doc=sample_doc, + serializer=serializer + ) + + # Should return original chunk when no pages + assert result == chunk, "Should return original chunk when no pages" + + + def test_expand_to_page_preserves_metadata(self, sample_doc_with_pages): + """Test that page expansion preserves metadata.""" + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc_with_pages)) + serializer = chunker.serializer_provider.get_serializer(sample_doc_with_pages) + + if len(chunks) > 0: + original = DocChunk.model_validate(chunks[0]) + expanded = original.expand_to_page( + doc=sample_doc_with_pages, + serializer=serializer + ) + + # Metadata should be preserved + assert expanded.meta == original.meta, "Metadata should be preserved" + + +class TestExpandToObjectWithRealDocument: + """Tests using real document from test data.""" + + def test_expand_with_real_document(self): + """Test expansion methods with real document data.""" + with open(INPUT_FILE, encoding="utf-8") as f: + data_json = f.read() + dl_doc = DoclingDocument.model_validate_json(data_json) + + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=dl_doc)) + serializer = chunker.serializer_provider.get_serializer(dl_doc) + + assert len(chunks) > 0, "Should have chunks from real document" + + # Test expand_to_object on first chunk + chunk = DocChunk.model_validate(chunks[0]) + expanded_obj = chunk.expand_to_object( + dl_doc=dl_doc, + serializer=serializer + ) + + assert expanded_obj is not None, "Should expand successfully" + assert len(expanded_obj.text) > 0, "Expanded chunk should have text" + + # Test expand_to_page if document has pages + if len(dl_doc.pages) > 0: + expanded_page = chunk.expand_to_page( + doc=dl_doc, + serializer=serializer + ) + + assert expanded_page is not None, "Should expand to page successfully" + + def test_expand_all_chunks(self): + """Test expanding all chunks from a document.""" + with open(INPUT_FILE, encoding="utf-8") as f: + data_json = f.read() + dl_doc = DoclingDocument.model_validate_json(data_json) + + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=dl_doc)) + serializer = chunker.serializer_provider.get_serializer(dl_doc) + + # Expand all chunks to objects + expanded_chunks = [] + for c in chunks: + chunk = DocChunk.model_validate(c) + expanded = chunk.expand_to_object( + dl_doc=dl_doc, + serializer=serializer + ) + expanded_chunks.append(expanded) + + assert len(expanded_chunks) == len(chunks), ( + "Should have same number of expanded chunks" + ) + + # All expanded chunks should have content + for expanded in expanded_chunks: + assert len(expanded.text.strip()) > 0, ( + "Each expanded chunk should have text" + ) + + +class TestEdgeCases: + """Test edge cases and error conditions.""" + + + def test_expand_with_none_serializer(self, sample_doc): + """Test expansion with None serializer.""" + chunker = HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + chunks = list(chunker.chunk(dl_doc=sample_doc)) + if len(chunks) > 0: + # Convert to DocChunk to access expansion methods + chunk = DocChunk.model_validate(chunks[0]) + # Should handle None serializer gracefully by returning original chunk + # (errors are caught and logged, not raised) + result = chunk.expand_to_object( + dl_doc=sample_doc, + serializer=None + ) + # Should return original chunk when serializer fails + assert result == chunk, "Should return original chunk when serializer is None" + + + From d5bb63b92abb8f6b7eda0a9e10d510e7b4ca52cd Mon Sep 17 00:00:00 2001 From: odelliab Date: Mon, 16 Mar 2026 13:35:51 +0200 Subject: [PATCH 08/17] small fixes Signed-off-by: odelliab --- docling_core/transforms/chunker/doc_chunk.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index 2966e6bc..528b3c27 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -123,7 +123,7 @@ def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerialize if isinstance(top_object, ListGroup | InlineGroup | DocItem): try: ser_res = serializer.serialize(item=top_object) - content += ser_res.text + " " + content += ser_res.text + "\n" doc_items.append(top_object) except Exception as e: @@ -132,8 +132,6 @@ def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerialize _logger.warning(f"expansion of {self} did not yield any text") return self - # fix me: update meta.headings - meta = copy(self.meta) meta.doc_items = doc_items return DocChunk( @@ -148,10 +146,16 @@ def expand_to_page(self, doc: DoclingDocument, serializer: BaseDocSerializer) -> _logger.warning(f"cannot expand to page the following chunk: {self}") return self - ser_params.pages = page_ids - pages_content = serializer.serialize().text + ser_params.pages = set(page_ids) + ser_res = serializer.serialize() + + # Extract doc_items from serialization result + expanded_doc_items = ser_res.get_unique_doc_items() + # Update metadata + meta = copy(self.meta) + meta.doc_items = expanded_doc_items return DocChunk( - text=pages_content, - meta=self.meta, + text=ser_res.text, + meta=meta, ) From 322ae619e795711d3f840e4c6ec5b0d24390ec05 Mon Sep 17 00:00:00 2001 From: odelliab Date: Mon, 16 Mar 2026 12:46:55 +0100 Subject: [PATCH 09/17] small fixes Signed-off-by: odelliab --- docling_core/transforms/chunker/doc_chunk.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index 528b3c27..b6586673 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -124,6 +124,7 @@ def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerialize try: ser_res = serializer.serialize(item=top_object) content += ser_res.text + "\n" + # Extract doc_items from serialization result doc_items.append(top_object) except Exception as e: @@ -136,7 +137,7 @@ def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerialize meta.doc_items = doc_items return DocChunk( text=content, - meta=self.meta, + meta=meta, ) def expand_to_page(self, doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk | None: From d97412536c1a74c409f41fa1c940217236983d18 Mon Sep 17 00:00:00 2001 From: odelliab Date: Mon, 16 Mar 2026 13:03:07 +0100 Subject: [PATCH 10/17] remove unnecessary code Signed-off-by: odelliab --- test/test_doc_chunk_expansion.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py index 53b3d035..9fdb4a57 100644 --- a/test/test_doc_chunk_expansion.py +++ b/test/test_doc_chunk_expansion.py @@ -1,6 +1,5 @@ """Tests for DocChunk expansion methods.""" -from ast import Or import re import pytest @@ -21,25 +20,6 @@ INPUT_FILE = "test/data/chunker/2_inp_dl_doc.json" -def text_contains_ignoring_whitespace(haystack: str, needle: str) -> bool: - """ - Check if needle text is contained in haystack, ignoring whitespace differences. - - Normalizes both strings by removing all whitespace characters. - - Args: - haystack: The text to search in - needle: The text to search for - - Returns: - True if needle is found in haystack (ignoring whitespace), False otherwise - """ - # Remove all whitespace - haystack_normalized = re.sub(r'\s+', '', haystack) - needle_normalized = re.sub(r'\s+', '', needle) - return needle_normalized in haystack_normalized - - @pytest.fixture def sample_doc(): """Create a sample document for testing.""" @@ -216,7 +196,7 @@ def test_expand_to_object_basic(self, sample_doc, chunking_serializer): assert len(expanded_chunk.text.strip()) > 0, "Expanded chunk should have text" # Expanded chunk text should contain original chunk text (or be a superset) - assert text_contains_ignoring_whitespace(expanded_chunk.text, needle=original_chunk.text), ( + assert original_chunk.text in expanded_chunk.text, ( f"Expanded chunk should contain of original chunk text. " f"original {original_chunk.text}" f"expanded: {expanded_chunk.text}" From be481f4a8f2f17389259e9938f69091e617305f9 Mon Sep 17 00:00:00 2001 From: odelliab Date: Mon, 16 Mar 2026 17:22:02 +0200 Subject: [PATCH 11/17] DCO Remediation Commit for odelliab I, odelliab , hereby add my Signed-off-by to this commit: 5cc61d93fb8ec8136fd52cbd087b92722fea86b3 I, odelliab , hereby add my Signed-off-by to this commit: 91b43f97e44c7c06ef35695222faa41053109cfb I, odelliab , hereby add my Signed-off-by to this commit: 5d17bdacf2acb6d300e7e44e83309bb182e81189 I, odelliab , hereby add my Signed-off-by to this commit: a50392e53cccea75144f4307bd70055ceb1b150c I, odelliab , hereby add my Signed-off-by to this commit: e5894290d5498afc8f66c61419e7fb5fac47ca19 I, odelliab , hereby add my Signed-off-by to this commit: 30c72a99be398e0b3e3828ba8951d1ee8b55df9d I, odelliab <91875866+odelliab@users.noreply.github.com>, hereby add my Signed-off-by to this commit: 6aa0019fe08707a29e6e7cb9620af23db93f88a1 Signed-off-by: odelliab From 916ccb5a2c4145e1bef86bd1b1f0f687dcd9b558 Mon Sep 17 00:00:00 2001 From: odelliab Date: Thu, 19 Mar 2026 17:43:00 +0200 Subject: [PATCH 12/17] change names Signed-off-by: odelliab --- docling_core/transforms/chunker/doc_chunk.py | 38 +-- test/test_doc_chunk_expansion.py | 235 +++++++------------ 2 files changed, 102 insertions(+), 171 deletions(-) diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index b6586673..df9915d3 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -93,48 +93,48 @@ class DocChunk(BaseChunk): meta: DocMeta - def get_top_containing_objects(self, doc: DoclingDocument) -> list[DocItem] | None: - objects = {} + def get_top_containing_items(self, doc: DoclingDocument) -> list[DocItem] | None: + items = {} ref_items = [item.self_ref for item in self.meta.doc_items] for item in ref_items: # traverse document tree till top level (body) - obj = RefItem(cref=item).resolve(doc) - while obj.parent != doc.body.get_ref(): - obj = obj.parent.resolve(doc) - objects[obj.self_ref] = obj + top_item = RefItem(cref=item).resolve(doc) + while top_item.parent != doc.body.get_ref(): + top_item = top_item.parent.resolve(doc) + items[top_item.self_ref] = top_item # maintain the reading order as in the original document doc_body_refs = [ref.cref for ref in doc.body.children] - doc_ordered_refs = [ref for ref in doc_body_refs if ref in objects] + doc_ordered_refs = [ref for ref in doc_body_refs if ref in items] if len(doc_ordered_refs) > 0: - return [objects[ref] for ref in doc_ordered_refs] + return [items[ref] for ref in doc_ordered_refs] return None - def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk: - top_objects = self.get_top_containing_objects(dl_doc) - if not top_objects: - _logger.warning(f"error in getting top objects of {self}") + def expand_to_item(self, dl_doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk: + top_items = self.get_top_containing_items(dl_doc) + if not top_items: + _logger.warning(f"error in getting top items of {self}") return self content = "" - doc_items = [] + all_doc_items = [] - for top_object in top_objects: - if isinstance(top_object, ListGroup | InlineGroup | DocItem): + for top_item in top_items: + if isinstance(top_item, ListGroup | InlineGroup | DocItem): try: - ser_res = serializer.serialize(item=top_object) + ser_res = serializer.serialize(item=top_item) content += ser_res.text + "\n" # Extract doc_items from serialization result - doc_items.append(top_object) + all_doc_items.extend(ser_res.get_unique_doc_items()) except Exception as e: - _logger.warning(f"error in extacting text of {top_object}: {e}") + _logger.warning(f"error in extacting text of {top_item}: {e}") if len(content.strip()) == 0: _logger.warning(f"expansion of {self} did not yield any text") return self meta = copy(self.meta) - meta.doc_items = doc_items + meta.doc_items = all_doc_items return DocChunk( text=content, meta=meta, diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py index 9fdb4a57..caa21ec9 100644 --- a/test/test_doc_chunk_expansion.py +++ b/test/test_doc_chunk_expansion.py @@ -1,17 +1,11 @@ """Tests for DocChunk expansion methods.""" -import re - import pytest from docling_core.transforms.chunker.doc_chunk import DocChunk, DocMeta -from docling_core.transforms.chunker.hierarchical_chunker import ( - ChunkingDocSerializer, - ChunkingSerializerProvider, -) +from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer -from docling_core.transforms.serializer.markdown import MarkdownTableSerializer from docling_core.types.doc import DocItemLabel, DoclingDocument, Size from docling_core.types.doc.document import TableData @@ -64,54 +58,57 @@ def sample_doc_with_pages(): return doc +@pytest.fixture +def hybrid_chunker(): + """Create a reusable HybridChunker instance.""" + return HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + ) + + @pytest.fixture def chunking_serializer(sample_doc): """Create a chunking serializer for testing.""" return ChunkingDocSerializer(doc=sample_doc) -class TestGetTopContainingObjects: - """Tests for get_top_containing_objects method.""" +class TestGetTopContainingItems: + """Tests for get_top_containing_items method.""" - def test_get_top_objects_basic(self, sample_doc): - """Test getting top-level objects from a chunk.""" - # Create a chunker and get chunks - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc)) + def test_get_top_items_basic(self, sample_doc, hybrid_chunker): + """Test getting top-level items from a chunk.""" + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) assert len(chunks) > 0, "Should have at least one chunk" # Test the first chunk - convert to DocChunk chunk = DocChunk.model_validate(chunks[0]) - top_objects = chunk.get_top_containing_objects(sample_doc) + top_items = chunk.get_top_containing_items(sample_doc) - assert top_objects is not None, "Should return top objects" - assert len(top_objects) > 0, "Should have at least one top object" + assert top_items is not None, "Should return top items" + assert len(top_items) > 0, "Should have at least one top item" - # Verify all returned objects are direct children of body - for obj in top_objects: - assert obj.parent == sample_doc.body.get_ref(), ( - f"Object {obj.self_ref} should be direct child of body" + # Verify all returned items are direct children of body + for item in top_items: + assert item.parent == sample_doc.body.get_ref(), ( + f"Item {item.self_ref} should be direct child of body" ) - # Verify that at least one doc_item from the chunk is a descendant of a top object + # Verify that at least one doc_item from the chunk is a descendant of a top item chunk_item_refs = {item.self_ref for item in chunk.meta.doc_items} - # Additional check: recursively traverse top object children to find chunk items - def find_chunk_item_in_descendants(obj, doc, target_refs): - """Recursively check if any target_refs are in obj's descendants.""" - # Check if this object itself is a target - if obj.self_ref in target_refs: + # Additional check: recursively traverse top item children to find chunk items + def find_chunk_item_in_descendants(item, doc, target_refs): + """Recursively check if any target_refs are in item's descendants.""" + # Check if this item itself is a target + if item.self_ref in target_refs: return True - # Check children if object has them - if hasattr(obj, 'children') and obj.children: - for child_ref in obj.children: + # Check children if item has them + if hasattr(item, 'children') and item.children: + for child_ref in item.children: child = child_ref.resolve(doc) if find_chunk_item_in_descendants(child, doc, target_refs): return True @@ -119,72 +116,57 @@ def find_chunk_item_in_descendants(obj, doc, target_refs): return False - for top_obj in top_objects: - assert find_chunk_item_in_descendants(top_obj, sample_doc, chunk_item_refs), ( - f"Could not find any chunk items in descendants of top object {top_obj.self_ref}" + for top_item in top_items: + assert find_chunk_item_in_descendants(top_item, sample_doc, chunk_item_refs), ( + f"Could not find any chunk items in descendants of top item {top_item.self_ref}" ) - def test_get_top_objects_maintains_order(self, sample_doc): - """Test that top objects maintain document reading order.""" - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc)) + def test_get_top_items_maintains_order(self, sample_doc, hybrid_chunker): + """Test that top items maintain document reading order.""" + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) for chunk in chunks: - top_objects = chunk.get_top_containing_objects(sample_doc) - if top_objects and len(top_objects) > 1: + top_items = chunk.get_top_containing_items(sample_doc) + if top_items and len(top_items) > 1: # Get the order in the document body body_refs = [ref.cref for ref in sample_doc.body.children] - top_refs = [obj.self_ref for obj in top_objects] + top_refs = [item.self_ref for item in top_items] # Verify order is maintained prev_idx = -1 for ref in top_refs: curr_idx = body_refs.index(ref) - assert curr_idx > prev_idx, "Objects should maintain reading order" + assert curr_idx > prev_idx, "Items should maintain reading order" prev_idx = curr_idx - def test_get_top_objects_empty_chunk(self): - """Test get_top_containing_objects with chunk containing non-body items.""" + def test_get_top_items_empty_chunk(self): + """Test get_top_containing_items with chunk containing non-body items.""" doc = DoclingDocument(name="empty_doc") text_item = doc.add_text(text="Some text", label=DocItemLabel.PARAGRAPH) # Create a chunk with a doc item that doesn't have proper parent - # This simulates an edge case where get_top_containing_objects might return None + # This simulates an edge case where get_top_containing_items might return None meta = DocMeta(doc_items=[text_item]) chunk = DocChunk(text="test", meta=meta) - # Should return the text item as top object since it's a direct child of body - result = chunk.get_top_containing_objects(doc) - assert result is not None, "Should return top objects for valid doc_items" - assert len(result) > 0, "Should have at least one top object" + # Should return the text item as top item since it's a direct child of body + result = chunk.get_top_containing_items(doc) + assert result is not None, "Should return top items for valid doc_items" + assert len(result) > 0, "Should have at least one top item" -class TestExpandToObject: - """Tests for expand_to_object method.""" +class TestExpandToItem: + """Tests for expand_to_item method.""" - def test_expand_to_object_basic(self, sample_doc, chunking_serializer): - """Test basic expansion to full objects.""" - # Create chunks - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc)) + def test_expand_to_item_basic(self, sample_doc, chunking_serializer, hybrid_chunker): + """Test basic expansion to full items.""" + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) assert len(chunks) > 0, "Should have chunks" # Expand the first chunk - convert to DocChunk original_chunk = DocChunk.model_validate(chunks[0]) - expanded_chunk = original_chunk.expand_to_object( + expanded_chunk = original_chunk.expand_to_item( dl_doc=sample_doc, serializer=chunking_serializer ) @@ -204,7 +186,7 @@ def test_expand_to_object_basic(self, sample_doc, chunking_serializer): - def test_expand_to_object_with_table(self): + def test_expand_to_item_with_table(self, hybrid_chunker): """Test expansion with table content.""" doc = DoclingDocument(name="table_doc") doc.add_heading(text="Table Section", level=1) @@ -216,16 +198,8 @@ def test_expand_to_object_with_table(self): table_data.add_row(["D", "E", "F"]) table_item = doc.add_table(data=table_data) - # Create chunks - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=doc)) - serializer = chunker.serializer_provider.get_serializer(doc) + chunks = list(hybrid_chunker.chunk(dl_doc=doc)) + serializer = hybrid_chunker.serializer_provider.get_serializer(doc) # Serialize the table to get expected text table_serialized = serializer.serialize(item=table_item) @@ -240,7 +214,7 @@ def test_expand_to_object_with_table(self): break if table_chunk: - expanded = table_chunk.expand_to_object( + expanded = table_chunk.expand_to_item( dl_doc=doc, serializer=serializer ) @@ -254,8 +228,8 @@ def test_expand_to_object_with_table(self): f"expanded: {expanded.text}" ) - def test_expand_to_object_error_handling(self, sample_doc): - """Test error handling in expand_to_object when serialization fails.""" + def test_expand_to_item_error_handling(self, sample_doc, hybrid_chunker): + """Test error handling in expand_to_item when serialization fails.""" # Create a mock serializer that raises an exception class FailingSerializer: def serialize(self, item): @@ -268,7 +242,7 @@ def serialize(self, item): # Call expand_to_object with failing serializer # Should catch the exception and return original chunk - result = original_chunk.expand_to_object( + result = original_chunk.expand_to_item( dl_doc=sample_doc, serializer=FailingSerializer() ) @@ -277,19 +251,12 @@ def serialize(self, item): assert result == original_chunk, "Should return original chunk when serialization fails" assert result.text == "original text", "Original text should be preserved" - def test_expand_to_object_preserves_metadata(self, sample_doc, chunking_serializer): + def test_expand_to_item_preserves_metadata(self, sample_doc, chunking_serializer, hybrid_chunker): """Test that expansion preserves chunk metadata.""" - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc)) + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) if len(chunks) > 0: original = chunks[0] - expanded = original.expand_to_object( + expanded = original.expand_to_item( dl_doc=sample_doc, serializer=chunking_serializer ) @@ -303,18 +270,10 @@ def test_expand_to_object_preserves_metadata(self, sample_doc, chunking_serializ class TestExpandToPage: """Tests for expand_to_page method.""" - def test_expand_to_page_basic(self, sample_doc_with_pages): + def test_expand_to_page_basic(self, sample_doc_with_pages, hybrid_chunker): """Test basic page expansion.""" - # Create chunks - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc_with_pages)) - serializer = chunker.serializer_provider.get_serializer(sample_doc_with_pages) + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc_with_pages)) + serializer = hybrid_chunker.serializer_provider.get_serializer(sample_doc_with_pages) if len(chunks) > 0: chunk = chunks[0] @@ -326,17 +285,10 @@ def test_expand_to_page_basic(self, sample_doc_with_pages): assert expanded is not None, "Should return expanded chunk" assert isinstance(expanded, DocChunk), "Should return DocChunk" - def test_expand_to_page_includes_page_content(self, sample_doc_with_pages): + def test_expand_to_page_includes_page_content(self, sample_doc_with_pages, hybrid_chunker): """Test that page expansion includes all page content.""" - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc_with_pages)) - serializer = chunker.serializer_provider.get_serializer(sample_doc_with_pages) + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc_with_pages)) + serializer = hybrid_chunker.serializer_provider.get_serializer(sample_doc_with_pages) for c in chunks: chunk = DocChunk.model_validate(c) @@ -382,17 +334,10 @@ def test_expand_to_page_no_pages(self, sample_doc): assert result == chunk, "Should return original chunk when no pages" - def test_expand_to_page_preserves_metadata(self, sample_doc_with_pages): + def test_expand_to_page_preserves_metadata(self, sample_doc_with_pages, hybrid_chunker): """Test that page expansion preserves metadata.""" - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc_with_pages)) - serializer = chunker.serializer_provider.get_serializer(sample_doc_with_pages) + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc_with_pages)) + serializer = hybrid_chunker.serializer_provider.get_serializer(sample_doc_with_pages) if len(chunks) > 0: original = DocChunk.model_validate(chunks[0]) @@ -428,7 +373,7 @@ def test_expand_with_real_document(self): # Test expand_to_object on first chunk chunk = DocChunk.model_validate(chunks[0]) - expanded_obj = chunk.expand_to_object( + expanded_obj = chunk.expand_to_item( dl_doc=dl_doc, serializer=serializer ) @@ -445,27 +390,20 @@ def test_expand_with_real_document(self): assert expanded_page is not None, "Should expand to page successfully" - def test_expand_all_chunks(self): + def test_expand_all_chunks(self, hybrid_chunker): """Test expanding all chunks from a document.""" with open(INPUT_FILE, encoding="utf-8") as f: data_json = f.read() dl_doc = DoclingDocument.model_validate_json(data_json) - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=dl_doc)) - serializer = chunker.serializer_provider.get_serializer(dl_doc) + chunks = list(hybrid_chunker.chunk(dl_doc=dl_doc)) + serializer = hybrid_chunker.serializer_provider.get_serializer(dl_doc) # Expand all chunks to objects expanded_chunks = [] for c in chunks: chunk = DocChunk.model_validate(c) - expanded = chunk.expand_to_object( + expanded = chunk.expand_to_item( dl_doc=dl_doc, serializer=serializer ) @@ -486,22 +424,15 @@ class TestEdgeCases: """Test edge cases and error conditions.""" - def test_expand_with_none_serializer(self, sample_doc): + def test_expand_with_none_serializer(self, sample_doc, hybrid_chunker): """Test expansion with None serializer.""" - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=sample_doc)) + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) if len(chunks) > 0: # Convert to DocChunk to access expansion methods chunk = DocChunk.model_validate(chunks[0]) # Should handle None serializer gracefully by returning original chunk # (errors are caught and logged, not raised) - result = chunk.expand_to_object( + result = chunk.expand_to_item( dl_doc=sample_doc, serializer=None ) From 0ae0b648c01f036113aeb33dae0a1d64ecbab590 Mon Sep 17 00:00:00 2001 From: odelliab Date: Thu, 19 Mar 2026 17:43:56 +0200 Subject: [PATCH 13/17] remove some tests Signed-off-by: odelliab --- test/test_doc_chunk_expansion.py | 71 -------------------------------- 1 file changed, 71 deletions(-) diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py index caa21ec9..7d088f31 100644 --- a/test/test_doc_chunk_expansion.py +++ b/test/test_doc_chunk_expansion.py @@ -349,77 +349,6 @@ def test_expand_to_page_preserves_metadata(self, sample_doc_with_pages, hybrid_c # Metadata should be preserved assert expanded.meta == original.meta, "Metadata should be preserved" - -class TestExpandToObjectWithRealDocument: - """Tests using real document from test data.""" - - def test_expand_with_real_document(self): - """Test expansion methods with real document data.""" - with open(INPUT_FILE, encoding="utf-8") as f: - data_json = f.read() - dl_doc = DoclingDocument.model_validate_json(data_json) - - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) - - chunks = list(chunker.chunk(dl_doc=dl_doc)) - serializer = chunker.serializer_provider.get_serializer(dl_doc) - - assert len(chunks) > 0, "Should have chunks from real document" - - # Test expand_to_object on first chunk - chunk = DocChunk.model_validate(chunks[0]) - expanded_obj = chunk.expand_to_item( - dl_doc=dl_doc, - serializer=serializer - ) - - assert expanded_obj is not None, "Should expand successfully" - assert len(expanded_obj.text) > 0, "Expanded chunk should have text" - - # Test expand_to_page if document has pages - if len(dl_doc.pages) > 0: - expanded_page = chunk.expand_to_page( - doc=dl_doc, - serializer=serializer - ) - - assert expanded_page is not None, "Should expand to page successfully" - - def test_expand_all_chunks(self, hybrid_chunker): - """Test expanding all chunks from a document.""" - with open(INPUT_FILE, encoding="utf-8") as f: - data_json = f.read() - dl_doc = DoclingDocument.model_validate_json(data_json) - - chunks = list(hybrid_chunker.chunk(dl_doc=dl_doc)) - serializer = hybrid_chunker.serializer_provider.get_serializer(dl_doc) - - # Expand all chunks to objects - expanded_chunks = [] - for c in chunks: - chunk = DocChunk.model_validate(c) - expanded = chunk.expand_to_item( - dl_doc=dl_doc, - serializer=serializer - ) - expanded_chunks.append(expanded) - - assert len(expanded_chunks) == len(chunks), ( - "Should have same number of expanded chunks" - ) - - # All expanded chunks should have content - for expanded in expanded_chunks: - assert len(expanded.text.strip()) > 0, ( - "Each expanded chunk should have text" - ) - - class TestEdgeCases: """Test edge cases and error conditions.""" From 8fc78a48ac12f3449a5e3cb420a78695eac054a0 Mon Sep 17 00:00:00 2001 From: odelliab <91875866+odelliab@users.noreply.github.com> Date: Sun, 22 Mar 2026 13:42:02 +0200 Subject: [PATCH 14/17] Apply suggestions from code review Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: odelliab <91875866+odelliab@users.noreply.github.com> --- docling_core/transforms/chunker/doc_chunk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index b6586673..4fc5606d 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -128,7 +128,7 @@ def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerialize doc_items.append(top_object) except Exception as e: - _logger.warning(f"error in extacting text of {top_object}: {e}") + _logger.warning(f"error in extracting text of {top_object}: {e}") if len(content.strip()) == 0: _logger.warning(f"expansion of {self} did not yield any text") return self @@ -140,7 +140,7 @@ def expand_to_object(self, dl_doc: DoclingDocument, serializer: BaseDocSerialize meta=meta, ) - def expand_to_page(self, doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk | None: + def expand_to_page(self, doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk: page_ids = [i.page_no for item in self.meta.doc_items for i in item.prov] ser_params: CommonParams | None = getattr(serializer, "params", None) if len(doc.pages) == 0 or page_ids is None or len(page_ids) == 0 or not ser_params: From 0dfecfcb1055688cd3d0b9a89353872ff0a82c7a Mon Sep 17 00:00:00 2001 From: odelliab Date: Sun, 22 Mar 2026 13:42:48 +0200 Subject: [PATCH 15/17] address review comments Signed-off-by: odelliab --- docling_core/transforms/chunker/doc_chunk.py | 86 ++++++++++++++++---- test/test_doc_chunk_expansion.py | 10 +-- 2 files changed, 77 insertions(+), 19 deletions(-) diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index df9915d3..eaf7cc66 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -4,15 +4,15 @@ import logging import re -from copy import copy +import warnings +from copy import deepcopy from typing import Annotated, ClassVar, Final, Literal, Optional from pydantic import Field, StringConstraints, field_validator from docling_core.search.package import VERSION_PATTERN from docling_core.transforms.chunker import BaseChunk, BaseMeta -from docling_core.transforms.serializer.base import BaseDocSerializer -from docling_core.transforms.serializer.common import CommonParams +from docling_core.transforms.serializer.common import DocSerializer from docling_core.types.doc.document import DocItem, DoclingDocument, DocumentOrigin, InlineGroup, ListGroup, RefItem _VERSION: Final = "1.0.0" @@ -93,7 +93,20 @@ class DocChunk(BaseChunk): meta: DocMeta - def get_top_containing_items(self, doc: DoclingDocument) -> list[DocItem] | None: + def _get_top_containing_items(self, doc: DoclingDocument) -> list[DocItem] | None: + """Get top-level document items that contain this chunk's items. + + Traverses the document tree upward from each item in the chunk to find + the top-level items (direct children of document body) that contain them. + Maintains the original document reading order. + + Args: + doc: The DoclingDocument containing this chunk. + + Returns: + List of top-level DocItems in document order, or None if no items found. + """ + items = {} ref_items = [item.self_ref for item in self.meta.doc_items] for item in ref_items: @@ -110,8 +123,25 @@ def get_top_containing_items(self, doc: DoclingDocument) -> list[DocItem] | None return [items[ref] for ref in doc_ordered_refs] return None - def expand_to_item(self, dl_doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk: - top_items = self.get_top_containing_items(dl_doc) + def expand_to_item(self, dl_doc: DoclingDocument, serializer: DocSerializer) -> DocChunk: + """Expand chunk to include complete top-level document items. + + Expands the chunk to contain full top-level items (sections, tables, lists) + rather than partial content. This ensures semantic completeness by including + all content from the top-level items that contain any part of the original chunk. + + Args: + dl_doc: The DoclingDocument containing this chunk. + serializer: Serializer to convert document items to text. + + Returns: + New DocChunk with expanded content and updated metadata, or the original + chunk if expansion fails or yields no content. + + Note: + - It is recommended to use same serializer as the original document + """ + top_items = self._get_top_containing_items(dl_doc) if not top_items: _logger.warning(f"error in getting top items of {self}") return self @@ -133,28 +163,56 @@ def expand_to_item(self, dl_doc: DoclingDocument, serializer: BaseDocSerializer) _logger.warning(f"expansion of {self} did not yield any text") return self - meta = copy(self.meta) + meta = deepcopy(self.meta) meta.doc_items = all_doc_items return DocChunk( text=content, meta=meta, ) - def expand_to_page(self, doc: DoclingDocument, serializer: BaseDocSerializer) -> DocChunk | None: + def expand_to_page(self, doc: DoclingDocument, serializer: DocSerializer) -> DocChunk: + """Expand chunk to include all content from its pages. + + Expands the chunk to contain all content from the pages it spans. This is + useful for maintaining page-level context and ensuring complete page coverage + in retrieval applications. + + Args: + doc: The DoclingDocument containing this chunk. + serializer: Serializer to convert document content to text. + + Returns: + New DocChunk with all content from the chunk's pages and updated metadata, + or the original chunk if expansion is not possible. + + Raises: + UserWarning: If document has no pages or chunk items have no page provenance. + + Example: + If a chunk spans pages 2-3, this expands it to include all content + from both pages, not just the original chunk's items. + + Note: + - It is recommended to use same serializer as the original document + """ + page_ids = [i.page_no for item in self.meta.doc_items for i in item.prov] - ser_params: CommonParams | None = getattr(serializer, "params", None) - if len(doc.pages) == 0 or page_ids is None or len(page_ids) == 0 or not ser_params: - _logger.warning(f"cannot expand to page the following chunk: {self}") + + if len(doc.pages) == 0 or page_ids is None or len(page_ids) == 0: + warnings.warn( + f"cannot expand to page the following chunk: {self}. \n Probably pagination was not supported in document conversion" + ) return self - ser_params.pages = set(page_ids) - ser_res = serializer.serialize() + page_serializer = deepcopy(serializer) # avoid mutating the serializer + page_serializer.params.pages = set(page_ids) + ser_res = page_serializer.serialize() # Extract doc_items from serialization result expanded_doc_items = ser_res.get_unique_doc_items() # Update metadata - meta = copy(self.meta) + meta = deepcopy(self.meta) meta.doc_items = expanded_doc_items return DocChunk( text=ser_res.text, diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py index 7d088f31..6a1d8cce 100644 --- a/test/test_doc_chunk_expansion.py +++ b/test/test_doc_chunk_expansion.py @@ -76,7 +76,7 @@ def chunking_serializer(sample_doc): class TestGetTopContainingItems: - """Tests for get_top_containing_items method.""" + """Tests for _get_top_containing_items method.""" def test_get_top_items_basic(self, sample_doc, hybrid_chunker): """Test getting top-level items from a chunk.""" @@ -85,7 +85,7 @@ def test_get_top_items_basic(self, sample_doc, hybrid_chunker): # Test the first chunk - convert to DocChunk chunk = DocChunk.model_validate(chunks[0]) - top_items = chunk.get_top_containing_items(sample_doc) + top_items = chunk._get_top_containing_items(sample_doc) assert top_items is not None, "Should return top items" assert len(top_items) > 0, "Should have at least one top item" @@ -127,7 +127,7 @@ def test_get_top_items_maintains_order(self, sample_doc, hybrid_chunker): chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) for chunk in chunks: - top_items = chunk.get_top_containing_items(sample_doc) + top_items = chunk._get_top_containing_items(sample_doc) if top_items and len(top_items) > 1: # Get the order in the document body body_refs = [ref.cref for ref in sample_doc.body.children] @@ -141,7 +141,7 @@ def test_get_top_items_maintains_order(self, sample_doc, hybrid_chunker): prev_idx = curr_idx def test_get_top_items_empty_chunk(self): - """Test get_top_containing_items with chunk containing non-body items.""" + """Test _get_top_containing_items with chunk containing non-body items.""" doc = DoclingDocument(name="empty_doc") text_item = doc.add_text(text="Some text", label=DocItemLabel.PARAGRAPH) @@ -151,7 +151,7 @@ def test_get_top_items_empty_chunk(self): chunk = DocChunk(text="test", meta=meta) # Should return the text item as top item since it's a direct child of body - result = chunk.get_top_containing_items(doc) + result = chunk._get_top_containing_items(doc) assert result is not None, "Should return top items for valid doc_items" assert len(result) > 0, "Should have at least one top item" From 1607cd1f58b28099549d51bd6969bb7eeb83d387 Mon Sep 17 00:00:00 2001 From: odelliab Date: Sun, 22 Mar 2026 20:28:27 +0200 Subject: [PATCH 16/17] consolidate tests Signed-off-by: odelliab --- .../transforms/serializer/markdown.py | 2 +- test/test_doc_chunk_expansion.py | 445 +++++++++--------- 2 files changed, 221 insertions(+), 226 deletions(-) diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 903fcbbd..95850e11 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -440,7 +440,7 @@ def get_header_and_body_lines( the header row and separator row, and body_lines contains the data rows. """ - lines = [line for line in table_text.split("\n") if line.strip()] + lines = [line for line in table_text.splitlines(True) if line.strip()] if len(lines) < 2: # Not enough lines for a proper markdown table (need at least header + separator) diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py index 6a1d8cce..9d94ec3a 100644 --- a/test/test_doc_chunk_expansion.py +++ b/test/test_doc_chunk_expansion.py @@ -3,60 +3,153 @@ import pytest from docling_core.transforms.chunker.doc_chunk import DocChunk, DocMeta -from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer +from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer, ChunkingSerializerProvider from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer +from docling_core.transforms.serializer.markdown import MarkdownTableSerializer from docling_core.types.doc import DocItemLabel, DoclingDocument, Size from docling_core.types.doc.document import TableData EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" -MAX_TOKENS = 64 -INPUT_FILE = "test/data/chunker/2_inp_dl_doc.json" +MAX_TOKENS = 50 - -@pytest.fixture -def sample_doc(): - """Create a sample document for testing.""" - doc = DoclingDocument(name="test_doc") +def check_lines_equal_in_order(text_a: str, text_b: str) -> bool: + """ + Check if lines of string A are equal to lines of string B in the same order. - # Add some content with hierarchy - doc.add_heading(text="Section 1", level=1) - doc.add_text(text="This is the first paragraph.", label=DocItemLabel.PARAGRAPH) - doc.add_text(text="This is the second paragraph.", label=DocItemLabel.PARAGRAPH) + This function splits both strings into lines and verifies that: + 1. All lines from A appear in B + 2. They appear in the same order + 3. Lines can be non-consecutive in B (other lines can appear between them) - doc.add_heading(text="Section 2", level=1) - doc.add_text(text="Content in section 2.", label=DocItemLabel.PARAGRAPH) + Args: + text_a (str): First string (subset) to check + text_b (str): Second string (superset) to check against - # Add a table - table_data = TableData(num_cols=2) - table_data.add_row(["Header 1", "Header 2"]) - table_data.add_row(["Value 1", "Value 2"]) - doc.add_table(data=table_data) + Returns: + bool: True if all lines of A appear in B in the same order, False otherwise + """ + if not isinstance(text_a, str) or not isinstance(text_b, str): + raise TypeError("Both inputs must be strings.") - return doc + lines_a = [line for line in text_a.splitlines() if line.strip()] + lines_b = [line for line in text_b.splitlines() if line.strip()] + + # If A is empty, it's always contained in B + if not lines_a: + return True + + # If B is empty but A is not, A cannot be contained in B + if not lines_b: + return False + + # Track position in B + b_index = 0 + + # Try to find each line of A in B in order + for line_a in lines_a: + found = False + # Search for line_a starting from current position in B + while b_index < len(lines_b): + if lines_b[b_index] == line_a: + found = True + b_index += 1 # Move to next position in B + break + b_index += 1 + + # If we couldn't find this line of A in B, return False + if not found: + return False + + return True @pytest.fixture -def sample_doc_with_pages(): - """Create a sample document with page information.""" - doc = DoclingDocument(name="test_doc_pages") +def sample_doc(): + """Create a comprehensive sample document for testing with pages and various content types. - # Add page - page = doc.add_page(size=Size(width=612, height=792), page_no=1) + Content is associated with pages through ProvenanceItem which includes page_no. + When add_text/add_heading is called with prov parameter containing page_no, + that content is associated with that specific page. + """ + from docling_core.types.doc.document import ProvenanceItem, BoundingBox - # Add content to page 1 - doc.add_heading(text="Page 1 Heading", level=1) - doc.add_text(text="Content on page 1.", label=DocItemLabel.PARAGRAPH) + doc = DoclingDocument(name="test_doc") - # Add another page + # Add pages + page1 = doc.add_page(size=Size(width=612, height=792), page_no=1) page2 = doc.add_page(size=Size(width=612, height=792), page_no=2) - # Add content to page 2 - doc.add_heading(text="Page 2 Heading", level=1) - doc.add_text(text="Content on page 2.", label=DocItemLabel.PARAGRAPH) + # Section 1 on page 1 (explicitly set page_no in prov) + doc.add_heading( + text="Section 1", + level=1, + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=50, t=50, r=550, b=80), charspan=(0, 9)) + ) + doc.add_text( + text="This is the first paragraph.", + label=DocItemLabel.PARAGRAPH, + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=50, t=90, r=550, b=120), charspan=(10, 38)) + ) + doc.add_text( + text="This is the second paragraph.", + label=DocItemLabel.PARAGRAPH, + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=50, t=130, r=550, b=160), charspan=(39, 68)) + ) + + # Section 2 on page 2 with list (explicitly set page_no=2 in prov) + doc.add_heading( + text="Section 2", + level=1, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=50, t=50, r=550, b=80), charspan=(69, 78)) + ) + doc.add_text( + text="Content in section 2.", + label=DocItemLabel.PARAGRAPH, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=50, t=90, r=550, b=120), charspan=(79, 100)) + ) + + # Add a list in section 2 on page 2 + list_group = doc.add_list_group() + doc.add_list_item( + text="First list item", + enumerated=False, + parent=list_group, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=70, t=130, r=550, b=150), charspan=(101, 116)) + ) + doc.add_list_item( + text="Second list item", + enumerated=False, + parent=list_group, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=70, t=160, r=550, b=180), charspan=(117, 133)) + ) + doc.add_list_item( + text="Third list item", + enumerated=False, + parent=list_group, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=70, t=190, r=550, b=210), charspan=(134, 149)) + ) + + # Add a table on page 2 + table_data = TableData(num_cols=2) + table_data.add_row(["Header 1", "Header 2"]) + table_data.add_row(["Value 1", "Value 2"]) + table_data.add_row(["Value 3", "Value 4"]) + table_data.add_row(["Value 5", "Value 6"]) + doc.add_table( + data=table_data, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=50, t=220, r=550, b=300), charspan=(150, 200)) + ) return doc +class MarkdownSerializerProvider(ChunkingSerializerProvider): + def get_serializer(self, doc: DoclingDocument): + return ChunkingDocSerializer( + doc=doc, + table_serializer=MarkdownTableSerializer(), + + ) @pytest.fixture def hybrid_chunker(): @@ -66,41 +159,29 @@ def hybrid_chunker(): model_name=EMBED_MODEL_ID, max_tokens=MAX_TOKENS, ), - ) - + serializer_provider=MarkdownSerializerProvider(), + repeat_table_header=True + ) @pytest.fixture -def chunking_serializer(sample_doc): - """Create a chunking serializer for testing.""" - return ChunkingDocSerializer(doc=sample_doc) +def sample_chunks(sample_doc, hybrid_chunker): + """Create chunks from sample_doc once and cache them.""" + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) + assert len(chunks) > 0, "Expected at least one chunk to be created" + return chunks + +@pytest.fixture +def sample_serializer(sample_doc, hybrid_chunker): + """Create serializer for sample_doc once and cache it.""" + return hybrid_chunker.serializer_provider.get_serializer(sample_doc) + class TestGetTopContainingItems: """Tests for _get_top_containing_items method.""" - def test_get_top_items_basic(self, sample_doc, hybrid_chunker): - """Test getting top-level items from a chunk.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) - assert len(chunks) > 0, "Should have at least one chunk" - - # Test the first chunk - convert to DocChunk - chunk = DocChunk.model_validate(chunks[0]) - top_items = chunk._get_top_containing_items(sample_doc) - - assert top_items is not None, "Should return top items" - assert len(top_items) > 0, "Should have at least one top item" - - # Verify all returned items are direct children of body - for item in top_items: - assert item.parent == sample_doc.body.get_ref(), ( - f"Item {item.self_ref} should be direct child of body" - ) - - # Verify that at least one doc_item from the chunk is a descendant of a top item - chunk_item_refs = {item.self_ref for item in chunk.meta.doc_items} - - # Additional check: recursively traverse top item children to find chunk items - def find_chunk_item_in_descendants(item, doc, target_refs): + # helper method: recursively traverse top item children to find chunk items + def _find_chunk_item_in_descendants(self, item, doc, target_refs): """Recursively check if any target_refs are in item's descendants.""" # Check if this item itself is a target if item.self_ref in target_refs: @@ -110,23 +191,40 @@ def find_chunk_item_in_descendants(item, doc, target_refs): if hasattr(item, 'children') and item.children: for child_ref in item.children: child = child_ref.resolve(doc) - if find_chunk_item_in_descendants(child, doc, target_refs): + if self._find_chunk_item_in_descendants(child, doc, target_refs): return True return False + + + def test_get_top_items_basic(self, sample_doc, sample_chunks): + """Test getting top-level items from a chunk.""" + assert len(sample_chunks) > 0, "Should have at least one chunk" + + for chunk in sample_chunks: + top_items = chunk._get_top_containing_items(sample_doc) + + assert top_items is not None, "Should return top items" + assert len(top_items) > 0, "Should have at least one top item" + # Verify all returned items are direct children of body + for item in top_items: + assert item.parent == sample_doc.body.get_ref(), ( + f"Item {item.self_ref} should be direct child of body" + ) - for top_item in top_items: - assert find_chunk_item_in_descendants(top_item, sample_doc, chunk_item_refs), ( - f"Could not find any chunk items in descendants of top item {top_item.self_ref}" + # Verify that at least one doc_item from the chunk is a descendant of a top item + chunk_item_refs = {item.self_ref for item in chunk.meta.doc_items} + + for top_item in top_items: + assert self._find_chunk_item_in_descendants(top_item, sample_doc, chunk_item_refs), ( + f"Could not find any chunk items in descendants of top item {top_item.self_ref}" ) - def test_get_top_items_maintains_order(self, sample_doc, hybrid_chunker): + def test_get_top_items_maintains_order(self, sample_doc, sample_chunks): """Test that top items maintain document reading order.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) - - for chunk in chunks: + for chunk in sample_chunks: top_items = chunk._get_top_containing_items(sample_doc) if top_items and len(top_items) > 1: # Get the order in the document body @@ -159,75 +257,32 @@ def test_get_top_items_empty_chunk(self): class TestExpandToItem: """Tests for expand_to_item method.""" - def test_expand_to_item_basic(self, sample_doc, chunking_serializer, hybrid_chunker): + def test_expand_to_item_basic(self, sample_doc, sample_serializer, sample_chunks): """Test basic expansion to full items.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) - assert len(chunks) > 0, "Should have chunks" - - # Expand the first chunk - convert to DocChunk - original_chunk = DocChunk.model_validate(chunks[0]) - expanded_chunk = original_chunk.expand_to_item( - dl_doc=sample_doc, - serializer=chunking_serializer - ) - - assert expanded_chunk is not None, "Should return expanded chunk" - assert isinstance(expanded_chunk, DocChunk), "Should return DocChunk instance" - - # Expanded chunk should have content - assert len(expanded_chunk.text.strip()) > 0, "Expanded chunk should have text" - - # Expanded chunk text should contain original chunk text (or be a superset) - assert original_chunk.text in expanded_chunk.text, ( - f"Expanded chunk should contain of original chunk text. " - f"original {original_chunk.text}" - f"expanded: {expanded_chunk.text}" - ) - - - - def test_expand_to_item_with_table(self, hybrid_chunker): - """Test expansion with table content.""" - doc = DoclingDocument(name="table_doc") - doc.add_heading(text="Table Section", level=1) - # Add a table - table_data = TableData(num_cols=3) - table_data.add_row(["Col1", "Col2", "Col3"]) - table_data.add_row(["A", "B", "C"]) - table_data.add_row(["D", "E", "F"]) - table_item = doc.add_table(data=table_data) + for chunk in sample_chunks: + expanded = chunk.expand_to_item( + dl_doc=sample_doc, + serializer=sample_serializer + ) - chunks = list(hybrid_chunker.chunk(dl_doc=doc)) - serializer = hybrid_chunker.serializer_provider.get_serializer(doc) + assert expanded is not None, "Should return expanded chunk" + assert isinstance(expanded, DocChunk), "Should return DocChunk instance" - # Serialize the table to get expected text - table_serialized = serializer.serialize(item=table_item) - table_text = table_serialized.text + # Expanded chunk should have content + assert len(expanded.text.strip()) > 0, "Expanded chunk should have text" - # Find chunk with table - table_chunk = None - for c in chunks: - chunk = DocChunk.model_validate(c) - if any(hasattr(item, 'data') for item in chunk.meta.doc_items): - table_chunk = chunk - break - - if table_chunk: - expanded = table_chunk.expand_to_item( - dl_doc=doc, - serializer=serializer - ) - - - - # Verify that the serialized table text is in expanded text - assert table_text in expanded.text, ( - f"Expanded chunk should contain the full serialized table text. " - f"table text: {table_text}\n" + # Expanded chunk text should contain original chunk text (or be a superset) + assert check_lines_equal_in_order(chunk.text,expanded.text), ( + f"Expanded chunk should contain of original chunk text. " + f"original {chunk.text}" f"expanded: {expanded.text}" ) - + assert expanded.meta.origin == chunk.meta.origin, ( + "Origin should be preserved" + ) + + def test_expand_to_item_error_handling(self, sample_doc, hybrid_chunker): """Test error handling in expand_to_item when serialization fails.""" # Create a mock serializer that raises an exception @@ -238,60 +293,28 @@ def serialize(self, item): # Create a chunk with valid doc items text_item = sample_doc.texts[0] meta = DocMeta(doc_items=[text_item]) - original_chunk = DocChunk(text="original text", meta=meta) + chunk = DocChunk(text="original text", meta=meta) # Call expand_to_object with failing serializer # Should catch the exception and return original chunk - result = original_chunk.expand_to_item( + expanded = chunk.expand_to_item( dl_doc=sample_doc, serializer=FailingSerializer() ) # Should return original chunk when serialization fails - assert result == original_chunk, "Should return original chunk when serialization fails" - assert result.text == "original text", "Original text should be preserved" - - def test_expand_to_item_preserves_metadata(self, sample_doc, chunking_serializer, hybrid_chunker): - """Test that expansion preserves chunk metadata.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) - if len(chunks) > 0: - original = chunks[0] - expanded = original.expand_to_item( - dl_doc=sample_doc, - serializer=chunking_serializer - ) - - - assert expanded.meta.origin == original.meta.origin, ( - "Origin should be preserved" - ) - + assert expanded == chunk, "Should return original chunk when serialization fails" + class TestExpandToPage: """Tests for expand_to_page method.""" + - def test_expand_to_page_basic(self, sample_doc_with_pages, hybrid_chunker): - """Test basic page expansion.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc_with_pages)) - serializer = hybrid_chunker.serializer_provider.get_serializer(sample_doc_with_pages) - - if len(chunks) > 0: - chunk = chunks[0] - expanded = chunk.expand_to_page( - doc=sample_doc_with_pages, - serializer=serializer - ) - - assert expanded is not None, "Should return expanded chunk" - assert isinstance(expanded, DocChunk), "Should return DocChunk" - - def test_expand_to_page_includes_page_content(self, sample_doc_with_pages, hybrid_chunker): + def test_expand_to_page_basic(self, sample_doc, sample_chunks, sample_serializer): """Test that page expansion includes all page content.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc_with_pages)) - serializer = hybrid_chunker.serializer_provider.get_serializer(sample_doc_with_pages) - for c in chunks: - chunk = DocChunk.model_validate(c) + for chunk in sample_chunks: + # Get page numbers from chunk page_ids = [ i.page_no for item in chunk.meta.doc_items for i in item.prov @@ -299,74 +322,46 @@ def test_expand_to_page_includes_page_content(self, sample_doc_with_pages, hybri if page_ids: expanded = chunk.expand_to_page( - doc=sample_doc_with_pages, - serializer=serializer + doc=sample_doc, + serializer=sample_serializer ) - + assert expanded is not None, "Should return expanded chunk" + assert isinstance(expanded, DocChunk), "Should return DocChunk" # Expanded text should contain page content assert len(expanded.text) > 0, "Expanded chunk should have text" # Verify it contains original - assert chunk.text in expanded.text, ( + assert check_lines_equal_in_order(chunk.text,expanded.text), ( "Expanded text should contain original" ) + + # Metadata fields should be updated with expanded content + assert expanded.meta.origin == chunk.meta.origin, "Expanded chunk should have metadata" + def get_ref_items(chunk:DocChunk): + return [item.self_ref for item in chunk.meta.doc_items] + assert set(get_ref_items(chunk)).issubset(get_ref_items(expanded)) , ( + "Expanded chunk should have at least as many doc_items as original" + ) - def test_expand_to_page_no_pages(self, sample_doc): - """Test expand_to_page when document has no pages.""" - chunker = HybridChunker( - tokenizer=HuggingFaceTokenizer.from_pretrained( - model_name=EMBED_MODEL_ID, - max_tokens=MAX_TOKENS, - ), - ) + def test_expand_to_page_no_pages(self, hybrid_chunker): + """Test expand_to_page when document has no pages for all chunks.""" + # Create a document without pages + doc_no_pages = DoclingDocument(name="no_pages_doc") + doc_no_pages.add_heading(text="Section 1", level=1) + doc_no_pages.add_text(text="Some content.", label=DocItemLabel.PARAGRAPH) + + chunks = list(hybrid_chunker.chunk(dl_doc=doc_no_pages)) + serializer = hybrid_chunker.serializer_provider.get_serializer(doc_no_pages) - chunks = list(chunker.chunk(dl_doc=sample_doc)) - serializer = chunker.serializer_provider.get_serializer(sample_doc) + assert len(chunks) > 0, "Should have at least one chunk" - if len(chunks) > 0: - chunk = DocChunk.model_validate(chunks[0]) + for chunk in chunks: result = chunk.expand_to_page( - doc=sample_doc, + doc=doc_no_pages, serializer=serializer ) # Should return original chunk when no pages - assert result == chunk, "Should return original chunk when no pages" - - - def test_expand_to_page_preserves_metadata(self, sample_doc_with_pages, hybrid_chunker): - """Test that page expansion preserves metadata.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc_with_pages)) - serializer = hybrid_chunker.serializer_provider.get_serializer(sample_doc_with_pages) - - if len(chunks) > 0: - original = DocChunk.model_validate(chunks[0]) - expanded = original.expand_to_page( - doc=sample_doc_with_pages, - serializer=serializer - ) - - # Metadata should be preserved - assert expanded.meta == original.meta, "Metadata should be preserved" - -class TestEdgeCases: - """Test edge cases and error conditions.""" + assert result == chunk, "Should return original chunk when document has no pages" - - def test_expand_with_none_serializer(self, sample_doc, hybrid_chunker): - """Test expansion with None serializer.""" - chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) - if len(chunks) > 0: - # Convert to DocChunk to access expansion methods - chunk = DocChunk.model_validate(chunks[0]) - # Should handle None serializer gracefully by returning original chunk - # (errors are caught and logged, not raised) - result = chunk.expand_to_item( - dl_doc=sample_doc, - serializer=None - ) - # Should return original chunk when serializer fails - assert result == chunk, "Should return original chunk when serializer is None" - - - + \ No newline at end of file From 62e1f08025aaa71515b3883371b799febd7c4763 Mon Sep 17 00:00:00 2001 From: odelliab Date: Mon, 23 Mar 2026 13:24:26 +0200 Subject: [PATCH 17/17] fix failing test Signed-off-by: odelliab --- test/data/chunker/0d_out_chunks.json | 73 ++++++++++++---------------- 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/test/data/chunker/0d_out_chunks.json b/test/data/chunker/0d_out_chunks.json index 6b1f306f..d3a29df3 100644 --- a/test/data/chunker/0d_out_chunks.json +++ b/test/data/chunker/0d_out_chunks.json @@ -259,7 +259,7 @@ } }, { - "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || - | - | - | - | - | - | - | - || | | TTS | Pages/s | Mem | TTS | Pages/s | Mem || Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |", + "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend |\n| - | - | - | - | - | - | - | - |\n| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem |\n| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |\n", "meta": { "doc_items": [ "#/tables/0" @@ -271,7 +271,7 @@ } }, { - "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |", + "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend |\n| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |", "meta": { "doc_items": [ "#/tables/0" @@ -638,7 +638,7 @@ } }, { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset.", + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page", "meta": { "doc_items": [ "#/texts/513" @@ -650,7 +650,7 @@ } }, { - "text": "With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", + "text": "in a typical timeframe of 20s to 60s, depending on its complexity.", "meta": { "doc_items": [ "#/texts/513" @@ -703,7 +703,7 @@ } }, { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture .", + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and", "meta": { "doc_items": [ "#/texts/523" @@ -715,7 +715,7 @@ } }, { - "text": "This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", + "text": "Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.\nof row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", "meta": { "doc_items": [ "#/texts/523", @@ -725,18 +725,7 @@ "#/texts/529", "#/texts/530", "#/texts/531", - "#/texts/532" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "doc_items": [ + "#/texts/532", "#/texts/533" ], "headings": [ @@ -770,7 +759,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n", "meta": { "doc_items": [ "#/tables/3" @@ -782,7 +771,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -794,7 +783,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n", "meta": { "doc_items": [ "#/tables/3" @@ -806,7 +795,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -818,7 +807,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -830,7 +819,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -842,7 +831,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -854,7 +843,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -866,7 +855,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -878,7 +867,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -890,7 +879,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -902,7 +891,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", "meta": { "doc_items": [ "#/tables/3" @@ -930,7 +919,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n", "meta": { "doc_items": [ "#/tables/4" @@ -942,7 +931,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -954,7 +943,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n", "meta": { "doc_items": [ "#/tables/4" @@ -966,7 +955,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -978,7 +967,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -990,7 +979,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1002,7 +991,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1014,7 +1003,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1026,7 +1015,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1038,7 +1027,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1050,7 +1039,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1062,7 +1051,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |\n3\n,\ngovernment offices,\nWe reviewed the col-\n,\nPage-\nTitle and\n.", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |\n3\n,\ngovernment offices,\nWe reviewed the col-\n,\nPage-\nTitle and\n.", "meta": { "doc_items": [ "#/tables/4",