Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 133 additions & 1 deletion docling_core/transforms/chunker/doc_chunk.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
"""Data model for chunk metadata."""

from __future__ import annotations

import logging
import re
import warnings
from copy import deepcopy
from typing import Annotated, ClassVar, Final, Literal, Optional

from pydantic import Field, StringConstraints, field_validator

from docling_core.search.package import VERSION_PATTERN
from docling_core.transforms.chunker import BaseChunk, BaseMeta
from docling_core.types.doc.document import DocItem, DocumentOrigin
from docling_core.transforms.serializer.common import DocSerializer
from docling_core.types.doc.document import DocItem, DoclingDocument, DocumentOrigin, InlineGroup, ListGroup, RefItem

_VERSION: Final = "1.0.0"

Expand Down Expand Up @@ -87,3 +92,130 @@ class DocChunk(BaseChunk):
"""Data model for document chunks."""

meta: DocMeta

def _get_top_containing_items(self, doc: DoclingDocument) -> list[DocItem] | None:
"""Get top-level document items that contain this chunk's items.

Traverses the document tree upward from each item in the chunk to find
the top-level items (direct children of document body) that contain them.
Maintains the original document reading order.

Args:
doc: The DoclingDocument containing this chunk.

Returns:
List of top-level DocItems in document order, or None if no items found.
"""

items = {}
ref_items = [item.self_ref for item in self.meta.doc_items]
for item in ref_items:
# traverse document tree till top level (body)
top_item = RefItem(cref=item).resolve(doc)
while top_item.parent != doc.body.get_ref():
top_item = top_item.parent.resolve(doc)
items[top_item.self_ref] = top_item

# maintain the reading order as in the original document
doc_body_refs = [ref.cref for ref in doc.body.children]
doc_ordered_refs = [ref for ref in doc_body_refs if ref in items]
if len(doc_ordered_refs) > 0:
return [items[ref] for ref in doc_ordered_refs]
return None

def expand_to_item(self, dl_doc: DoclingDocument, serializer: DocSerializer) -> DocChunk:
"""Expand chunk to include complete top-level document items.

Expands the chunk to contain full top-level items (sections, tables, lists)
rather than partial content. This ensures semantic completeness by including
all content from the top-level items that contain any part of the original chunk.

Args:
dl_doc: The DoclingDocument containing this chunk.
serializer: Serializer to convert document items to text.

Returns:
New DocChunk with expanded content and updated metadata, or the original
chunk if expansion fails or yields no content.

Note:
- It is recommended to use same serializer as the original document
"""
top_items = self._get_top_containing_items(dl_doc)
if not top_items:
_logger.warning(f"error in getting top items of {self}")
return self

content = ""
all_doc_items = []

for top_item in top_items:
if isinstance(top_item, ListGroup | InlineGroup | DocItem):
try:
ser_res = serializer.serialize(item=top_item)
content += ser_res.text + "\n"
# Extract doc_items from serialization result
all_doc_items.extend(ser_res.get_unique_doc_items())

except Exception as e:
_logger.warning(f"error in extacting text of {top_item}: {e}")

if len(content.strip()) == 0:
_logger.warning(f"expansion of {self} did not yield any text")
return self

meta = deepcopy(self.meta)
meta.doc_items = all_doc_items
return DocChunk(
text=content,
meta=meta,
)

def expand_to_page(self, doc: DoclingDocument, serializer: DocSerializer) -> DocChunk:
"""Expand chunk to include all content from its pages.

Expands the chunk to contain all content from the pages it spans. This is
useful for maintaining page-level context and ensuring complete page coverage
in retrieval applications.

Args:
doc: The DoclingDocument containing this chunk.
serializer: Serializer to convert document content to text.

Returns:
New DocChunk with all content from the chunk's pages and updated metadata,
or the original chunk if expansion is not possible.

Raises:
UserWarning: If document has no pages or chunk items have no page provenance.

Example:
If a chunk spans pages 2-3, this expands it to include all content
from both pages, not just the original chunk's items.

Note:
- It is recommended to use same serializer as the original document
"""

page_ids = [i.page_no for item in self.meta.doc_items for i in item.prov]

if len(doc.pages) == 0 or page_ids is None or len(page_ids) == 0:
warnings.warn(
f"cannot expand to page the following chunk: {self}. \n Probably pagination was not supported in document conversion"
)
return self

page_serializer = deepcopy(serializer) # avoid mutating the serializer
page_serializer.params.pages = set(page_ids)
ser_res = page_serializer.serialize()

# Extract doc_items from serialization result
expanded_doc_items = ser_res.get_unique_doc_items()

# Update metadata
meta = deepcopy(self.meta)
meta.doc_items = expanded_doc_items
return DocChunk(
text=ser_res.text,
meta=meta,
)
2 changes: 1 addition & 1 deletion docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def get_header_and_body_lines(
the header row and separator row, and body_lines contains the data rows.
"""

lines = [line for line in table_text.split("\n") if line.strip()]
lines = [line for line in table_text.splitlines(True) if line.strip()]

if len(lines) < 2:
# Not enough lines for a proper markdown table (need at least header + separator)
Expand Down
Loading
Loading