Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,13 @@ def serialize(
table_text = ". ".join(table_text_parts)
parts.append(create_ser_result(text=table_text, span_source=item))

ftn_res = doc_serializer.serialize_footnotes(
item=item,
**kwargs,
)
if ftn_res.text:
parts.append(ftn_res)

text_res = "\n\n".join([r.text for r in parts])

return create_ser_result(text=text_res, span_source=parts)
Expand Down
71 changes: 61 additions & 10 deletions docling_core/transforms/serializer/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
PictureDataType,
PictureItem,
PictureMoleculeData,
RefItem,
Script,
TableAnnotationType,
TableItem,
Expand Down Expand Up @@ -202,6 +203,7 @@ class CommonParams(BaseModel):
include_formatting: bool = True
include_hyperlinks: bool = True
caption_delim: str = " "
footnote_delim: str = " "
use_legacy_annotations: bool = Field(
default=False,
description="Use legacy annotation serialization.",
Expand Down Expand Up @@ -316,6 +318,62 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
self._excluded_refs_cache[params_json] = refs
return refs

def _iter_visible_referenced_text_items(
self,
refs: Iterable[RefItem],
*,
excluded_refs: set[str],
) -> Iterable[TextItem]:
"""Yield referenced text items that survive the current serializer filters."""
for ref in refs:
if isinstance(it := ref.resolve(self.doc), TextItem) and it.self_ref not in excluded_refs:
yield it

def _serialize_referenced_text_items(
self,
refs: Iterable[RefItem],
**kwargs: Any,
) -> list[SerializationResult]:
"""Serialize referenced text items while bypassing the top-level skip path."""
excluded_refs = self.get_excluded_refs(**kwargs)
return [
self.text_serializer.serialize(
item=it,
doc_serializer=self,
doc=self.doc,
is_inline_scope=True,
**kwargs,
)
for it in self._iter_visible_referenced_text_items(
refs,
excluded_refs=excluded_refs,
)
]

def has_visible_footnotes(self, **kwargs: Any) -> bool:
"""Whether the current serialization scope includes floating-item footnotes."""
params = self.params.merge_with_patch(patch=kwargs)
if DocItemLabel.FOOTNOTE not in params.labels:
return False

excluded_refs = self.get_excluded_refs(**kwargs)
for item, _ in _iterate_items(
doc=self.doc,
traverse_pictures=True,
layers=params.layers,
):
if isinstance(item, FloatingItem) and item.self_ref not in excluded_refs:
if any(
True
for _ in self._iter_visible_referenced_text_items(
item.footnotes,
excluded_refs=excluded_refs,
)
):
return True

return False

@abstractmethod
def serialize_doc(
self,
Expand Down Expand Up @@ -621,18 +679,11 @@ def serialize_footnotes(
) -> SerializationResult:
"""Serialize the item's footnotes."""
params = self.params.merge_with_patch(patch=kwargs)
results: list[SerializationResult] = []
if DocItemLabel.FOOTNOTE in params.labels:
results = [
create_ser_result(text=it.text, span_source=it)
for ftn in item.footnotes
if isinstance(it := ftn.resolve(self.doc), TextItem)
and it.self_ref not in self.get_excluded_refs(**kwargs)
]
# FIXME: using the caption_delimiter for now ...
text_res = params.caption_delim.join([r.text for r in results])
text_res = self.post_process(text=text_res)
results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
text_res = params.footnote_delim.join([r.text for r in results])
else:
results = []
text_res = ""
return create_ser_result(text=text_res, span_source=results)

Expand Down
61 changes: 56 additions & 5 deletions docling_core/transforms/serializer/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@
create_ser_result,
)
from docling_core.transforms.serializer.html_styles import (
_get_css_for_footnotes,
_get_css_for_single_column,
_get_css_for_split_page,
_get_css_with_no_styling,
)
from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.base import ImageRefMode
Expand Down Expand Up @@ -363,10 +365,10 @@ def serialize(
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed table item to HTML."""
res_parts: list[SerializationResult] = []
table_parts: list[SerializationResult] = []
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
if cap_res.text:
res_parts.append(cap_res)
table_parts.append(cap_res)

if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
body = ""
Expand Down Expand Up @@ -416,10 +418,18 @@ def serialize(

if body:
body = f"<tbody>{body}</tbody>"
res_parts.append(create_ser_result(text=body, span_source=span_source))
table_parts.append(create_ser_result(text=body, span_source=span_source))

res_parts: list[SerializationResult] = []
if table_parts:
table_text = "".join([r.text for r in table_parts])
res_parts.append(create_ser_result(text=f"<table>{table_text}</table>", span_source=table_parts))

ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
if ftn_res.text:
res_parts.append(ftn_res)

text_res = "".join([r.text for r in res_parts])
text_res = f"<table>{text_res}</table>" if text_res else ""

return create_ser_result(text=text_res, span_source=res_parts)

Expand Down Expand Up @@ -610,6 +620,10 @@ def get_img_row(imgb64: str, ind: int) -> str:
details_html = f"<details><summary>Meta</summary>{meta_res.text}</details>"
res_parts.append(create_ser_result(text=details_html, span_source=[meta_res]))

ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
if ftn_res.text:
res_parts.append(ftn_res)

text_res = "".join([r.text for r in res_parts])
if text_res:
text_res = f"<figure>{text_res}</figure>"
Expand Down Expand Up @@ -1210,6 +1224,39 @@ def serialize_captions(
text_res = f"<{tag}>{text_res}</{tag}>"
return create_ser_result(text=text_res, span_source=results)

@override
def serialize_footnotes(
self,
item: FloatingItem,
**kwargs: Any,
) -> SerializationResult:
"""Serialize the item's footnotes."""
params = self.params.merge_with_patch(patch=kwargs)
if DocItemLabel.FOOTNOTE not in params.labels:
return create_ser_result()

raw_results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
if not raw_results:
return create_ser_result()

results: list[SerializationResult] = []
for ser_res in raw_results:
dir_str = ""
if ser_res.spans and isinstance(ser_res.spans[0].item, TextItem):
text_dir = get_text_direction(ser_res.spans[0].item.text)
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""

results.append(
create_ser_result(
text=f'<div class="footnote"{dir_str}>{ser_res.text}</div>',
span_source=[ser_res],
)
)

text_res = "".join([r.text for r in results])
text_res = f'<div class="footnotes" role="note">{text_res}</div>'
return create_ser_result(text=text_res, span_source=results)

def _generate_head(self) -> str:
"""Generate the HTML head section with metadata and styles."""
params = self.params
Expand All @@ -1236,8 +1283,12 @@ def _generate_head(self) -> str:
head_parts.append(f"<style>\n{params.css_styles}\n</style>")
elif self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
head_parts.append(_get_css_for_split_page())
if self.has_visible_footnotes():
head_parts.append(_get_css_for_footnotes())
elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
head_parts.append(_get_css_for_single_column())
if self.has_visible_footnotes():
head_parts.append(_get_css_for_footnotes())
else:
raise ValueError(f"unknown output-style: {self.params.output_style}")

Expand All @@ -1250,7 +1301,7 @@ def _generate_head(self) -> str:

def _get_default_css(self) -> str:
"""Return default CSS styles for the HTML document."""
return "<style></style>"
return _get_css_with_no_styling()

@override
def requires_page_break(self):
Expand Down
43 changes: 37 additions & 6 deletions docling_core/transforms/serializer/html_styles.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,42 @@
"""HTML styles for different export modes."""


def _wrap_style(css: str, *, trailing_newline: bool = False) -> str:
"""Wrap CSS in a style tag."""
suffix = "\n" if trailing_newline else ""
return f"<style>{css}</style>{suffix}"


def _get_css_with_no_styling() -> str:
"""Return default CSS styles for the HTML document."""
return "<style></style>"
return _wrap_style("")


_FOOTNOTE_CSS = """
.footnotes {
margin-top: 0.65em;
padding-top: 0.45em;
border-top: 1px solid #ddd;
color: #666;
font-size: 0.95em;
line-height: 1.5;
text-align: left;
}
.footnote + .footnote {
margin-top: 0.35em;
}
"""


def _get_css_for_footnotes() -> str:
"""Return CSS styles for floating-item footnotes."""
return _wrap_style(_FOOTNOTE_CSS)


def _get_css_for_split_page() -> str:
"""Return default CSS styles for the HTML document."""
return """<style>
return _wrap_style(
"""
html {
background-color: #e1e1e1;
font-family: Arial, sans-serif;
Expand Down Expand Up @@ -87,13 +115,15 @@ def _get_css_for_split_page() -> str:
word-wrap: break-word;
/*overflow-wrap: break-word;*/
}
</style>
"""
""",
trailing_newline=True,
)


def _get_css_for_single_column() -> str:
"""Return CSS styles for the single-column HTML document."""
return """<style>
return _wrap_style(
"""
html {
background-color: #f5f5f5;
font-family: Arial, sans-serif;
Expand Down Expand Up @@ -209,4 +239,5 @@ def _get_css_for_single_column() -> str:
color: #666;
margin-top: 0.5em;
}
</style>"""
"""
)
30 changes: 30 additions & 0 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,13 @@ def serialize(
if table_text:
res_parts.append(create_ser_result(text=table_text, span_source=item))

ftn_res = doc_serializer.serialize_footnotes(
item=item,
**kwargs,
)
if ftn_res.text:
res_parts.append(ftn_res)

text_res = "\n\n".join([r.text for r in res_parts])

return create_ser_result(text=text_res, span_source=res_parts)
Expand Down Expand Up @@ -621,6 +628,14 @@ def serialize(
md_table_content = temp_table.export_to_markdown(temp_doc)
if len(md_table_content) > 0:
res_parts.append(create_ser_result(text=md_table_content, span_source=item))

ftn_res = doc_serializer.serialize_footnotes(
item=item,
**kwargs,
)
if ftn_res.text:
res_parts.append(ftn_res)

text_res = "\n\n".join([r.text for r in res_parts if r.text])

return create_ser_result(text=text_res, span_source=res_parts)
Expand Down Expand Up @@ -911,6 +926,21 @@ def post_process(
)
return res

@override
def serialize_footnotes(
self,
item: FloatingItem,
**kwargs: Any,
) -> SerializationResult:
"""Serialize footnotes as separate Markdown blocks."""
params = self.params.merge_with_patch(patch=kwargs)
if DocItemLabel.FOOTNOTE not in params.labels:
return create_ser_result()

results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
text_res = "\n\n".join([r.text for r in results])
return create_ser_result(text=text_res, span_source=results)

@override
def serialize_doc(
self,
Expand Down
25 changes: 25 additions & 0 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,31 @@ def test_triplet_table_serializer_single_column():
expected = "Country = Italy. Country = Canada. Country = Switzerland"
assert result.text == expected, f"Expected '{expected}', got '{result.text}'"


def test_triplet_table_serializer_includes_footnotes():
"""Regression: table footnotes must be preserved in chunking serialization."""

doc = DoclingDocument(name="table_footnotes")
table_data = TableData(num_cols=1)
table_data.add_row(["Country"])
table_data.add_row(["Italy"])
doc.add_table(data=table_data)

table_item = next(iter(doc.iterate_items()))[0]
footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="Country footnote")
table_item.footnotes.append(footnote.get_ref())

serializer = ChunkingDocSerializer(doc=doc)
result = TripletTableSerializer().serialize(
item=table_item,
doc_serializer=serializer,
doc=doc,
)

assert result.text == "Country = Italy\n\nCountry footnote"
assert result.text.count("Country footnote") == 1


def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument):
doc = rich_table_doc

Expand Down
Loading
Loading