diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
index 57e3671e..bc25cc6e 100644
--- a/docling_core/transforms/chunker/hierarchical_chunker.py
+++ b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -100,6 +100,13 @@ def serialize(
table_text = ". ".join(table_text_parts)
parts.append(create_ser_result(text=table_text, span_source=item))
+ ftn_res = doc_serializer.serialize_footnotes(
+ item=item,
+ **kwargs,
+ )
+ if ftn_res.text:
+ parts.append(ftn_res)
+
text_res = "\n\n".join([r.text for r in parts])
return create_ser_result(text=text_res, span_source=parts)
diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py
index 980509d6..9024d18b 100644
--- a/docling_core/transforms/serializer/common.py
+++ b/docling_core/transforms/serializer/common.py
@@ -52,6 +52,7 @@
PictureDataType,
PictureItem,
PictureMoleculeData,
+ RefItem,
Script,
TableAnnotationType,
TableItem,
@@ -202,6 +203,7 @@ class CommonParams(BaseModel):
include_formatting: bool = True
include_hyperlinks: bool = True
caption_delim: str = " "
+ footnote_delim: str = " "
use_legacy_annotations: bool = Field(
default=False,
description="Use legacy annotation serialization.",
@@ -316,6 +318,62 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
self._excluded_refs_cache[params_json] = refs
return refs
+ def _iter_visible_referenced_text_items(
+ self,
+ refs: Iterable[RefItem],
+ *,
+ excluded_refs: set[str],
+ ) -> Iterable[TextItem]:
+ """Yield referenced text items that survive the current serializer filters."""
+ for ref in refs:
+ if isinstance(it := ref.resolve(self.doc), TextItem) and it.self_ref not in excluded_refs:
+ yield it
+
+ def _serialize_referenced_text_items(
+ self,
+ refs: Iterable[RefItem],
+ **kwargs: Any,
+ ) -> list[SerializationResult]:
+ """Serialize referenced text items while bypassing the top-level skip path."""
+ excluded_refs = self.get_excluded_refs(**kwargs)
+ return [
+ self.text_serializer.serialize(
+ item=it,
+ doc_serializer=self,
+ doc=self.doc,
+ is_inline_scope=True,
+ **kwargs,
+ )
+ for it in self._iter_visible_referenced_text_items(
+ refs,
+ excluded_refs=excluded_refs,
+ )
+ ]
+
+ def has_visible_footnotes(self, **kwargs: Any) -> bool:
+ """Whether the current serialization scope includes floating-item footnotes."""
+ params = self.params.merge_with_patch(patch=kwargs)
+ if DocItemLabel.FOOTNOTE not in params.labels:
+ return False
+
+ excluded_refs = self.get_excluded_refs(**kwargs)
+ for item, _ in _iterate_items(
+ doc=self.doc,
+ traverse_pictures=True,
+ layers=params.layers,
+ ):
+ if isinstance(item, FloatingItem) and item.self_ref not in excluded_refs:
+ if any(
+ True
+ for _ in self._iter_visible_referenced_text_items(
+ item.footnotes,
+ excluded_refs=excluded_refs,
+ )
+ ):
+ return True
+
+ return False
+
@abstractmethod
def serialize_doc(
self,
@@ -621,18 +679,11 @@ def serialize_footnotes(
) -> SerializationResult:
"""Serialize the item's footnotes."""
params = self.params.merge_with_patch(patch=kwargs)
- results: list[SerializationResult] = []
if DocItemLabel.FOOTNOTE in params.labels:
- results = [
- create_ser_result(text=it.text, span_source=it)
- for ftn in item.footnotes
- if isinstance(it := ftn.resolve(self.doc), TextItem)
- and it.self_ref not in self.get_excluded_refs(**kwargs)
- ]
- # FIXME: using the caption_delimiter for now ...
- text_res = params.caption_delim.join([r.text for r in results])
- text_res = self.post_process(text=text_res)
+ results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
+ text_res = params.footnote_delim.join([r.text for r in results])
else:
+ results = []
text_res = ""
return create_ser_result(text=text_res, span_source=results)
diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py
index 4a92e303..95a9a359 100644
--- a/docling_core/transforms/serializer/html.py
+++ b/docling_core/transforms/serializer/html.py
@@ -39,8 +39,10 @@
create_ser_result,
)
from docling_core.transforms.serializer.html_styles import (
+ _get_css_for_footnotes,
_get_css_for_single_column,
_get_css_for_split_page,
+ _get_css_with_no_styling,
)
from docling_core.transforms.visualizer.base import BaseVisualizer
from docling_core.types.doc.base import ImageRefMode
@@ -363,10 +365,10 @@ def serialize(
**kwargs: Any,
) -> SerializationResult:
"""Serializes the passed table item to HTML."""
- res_parts: list[SerializationResult] = []
+ table_parts: list[SerializationResult] = []
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
if cap_res.text:
- res_parts.append(cap_res)
+ table_parts.append(cap_res)
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
body = ""
@@ -416,10 +418,18 @@ def serialize(
if body:
body = f"
{body} "
- res_parts.append(create_ser_result(text=body, span_source=span_source))
+ table_parts.append(create_ser_result(text=body, span_source=span_source))
+
+ res_parts: list[SerializationResult] = []
+ if table_parts:
+ table_text = "".join([r.text for r in table_parts])
+ res_parts.append(create_ser_result(text=f"", span_source=table_parts))
+
+ ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
+ if ftn_res.text:
+ res_parts.append(ftn_res)
text_res = "".join([r.text for r in res_parts])
- text_res = f"" if text_res else ""
return create_ser_result(text=text_res, span_source=res_parts)
@@ -610,6 +620,10 @@ def get_img_row(imgb64: str, ind: int) -> str:
details_html = f"Meta {meta_res.text} "
res_parts.append(create_ser_result(text=details_html, span_source=[meta_res]))
+ ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
+ if ftn_res.text:
+ res_parts.append(ftn_res)
+
text_res = "".join([r.text for r in res_parts])
if text_res:
text_res = f"{text_res} "
@@ -1210,6 +1224,39 @@ def serialize_captions(
text_res = f"<{tag}>{text_res}{tag}>"
return create_ser_result(text=text_res, span_source=results)
+ @override
+ def serialize_footnotes(
+ self,
+ item: FloatingItem,
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """Serialize the item's footnotes."""
+ params = self.params.merge_with_patch(patch=kwargs)
+ if DocItemLabel.FOOTNOTE not in params.labels:
+ return create_ser_result()
+
+ raw_results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
+ if not raw_results:
+ return create_ser_result()
+
+ results: list[SerializationResult] = []
+ for ser_res in raw_results:
+ dir_str = ""
+ if ser_res.spans and isinstance(ser_res.spans[0].item, TextItem):
+ text_dir = get_text_direction(ser_res.spans[0].item.text)
+ dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
+
+ results.append(
+ create_ser_result(
+ text=f'',
+ span_source=[ser_res],
+ )
+ )
+
+ text_res = "".join([r.text for r in results])
+ text_res = f''
+ return create_ser_result(text=text_res, span_source=results)
+
def _generate_head(self) -> str:
"""Generate the HTML head section with metadata and styles."""
params = self.params
@@ -1236,8 +1283,12 @@ def _generate_head(self) -> str:
head_parts.append(f"")
elif self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
head_parts.append(_get_css_for_split_page())
+ if self.has_visible_footnotes():
+ head_parts.append(_get_css_for_footnotes())
elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
head_parts.append(_get_css_for_single_column())
+ if self.has_visible_footnotes():
+ head_parts.append(_get_css_for_footnotes())
else:
raise ValueError(f"unknown output-style: {self.params.output_style}")
@@ -1250,7 +1301,7 @@ def _generate_head(self) -> str:
def _get_default_css(self) -> str:
"""Return default CSS styles for the HTML document."""
- return ""
+ return _get_css_with_no_styling()
@override
def requires_page_break(self):
diff --git a/docling_core/transforms/serializer/html_styles.py b/docling_core/transforms/serializer/html_styles.py
index 3d721f01..59585c7e 100644
--- a/docling_core/transforms/serializer/html_styles.py
+++ b/docling_core/transforms/serializer/html_styles.py
@@ -1,14 +1,42 @@
"""HTML styles for different export modes."""
+def _wrap_style(css: str, *, trailing_newline: bool = False) -> str:
+ """Wrap CSS in a style tag."""
+ suffix = "\n" if trailing_newline else ""
+ return f"{suffix}"
+
+
def _get_css_with_no_styling() -> str:
"""Return default CSS styles for the HTML document."""
- return ""
+ return _wrap_style("")
+
+
+_FOOTNOTE_CSS = """
+ .footnotes {
+ margin-top: 0.65em;
+ padding-top: 0.45em;
+ border-top: 1px solid #ddd;
+ color: #666;
+ font-size: 0.95em;
+ line-height: 1.5;
+ text-align: left;
+ }
+ .footnote + .footnote {
+ margin-top: 0.35em;
+ }
+"""
+
+
+def _get_css_for_footnotes() -> str:
+ """Return CSS styles for floating-item footnotes."""
+ return _wrap_style(_FOOTNOTE_CSS)
def _get_css_for_split_page() -> str:
"""Return default CSS styles for the HTML document."""
- return """
-"""
+""",
+ trailing_newline=True,
+ )
def _get_css_for_single_column() -> str:
"""Return CSS styles for the single-column HTML document."""
- return """"""
+"""
+ )
diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py
index da547b21..4dd99d75 100644
--- a/docling_core/transforms/serializer/markdown.py
+++ b/docling_core/transforms/serializer/markdown.py
@@ -561,6 +561,13 @@ def serialize(
if table_text:
res_parts.append(create_ser_result(text=table_text, span_source=item))
+ ftn_res = doc_serializer.serialize_footnotes(
+ item=item,
+ **kwargs,
+ )
+ if ftn_res.text:
+ res_parts.append(ftn_res)
+
text_res = "\n\n".join([r.text for r in res_parts])
return create_ser_result(text=text_res, span_source=res_parts)
@@ -621,6 +628,14 @@ def serialize(
md_table_content = temp_table.export_to_markdown(temp_doc)
if len(md_table_content) > 0:
res_parts.append(create_ser_result(text=md_table_content, span_source=item))
+
+ ftn_res = doc_serializer.serialize_footnotes(
+ item=item,
+ **kwargs,
+ )
+ if ftn_res.text:
+ res_parts.append(ftn_res)
+
text_res = "\n\n".join([r.text for r in res_parts if r.text])
return create_ser_result(text=text_res, span_source=res_parts)
@@ -911,6 +926,21 @@ def post_process(
)
return res
+ @override
+ def serialize_footnotes(
+ self,
+ item: FloatingItem,
+ **kwargs: Any,
+ ) -> SerializationResult:
+ """Serialize footnotes as separate Markdown blocks."""
+ params = self.params.merge_with_patch(patch=kwargs)
+ if DocItemLabel.FOOTNOTE not in params.labels:
+ return create_ser_result()
+
+ results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
+ text_res = "\n\n".join([r.text for r in results])
+ return create_ser_result(text=text_res, span_source=results)
+
@override
def serialize_doc(
self,
diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
index 9f29e759..b9d90cd4 100644
--- a/test/test_hierarchical_chunker.py
+++ b/test/test_hierarchical_chunker.py
@@ -180,6 +180,31 @@ def test_triplet_table_serializer_single_column():
expected = "Country = Italy. Country = Canada. Country = Switzerland"
assert result.text == expected, f"Expected '{expected}', got '{result.text}'"
+
+def test_triplet_table_serializer_includes_footnotes():
+ """Regression: table footnotes must be preserved in chunking serialization."""
+
+ doc = DoclingDocument(name="table_footnotes")
+ table_data = TableData(num_cols=1)
+ table_data.add_row(["Country"])
+ table_data.add_row(["Italy"])
+ doc.add_table(data=table_data)
+
+ table_item = next(iter(doc.iterate_items()))[0]
+ footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="Country footnote")
+ table_item.footnotes.append(footnote.get_ref())
+
+ serializer = ChunkingDocSerializer(doc=doc)
+ result = TripletTableSerializer().serialize(
+ item=table_item,
+ doc_serializer=serializer,
+ doc=doc,
+ )
+
+ assert result.text == "Country = Italy\n\nCountry footnote"
+ assert result.text.count("Country footnote") == 1
+
+
def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument):
doc = rich_table_doc
diff --git a/test/test_serialization.py b/test/test_serialization.py
index dff9cab6..9270b629 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -2,6 +2,7 @@
import threading
from pathlib import Path
+from unittest.mock import MagicMock, patch
import pytest
@@ -10,6 +11,7 @@
HTMLDocSerializer,
HTMLOutputStyle,
HTMLParams,
+ HTMLTableSerializer,
)
from docling_core.transforms.serializer.markdown import (
MarkdownDocSerializer,
@@ -24,6 +26,7 @@
from docling_core.types.doc.document import (
DescriptionAnnotation,
DoclingDocument,
+ Formatting,
RefItem,
RichTableCell,
TableCell,
@@ -56,6 +59,64 @@ def _normalize_quotes(s: str) -> str:
assert actual == expected
+def _build_table_with_footnote_doc(footnote_text: str) -> DoclingDocument:
+ doc = DoclingDocument(name="table_footnotes")
+ table = doc.add_table(data=TableData(num_rows=2, num_cols=1))
+ doc.add_table_cell(
+ table,
+ TableCell(
+ text="Header",
+ start_row_offset_idx=0,
+ end_row_offset_idx=1,
+ start_col_offset_idx=0,
+ end_col_offset_idx=1,
+ ),
+ )
+ doc.add_table_cell(
+ table,
+ TableCell(
+ text="Value*",
+ start_row_offset_idx=1,
+ end_row_offset_idx=2,
+ start_col_offset_idx=0,
+ end_col_offset_idx=1,
+ ),
+ )
+ footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text=footnote_text)
+ table.footnotes.append(footnote.get_ref())
+ return doc
+
+
+def _build_picture_with_footnote_doc(footnote_text: str) -> DoclingDocument:
+ doc = DoclingDocument(name="picture_footnotes")
+ picture = doc.add_picture()
+ caption = doc.add_text(label=DocItemLabel.CAPTION, text="Picture caption")
+ footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text=footnote_text)
+ picture.captions.append(caption.get_ref())
+ picture.footnotes.append(footnote.get_ref())
+ return doc
+
+
+def _build_table_with_formatted_footnote_doc() -> DoclingDocument:
+ doc = _build_table_with_footnote_doc(footnote_text="bold link")
+ footnote = next(text for text in doc.texts if text.label == DocItemLabel.FOOTNOTE)
+ footnote.formatting = Formatting(bold=True)
+ footnote.hyperlink = "https://example.com"
+ return doc
+
+
+def _build_table_with_multiple_formatted_footnotes_doc() -> DoclingDocument:
+ doc = _build_table_with_footnote_doc(footnote_text="one")
+ table = doc.tables[0]
+ first_footnote = next(text for text in doc.texts if text.label == DocItemLabel.FOOTNOTE)
+ first_footnote.formatting = Formatting(bold=True)
+ first_footnote.hyperlink = "https://one.example"
+
+ second_footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="two")
+ table.footnotes.append(second_footnote.get_ref())
+ return doc
+
+
# ===============================
# Markdown tests
# ===============================
@@ -177,6 +238,42 @@ def test_md_charts():
verify(exp_file=src.with_suffix(".gt.md"), actual=actual)
+def test_md_table_footnotes_are_serialized_once():
+ footnote_text = "Table footnote & more"
+ doc = _build_table_with_footnote_doc(footnote_text=footnote_text)
+
+ actual = doc.export_to_markdown()
+
+ assert "\n\nTable footnote <unsafe> & more" in actual
+ assert actual.count("Table footnote") == 1
+
+
+def test_md_picture_footnotes_are_serialized_once():
+ footnote_text = "Picture footnote"
+ doc = _build_picture_with_footnote_doc(footnote_text=footnote_text)
+
+ actual = doc.export_to_markdown()
+
+ assert actual.index("Picture caption") < actual.index("") < actual.index(footnote_text)
+ assert actual.count(footnote_text) == 1
+
+
+def test_md_footnotes_preserve_formatting_and_hyperlinks():
+ doc = _build_table_with_formatted_footnote_doc()
+
+ actual = doc.export_to_markdown()
+
+ assert "[**bold link**](https://example.com)" in actual
+
+
+def test_md_multiple_footnotes_are_separate_blocks():
+ doc = _build_table_with_multiple_formatted_footnotes_doc()
+
+ actual = doc.export_to_markdown()
+
+ assert "[**one**](https://one.example)\n\ntwo" in actual
+
+
def test_md_inline_and_formatting():
src = Path("./test/data/doc/inline_and_formatting.yaml")
doc = DoclingDocument.load_from_yaml(src)
@@ -538,46 +635,44 @@ def test_md_traverse_pictures():
def test_html_table_serializer_get_header_and_body_lines():
"""Test HTMLTableSerializer.get_header_and_body_lines() method."""
- from docling_core.transforms.serializer.html import HTMLTableSerializer
- from unittest.mock import patch, MagicMock
-
+
serializer = HTMLTableSerializer()
-
+
# Test 1: Valid HTML with headers and data
valid_html = ""
headers, body = serializer.get_header_and_body_lines(table_text=valid_html)
assert len(headers) > 0, "Should have headers"
assert len(body) > 0, "Should have body rows"
-
+
# Test 2: Row without closing tag
# Parser will find the row, but when we search for it won't be found
no_close_tr = "Header Data1"
headers, body = serializer.get_header_and_body_lines(table_text=no_close_tr)
assert isinstance(headers, list)
assert isinstance(body, list)
-
+
# Test 3: Data rows with incomplete closing tags
# When collecting remaining rows, some tags are missing
incomplete_data = "H1 D1 D2"
headers, body = serializer.get_header_and_body_lines(table_text=incomplete_data)
assert isinstance(headers, list)
assert isinstance(body, list)
-
+
# Test 4: Force exception in parser
- with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class:
+ with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class:
mock_parser = MagicMock()
mock_parser.feed.side_effect = Exception("Parser error")
mock_parser_class.return_value = mock_parser
-
+
broken_html = " Header Data "
headers, body = serializer.get_header_and_body_lines(table_text=broken_html)
# Should use fallback logic
assert isinstance(headers, list)
assert isinstance(body, list)
-
+
# Test 5: Parser returns more rows than exist in HTML
# Mock parser to return extra rows that don't exist in the HTML
- with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class:
+ with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class:
mock_parser = MagicMock()
# Create fake row data - more rows than actually exist in HTML
mock_parser.rows = [
@@ -587,32 +682,32 @@ def test_html_table_serializer_get_header_and_body_lines():
{"th_cells": [], "td_cells": ["D1"]},
]
mock_parser_class.return_value = mock_parser
-
+
# HTML with only 2 rows, but parser claims 4
limited_html = "H1 H2 "
headers, body = serializer.get_header_and_body_lines(table_text=limited_html)
assert isinstance(headers, list)
assert isinstance(body, list)
-
+
# Test 6: Specific case for line 485 - row_start found but row_end not found
# Create HTML where parser finds a row, but the actual HTML has
- with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class:
+ with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class:
mock_parser = MagicMock()
# Parser reports a header row exists
mock_parser.rows = [
{"th_cells": ["Header"], "td_cells": []},
]
mock_parser_class.return_value = mock_parser
-
+
# But the HTML has
html_no_close = " Header"
headers, body = serializer.get_header_and_body_lines(table_text=html_no_close)
assert isinstance(headers, list)
assert isinstance(body, list)
-
+
# Test 7: Specific case for line 504 - data collection finds
# Create HTML where we start collecting data rows but encounter incomplete row
- with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class:
+ with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class:
mock_parser = MagicMock()
# Parser reports header then data rows
mock_parser.rows = [
@@ -620,13 +715,13 @@ def test_html_table_serializer_get_header_and_body_lines():
{"th_cells": [], "td_cells": ["D1"]}, # This triggers data collection
]
mock_parser_class.return_value = mock_parser
-
+
# HTML has complete header but incomplete data row
html_incomplete_data = " H D1 D2"
headers, body = serializer.get_header_and_body_lines(table_text=html_incomplete_data)
assert isinstance(headers, list)
assert isinstance(body, list)
-
+
# Test 8: Table with footer content
with_footer = " H D Footer content"
headers, body = serializer.get_header_and_body_lines(table_text=with_footer)
@@ -636,6 +731,58 @@ def test_html_table_serializer_get_header_and_body_lines():
assert "Footer" in str(body)
+def test_html_table_footnotes_are_serialized_once():
+ footnote_text = "Table footnote & more"
+ doc = _build_table_with_footnote_doc(footnote_text=footnote_text)
+
+ actual = doc.export_to_html()
+
+ assert '