diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 57e3671e..bc25cc6e 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -100,6 +100,13 @@ def serialize( table_text = ". ".join(table_text_parts) parts.append(create_ser_result(text=table_text, span_source=item)) + ftn_res = doc_serializer.serialize_footnotes( + item=item, + **kwargs, + ) + if ftn_res.text: + parts.append(ftn_res) + text_res = "\n\n".join([r.text for r in parts]) return create_ser_result(text=text_res, span_source=parts) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 980509d6..9024d18b 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -52,6 +52,7 @@ PictureDataType, PictureItem, PictureMoleculeData, + RefItem, Script, TableAnnotationType, TableItem, @@ -202,6 +203,7 @@ class CommonParams(BaseModel): include_formatting: bool = True include_hyperlinks: bool = True caption_delim: str = " " + footnote_delim: str = " " use_legacy_annotations: bool = Field( default=False, description="Use legacy annotation serialization.", @@ -316,6 +318,62 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]: self._excluded_refs_cache[params_json] = refs return refs + def _iter_visible_referenced_text_items( + self, + refs: Iterable[RefItem], + *, + excluded_refs: set[str], + ) -> Iterable[TextItem]: + """Yield referenced text items that survive the current serializer filters.""" + for ref in refs: + if isinstance(it := ref.resolve(self.doc), TextItem) and it.self_ref not in excluded_refs: + yield it + + def _serialize_referenced_text_items( + self, + refs: Iterable[RefItem], + **kwargs: Any, + ) -> list[SerializationResult]: + """Serialize referenced text items while bypassing the top-level skip path.""" + excluded_refs = self.get_excluded_refs(**kwargs) + return [ + self.text_serializer.serialize( + item=it, + doc_serializer=self, + doc=self.doc, + is_inline_scope=True, + **kwargs, + ) + for it in self._iter_visible_referenced_text_items( + refs, + excluded_refs=excluded_refs, + ) + ] + + def has_visible_footnotes(self, **kwargs: Any) -> bool: + """Whether the current serialization scope includes floating-item footnotes.""" + params = self.params.merge_with_patch(patch=kwargs) + if DocItemLabel.FOOTNOTE not in params.labels: + return False + + excluded_refs = self.get_excluded_refs(**kwargs) + for item, _ in _iterate_items( + doc=self.doc, + traverse_pictures=True, + layers=params.layers, + ): + if isinstance(item, FloatingItem) and item.self_ref not in excluded_refs: + if any( + True + for _ in self._iter_visible_referenced_text_items( + item.footnotes, + excluded_refs=excluded_refs, + ) + ): + return True + + return False + @abstractmethod def serialize_doc( self, @@ -621,18 +679,11 @@ def serialize_footnotes( ) -> SerializationResult: """Serialize the item's footnotes.""" params = self.params.merge_with_patch(patch=kwargs) - results: list[SerializationResult] = [] if DocItemLabel.FOOTNOTE in params.labels: - results = [ - create_ser_result(text=it.text, span_source=it) - for ftn in item.footnotes - if isinstance(it := ftn.resolve(self.doc), TextItem) - and it.self_ref not in self.get_excluded_refs(**kwargs) - ] - # FIXME: using the caption_delimiter for now ... - text_res = params.caption_delim.join([r.text for r in results]) - text_res = self.post_process(text=text_res) + results = self._serialize_referenced_text_items(item.footnotes, **kwargs) + text_res = params.footnote_delim.join([r.text for r in results]) else: + results = [] text_res = "" return create_ser_result(text=text_res, span_source=results) diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 4a92e303..95a9a359 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -39,8 +39,10 @@ create_ser_result, ) from docling_core.transforms.serializer.html_styles import ( + _get_css_for_footnotes, _get_css_for_single_column, _get_css_for_split_page, + _get_css_with_no_styling, ) from docling_core.transforms.visualizer.base import BaseVisualizer from docling_core.types.doc.base import ImageRefMode @@ -363,10 +365,10 @@ def serialize( **kwargs: Any, ) -> SerializationResult: """Serializes the passed table item to HTML.""" - res_parts: list[SerializationResult] = [] + table_parts: list[SerializationResult] = [] cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs) if cap_res.text: - res_parts.append(cap_res) + table_parts.append(cap_res) if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs): body = "" @@ -416,10 +418,18 @@ def serialize( if body: body = f"{body}" - res_parts.append(create_ser_result(text=body, span_source=span_source)) + table_parts.append(create_ser_result(text=body, span_source=span_source)) + + res_parts: list[SerializationResult] = [] + if table_parts: + table_text = "".join([r.text for r in table_parts]) + res_parts.append(create_ser_result(text=f"{table_text}
", span_source=table_parts)) + + ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs) + if ftn_res.text: + res_parts.append(ftn_res) text_res = "".join([r.text for r in res_parts]) - text_res = f"{text_res}
" if text_res else "" return create_ser_result(text=text_res, span_source=res_parts) @@ -610,6 +620,10 @@ def get_img_row(imgb64: str, ind: int) -> str: details_html = f"
Meta{meta_res.text}
" res_parts.append(create_ser_result(text=details_html, span_source=[meta_res])) + ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs) + if ftn_res.text: + res_parts.append(ftn_res) + text_res = "".join([r.text for r in res_parts]) if text_res: text_res = f"
{text_res}
" @@ -1210,6 +1224,39 @@ def serialize_captions( text_res = f"<{tag}>{text_res}" return create_ser_result(text=text_res, span_source=results) + @override + def serialize_footnotes( + self, + item: FloatingItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's footnotes.""" + params = self.params.merge_with_patch(patch=kwargs) + if DocItemLabel.FOOTNOTE not in params.labels: + return create_ser_result() + + raw_results = self._serialize_referenced_text_items(item.footnotes, **kwargs) + if not raw_results: + return create_ser_result() + + results: list[SerializationResult] = [] + for ser_res in raw_results: + dir_str = "" + if ser_res.spans and isinstance(ser_res.spans[0].item, TextItem): + text_dir = get_text_direction(ser_res.spans[0].item.text) + dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else "" + + results.append( + create_ser_result( + text=f'
{ser_res.text}
', + span_source=[ser_res], + ) + ) + + text_res = "".join([r.text for r in results]) + text_res = f'
{text_res}
' + return create_ser_result(text=text_res, span_source=results) + def _generate_head(self) -> str: """Generate the HTML head section with metadata and styles.""" params = self.params @@ -1236,8 +1283,12 @@ def _generate_head(self) -> str: head_parts.append(f"") elif self.params.output_style == HTMLOutputStyle.SPLIT_PAGE: head_parts.append(_get_css_for_split_page()) + if self.has_visible_footnotes(): + head_parts.append(_get_css_for_footnotes()) elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN: head_parts.append(_get_css_for_single_column()) + if self.has_visible_footnotes(): + head_parts.append(_get_css_for_footnotes()) else: raise ValueError(f"unknown output-style: {self.params.output_style}") @@ -1250,7 +1301,7 @@ def _generate_head(self) -> str: def _get_default_css(self) -> str: """Return default CSS styles for the HTML document.""" - return "" + return _get_css_with_no_styling() @override def requires_page_break(self): diff --git a/docling_core/transforms/serializer/html_styles.py b/docling_core/transforms/serializer/html_styles.py index 3d721f01..59585c7e 100644 --- a/docling_core/transforms/serializer/html_styles.py +++ b/docling_core/transforms/serializer/html_styles.py @@ -1,14 +1,42 @@ """HTML styles for different export modes.""" +def _wrap_style(css: str, *, trailing_newline: bool = False) -> str: + """Wrap CSS in a style tag.""" + suffix = "\n" if trailing_newline else "" + return f"{suffix}" + + def _get_css_with_no_styling() -> str: """Return default CSS styles for the HTML document.""" - return "" + return _wrap_style("") + + +_FOOTNOTE_CSS = """ + .footnotes { + margin-top: 0.65em; + padding-top: 0.45em; + border-top: 1px solid #ddd; + color: #666; + font-size: 0.95em; + line-height: 1.5; + text-align: left; + } + .footnote + .footnote { + margin-top: 0.35em; + } +""" + + +def _get_css_for_footnotes() -> str: + """Return CSS styles for floating-item footnotes.""" + return _wrap_style(_FOOTNOTE_CSS) def _get_css_for_split_page() -> str: """Return default CSS styles for the HTML document.""" - return """ -""" +""", + trailing_newline=True, + ) def _get_css_for_single_column() -> str: """Return CSS styles for the single-column HTML document.""" - return """""" +""" + ) diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index da547b21..4dd99d75 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -561,6 +561,13 @@ def serialize( if table_text: res_parts.append(create_ser_result(text=table_text, span_source=item)) + ftn_res = doc_serializer.serialize_footnotes( + item=item, + **kwargs, + ) + if ftn_res.text: + res_parts.append(ftn_res) + text_res = "\n\n".join([r.text for r in res_parts]) return create_ser_result(text=text_res, span_source=res_parts) @@ -621,6 +628,14 @@ def serialize( md_table_content = temp_table.export_to_markdown(temp_doc) if len(md_table_content) > 0: res_parts.append(create_ser_result(text=md_table_content, span_source=item)) + + ftn_res = doc_serializer.serialize_footnotes( + item=item, + **kwargs, + ) + if ftn_res.text: + res_parts.append(ftn_res) + text_res = "\n\n".join([r.text for r in res_parts if r.text]) return create_ser_result(text=text_res, span_source=res_parts) @@ -911,6 +926,21 @@ def post_process( ) return res + @override + def serialize_footnotes( + self, + item: FloatingItem, + **kwargs: Any, + ) -> SerializationResult: + """Serialize footnotes as separate Markdown blocks.""" + params = self.params.merge_with_patch(patch=kwargs) + if DocItemLabel.FOOTNOTE not in params.labels: + return create_ser_result() + + results = self._serialize_referenced_text_items(item.footnotes, **kwargs) + text_res = "\n\n".join([r.text for r in results]) + return create_ser_result(text=text_res, span_source=results) + @override def serialize_doc( self, diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index 9f29e759..b9d90cd4 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -180,6 +180,31 @@ def test_triplet_table_serializer_single_column(): expected = "Country = Italy. Country = Canada. Country = Switzerland" assert result.text == expected, f"Expected '{expected}', got '{result.text}'" + +def test_triplet_table_serializer_includes_footnotes(): + """Regression: table footnotes must be preserved in chunking serialization.""" + + doc = DoclingDocument(name="table_footnotes") + table_data = TableData(num_cols=1) + table_data.add_row(["Country"]) + table_data.add_row(["Italy"]) + doc.add_table(data=table_data) + + table_item = next(iter(doc.iterate_items()))[0] + footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="Country footnote") + table_item.footnotes.append(footnote.get_ref()) + + serializer = ChunkingDocSerializer(doc=doc) + result = TripletTableSerializer().serialize( + item=table_item, + doc_serializer=serializer, + doc=doc, + ) + + assert result.text == "Country = Italy\n\nCountry footnote" + assert result.text.count("Country footnote") == 1 + + def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument): doc = rich_table_doc diff --git a/test/test_serialization.py b/test/test_serialization.py index dff9cab6..9270b629 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -2,6 +2,7 @@ import threading from pathlib import Path +from unittest.mock import MagicMock, patch import pytest @@ -10,6 +11,7 @@ HTMLDocSerializer, HTMLOutputStyle, HTMLParams, + HTMLTableSerializer, ) from docling_core.transforms.serializer.markdown import ( MarkdownDocSerializer, @@ -24,6 +26,7 @@ from docling_core.types.doc.document import ( DescriptionAnnotation, DoclingDocument, + Formatting, RefItem, RichTableCell, TableCell, @@ -56,6 +59,64 @@ def _normalize_quotes(s: str) -> str: assert actual == expected +def _build_table_with_footnote_doc(footnote_text: str) -> DoclingDocument: + doc = DoclingDocument(name="table_footnotes") + table = doc.add_table(data=TableData(num_rows=2, num_cols=1)) + doc.add_table_cell( + table, + TableCell( + text="Header", + start_row_offset_idx=0, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=1, + ), + ) + doc.add_table_cell( + table, + TableCell( + text="Value*", + start_row_offset_idx=1, + end_row_offset_idx=2, + start_col_offset_idx=0, + end_col_offset_idx=1, + ), + ) + footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text=footnote_text) + table.footnotes.append(footnote.get_ref()) + return doc + + +def _build_picture_with_footnote_doc(footnote_text: str) -> DoclingDocument: + doc = DoclingDocument(name="picture_footnotes") + picture = doc.add_picture() + caption = doc.add_text(label=DocItemLabel.CAPTION, text="Picture caption") + footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text=footnote_text) + picture.captions.append(caption.get_ref()) + picture.footnotes.append(footnote.get_ref()) + return doc + + +def _build_table_with_formatted_footnote_doc() -> DoclingDocument: + doc = _build_table_with_footnote_doc(footnote_text="bold link") + footnote = next(text for text in doc.texts if text.label == DocItemLabel.FOOTNOTE) + footnote.formatting = Formatting(bold=True) + footnote.hyperlink = "https://example.com" + return doc + + +def _build_table_with_multiple_formatted_footnotes_doc() -> DoclingDocument: + doc = _build_table_with_footnote_doc(footnote_text="one") + table = doc.tables[0] + first_footnote = next(text for text in doc.texts if text.label == DocItemLabel.FOOTNOTE) + first_footnote.formatting = Formatting(bold=True) + first_footnote.hyperlink = "https://one.example" + + second_footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="two") + table.footnotes.append(second_footnote.get_ref()) + return doc + + # =============================== # Markdown tests # =============================== @@ -177,6 +238,42 @@ def test_md_charts(): verify(exp_file=src.with_suffix(".gt.md"), actual=actual) +def test_md_table_footnotes_are_serialized_once(): + footnote_text = "Table footnote & more" + doc = _build_table_with_footnote_doc(footnote_text=footnote_text) + + actual = doc.export_to_markdown() + + assert "\n\nTable footnote <unsafe> & more" in actual + assert actual.count("Table footnote") == 1 + + +def test_md_picture_footnotes_are_serialized_once(): + footnote_text = "Picture footnote" + doc = _build_picture_with_footnote_doc(footnote_text=footnote_text) + + actual = doc.export_to_markdown() + + assert actual.index("Picture caption") < actual.index("") < actual.index(footnote_text) + assert actual.count(footnote_text) == 1 + + +def test_md_footnotes_preserve_formatting_and_hyperlinks(): + doc = _build_table_with_formatted_footnote_doc() + + actual = doc.export_to_markdown() + + assert "[**bold link**](https://example.com)" in actual + + +def test_md_multiple_footnotes_are_separate_blocks(): + doc = _build_table_with_multiple_formatted_footnotes_doc() + + actual = doc.export_to_markdown() + + assert "[**one**](https://one.example)\n\ntwo" in actual + + def test_md_inline_and_formatting(): src = Path("./test/data/doc/inline_and_formatting.yaml") doc = DoclingDocument.load_from_yaml(src) @@ -538,46 +635,44 @@ def test_md_traverse_pictures(): def test_html_table_serializer_get_header_and_body_lines(): """Test HTMLTableSerializer.get_header_and_body_lines() method.""" - from docling_core.transforms.serializer.html import HTMLTableSerializer - from unittest.mock import patch, MagicMock - + serializer = HTMLTableSerializer() - + # Test 1: Valid HTML with headers and data valid_html = "
Header1Header2
Data1Data2
" headers, body = serializer.get_header_and_body_lines(table_text=valid_html) assert len(headers) > 0, "Should have headers" assert len(body) > 0, "Should have body rows" - + # Test 2: Row without closing tag # Parser will find the row, but when we search for it won't be found no_close_tr = "HeaderData1" headers, body = serializer.get_header_and_body_lines(table_text=no_close_tr) assert isinstance(headers, list) assert isinstance(body, list) - + # Test 3: Data rows with incomplete closing tags # When collecting remaining rows, some tags are missing incomplete_data = "H1D1D2" headers, body = serializer.get_header_and_body_lines(table_text=incomplete_data) assert isinstance(headers, list) assert isinstance(body, list) - + # Test 4: Force exception in parser - with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class: + with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class: mock_parser = MagicMock() mock_parser.feed.side_effect = Exception("Parser error") mock_parser_class.return_value = mock_parser - + broken_html = "HeaderData" headers, body = serializer.get_header_and_body_lines(table_text=broken_html) # Should use fallback logic assert isinstance(headers, list) assert isinstance(body, list) - + # Test 5: Parser returns more rows than exist in HTML # Mock parser to return extra rows that don't exist in the HTML - with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class: + with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class: mock_parser = MagicMock() # Create fake row data - more rows than actually exist in HTML mock_parser.rows = [ @@ -587,32 +682,32 @@ def test_html_table_serializer_get_header_and_body_lines(): {"th_cells": [], "td_cells": ["D1"]}, ] mock_parser_class.return_value = mock_parser - + # HTML with only 2 rows, but parser claims 4 limited_html = "H1H2" headers, body = serializer.get_header_and_body_lines(table_text=limited_html) assert isinstance(headers, list) assert isinstance(body, list) - + # Test 6: Specific case for line 485 - row_start found but row_end not found # Create HTML where parser finds a row, but the actual HTML has - with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class: + with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class: mock_parser = MagicMock() # Parser reports a header row exists mock_parser.rows = [ {"th_cells": ["Header"], "td_cells": []}, ] mock_parser_class.return_value = mock_parser - + # But the HTML has html_no_close = "Header" headers, body = serializer.get_header_and_body_lines(table_text=html_no_close) assert isinstance(headers, list) assert isinstance(body, list) - + # Test 7: Specific case for line 504 - data collection finds # Create HTML where we start collecting data rows but encounter incomplete row - with patch('docling_core.transforms.serializer.html._SimpleHTMLTableParser') as mock_parser_class: + with patch("docling_core.transforms.serializer.html._SimpleHTMLTableParser") as mock_parser_class: mock_parser = MagicMock() # Parser reports header then data rows mock_parser.rows = [ @@ -620,13 +715,13 @@ def test_html_table_serializer_get_header_and_body_lines(): {"th_cells": [], "td_cells": ["D1"]}, # This triggers data collection ] mock_parser_class.return_value = mock_parser - + # HTML has complete header but incomplete data row html_incomplete_data = "HD1D2" headers, body = serializer.get_header_and_body_lines(table_text=html_incomplete_data) assert isinstance(headers, list) assert isinstance(body, list) - + # Test 8: Table with footer content with_footer = "HDFooter content" headers, body = serializer.get_header_and_body_lines(table_text=with_footer) @@ -636,6 +731,58 @@ def test_html_table_serializer_get_header_and_body_lines(): assert "Footer" in str(body) +def test_html_table_footnotes_are_serialized_once(): + footnote_text = "Table footnote & more" + doc = _build_table_with_footnote_doc(footnote_text=footnote_text) + + actual = doc.export_to_html() + + assert '
' in actual + assert '
' in actual + assert ".footnotes {" in actual + assert "<unsafe> & more" in actual + assert actual.count("Table footnote") == 1 + assert 'class="footnotes" role="note" style=' not in actual + assert 'class="footnote" style=' not in actual + assert actual.index("") < actual.index('
' in actual + assert '
' in actual + assert "<unsafe> & more" in actual + assert actual.count("Picture footnote") == 1 + assert ".footnote + .footnote {" in actual + assert 'class="footnotes" role="note" style=' not in actual + assert 'class="footnote" style=' not in actual + assert actual.index("
") < actual.index('
") + + +def test_html_footnotes_preserve_formatting_and_hyperlinks(): + doc = _build_table_with_formatted_footnote_doc() + + actual = doc.export_to_html() + + assert '
bold link' in actual + + +def test_html_hidden_footnotes_do_not_inject_footnote_css(): + doc = _build_table_with_footnote_doc(footnote_text="hidden footnote") + + actual = HTMLDocSerializer( + doc=doc, + params=HTMLParams(labels=_DEFAULT_LABELS - {DocItemLabel.FOOTNOTE}), + ).serialize().text + + assert '