diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 903fcbbd..9b4686da 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -566,6 +566,13 @@ def serialize( if table_text: res_parts.append(create_ser_result(text=table_text, span_source=item)) + ftn_res = doc_serializer.serialize_footnotes( + item=item, + **kwargs, + ) + if ftn_res.text: + res_parts.append(ftn_res) + text_res = "\n\n".join([r.text for r in res_parts]) return create_ser_result(text=text_res, span_source=res_parts) @@ -626,6 +633,14 @@ def serialize( md_table_content = temp_table.export_to_markdown(temp_doc) if len(md_table_content) > 0: res_parts.append(create_ser_result(text=md_table_content, span_source=item)) + + ftn_res = doc_serializer.serialize_footnotes( + item=item, + **kwargs, + ) + if ftn_res.text: + res_parts.append(ftn_res) + text_res = "\n\n".join([r.text for r in res_parts if r.text]) return create_ser_result(text=text_res, span_source=res_parts) @@ -842,6 +857,34 @@ class MarkdownDocSerializer(DocSerializer): params: MarkdownParams = MarkdownParams() + @override + def serialize_footnotes( + self, + item: FloatingItem, + **kwargs: Any, + ) -> SerializationResult: + params: MarkdownParams = self.params.merge_with_patch(patch=kwargs) + results: list[SerializationResult] = [] + if DocItemLabel.FOOTNOTE in params.labels: + results = [] + for footnote in item.footnotes: + if isinstance(ftn := footnote.resolve(self.doc), TextItem): + parts = ftn.text.split(" ", 1) + + if len(parts) == 2: + formatted_text = f"[^{parts[0]}]: {parts[1]}\n" + else: + formatted_text = f"[^{parts[0]}]:\n" + + results.append(create_ser_result(text=formatted_text, span_source=ftn)) + + text_res = "".join([r.text for r in results]) + + else: + text_res = "" + + return create_ser_result(text=text_res, span_source=results) + @override def serialize_bold(self, text: str, **kwargs: Any): """Apply Markdown-specific bold serialization.""" diff --git a/test/test_markdown_footnotes.py b/test/test_markdown_footnotes.py new file mode 100644 index 00000000..864ab0b2 --- /dev/null +++ b/test/test_markdown_footnotes.py @@ -0,0 +1,137 @@ +from docling_core.transforms.serializer.markdown import MarkdownDocSerializer +from docling_core.types.doc import DocItemLabel, DoclingDocument, TableCell, TableData +from docling_core.types.doc.document import ( + CodeItem, + FieldHeadingItem, + FieldValueItem, + FormulaItem, + ListItem, + SectionHeaderItem, + TextItem, + TitleItem, +) + +# Mock footnotes for pre-serialization +numericFtnMock = "1 Note about data" +wordFtnMock = "ID Note about data" +idOnlyFtnMock = "ID" + +# Mock footnotes for post-serialization +numericFtnSerialized = "[^1]: Note about data\n" +wordFtnSerialized = "[^ID]: Note about data\n" +idOnlyFtnSerialized = "[^ID]:\n" + + +def test_table_with_footnotes_markdown(): + doc = DoclingDocument(name="test") + + table = doc.add_table(data=TableData()) + + # Test three types of footnotes on table + footnote1: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=numericFtnMock) + footnote2: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=wordFtnMock) + footnote3: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=idOnlyFtnMock) + + table.footnotes.append(footnote1.get_ref()) + table.footnotes.append(footnote2.get_ref()) + table.footnotes.append(footnote3.get_ref()) + + serializer = MarkdownDocSerializer(doc=doc) + + result = serializer.serialize(item=table) + + # Verify serialization result has formatted footnotes + assert numericFtnSerialized in result.text + assert wordFtnSerialized in result.text + assert idOnlyFtnSerialized in result.text + + +def test_picture_with_footnotes_markdown(): + doc = DoclingDocument(name="test") + + picture = doc.add_picture() + + # Test one footnote on picture + footnote1: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=numericFtnMock) + + picture.footnotes.append(footnote1.get_ref()) + + serializer = MarkdownDocSerializer(doc=doc) + result = serializer.serialize(item=picture) + + # Verify serialization result has formatted footnote + assert numericFtnSerialized in result.text + + +def test_table_export_to_markdown_with_footnotes(): + doc = DoclingDocument(name="test") + + # Create a table + table_data = TableData( + num_rows=2, + num_cols=2, + table_cells=[ + TableCell( + text="Header 1", + row_span=1, + col_span=1, + start_row_offset_idx=0, + end_row_offset_idx=0, + start_col_offset_idx=0, + end_col_offset_idx=0, + column_header=True, + ), + TableCell( + text="Header 2", + row_span=1, + col_span=1, + start_row_offset_idx=0, + end_row_offset_idx=0, + start_col_offset_idx=1, + end_col_offset_idx=1, + column_header=True, + ), + TableCell( + text="Data 1", + row_span=1, + col_span=1, + start_row_offset_idx=1, + end_row_offset_idx=1, + start_col_offset_idx=0, + end_col_offset_idx=0, + ), + TableCell( + text="Data 2", + row_span=1, + col_span=1, + start_row_offset_idx=1, + end_row_offset_idx=1, + start_col_offset_idx=1, + end_col_offset_idx=1, + ), + ], + ) + + table = doc.add_table(data=table_data) + + caption = doc.add_text(label=DocItemLabel.CAPTION, text="Table 1: Sample Data") + table.captions.append(caption.get_ref()) + + # Test one footnote on picture + footnote1 = doc.add_text(label=DocItemLabel.FOOTNOTE, text=numericFtnMock) + + table.footnotes.append(footnote1.get_ref()) + + markdown = table.export_to_markdown(doc) + + # Test Table is in exported markdown + assert "Table 1: Sample Data" in markdown + + # Test Footnote is in exported markdown + assert numericFtnSerialized in markdown + + +if __name__ == "__main__": + test_table_with_footnotes_markdown() + test_picture_with_footnotes_markdown() + test_table_export_to_markdown_with_footnotes()