Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,13 @@ def serialize(
if table_text:
res_parts.append(create_ser_result(text=table_text, span_source=item))

ftn_res = doc_serializer.serialize_footnotes(
item=item,
**kwargs,
)
if ftn_res.text:
res_parts.append(ftn_res)

text_res = "\n\n".join([r.text for r in res_parts])

return create_ser_result(text=text_res, span_source=res_parts)
Expand Down Expand Up @@ -626,6 +633,14 @@ def serialize(
md_table_content = temp_table.export_to_markdown(temp_doc)
if len(md_table_content) > 0:
res_parts.append(create_ser_result(text=md_table_content, span_source=item))

ftn_res = doc_serializer.serialize_footnotes(
item=item,
**kwargs,
)
if ftn_res.text:
res_parts.append(ftn_res)

text_res = "\n\n".join([r.text for r in res_parts if r.text])

return create_ser_result(text=text_res, span_source=res_parts)
Expand Down Expand Up @@ -842,6 +857,34 @@ class MarkdownDocSerializer(DocSerializer):

params: MarkdownParams = MarkdownParams()

@override
def serialize_footnotes(
self,
item: FloatingItem,
**kwargs: Any,
) -> SerializationResult:
params: MarkdownParams = self.params.merge_with_patch(patch=kwargs)
results: list[SerializationResult] = []
if DocItemLabel.FOOTNOTE in params.labels:
results = []
for footnote in item.footnotes:
if isinstance(ftn := footnote.resolve(self.doc), TextItem):
parts = ftn.text.split(" ", 1)

if len(parts) == 2:
formatted_text = f"[^{parts[0]}]: {parts[1]}\n"
else:
formatted_text = f"[^{parts[0]}]:\n"

results.append(create_ser_result(text=formatted_text, span_source=ftn))

text_res = "".join([r.text for r in results])

else:
text_res = ""

return create_ser_result(text=text_res, span_source=results)

@override
def serialize_bold(self, text: str, **kwargs: Any):
"""Apply Markdown-specific bold serialization."""
Expand Down
137 changes: 137 additions & 0 deletions test/test_markdown_footnotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from docling_core.transforms.serializer.markdown import MarkdownDocSerializer
from docling_core.types.doc import DocItemLabel, DoclingDocument, TableCell, TableData
from docling_core.types.doc.document import (
CodeItem,
FieldHeadingItem,
FieldValueItem,
FormulaItem,
ListItem,
SectionHeaderItem,
TextItem,
TitleItem,
)

# Mock footnotes for pre-serialization
numericFtnMock = "1 Note about data"
wordFtnMock = "ID Note about data"
idOnlyFtnMock = "ID"

# Mock footnotes for post-serialization
numericFtnSerialized = "[^1]: Note about data\n"
wordFtnSerialized = "[^ID]: Note about data\n"
idOnlyFtnSerialized = "[^ID]:\n"


def test_table_with_footnotes_markdown():
doc = DoclingDocument(name="test")

table = doc.add_table(data=TableData())

# Test three types of footnotes on table
footnote1: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=numericFtnMock)
footnote2: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=wordFtnMock)
footnote3: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=idOnlyFtnMock)

table.footnotes.append(footnote1.get_ref())
table.footnotes.append(footnote2.get_ref())
table.footnotes.append(footnote3.get_ref())

serializer = MarkdownDocSerializer(doc=doc)

result = serializer.serialize(item=table)

# Verify serialization result has formatted footnotes
assert numericFtnSerialized in result.text
assert wordFtnSerialized in result.text
assert idOnlyFtnSerialized in result.text


def test_picture_with_footnotes_markdown():
doc = DoclingDocument(name="test")

picture = doc.add_picture()

# Test one footnote on picture
footnote1: TextItem = doc.add_text(label=DocItemLabel.FOOTNOTE, text=numericFtnMock)

picture.footnotes.append(footnote1.get_ref())

serializer = MarkdownDocSerializer(doc=doc)
result = serializer.serialize(item=picture)

# Verify serialization result has formatted footnote
assert numericFtnSerialized in result.text


def test_table_export_to_markdown_with_footnotes():
doc = DoclingDocument(name="test")

# Create a table
table_data = TableData(
num_rows=2,
num_cols=2,
table_cells=[
TableCell(
text="Header 1",
row_span=1,
col_span=1,
start_row_offset_idx=0,
end_row_offset_idx=0,
start_col_offset_idx=0,
end_col_offset_idx=0,
column_header=True,
),
TableCell(
text="Header 2",
row_span=1,
col_span=1,
start_row_offset_idx=0,
end_row_offset_idx=0,
start_col_offset_idx=1,
end_col_offset_idx=1,
column_header=True,
),
TableCell(
text="Data 1",
row_span=1,
col_span=1,
start_row_offset_idx=1,
end_row_offset_idx=1,
start_col_offset_idx=0,
end_col_offset_idx=0,
),
TableCell(
text="Data 2",
row_span=1,
col_span=1,
start_row_offset_idx=1,
end_row_offset_idx=1,
start_col_offset_idx=1,
end_col_offset_idx=1,
),
],
)

table = doc.add_table(data=table_data)

caption = doc.add_text(label=DocItemLabel.CAPTION, text="Table 1: Sample Data")
table.captions.append(caption.get_ref())

# Test one footnote on picture
footnote1 = doc.add_text(label=DocItemLabel.FOOTNOTE, text=numericFtnMock)

table.footnotes.append(footnote1.get_ref())

markdown = table.export_to_markdown(doc)

# Test Table is in exported markdown
assert "Table 1: Sample Data" in markdown

# Test Footnote is in exported markdown
assert numericFtnSerialized in markdown


if __name__ == "__main__":
test_table_with_footnotes_markdown()
test_picture_with_footnotes_markdown()
test_table_export_to_markdown_with_footnotes()
Loading