Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions docling_core/transforms/serializer/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,27 @@ class OrigListItemMarkerMode(str, Enum):
AUTO = "auto"


class ImageAltTextMode(str, Enum):
"""Mode for image alt text in markdown output."""

STATIC = "static"
CAPTION = "caption"
DESCRIPTION = "description"


class MarkdownParams(CommonParams):
"""Markdown-specific serialization parameters."""

layers: set[ContentLayer] = {ContentLayer.BODY}
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
image_placeholder: str = "<!-- image -->"
image_alt_mode: ImageAltTextMode = Field(
default=ImageAltTextMode.STATIC,
description=(
"Mode for image alt text: 'static' uses 'Image', "
"'caption' uses caption text, 'description' uses AI-generated description."
),
)
enable_chart_tables: bool = True
indent: int = 4
wrap_width: Optional[PositiveInt] = None
Expand Down Expand Up @@ -473,6 +488,7 @@ def serialize(
doc=doc,
image_mode=params.image_mode,
image_placeholder=params.image_placeholder,
image_alt_mode=params.image_alt_mode,
)
if img_res.text:
res_parts.append(img_res)
Expand Down Expand Up @@ -502,19 +518,39 @@ def serialize(

return create_ser_result(text=text_res, span_source=res_parts)

def _get_alt_text(
self,
item: PictureItem,
doc: DoclingDocument,
alt_mode: ImageAltTextMode,
) -> str:
"""Get alt text for an image based on the mode, falling back to 'Image'."""
if alt_mode == ImageAltTextMode.DESCRIPTION:
if item.meta and item.meta.description:
return item.meta.description.text
elif alt_mode == ImageAltTextMode.CAPTION:
if item.captions:
cap = item.captions[0].resolve(doc)
if hasattr(cap, "text") and cap.text:
return cap.text
return "Image"

def _serialize_image_part(
self,
item: PictureItem,
doc: DoclingDocument,
image_mode: ImageRefMode,
image_placeholder: str,
image_alt_mode: ImageAltTextMode = ImageAltTextMode.STATIC,
**kwargs: Any,
) -> SerializationResult:
error_response = (
"<!-- 🖼️❌ Image not available. "
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
" -->"
)
alt_text = self._get_alt_text(item=item, doc=doc, alt_mode=image_alt_mode)

if image_mode == ImageRefMode.PLACEHOLDER:
text_res = image_placeholder
elif image_mode == ImageRefMode.EMBEDDED:
Expand All @@ -524,15 +560,15 @@ def _serialize_image_part(
and isinstance(item.image.uri, AnyUrl)
and item.image.uri.scheme == "data"
):
text = f"![Image]({item.image.uri})"
text = f"![{alt_text}]({item.image.uri})"
text_res = text
else:
# get the item.image._pil or crop it out of the page-image
img = item.get_image(doc=doc)

if img is not None:
imgb64 = item._image_to_base64(img)
text = f"![Image](data:image/png;base64,{imgb64})"
text = f"![{alt_text}](data:image/png;base64,{imgb64})"

text_res = text
else:
Expand All @@ -543,7 +579,7 @@ def _serialize_image_part(
):
text_res = image_placeholder
else:
text_res = f"![Image]({str(item.image.uri)})"
text_res = f"![{alt_text}]({str(item.image.uri)})"
else:
text_res = image_placeholder

Expand Down
205 changes: 205 additions & 0 deletions test/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
HTMLParams,
)
from docling_core.transforms.serializer.markdown import (
ImageAltTextMode,
MarkdownDocSerializer,
MarkdownParams,
OrigListItemMarkerMode,
Expand All @@ -19,7 +20,9 @@
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.document import (
DescriptionAnnotation,
DescriptionMetaField,
DoclingDocument,
PictureMeta,
TableCell,
TableData,
)
Expand Down Expand Up @@ -338,6 +341,208 @@ def test_md_single_row_table():
verify(exp_file=exp_file, actual=actual)


# ===============================
# Image Alt Text Mode tests
# ===============================


def test_md_image_alt_mode_static():
from PIL import Image as PILImage

from docling_core.types.doc import ImageRef

doc = DoclingDocument(name="test_alt_static")
cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption")
fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128))
pic = doc.add_picture(
caption=cap,
image=ImageRef.from_pil(image=fig_image, dpi=72),
)
pic.meta = PictureMeta(
description=DescriptionMetaField(text="AI-generated description of the image")
)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.EMBEDDED,
image_alt_mode=ImageAltTextMode.STATIC,
),
)
result = ser.serialize().text
# With STATIC mode, alt text should be "Image"
assert "![Image](data:image/png;base64," in result
assert "![AI-generated description" not in result
assert "![My figure caption]" not in result


def test_md_image_alt_mode_description():
from PIL import Image as PILImage

from docling_core.types.doc import ImageRef

doc = DoclingDocument(name="test_alt_description")
cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption")
fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128))
pic = doc.add_picture(
caption=cap,
image=ImageRef.from_pil(image=fig_image, dpi=72),
)
pic.meta = PictureMeta(
description=DescriptionMetaField(text="AI-generated description of the image")
)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.EMBEDDED,
image_alt_mode=ImageAltTextMode.DESCRIPTION,
),
)
result = ser.serialize().text
# With DESCRIPTION mode, alt text should be the AI-generated description
assert "![AI-generated description of the image](data:image/png;base64," in result
assert "![Image](" not in result


def test_md_image_alt_mode_caption():
from PIL import Image as PILImage

from docling_core.types.doc import ImageRef

doc = DoclingDocument(name="test_alt_caption")
cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption")
fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128))
pic = doc.add_picture(
caption=cap,
image=ImageRef.from_pil(image=fig_image, dpi=72),
)
pic.meta = PictureMeta(
description=DescriptionMetaField(text="AI-generated description of the image")
)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.EMBEDDED,
image_alt_mode=ImageAltTextMode.CAPTION,
),
)
result = ser.serialize().text
# With CAPTION mode, alt text should be the caption
assert "![My figure caption](data:image/png;base64," in result
assert "![Image](" not in result


def test_md_image_alt_mode_description_fallback():
from PIL import Image as PILImage

from docling_core.types.doc import ImageRef

doc = DoclingDocument(name="test_alt_fallback")
cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption")
fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128))
# No meta/description set
doc.add_picture(
caption=cap,
image=ImageRef.from_pil(image=fig_image, dpi=72),
)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.EMBEDDED,
image_alt_mode=ImageAltTextMode.DESCRIPTION,
),
)
result = ser.serialize().text
# Without description, should fall back to "Image"
assert "![Image](data:image/png;base64," in result


def test_md_image_alt_mode_caption_fallback():
from PIL import Image as PILImage

from docling_core.types.doc import ImageRef

doc = DoclingDocument(name="test_alt_caption_fallback")
fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128))
# No caption
doc.add_picture(image=ImageRef.from_pil(image=fig_image, dpi=72))

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.EMBEDDED,
image_alt_mode=ImageAltTextMode.CAPTION,
),
)
result = ser.serialize().text
# Without caption, should fall back to "Image"
assert "![Image](data:image/png;base64," in result


def test_md_image_alt_mode_with_embedded():
from PIL import Image as PILImage

from docling_core.types.doc import ImageRef

doc = DoclingDocument(name="test_alt_embedded")
cap = doc.add_text(label=DocItemLabel.CAPTION, text="Embedded figure")
fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128))
pic = doc.add_picture(
caption=cap,
image=ImageRef.from_pil(image=fig_image, dpi=72),
)
pic.meta = PictureMeta(
description=DescriptionMetaField(text="Description for embedded image")
)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.EMBEDDED,
image_alt_mode=ImageAltTextMode.DESCRIPTION,
),
)
result = ser.serialize().text
# With DESCRIPTION mode and EMBEDDED, alt text should be the description
assert "![Description for embedded image](data:image/png;base64," in result


def test_md_image_alt_mode_with_referenced():
from docling_core.types.doc import ImageRef, Size

doc = DoclingDocument(name="test_alt_referenced")
cap = doc.add_text(label=DocItemLabel.CAPTION, text="Referenced figure")
# Create an ImageRef with a file path URI and required fields
pic = doc.add_picture(
caption=cap,
image=ImageRef(
uri="images/figure1.png",
mimetype="image/png",
dpi=72,
size=Size(width=100, height=100),
),
)
pic.meta = PictureMeta(
description=DescriptionMetaField(text="Description for referenced image")
)

ser = MarkdownDocSerializer(
doc=doc,
params=MarkdownParams(
image_mode=ImageRefMode.REFERENCED,
image_alt_mode=ImageAltTextMode.DESCRIPTION,
),
)
result = ser.serialize().text
# With DESCRIPTION mode and REFERENCED, alt text should be the description
# Note: Path separator may vary by platform, so check for both
assert "![Description for referenced image](images" in result
assert "figure1.png)" in result


# ===============================
# HTML tests
# ===============================
Expand Down
Loading