diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 0702241b..19d474e9 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -77,12 +77,27 @@ class OrigListItemMarkerMode(str, Enum): AUTO = "auto" +class ImageAltTextMode(str, Enum): + """Mode for image alt text in markdown output.""" + + STATIC = "static" + CAPTION = "caption" + DESCRIPTION = "description" + + class MarkdownParams(CommonParams): """Markdown-specific serialization parameters.""" layers: set[ContentLayer] = {ContentLayer.BODY} image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER image_placeholder: str = "" + image_alt_mode: ImageAltTextMode = Field( + default=ImageAltTextMode.STATIC, + description=( + "Mode for image alt text: 'static' uses 'Image', " + "'caption' uses caption text, 'description' uses AI-generated description." + ), + ) enable_chart_tables: bool = True indent: int = 4 wrap_width: Optional[PositiveInt] = None @@ -473,6 +488,7 @@ def serialize( doc=doc, image_mode=params.image_mode, image_placeholder=params.image_placeholder, + image_alt_mode=params.image_alt_mode, ) if img_res.text: res_parts.append(img_res) @@ -502,12 +518,30 @@ def serialize( return create_ser_result(text=text_res, span_source=res_parts) + def _get_alt_text( + self, + item: PictureItem, + doc: DoclingDocument, + alt_mode: ImageAltTextMode, + ) -> str: + """Get alt text for an image based on the mode, falling back to 'Image'.""" + if alt_mode == ImageAltTextMode.DESCRIPTION: + if item.meta and item.meta.description: + return item.meta.description.text + elif alt_mode == ImageAltTextMode.CAPTION: + if item.captions: + cap = item.captions[0].resolve(doc) + if hasattr(cap, "text") and cap.text: + return cap.text + return "Image" + def _serialize_image_part( self, item: PictureItem, doc: DoclingDocument, image_mode: ImageRefMode, image_placeholder: str, + image_alt_mode: ImageAltTextMode = ImageAltTextMode.STATIC, **kwargs: Any, ) -> SerializationResult: error_response = ( @@ -515,6 +549,8 @@ def _serialize_image_part( "Please use `PdfPipelineOptions(generate_picture_images=True)`" " -->" ) + alt_text = self._get_alt_text(item=item, doc=doc, alt_mode=image_alt_mode) + if image_mode == ImageRefMode.PLACEHOLDER: text_res = image_placeholder elif image_mode == ImageRefMode.EMBEDDED: @@ -524,7 +560,7 @@ def _serialize_image_part( and isinstance(item.image.uri, AnyUrl) and item.image.uri.scheme == "data" ): - text = f"![Image]({item.image.uri})" + text = f"![{alt_text}]({item.image.uri})" text_res = text else: # get the item.image._pil or crop it out of the page-image @@ -532,7 +568,7 @@ def _serialize_image_part( if img is not None: imgb64 = item._image_to_base64(img) - text = f"![Image](data:image/png;base64,{imgb64})" + text = f"![{alt_text}](data:image/png;base64,{imgb64})" text_res = text else: @@ -543,7 +579,7 @@ def _serialize_image_part( ): text_res = image_placeholder else: - text_res = f"![Image]({str(item.image.uri)})" + text_res = f"![{alt_text}]({str(item.image.uri)})" else: text_res = image_placeholder diff --git a/test/test_serialization.py b/test/test_serialization.py index dd629130..2b68bad1 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -11,6 +11,7 @@ HTMLParams, ) from docling_core.transforms.serializer.markdown import ( + ImageAltTextMode, MarkdownDocSerializer, MarkdownParams, OrigListItemMarkerMode, @@ -19,7 +20,9 @@ from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( DescriptionAnnotation, + DescriptionMetaField, DoclingDocument, + PictureMeta, TableCell, TableData, ) @@ -338,6 +341,208 @@ def test_md_single_row_table(): verify(exp_file=exp_file, actual=actual) +# =============================== +# Image Alt Text Mode tests +# =============================== + + +def test_md_image_alt_mode_static(): + from PIL import Image as PILImage + + from docling_core.types.doc import ImageRef + + doc = DoclingDocument(name="test_alt_static") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption") + fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128)) + pic = doc.add_picture( + caption=cap, + image=ImageRef.from_pil(image=fig_image, dpi=72), + ) + pic.meta = PictureMeta( + description=DescriptionMetaField(text="AI-generated description of the image") + ) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.EMBEDDED, + image_alt_mode=ImageAltTextMode.STATIC, + ), + ) + result = ser.serialize().text + # With STATIC mode, alt text should be "Image" + assert "![Image](data:image/png;base64," in result + assert "![AI-generated description" not in result + assert "![My figure caption]" not in result + + +def test_md_image_alt_mode_description(): + from PIL import Image as PILImage + + from docling_core.types.doc import ImageRef + + doc = DoclingDocument(name="test_alt_description") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption") + fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128)) + pic = doc.add_picture( + caption=cap, + image=ImageRef.from_pil(image=fig_image, dpi=72), + ) + pic.meta = PictureMeta( + description=DescriptionMetaField(text="AI-generated description of the image") + ) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.EMBEDDED, + image_alt_mode=ImageAltTextMode.DESCRIPTION, + ), + ) + result = ser.serialize().text + # With DESCRIPTION mode, alt text should be the AI-generated description + assert "![AI-generated description of the image](data:image/png;base64," in result + assert "![Image](" not in result + + +def test_md_image_alt_mode_caption(): + from PIL import Image as PILImage + + from docling_core.types.doc import ImageRef + + doc = DoclingDocument(name="test_alt_caption") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption") + fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128)) + pic = doc.add_picture( + caption=cap, + image=ImageRef.from_pil(image=fig_image, dpi=72), + ) + pic.meta = PictureMeta( + description=DescriptionMetaField(text="AI-generated description of the image") + ) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.EMBEDDED, + image_alt_mode=ImageAltTextMode.CAPTION, + ), + ) + result = ser.serialize().text + # With CAPTION mode, alt text should be the caption + assert "![My figure caption](data:image/png;base64," in result + assert "![Image](" not in result + + +def test_md_image_alt_mode_description_fallback(): + from PIL import Image as PILImage + + from docling_core.types.doc import ImageRef + + doc = DoclingDocument(name="test_alt_fallback") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="My figure caption") + fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128)) + # No meta/description set + doc.add_picture( + caption=cap, + image=ImageRef.from_pil(image=fig_image, dpi=72), + ) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.EMBEDDED, + image_alt_mode=ImageAltTextMode.DESCRIPTION, + ), + ) + result = ser.serialize().text + # Without description, should fall back to "Image" + assert "![Image](data:image/png;base64," in result + + +def test_md_image_alt_mode_caption_fallback(): + from PIL import Image as PILImage + + from docling_core.types.doc import ImageRef + + doc = DoclingDocument(name="test_alt_caption_fallback") + fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128)) + # No caption + doc.add_picture(image=ImageRef.from_pil(image=fig_image, dpi=72)) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.EMBEDDED, + image_alt_mode=ImageAltTextMode.CAPTION, + ), + ) + result = ser.serialize().text + # Without caption, should fall back to "Image" + assert "![Image](data:image/png;base64," in result + + +def test_md_image_alt_mode_with_embedded(): + from PIL import Image as PILImage + + from docling_core.types.doc import ImageRef + + doc = DoclingDocument(name="test_alt_embedded") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="Embedded figure") + fig_image = PILImage.new(mode="RGB", size=(10, 10), color=(128, 128, 128)) + pic = doc.add_picture( + caption=cap, + image=ImageRef.from_pil(image=fig_image, dpi=72), + ) + pic.meta = PictureMeta( + description=DescriptionMetaField(text="Description for embedded image") + ) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.EMBEDDED, + image_alt_mode=ImageAltTextMode.DESCRIPTION, + ), + ) + result = ser.serialize().text + # With DESCRIPTION mode and EMBEDDED, alt text should be the description + assert "![Description for embedded image](data:image/png;base64," in result + + +def test_md_image_alt_mode_with_referenced(): + from docling_core.types.doc import ImageRef, Size + + doc = DoclingDocument(name="test_alt_referenced") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="Referenced figure") + # Create an ImageRef with a file path URI and required fields + pic = doc.add_picture( + caption=cap, + image=ImageRef( + uri="images/figure1.png", + mimetype="image/png", + dpi=72, + size=Size(width=100, height=100), + ), + ) + pic.meta = PictureMeta( + description=DescriptionMetaField(text="Description for referenced image") + ) + + ser = MarkdownDocSerializer( + doc=doc, + params=MarkdownParams( + image_mode=ImageRefMode.REFERENCED, + image_alt_mode=ImageAltTextMode.DESCRIPTION, + ), + ) + result = ser.serialize().text + # With DESCRIPTION mode and REFERENCED, alt text should be the description + # Note: Path separator may vary by platform, so check for both + assert "![Description for referenced image](images" in result + assert "figure1.png)" in result + + # =============================== # HTML tests # ===============================