From 000ccc55c55d32f1e561939bade6822be816076a Mon Sep 17 00:00:00 2001 From: Vittorio Pippi Date: Fri, 20 Mar 2026 14:16:48 +0100 Subject: [PATCH 1/4] feat: Add HANDWRITTEN_TEXT label support Add full integration for the HANDWRITTEN_TEXT document item label. Changes: - tokens.py: Add HANDWRITTEN_TEXT to DocumentToken enum and mapping - document.py: Add HANDWRITTEN_TEXT to DEFAULT_EXPORT_LABELS Also adds test/test_handwritten_text_label.py for integration tests. --- docling_core/types/doc/document.py | 2 + docling_core/types/doc/tokens.py | 2 + test/test_handwritten_text_label.py | 264 ++++++++++++++++++++++++++++ 3 files changed, 268 insertions(+) create mode 100644 test/test_handwritten_text_label.py diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 139dc381..d705018a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -96,6 +96,7 @@ DocItemLabel.FIELD_HEADING, DocItemLabel.FIELD_HINT, DocItemLabel.MARKER, + DocItemLabel.HANDWRITTEN_TEXT, } DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy() @@ -1692,6 +1693,7 @@ class TextItem(DocItem): DocItemLabel.FIELD_KEY, DocItemLabel.FIELD_HINT, DocItemLabel.MARKER, + DocItemLabel.HANDWRITTEN_TEXT, ] orig: str # untreated representation diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py index e172105d..5da137cd 100644 --- a/docling_core/types/doc/tokens.py +++ b/docling_core/types/doc/tokens.py @@ -201,6 +201,7 @@ class DocumentToken(str, Enum): PARAGRAPH = "paragraph" REFERENCE = "reference" + HANDWRITTEN_TEXT = "handwritten_text" @classmethod def get_special_tokens( @@ -253,6 +254,7 @@ def create_token_name_from_doc_item_label(cls, label: str, level: int = 1) -> st DocItemLabel.PARAGRAPH: DocumentToken.PARAGRAPH, DocItemLabel.REFERENCE: DocumentToken.REFERENCE, DocItemLabel.CHART: DocumentToken.CHART, + DocItemLabel.HANDWRITTEN_TEXT: DocumentToken.HANDWRITTEN_TEXT, } res: str diff --git a/test/test_handwritten_text_label.py b/test/test_handwritten_text_label.py new file mode 100644 index 00000000..1092a9af --- /dev/null +++ b/test/test_handwritten_text_label.py @@ -0,0 +1,264 @@ +"""Test HANDWRITTEN_TEXT label integration. + +This module tests that the HANDWRITTEN_TEXT label is properly integrated +across all required components: +1. DocTags serialization (tokens.py mapping) +2. Export labels (document.py DEFAULT_EXPORT_LABELS) +3. Markdown export +4. Token mapping completeness +""" + +import pytest + +from docling_core.types.doc.document import ( + DEFAULT_EXPORT_LABELS, + DOCUMENT_TOKENS_EXPORT_LABELS, + DoclingDocument, +) +from docling_core.types.doc.labels import DocItemLabel +from docling_core.types.doc.tokens import DocumentToken + + +class TestHandwrittenTextLabelIntegration: + """Test suite for HANDWRITTEN_TEXT label integration.""" + + def test_label_exists_in_enum(self): + """Test that HANDWRITTEN_TEXT exists in DocItemLabel enum.""" + assert hasattr(DocItemLabel, "HANDWRITTEN_TEXT") + assert DocItemLabel.HANDWRITTEN_TEXT.value == "handwritten_text" + + def test_label_has_color(self): + """Test that HANDWRITTEN_TEXT has a color mapping.""" + color = DocItemLabel.get_color(DocItemLabel.HANDWRITTEN_TEXT) + assert isinstance(color, tuple) + assert len(color) == 3 + assert all(0 <= c <= 255 for c in color) + + def test_document_token_exists(self): + """Test that HANDWRITTEN_TEXT exists in DocumentToken enum.""" + assert hasattr(DocumentToken, "HANDWRITTEN_TEXT") + assert DocumentToken.HANDWRITTEN_TEXT.value == "handwritten_text" + + def test_token_mapping_exists(self): + """Test that HANDWRITTEN_TEXT has a token mapping in create_token_name_from_doc_item_label. + + Without this mapping, RuntimeError would be raised during DocTags serialization. + """ + # This should not raise RuntimeError + token_name = DocumentToken.create_token_name_from_doc_item_label( + DocItemLabel.HANDWRITTEN_TEXT.value + ) + assert token_name == "handwritten_text" + + def test_label_in_default_export_labels(self): + """Test that HANDWRITTEN_TEXT is in DEFAULT_EXPORT_LABELS. + + Without this, HANDWRITTEN_TEXT content would be silently omitted from exports. + """ + assert DocItemLabel.HANDWRITTEN_TEXT in DEFAULT_EXPORT_LABELS + + def test_label_in_document_tokens_export_labels(self): + """Test that HANDWRITTEN_TEXT is in DOCUMENT_TOKENS_EXPORT_LABELS. + + This set is used by markdown and other serializers. + """ + assert DocItemLabel.HANDWRITTEN_TEXT in DOCUMENT_TOKENS_EXPORT_LABELS + + +class TestHandwrittenTextDocTagsSerialization: + """Test DocTags serialization with HANDWRITTEN_TEXT label.""" + + def test_doctags_serialization_succeeds(self): + """Test that DocTags serialization works with HANDWRITTEN_TEXT. + + This would raise RuntimeError if token mapping is missing. + """ + doc = DoclingDocument(name="test_doctags") + doc.add_text( + label=DocItemLabel.HANDWRITTEN_TEXT, + text="This is handwritten text", + ) + + # This should not raise RuntimeError + doctags_output = doc.export_to_doctags() + + assert "" in doctags_output + assert "" in doctags_output + assert "This is handwritten text" in doctags_output + + def test_doctags_with_multiple_handwritten_items(self): + """Test DocTags serialization with multiple HANDWRITTEN_TEXT items.""" + doc = DoclingDocument(name="test_multiple") + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="First handwritten") + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="Second handwritten") + doc.add_text(label=DocItemLabel.TEXT, text="Regular text") + + doctags_output = doc.export_to_doctags() + + assert doctags_output.count("") == 2 + assert doctags_output.count("") == 2 + assert "First handwritten" in doctags_output + assert "Second handwritten" in doctags_output + + +class TestHandwrittenTextMarkdownExport: + """Test Markdown export with HANDWRITTEN_TEXT label.""" + + def test_markdown_export_includes_handwritten_text(self): + """Test that HANDWRITTEN_TEXT content appears in markdown export. + + Without DEFAULT_EXPORT_LABELS inclusion, this content would be silently omitted. + """ + doc = DoclingDocument(name="test_markdown") + doc.add_text(label=DocItemLabel.TEXT, text="Regular text.") + doc.add_text( + label=DocItemLabel.HANDWRITTEN_TEXT, + text="Handwritten content here.", + ) + + markdown = doc.export_to_markdown() + + assert "Regular text." in markdown + assert "Handwritten content here." in markdown + + def test_markdown_export_preserves_order(self): + """Test that HANDWRITTEN_TEXT items maintain their order in markdown export.""" + doc = DoclingDocument(name="test_order") + doc.add_text(label=DocItemLabel.TEXT, text="First") + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="Second (handwritten)") + doc.add_text(label=DocItemLabel.TEXT, text="Third") + + markdown = doc.export_to_markdown() + + first_pos = markdown.find("First") + second_pos = markdown.find("Second (handwritten)") + third_pos = markdown.find("Third") + + assert first_pos < second_pos < third_pos + + +class TestHandwrittenTextPlainTextExport: + """Test plain text export with HANDWRITTEN_TEXT label.""" + + def test_plain_text_export_includes_handwritten_text(self): + """Test that HANDWRITTEN_TEXT content appears in plain text export.""" + doc = DoclingDocument(name="test_plain") + doc.add_text(label=DocItemLabel.TEXT, text="Normal text.") + doc.add_text( + label=DocItemLabel.HANDWRITTEN_TEXT, + text="Handwritten note.", + ) + + plain_text = doc.export_to_text() + + assert "Normal text." in plain_text + assert "Handwritten note." in plain_text + + +class TestHandwrittenTextHtmlExport: + """Test HTML export with HANDWRITTEN_TEXT label.""" + + def test_html_export_includes_handwritten_text(self): + """Test that HANDWRITTEN_TEXT content appears in HTML export.""" + doc = DoclingDocument(name="test_html") + doc.add_text(label=DocItemLabel.TEXT, text="Regular paragraph.") + doc.add_text( + label=DocItemLabel.HANDWRITTEN_TEXT, + text="Handwritten section.", + ) + + html = doc.export_to_html() + + assert "Regular paragraph." in html + assert "Handwritten section." in html + + +class TestHandwrittenTextDocumentOperations: + """Test document operations with HANDWRITTEN_TEXT label.""" + + def test_add_text_with_handwritten_label(self): + """Test adding text with HANDWRITTEN_TEXT label.""" + doc = DoclingDocument(name="test_add") + item = doc.add_text( + label=DocItemLabel.HANDWRITTEN_TEXT, + text="My handwritten note", + ) + + assert item.label == DocItemLabel.HANDWRITTEN_TEXT + assert item.text == "My handwritten note" + + def test_iterate_items_includes_handwritten_text(self): + """Test that iterate_items includes HANDWRITTEN_TEXT items.""" + doc = DoclingDocument(name="test_iterate") + doc.add_text(label=DocItemLabel.TEXT, text="Normal") + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="Handwritten") + + labels = [ + item.label + for item, _ in doc.iterate_items() + if hasattr(item, "label") + ] + + assert DocItemLabel.TEXT in labels + assert DocItemLabel.HANDWRITTEN_TEXT in labels + + def test_json_roundtrip_preserves_handwritten_label(self, tmp_path): + """Test that JSON save/load preserves HANDWRITTEN_TEXT label.""" + doc = DoclingDocument(name="test_roundtrip") + doc.add_text( + label=DocItemLabel.HANDWRITTEN_TEXT, + text="Preserved handwritten text", + ) + + json_path = tmp_path / "test.json" + doc.save_as_json(json_path) + + loaded_doc = DoclingDocument.load_from_json(json_path) + + # Find the handwritten text item + found = False + for item, _ in loaded_doc.iterate_items(): + if hasattr(item, "label") and item.label == DocItemLabel.HANDWRITTEN_TEXT: + assert item.text == "Preserved handwritten text" + found = True + break + + assert found, "HANDWRITTEN_TEXT item not found after JSON roundtrip" + + +class TestTokenMappingCompleteness: + """Test that all expected labels have token mappings.""" + + # Labels that are expected to NOT have direct token mappings + # (they're handled specially or are container types) + SPECIAL_CASE_LABELS = { + DocItemLabel.SECTION_HEADER, # Handled with level suffix + DocItemLabel.GRADING_SCALE, + DocItemLabel.EMPTY_VALUE, + DocItemLabel.FIELD_REGION, + DocItemLabel.FIELD_HEADING, + DocItemLabel.FIELD_ITEM, + DocItemLabel.FIELD_KEY, + DocItemLabel.FIELD_VALUE, + DocItemLabel.FIELD_HINT, + DocItemLabel.MARKER, + } + + def test_all_text_labels_have_token_mappings(self): + """Test that all text-type labels have token mappings. + + This proactively identifies any labels that would cause RuntimeError. + """ + missing_mappings = [] + + for label in DocItemLabel: + if label in self.SPECIAL_CASE_LABELS: + continue + try: + DocumentToken.create_token_name_from_doc_item_label(label.value) + except RuntimeError: + missing_mappings.append(label) + + assert not missing_mappings, ( + f"Labels missing token mappings: {[l.name for l in missing_mappings]}" + ) From f3d2d136ae647ca0a7e4e3d5b92bc5d5cbc1e7f4 Mon Sep 17 00:00:00 2001 From: Vittorio Pippi Date: Fri, 20 Mar 2026 14:24:37 +0100 Subject: [PATCH 2/4] DCO Remediation Commit for Vittorio Pippi I, Vittorio Pippi , hereby add my Signed-off-by to this commit: acbdfa3902e74da44a5844ad0cecab8657da4904 Signed-off-by: Vittorio Pippi From 336bce9e38f4b1c996c174a2c42f01db4cacf6d8 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Wed, 25 Mar 2026 16:47:31 +0100 Subject: [PATCH 3/4] add handwriting support to Doclang Signed-off-by: Panos Vagenas --- docling_core/experimental/doclang.py | 16 +- docs/DoclingDocument.json | 3 +- test/conftest.py | 30 +++ test/data/doc/handwritten_text.gt.dclg.xml | 24 ++ test/test_handwritten_text_label.py | 264 --------------------- test/test_serialization_doclang.py | 6 + 6 files changed, 76 insertions(+), 267 deletions(-) create mode 100644 test/data/doc/handwritten_text.gt.dclg.xml delete mode 100644 test/test_handwritten_text_label.py diff --git a/docling_core/experimental/doclang.py b/docling_core/experimental/doclang.py index 09314e7d..27b17085 100644 --- a/docling_core/experimental/doclang.py +++ b/docling_core/experimental/doclang.py @@ -392,6 +392,7 @@ class DoclangToken(str, Enum): STRIKETHROUGH = "strikethrough" SUPERSCRIPT = "superscript" SUBSCRIPT = "subscript" + HANDWRITING = "handwriting" # Formatting self-closing RTL = "rtl" @@ -611,6 +612,7 @@ class DoclangVocabulary(BaseModel): DoclangToken.STRIKETHROUGH: DoclangCategory.FORMATTING, DoclangToken.SUPERSCRIPT: DoclangCategory.FORMATTING, DoclangToken.SUBSCRIPT: DoclangCategory.FORMATTING, + DoclangToken.HANDWRITING: DoclangCategory.FORMATTING, DoclangToken.RTL: DoclangCategory.FORMATTING, DoclangToken.BR: DoclangCategory.FORMATTING, # Structural @@ -1547,6 +1549,8 @@ def _serialize_single_item( formatting=item.formatting, hyperlink=item.hyperlink, ) + if item.label == DocItemLabel.HANDWRITTEN_TEXT: + text_part = _wrap(text=text_part, wrap_tag=DoclangToken.HANDWRITING.value) if text_part: parts.append(text_part) @@ -1564,7 +1568,9 @@ def _serialize_single_item( parts.append(ftn_text) text_res = "".join(parts) - if wrap_open_token is not None and not (is_inline_scope and item.label == DocItemLabel.TEXT): + if wrap_open_token is not None and not ( + is_inline_scope and item.label in {DocItemLabel.TEXT, DocItemLabel.HANDWRITTEN_TEXT} + ): if text_res or not params.suppress_empty_elements: text_res = _wrap_token(text=text_res, open_token=wrap_open_token) return create_ser_result(text=text_res, span_source=item) @@ -2394,6 +2400,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]: DoclangToken.STRIKETHROUGH.value, DoclangToken.SUBSCRIPT.value, DoclangToken.SUPERSCRIPT.value, + DoclangToken.HANDWRITING.value, DoclangToken.CONTENT.value, }: return None @@ -2476,8 +2483,13 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona formatting.script = Script.SUB elif is_superscript: formatting.script = Script.SUPER + label = text_label_map[nm] + if nm == DoclangToken.TEXT.value and any( + c.tagName == DoclangToken.HANDWRITING.value for c in element_children + ): + label = DocItemLabel.HANDWRITTEN_TEXT item = doc.add_text( - label=text_label_map[nm], + label=label, text=text, parent=parent, prov=(prov_list[0] if prov_list else None), diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 100601a5..42abf020 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -3431,7 +3431,8 @@ "empty_value", "field_key", "field_hint", - "marker" + "marker", + "handwritten_text" ], "title": "Label", "type": "string" diff --git a/test/conftest.py b/test/conftest.py index a26f7558..9ee385cb 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -24,6 +24,8 @@ TableCell, TableData, ) +from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size +from docling_core.types.doc.document import ProvenanceItem # factored out of fixture to simplify IDE-level debugging @@ -465,3 +467,31 @@ def rich_table_doc(_rich_table_doc: DoclingDocument) -> DoclingDocument: """Copy of a rich table document for each test function.""" return _rich_table_doc.model_copy(deep=True) + +@pytest.fixture(scope="session") +def _doc_with_handwritten() -> DoclingDocument: + """Fixture for a document with handwritten text to be reused across the test session.""" + doc = DoclingDocument(name="") + doc.add_page(page_no=1, size=Size(width=100, height=100), image=None) + prov = ProvenanceItem( + page_no=1, + bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT), + charspan=(0, 2), + ) + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="My hand-written note") + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="My hand-written note (with prov)", prov=prov) + + inl_text = doc.add_text(label=DocItemLabel.TEXT, text="", prov=prov) + inline = doc.add_inline_group(parent=inl_text) + doc.add_text(label=DocItemLabel.TEXT, text="Check ", parent=inline) + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="out", parent=inline) + doc.add_text(label=DocItemLabel.TEXT, text=" these", parent=inline) + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text=" hand-written spans", parent=inline) + + return doc + + +@pytest.fixture(scope="function") +def doc_with_handwritten(_doc_with_handwritten: DoclingDocument) -> DoclingDocument: + """Copy of a document with handwritten text for each test function.""" + return _doc_with_handwritten.model_copy(deep=True) diff --git a/test/data/doc/handwritten_text.gt.dclg.xml b/test/data/doc/handwritten_text.gt.dclg.xml new file mode 100644 index 00000000..78ec863b --- /dev/null +++ b/test/data/doc/handwritten_text.gt.dclg.xml @@ -0,0 +1,24 @@ + + + My hand-written note + + + + + + + My hand-written note (with prov) + + + + + + + Check + out + these + + hand-written spans + + + diff --git a/test/test_handwritten_text_label.py b/test/test_handwritten_text_label.py deleted file mode 100644 index 1092a9af..00000000 --- a/test/test_handwritten_text_label.py +++ /dev/null @@ -1,264 +0,0 @@ -"""Test HANDWRITTEN_TEXT label integration. - -This module tests that the HANDWRITTEN_TEXT label is properly integrated -across all required components: -1. DocTags serialization (tokens.py mapping) -2. Export labels (document.py DEFAULT_EXPORT_LABELS) -3. Markdown export -4. Token mapping completeness -""" - -import pytest - -from docling_core.types.doc.document import ( - DEFAULT_EXPORT_LABELS, - DOCUMENT_TOKENS_EXPORT_LABELS, - DoclingDocument, -) -from docling_core.types.doc.labels import DocItemLabel -from docling_core.types.doc.tokens import DocumentToken - - -class TestHandwrittenTextLabelIntegration: - """Test suite for HANDWRITTEN_TEXT label integration.""" - - def test_label_exists_in_enum(self): - """Test that HANDWRITTEN_TEXT exists in DocItemLabel enum.""" - assert hasattr(DocItemLabel, "HANDWRITTEN_TEXT") - assert DocItemLabel.HANDWRITTEN_TEXT.value == "handwritten_text" - - def test_label_has_color(self): - """Test that HANDWRITTEN_TEXT has a color mapping.""" - color = DocItemLabel.get_color(DocItemLabel.HANDWRITTEN_TEXT) - assert isinstance(color, tuple) - assert len(color) == 3 - assert all(0 <= c <= 255 for c in color) - - def test_document_token_exists(self): - """Test that HANDWRITTEN_TEXT exists in DocumentToken enum.""" - assert hasattr(DocumentToken, "HANDWRITTEN_TEXT") - assert DocumentToken.HANDWRITTEN_TEXT.value == "handwritten_text" - - def test_token_mapping_exists(self): - """Test that HANDWRITTEN_TEXT has a token mapping in create_token_name_from_doc_item_label. - - Without this mapping, RuntimeError would be raised during DocTags serialization. - """ - # This should not raise RuntimeError - token_name = DocumentToken.create_token_name_from_doc_item_label( - DocItemLabel.HANDWRITTEN_TEXT.value - ) - assert token_name == "handwritten_text" - - def test_label_in_default_export_labels(self): - """Test that HANDWRITTEN_TEXT is in DEFAULT_EXPORT_LABELS. - - Without this, HANDWRITTEN_TEXT content would be silently omitted from exports. - """ - assert DocItemLabel.HANDWRITTEN_TEXT in DEFAULT_EXPORT_LABELS - - def test_label_in_document_tokens_export_labels(self): - """Test that HANDWRITTEN_TEXT is in DOCUMENT_TOKENS_EXPORT_LABELS. - - This set is used by markdown and other serializers. - """ - assert DocItemLabel.HANDWRITTEN_TEXT in DOCUMENT_TOKENS_EXPORT_LABELS - - -class TestHandwrittenTextDocTagsSerialization: - """Test DocTags serialization with HANDWRITTEN_TEXT label.""" - - def test_doctags_serialization_succeeds(self): - """Test that DocTags serialization works with HANDWRITTEN_TEXT. - - This would raise RuntimeError if token mapping is missing. - """ - doc = DoclingDocument(name="test_doctags") - doc.add_text( - label=DocItemLabel.HANDWRITTEN_TEXT, - text="This is handwritten text", - ) - - # This should not raise RuntimeError - doctags_output = doc.export_to_doctags() - - assert "" in doctags_output - assert "" in doctags_output - assert "This is handwritten text" in doctags_output - - def test_doctags_with_multiple_handwritten_items(self): - """Test DocTags serialization with multiple HANDWRITTEN_TEXT items.""" - doc = DoclingDocument(name="test_multiple") - doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="First handwritten") - doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="Second handwritten") - doc.add_text(label=DocItemLabel.TEXT, text="Regular text") - - doctags_output = doc.export_to_doctags() - - assert doctags_output.count("") == 2 - assert doctags_output.count("") == 2 - assert "First handwritten" in doctags_output - assert "Second handwritten" in doctags_output - - -class TestHandwrittenTextMarkdownExport: - """Test Markdown export with HANDWRITTEN_TEXT label.""" - - def test_markdown_export_includes_handwritten_text(self): - """Test that HANDWRITTEN_TEXT content appears in markdown export. - - Without DEFAULT_EXPORT_LABELS inclusion, this content would be silently omitted. - """ - doc = DoclingDocument(name="test_markdown") - doc.add_text(label=DocItemLabel.TEXT, text="Regular text.") - doc.add_text( - label=DocItemLabel.HANDWRITTEN_TEXT, - text="Handwritten content here.", - ) - - markdown = doc.export_to_markdown() - - assert "Regular text." in markdown - assert "Handwritten content here." in markdown - - def test_markdown_export_preserves_order(self): - """Test that HANDWRITTEN_TEXT items maintain their order in markdown export.""" - doc = DoclingDocument(name="test_order") - doc.add_text(label=DocItemLabel.TEXT, text="First") - doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="Second (handwritten)") - doc.add_text(label=DocItemLabel.TEXT, text="Third") - - markdown = doc.export_to_markdown() - - first_pos = markdown.find("First") - second_pos = markdown.find("Second (handwritten)") - third_pos = markdown.find("Third") - - assert first_pos < second_pos < third_pos - - -class TestHandwrittenTextPlainTextExport: - """Test plain text export with HANDWRITTEN_TEXT label.""" - - def test_plain_text_export_includes_handwritten_text(self): - """Test that HANDWRITTEN_TEXT content appears in plain text export.""" - doc = DoclingDocument(name="test_plain") - doc.add_text(label=DocItemLabel.TEXT, text="Normal text.") - doc.add_text( - label=DocItemLabel.HANDWRITTEN_TEXT, - text="Handwritten note.", - ) - - plain_text = doc.export_to_text() - - assert "Normal text." in plain_text - assert "Handwritten note." in plain_text - - -class TestHandwrittenTextHtmlExport: - """Test HTML export with HANDWRITTEN_TEXT label.""" - - def test_html_export_includes_handwritten_text(self): - """Test that HANDWRITTEN_TEXT content appears in HTML export.""" - doc = DoclingDocument(name="test_html") - doc.add_text(label=DocItemLabel.TEXT, text="Regular paragraph.") - doc.add_text( - label=DocItemLabel.HANDWRITTEN_TEXT, - text="Handwritten section.", - ) - - html = doc.export_to_html() - - assert "Regular paragraph." in html - assert "Handwritten section." in html - - -class TestHandwrittenTextDocumentOperations: - """Test document operations with HANDWRITTEN_TEXT label.""" - - def test_add_text_with_handwritten_label(self): - """Test adding text with HANDWRITTEN_TEXT label.""" - doc = DoclingDocument(name="test_add") - item = doc.add_text( - label=DocItemLabel.HANDWRITTEN_TEXT, - text="My handwritten note", - ) - - assert item.label == DocItemLabel.HANDWRITTEN_TEXT - assert item.text == "My handwritten note" - - def test_iterate_items_includes_handwritten_text(self): - """Test that iterate_items includes HANDWRITTEN_TEXT items.""" - doc = DoclingDocument(name="test_iterate") - doc.add_text(label=DocItemLabel.TEXT, text="Normal") - doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="Handwritten") - - labels = [ - item.label - for item, _ in doc.iterate_items() - if hasattr(item, "label") - ] - - assert DocItemLabel.TEXT in labels - assert DocItemLabel.HANDWRITTEN_TEXT in labels - - def test_json_roundtrip_preserves_handwritten_label(self, tmp_path): - """Test that JSON save/load preserves HANDWRITTEN_TEXT label.""" - doc = DoclingDocument(name="test_roundtrip") - doc.add_text( - label=DocItemLabel.HANDWRITTEN_TEXT, - text="Preserved handwritten text", - ) - - json_path = tmp_path / "test.json" - doc.save_as_json(json_path) - - loaded_doc = DoclingDocument.load_from_json(json_path) - - # Find the handwritten text item - found = False - for item, _ in loaded_doc.iterate_items(): - if hasattr(item, "label") and item.label == DocItemLabel.HANDWRITTEN_TEXT: - assert item.text == "Preserved handwritten text" - found = True - break - - assert found, "HANDWRITTEN_TEXT item not found after JSON roundtrip" - - -class TestTokenMappingCompleteness: - """Test that all expected labels have token mappings.""" - - # Labels that are expected to NOT have direct token mappings - # (they're handled specially or are container types) - SPECIAL_CASE_LABELS = { - DocItemLabel.SECTION_HEADER, # Handled with level suffix - DocItemLabel.GRADING_SCALE, - DocItemLabel.EMPTY_VALUE, - DocItemLabel.FIELD_REGION, - DocItemLabel.FIELD_HEADING, - DocItemLabel.FIELD_ITEM, - DocItemLabel.FIELD_KEY, - DocItemLabel.FIELD_VALUE, - DocItemLabel.FIELD_HINT, - DocItemLabel.MARKER, - } - - def test_all_text_labels_have_token_mappings(self): - """Test that all text-type labels have token mappings. - - This proactively identifies any labels that would cause RuntimeError. - """ - missing_mappings = [] - - for label in DocItemLabel: - if label in self.SPECIAL_CASE_LABELS: - continue - try: - DocumentToken.create_token_name_from_doc_item_label(label.value) - except RuntimeError: - missing_mappings.append(label) - - assert not missing_mappings, ( - f"Labels missing token mappings: {[l.name for l in missing_mappings]}" - ) diff --git a/test/test_serialization_doclang.py b/test/test_serialization_doclang.py index 5adadf51..124c52b4 100644 --- a/test/test_serialization_doclang.py +++ b/test/test_serialization_doclang.py @@ -474,6 +474,12 @@ def _create_content_filtering_doc(inp_doc: DoclingDocument): return doc +def test_handwritten_text_label(doc_with_handwritten: DoclingDocument): + result = doc_with_handwritten.export_to_doclang() + exp_file = Path("./test/data/doc/handwritten_text.gt.dclg.xml") + verify(exp_file=exp_file, actual=result) + + def test_content_allow_all_types(sample_doc: DoclingDocument): doc = _create_content_filtering_doc(sample_doc) serializer = DoclangDocSerializer( From 1ec57f869cd2fda904cef49c1250c67a29b746c0 Mon Sep 17 00:00:00 2001 From: Vittorio Pippi Date: Wed, 25 Mar 2026 16:57:44 +0100 Subject: [PATCH 4/4] DCO Remediation Commit for Vittorio Pippi I, Vittorio Pippi , hereby add my Signed-off-by to this commit: 000ccc55c55d32f1e561939bade6822be816076a Signed-off-by: Vittorio Pippi