diff --git a/docling_core/experimental/doclang.py b/docling_core/experimental/doclang.py index 09314e7d..27b17085 100644 --- a/docling_core/experimental/doclang.py +++ b/docling_core/experimental/doclang.py @@ -392,6 +392,7 @@ class DoclangToken(str, Enum): STRIKETHROUGH = "strikethrough" SUPERSCRIPT = "superscript" SUBSCRIPT = "subscript" + HANDWRITING = "handwriting" # Formatting self-closing RTL = "rtl" @@ -611,6 +612,7 @@ class DoclangVocabulary(BaseModel): DoclangToken.STRIKETHROUGH: DoclangCategory.FORMATTING, DoclangToken.SUPERSCRIPT: DoclangCategory.FORMATTING, DoclangToken.SUBSCRIPT: DoclangCategory.FORMATTING, + DoclangToken.HANDWRITING: DoclangCategory.FORMATTING, DoclangToken.RTL: DoclangCategory.FORMATTING, DoclangToken.BR: DoclangCategory.FORMATTING, # Structural @@ -1547,6 +1549,8 @@ def _serialize_single_item( formatting=item.formatting, hyperlink=item.hyperlink, ) + if item.label == DocItemLabel.HANDWRITTEN_TEXT: + text_part = _wrap(text=text_part, wrap_tag=DoclangToken.HANDWRITING.value) if text_part: parts.append(text_part) @@ -1564,7 +1568,9 @@ def _serialize_single_item( parts.append(ftn_text) text_res = "".join(parts) - if wrap_open_token is not None and not (is_inline_scope and item.label == DocItemLabel.TEXT): + if wrap_open_token is not None and not ( + is_inline_scope and item.label in {DocItemLabel.TEXT, DocItemLabel.HANDWRITTEN_TEXT} + ): if text_res or not params.suppress_empty_elements: text_res = _wrap_token(text=text_res, open_token=wrap_open_token) return create_ser_result(text=text_res, span_source=item) @@ -2394,6 +2400,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]: DoclangToken.STRIKETHROUGH.value, DoclangToken.SUBSCRIPT.value, DoclangToken.SUPERSCRIPT.value, + DoclangToken.HANDWRITING.value, DoclangToken.CONTENT.value, }: return None @@ -2476,8 +2483,13 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona formatting.script = Script.SUB elif is_superscript: formatting.script = Script.SUPER + label = text_label_map[nm] + if nm == DoclangToken.TEXT.value and any( + c.tagName == DoclangToken.HANDWRITING.value for c in element_children + ): + label = DocItemLabel.HANDWRITTEN_TEXT item = doc.add_text( - label=text_label_map[nm], + label=label, text=text, parent=parent, prov=(prov_list[0] if prov_list else None), diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 139dc381..d705018a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -96,6 +96,7 @@ DocItemLabel.FIELD_HEADING, DocItemLabel.FIELD_HINT, DocItemLabel.MARKER, + DocItemLabel.HANDWRITTEN_TEXT, } DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy() @@ -1692,6 +1693,7 @@ class TextItem(DocItem): DocItemLabel.FIELD_KEY, DocItemLabel.FIELD_HINT, DocItemLabel.MARKER, + DocItemLabel.HANDWRITTEN_TEXT, ] orig: str # untreated representation diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py index e172105d..5da137cd 100644 --- a/docling_core/types/doc/tokens.py +++ b/docling_core/types/doc/tokens.py @@ -201,6 +201,7 @@ class DocumentToken(str, Enum): PARAGRAPH = "paragraph" REFERENCE = "reference" + HANDWRITTEN_TEXT = "handwritten_text" @classmethod def get_special_tokens( @@ -253,6 +254,7 @@ def create_token_name_from_doc_item_label(cls, label: str, level: int = 1) -> st DocItemLabel.PARAGRAPH: DocumentToken.PARAGRAPH, DocItemLabel.REFERENCE: DocumentToken.REFERENCE, DocItemLabel.CHART: DocumentToken.CHART, + DocItemLabel.HANDWRITTEN_TEXT: DocumentToken.HANDWRITTEN_TEXT, } res: str diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 100601a5..42abf020 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -3431,7 +3431,8 @@ "empty_value", "field_key", "field_hint", - "marker" + "marker", + "handwritten_text" ], "title": "Label", "type": "string" diff --git a/test/conftest.py b/test/conftest.py index a26f7558..9ee385cb 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -24,6 +24,8 @@ TableCell, TableData, ) +from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size +from docling_core.types.doc.document import ProvenanceItem # factored out of fixture to simplify IDE-level debugging @@ -465,3 +467,31 @@ def rich_table_doc(_rich_table_doc: DoclingDocument) -> DoclingDocument: """Copy of a rich table document for each test function.""" return _rich_table_doc.model_copy(deep=True) + +@pytest.fixture(scope="session") +def _doc_with_handwritten() -> DoclingDocument: + """Fixture for a document with handwritten text to be reused across the test session.""" + doc = DoclingDocument(name="") + doc.add_page(page_no=1, size=Size(width=100, height=100), image=None) + prov = ProvenanceItem( + page_no=1, + bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT), + charspan=(0, 2), + ) + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="My hand-written note") + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="My hand-written note (with prov)", prov=prov) + + inl_text = doc.add_text(label=DocItemLabel.TEXT, text="", prov=prov) + inline = doc.add_inline_group(parent=inl_text) + doc.add_text(label=DocItemLabel.TEXT, text="Check ", parent=inline) + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="out", parent=inline) + doc.add_text(label=DocItemLabel.TEXT, text=" these", parent=inline) + doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text=" hand-written spans", parent=inline) + + return doc + + +@pytest.fixture(scope="function") +def doc_with_handwritten(_doc_with_handwritten: DoclingDocument) -> DoclingDocument: + """Copy of a document with handwritten text for each test function.""" + return _doc_with_handwritten.model_copy(deep=True) diff --git a/test/data/doc/handwritten_text.gt.dclg.xml b/test/data/doc/handwritten_text.gt.dclg.xml new file mode 100644 index 00000000..78ec863b --- /dev/null +++ b/test/data/doc/handwritten_text.gt.dclg.xml @@ -0,0 +1,24 @@ + + + My hand-written note + + + + + + + My hand-written note (with prov) + + + + + + + Check + out + these + + hand-written spans + + + diff --git a/test/test_serialization_doclang.py b/test/test_serialization_doclang.py index 5adadf51..124c52b4 100644 --- a/test/test_serialization_doclang.py +++ b/test/test_serialization_doclang.py @@ -474,6 +474,12 @@ def _create_content_filtering_doc(inp_doc: DoclingDocument): return doc +def test_handwritten_text_label(doc_with_handwritten: DoclingDocument): + result = doc_with_handwritten.export_to_doclang() + exp_file = Path("./test/data/doc/handwritten_text.gt.dclg.xml") + verify(exp_file=exp_file, actual=result) + + def test_content_allow_all_types(sample_doc: DoclingDocument): doc = _create_content_filtering_doc(sample_doc) serializer = DoclangDocSerializer(