Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions docling_core/experimental/doclang.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ class DoclangToken(str, Enum):
STRIKETHROUGH = "strikethrough"
SUPERSCRIPT = "superscript"
SUBSCRIPT = "subscript"
HANDWRITING = "handwriting"

# Formatting self-closing
RTL = "rtl"
Expand Down Expand Up @@ -611,6 +612,7 @@ class DoclangVocabulary(BaseModel):
DoclangToken.STRIKETHROUGH: DoclangCategory.FORMATTING,
DoclangToken.SUPERSCRIPT: DoclangCategory.FORMATTING,
DoclangToken.SUBSCRIPT: DoclangCategory.FORMATTING,
DoclangToken.HANDWRITING: DoclangCategory.FORMATTING,
DoclangToken.RTL: DoclangCategory.FORMATTING,
DoclangToken.BR: DoclangCategory.FORMATTING,
# Structural
Expand Down Expand Up @@ -1547,6 +1549,8 @@ def _serialize_single_item(
formatting=item.formatting,
hyperlink=item.hyperlink,
)
if item.label == DocItemLabel.HANDWRITTEN_TEXT:
text_part = _wrap(text=text_part, wrap_tag=DoclangToken.HANDWRITING.value)

if text_part:
parts.append(text_part)
Expand All @@ -1564,7 +1568,9 @@ def _serialize_single_item(
parts.append(ftn_text)

text_res = "".join(parts)
if wrap_open_token is not None and not (is_inline_scope and item.label == DocItemLabel.TEXT):
if wrap_open_token is not None and not (
is_inline_scope and item.label in {DocItemLabel.TEXT, DocItemLabel.HANDWRITTEN_TEXT}
):
if text_res or not params.suppress_empty_elements:
text_res = _wrap_token(text=text_res, open_token=wrap_open_token)
return create_ser_result(text=text_res, span_source=item)
Expand Down Expand Up @@ -2394,6 +2400,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]:
DoclangToken.STRIKETHROUGH.value,
DoclangToken.SUBSCRIPT.value,
DoclangToken.SUPERSCRIPT.value,
DoclangToken.HANDWRITING.value,
DoclangToken.CONTENT.value,
}:
return None
Expand Down Expand Up @@ -2476,8 +2483,13 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
formatting.script = Script.SUB
elif is_superscript:
formatting.script = Script.SUPER
label = text_label_map[nm]
if nm == DoclangToken.TEXT.value and any(
c.tagName == DoclangToken.HANDWRITING.value for c in element_children
):
label = DocItemLabel.HANDWRITTEN_TEXT
item = doc.add_text(
label=text_label_map[nm],
label=label,
text=text,
parent=parent,
prov=(prov_list[0] if prov_list else None),
Expand Down
2 changes: 2 additions & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
DocItemLabel.FIELD_HEADING,
DocItemLabel.FIELD_HINT,
DocItemLabel.MARKER,
DocItemLabel.HANDWRITTEN_TEXT,
}

DOCUMENT_TOKENS_EXPORT_LABELS = DEFAULT_EXPORT_LABELS.copy()
Expand Down Expand Up @@ -1692,6 +1693,7 @@ class TextItem(DocItem):
DocItemLabel.FIELD_KEY,
DocItemLabel.FIELD_HINT,
DocItemLabel.MARKER,
DocItemLabel.HANDWRITTEN_TEXT,
]

orig: str # untreated representation
Expand Down
2 changes: 2 additions & 0 deletions docling_core/types/doc/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ class DocumentToken(str, Enum):

PARAGRAPH = "paragraph"
REFERENCE = "reference"
HANDWRITTEN_TEXT = "handwritten_text"

@classmethod
def get_special_tokens(
Expand Down Expand Up @@ -253,6 +254,7 @@ def create_token_name_from_doc_item_label(cls, label: str, level: int = 1) -> st
DocItemLabel.PARAGRAPH: DocumentToken.PARAGRAPH,
DocItemLabel.REFERENCE: DocumentToken.REFERENCE,
DocItemLabel.CHART: DocumentToken.CHART,
DocItemLabel.HANDWRITTEN_TEXT: DocumentToken.HANDWRITTEN_TEXT,
}

res: str
Expand Down
3 changes: 2 additions & 1 deletion docs/DoclingDocument.json
Original file line number Diff line number Diff line change
Expand Up @@ -3431,7 +3431,8 @@
"empty_value",
"field_key",
"field_hint",
"marker"
"marker",
"handwritten_text"
],
"title": "Label",
"type": "string"
Expand Down
30 changes: 30 additions & 0 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
TableCell,
TableData,
)
from docling_core.types.doc.base import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.document import ProvenanceItem


# factored out of fixture to simplify IDE-level debugging
Expand Down Expand Up @@ -465,3 +467,31 @@ def rich_table_doc(_rich_table_doc: DoclingDocument) -> DoclingDocument:
"""Copy of a rich table document for each test function."""

return _rich_table_doc.model_copy(deep=True)

@pytest.fixture(scope="session")
def _doc_with_handwritten() -> DoclingDocument:
"""Fixture for a document with handwritten text to be reused across the test session."""
doc = DoclingDocument(name="")
doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
prov = ProvenanceItem(
page_no=1,
bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
charspan=(0, 2),
)
doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="My hand-written note")
doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="My hand-written note (with prov)", prov=prov)

inl_text = doc.add_text(label=DocItemLabel.TEXT, text="", prov=prov)
inline = doc.add_inline_group(parent=inl_text)
doc.add_text(label=DocItemLabel.TEXT, text="Check ", parent=inline)
doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text="out", parent=inline)
doc.add_text(label=DocItemLabel.TEXT, text=" these", parent=inline)
doc.add_text(label=DocItemLabel.HANDWRITTEN_TEXT, text=" hand-written spans", parent=inline)

return doc


@pytest.fixture(scope="function")
def doc_with_handwritten(_doc_with_handwritten: DoclingDocument) -> DoclingDocument:
"""Copy of a document with handwritten text for each test function."""
return _doc_with_handwritten.model_copy(deep=True)
24 changes: 24 additions & 0 deletions test/data/doc/handwritten_text.gt.dclg.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<doclang version="1.0.0">
<text>
<handwriting>My hand-written note</handwriting>
</text>
<text>
<location value="5"/>
<location value="492"/>
<location value="15"/>
<location value="502"/>
<handwriting>My hand-written note (with prov)</handwriting>
</text>
<text>
<location value="5"/>
<location value="492"/>
<location value="15"/>
<location value="502"/>
<content>Check </content>
<handwriting>out</handwriting>
<content> these</content>
<handwriting>
<content> hand-written spans</content>
</handwriting>
</text>
</doclang>
6 changes: 6 additions & 0 deletions test/test_serialization_doclang.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,12 @@ def _create_content_filtering_doc(inp_doc: DoclingDocument):
return doc


def test_handwritten_text_label(doc_with_handwritten: DoclingDocument):
result = doc_with_handwritten.export_to_doclang()
exp_file = Path("./test/data/doc/handwritten_text.gt.dclg.xml")
verify(exp_file=exp_file, actual=result)


def test_content_allow_all_types(sample_doc: DoclingDocument):
doc = _create_content_filtering_doc(sample_doc)
serializer = DoclangDocSerializer(
Expand Down
Loading