From 65b04937deed0a9b7977b7d3577bdd0af063cf7a Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Fri, 27 Mar 2026 16:40:22 +0100 Subject: [PATCH] fix(Doclang): improve checkbox serialization & deserialization Wrap in `` if needed. Signed-off-by: Panos Vagenas --- docling_core/experimental/doclang.py | 42 ++++++++++++++----- test/data/doc/checkboxes.out.dclg.xml | 12 ++++-- .../output.dclg.xml | 42 +++++++++++-------- .../output_no_content.dclg.xml | 36 +++++++++------- .../output.dclg.xml | 28 +++++++------ .../output_no_content.dclg.xml | 24 ++++++----- 6 files changed, 115 insertions(+), 69 deletions(-) diff --git a/docling_core/experimental/doclang.py b/docling_core/experimental/doclang.py index 27b17085..e0155b03 100644 --- a/docling_core/experimental/doclang.py +++ b/docling_core/experimental/doclang.py @@ -1457,7 +1457,6 @@ def _serialize_single_item( # list items, this maps to and keeps the text serializer # free of type-based special casing. wrap_open_token: Optional[str] - selected_token: str = "" tok: DoclangToken | None = None if isinstance(item, SectionHeaderItem): wrap_open_token = DoclangVocabulary.create_heading_token(level=item.level) @@ -1474,11 +1473,12 @@ def _serialize_single_item( DocItemLabel.CHECKBOX_SELECTED, DocItemLabel.CHECKBOX_UNSELECTED, ]: - tok = DoclangToken.TEXT - wrap_open_token = None - selected_token = DoclangVocabulary.create_checkbox_token( - selected=(item.label == DocItemLabel.CHECKBOX_SELECTED) - ) + if item.parent and isinstance((parent_item := item.parent.resolve(doc)), TextItem) and not parent_item.text: + # skip re-wrapping if already in a text item + wrap_open_token = None + else: + tok = DoclangToken.TEXT + wrap_open_token = f"<{tok.value}>" elif isinstance(item, TextItem) and ( tok := { DocItemLabel.FIELD_KEY: DoclangToken.FIELD_KEY, @@ -1521,9 +1521,6 @@ def _serialize_single_item( if loc: parts.append(loc) - if selected_token: - parts.append(selected_token) - if item.meta: meta_res = doc_serializer.serialize_meta(item=item, **kwargs) if meta_res.text: @@ -1551,6 +1548,12 @@ def _serialize_single_item( ) if item.label == DocItemLabel.HANDWRITTEN_TEXT: text_part = _wrap(text=text_part, wrap_tag=DoclangToken.HANDWRITING.value) + elif item.label in [DocItemLabel.CHECKBOX_SELECTED, DocItemLabel.CHECKBOX_UNSELECTED]: + # Add checkbox token before the text + checkbox_token = DoclangVocabulary.create_checkbox_token( + selected=(item.label == DocItemLabel.CHECKBOX_SELECTED) + ) + text_part = checkbox_token + text_part if text_part: parts.append(text_part) @@ -1569,7 +1572,14 @@ def _serialize_single_item( text_res = "".join(parts) if wrap_open_token is not None and not ( - is_inline_scope and item.label in {DocItemLabel.TEXT, DocItemLabel.HANDWRITTEN_TEXT} + is_inline_scope + and item.label + in { + DocItemLabel.TEXT, + DocItemLabel.HANDWRITTEN_TEXT, + DocItemLabel.CHECKBOX_SELECTED, + DocItemLabel.CHECKBOX_UNSELECTED, + } ): if text_res or not params.suppress_empty_elements: text_res = _wrap_token(text=text_res, open_token=wrap_open_token) @@ -2401,6 +2411,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]: DoclangToken.SUBSCRIPT.value, DoclangToken.SUPERSCRIPT.value, DoclangToken.HANDWRITING.value, + DoclangToken.CHECKBOX.value, DoclangToken.CONTENT.value, }: return None @@ -2488,6 +2499,17 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona c.tagName == DoclangToken.HANDWRITING.value for c in element_children ): label = DocItemLabel.HANDWRITTEN_TEXT + elif nm == DoclangToken.TEXT.value: + # Check for checkbox elements with class attribute + for c in element_children: + if c.tagName == DoclangToken.CHECKBOX.value: + checkbox_class = c.getAttribute(DoclangAttributeKey.CLASS.value) + if checkbox_class == DoclangAttributeValue.SELECTED.value: + label = DocItemLabel.CHECKBOX_SELECTED + break + elif checkbox_class == DoclangAttributeValue.UNSELECTED.value: + label = DocItemLabel.CHECKBOX_UNSELECTED + break item = doc.add_text( label=label, text=text, diff --git a/test/data/doc/checkboxes.out.dclg.xml b/test/data/doc/checkboxes.out.dclg.xml index 94195249..0485e203 100644 --- a/test/data/doc/checkboxes.out.dclg.xml +++ b/test/data/doc/checkboxes.out.dclg.xml @@ -1,6 +1,10 @@ - - TODO - - DONE + + + TODO + + + + DONE + diff --git a/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output.dclg.xml b/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output.dclg.xml index de48fd7b..a0b745be 100644 --- a/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output.dclg.xml +++ b/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output.dclg.xml @@ -20,12 +20,14 @@ Form 10-K - - - - - - ☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 + + + + + + + ☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 + @@ -33,12 +35,14 @@ FOR THE FISCAL YEAR ENDED DECEMBER 31, 2020 - - - - - - TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR THE TRANSITION PERIOD FROM TO + + + + + + + TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR THE TRANSITION PERIOD FROM TO + @@ -370,12 +374,14 @@ - - - - - - Large accelerated filer ☑ + + + + + + + Large accelerated filer ☑ + diff --git a/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output_no_content.dclg.xml b/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output_no_content.dclg.xml index a3132e22..1bf6dfab 100644 --- a/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output_no_content.dclg.xml +++ b/test/data/doc/kv/1eac20e5ac5fac655a611343f86927d6a76277e170430c1eba741585437a2e90/output_no_content.dclg.xml @@ -17,22 +17,26 @@ - - - - - + + + + + + + - - - - - + + + + + + + @@ -327,11 +331,13 @@ - - - - - + + + + + + + diff --git a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml index de48bc40..c99862d6 100644 --- a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml +++ b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml @@ -181,12 +181,14 @@ - - - - - - Further documents are listed in the continuation of Box O. + + + + + + + Further documents are listed in the continuation of Box O. + @@ -304,12 +306,14 @@ : (+31-70) 340-3016 - - - - - - X See patent family annex. + + + + + + + X See patent family annex. + diff --git a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml index b94dc6ca..7e9b1f5f 100644 --- a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml +++ b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml @@ -154,11 +154,13 @@ - - - - - + + + + + + + @@ -265,11 +267,13 @@ - - - - - + + + + + + +