Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions docling_core/experimental/doclang.py
Original file line number Diff line number Diff line change
Expand Up @@ -1457,7 +1457,6 @@ def _serialize_single_item(
# list items, this maps to <list_text> and keeps the text serializer
# free of type-based special casing.
wrap_open_token: Optional[str]
selected_token: str = ""
tok: DoclangToken | None = None
if isinstance(item, SectionHeaderItem):
wrap_open_token = DoclangVocabulary.create_heading_token(level=item.level)
Expand All @@ -1474,11 +1473,12 @@ def _serialize_single_item(
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.CHECKBOX_UNSELECTED,
]:
tok = DoclangToken.TEXT
wrap_open_token = None
selected_token = DoclangVocabulary.create_checkbox_token(
selected=(item.label == DocItemLabel.CHECKBOX_SELECTED)
)
if item.parent and isinstance((parent_item := item.parent.resolve(doc)), TextItem) and not parent_item.text:
# skip re-wrapping if already in a text item
wrap_open_token = None
else:
tok = DoclangToken.TEXT
wrap_open_token = f"<{tok.value}>"
elif isinstance(item, TextItem) and (
tok := {
DocItemLabel.FIELD_KEY: DoclangToken.FIELD_KEY,
Expand Down Expand Up @@ -1521,9 +1521,6 @@ def _serialize_single_item(
if loc:
parts.append(loc)

if selected_token:
parts.append(selected_token)

if item.meta:
meta_res = doc_serializer.serialize_meta(item=item, **kwargs)
if meta_res.text:
Expand Down Expand Up @@ -1551,6 +1548,12 @@ def _serialize_single_item(
)
if item.label == DocItemLabel.HANDWRITTEN_TEXT:
text_part = _wrap(text=text_part, wrap_tag=DoclangToken.HANDWRITING.value)
elif item.label in [DocItemLabel.CHECKBOX_SELECTED, DocItemLabel.CHECKBOX_UNSELECTED]:
# Add checkbox token before the text
checkbox_token = DoclangVocabulary.create_checkbox_token(
selected=(item.label == DocItemLabel.CHECKBOX_SELECTED)
)
text_part = checkbox_token + text_part

if text_part:
parts.append(text_part)
Expand All @@ -1569,7 +1572,14 @@ def _serialize_single_item(

text_res = "".join(parts)
if wrap_open_token is not None and not (
is_inline_scope and item.label in {DocItemLabel.TEXT, DocItemLabel.HANDWRITTEN_TEXT}
is_inline_scope
and item.label
in {
DocItemLabel.TEXT,
DocItemLabel.HANDWRITTEN_TEXT,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.CHECKBOX_UNSELECTED,
}
):
if text_res or not params.suppress_empty_elements:
text_res = _wrap_token(text=text_res, open_token=wrap_open_token)
Expand Down Expand Up @@ -2401,6 +2411,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]:
DoclangToken.SUBSCRIPT.value,
DoclangToken.SUPERSCRIPT.value,
DoclangToken.HANDWRITING.value,
DoclangToken.CHECKBOX.value,
DoclangToken.CONTENT.value,
}:
return None
Expand Down Expand Up @@ -2488,6 +2499,17 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
c.tagName == DoclangToken.HANDWRITING.value for c in element_children
):
label = DocItemLabel.HANDWRITTEN_TEXT
elif nm == DoclangToken.TEXT.value:
# Check for checkbox elements with class attribute
for c in element_children:
if c.tagName == DoclangToken.CHECKBOX.value:
checkbox_class = c.getAttribute(DoclangAttributeKey.CLASS.value)
if checkbox_class == DoclangAttributeValue.SELECTED.value:
label = DocItemLabel.CHECKBOX_SELECTED
break
elif checkbox_class == DoclangAttributeValue.UNSELECTED.value:
label = DocItemLabel.CHECKBOX_UNSELECTED
break
item = doc.add_text(
label=label,
text=text,
Expand Down
12 changes: 8 additions & 4 deletions test/data/doc/checkboxes.out.dclg.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
<doclang version="1.0.0">
<checkbox class="unselected"/>
TODO
<checkbox class="selected"/>
DONE
<text>
<checkbox class="unselected"/>
TODO
</text>
<text>
<checkbox class="selected"/>
DONE
</text>
</doclang>
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,29 @@
<location value="51"/>
Form 10-K
</heading>
<location value="23"/>
<location value="58"/>
<location value="363"/>
<location value="62"/>
<checkbox class="selected"/>
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
<text>
<location value="23"/>
<location value="58"/>
<location value="363"/>
<location value="62"/>
<checkbox class="selected"/>
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
</text>
<text>
<location value="32"/>
<location value="62"/>
<location value="247"/>
<location value="68"/>
FOR THE FISCAL YEAR ENDED DECEMBER 31, 2020
</text>
<location value="32"/>
<location value="73"/>
<location value="489"/>
<location value="81"/>
<checkbox class="unselected"/>
TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR THE TRANSITION PERIOD FROM TO
<text>
<location value="32"/>
<location value="73"/>
<location value="489"/>
<location value="81"/>
<checkbox class="unselected"/>
TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR THE TRANSITION PERIOD FROM TO
</text>
<text>
<location value="15"/>
<location value="80"/>
Expand Down Expand Up @@ -370,12 +374,14 @@
<location value="499"/>
<location value="263"/>
<![CDATA[Indicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company . See the definitions of "large accelerated filer," "accelerated filer," "smaller reporting company" and "emerging growth company" in Rule 12b-2 of the Exchange Act.]]> </text>
<location value="30"/>
<location value="266"/>
<location value="120"/>
<location value="270"/>
<checkbox class="selected"/>
Large accelerated filer ☑
<text>
<location value="30"/>
<location value="266"/>
<location value="120"/>
<location value="270"/>
<checkbox class="selected"/>
Large accelerated filer ☑
</text>
<field_region>
<field_item>
<value class="read_only">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,26 @@
<location value="287"/>
<location value="51"/>
</heading>
<location value="23"/>
<location value="58"/>
<location value="363"/>
<location value="62"/>
<checkbox class="selected"/>
<text>
<location value="23"/>
<location value="58"/>
<location value="363"/>
<location value="62"/>
<checkbox class="selected"/>
</text>
<text>
<location value="32"/>
<location value="62"/>
<location value="247"/>
<location value="68"/>
</text>
<location value="32"/>
<location value="73"/>
<location value="489"/>
<location value="81"/>
<checkbox class="unselected"/>
<text>
<location value="32"/>
<location value="73"/>
<location value="489"/>
<location value="81"/>
<checkbox class="unselected"/>
</text>
<text>
<location value="15"/>
<location value="80"/>
Expand Down Expand Up @@ -327,11 +331,13 @@
<location value="499"/>
<location value="263"/>
</text>
<location value="30"/>
<location value="266"/>
<location value="120"/>
<location value="270"/>
<checkbox class="selected"/>
<text>
<location value="30"/>
<location value="266"/>
<location value="120"/>
<location value="270"/>
<checkbox class="selected"/>
</text>
<field_region>
<field_item>
<value class="read_only">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,14 @@
<nl/>
</otsl>
</floating_group>
<location value="52"/>
<location value="345"/>
<location value="227"/>
<location value="355"/>
<checkbox class="unselected"/>
Further documents are listed in the continuation of Box O.
<text>
<location value="52"/>
<location value="345"/>
<location value="227"/>
<location value="355"/>
<checkbox class="unselected"/>
Further documents are listed in the continuation of Box O.
</text>
<text>
<location value="53"/>
<location value="359"/>
Expand Down Expand Up @@ -304,12 +306,14 @@
: (+31-70) 340-3016
</value>
</field_item>
<location value="273"/>
<location value="345"/>
<location value="358"/>
<location value="354"/>
<checkbox class="selected"/>
X See patent family annex.
<text>
<location value="273"/>
<location value="345"/>
<location value="358"/>
<location value="354"/>
<checkbox class="selected"/>
X See patent family annex.
</text>
<text>
<location value="269"/>
<location value="363"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,13 @@
<nl/>
</otsl>
</floating_group>
<location value="52"/>
<location value="345"/>
<location value="227"/>
<location value="355"/>
<checkbox class="unselected"/>
<text>
<location value="52"/>
<location value="345"/>
<location value="227"/>
<location value="355"/>
<checkbox class="unselected"/>
</text>
<text>
<location value="53"/>
<location value="359"/>
Expand Down Expand Up @@ -265,11 +267,13 @@
<location value="479"/>
</value>
</field_item>
<location value="273"/>
<location value="345"/>
<location value="358"/>
<location value="354"/>
<checkbox class="selected"/>
<text>
<location value="273"/>
<location value="345"/>
<location value="358"/>
<location value="354"/>
<checkbox class="selected"/>
</text>
<text>
<location value="269"/>
<location value="363"/>
Expand Down
Loading