diff --git a/docling_core/experimental/doclang.py b/docling_core/experimental/doclang.py index 27b17085..ff42e998 100644 --- a/docling_core/experimental/doclang.py +++ b/docling_core/experimental/doclang.py @@ -38,6 +38,7 @@ BaseMeta, BoundingBox, CodeItem, + ContentLayer, DescriptionMetaField, DocItem, DoclingDocument, @@ -352,6 +353,7 @@ class DoclangToken(str, Enum): # Geometric and temporal LOCATION = "location" + LAYER = "layer" HOUR = "hour" MINUTE = "minute" SECOND = "second" @@ -469,6 +471,7 @@ class DoclangVocabulary(BaseModel): DoclangAttributeKey.VALUE, DoclangAttributeKey.RESOLUTION, }, + DoclangToken.LAYER: {DoclangAttributeKey.CLASS}, DoclangToken.HOUR: {DoclangAttributeKey.VALUE}, DoclangToken.MINUTE: {DoclangAttributeKey.VALUE}, DoclangToken.SECOND: {DoclangAttributeKey.VALUE}, @@ -543,6 +546,7 @@ class DoclangVocabulary(BaseModel): DoclangToken.PAGE_BREAK, DoclangToken.TIME_BREAK, DoclangToken.LOCATION, + DoclangToken.LAYER, DoclangToken.HOUR, DoclangToken.MINUTE, DoclangToken.SECOND, @@ -577,6 +581,7 @@ class DoclangVocabulary(BaseModel): DoclangToken.TIME_BREAK: DoclangCategory.SPECIAL, # Geometric DoclangToken.LOCATION: DoclangCategory.GEOMETRIC, + DoclangToken.LAYER: DoclangCategory.GEOMETRIC, # Temporal DoclangToken.HOUR: DoclangCategory.TEMPORAL, DoclangToken.MINUTE: DoclangCategory.TEMPORAL, @@ -970,6 +975,13 @@ class WrapMode(str, Enum): WRAP_WHEN_NEEDED = "wrap_when_needed" # wrap text only if it has leading or trailing whitespace +class LayerMode(str, Enum): + """Layer mode for Doclang output.""" + + ALWAYS = "always" # always include layer element + MINIMAL = "minimal" # include layer element only when it differs from default + + class ContentType(str, Enum): """Content type for Doclang output.""" @@ -991,6 +1003,9 @@ class ContentType(str, Enum): class DoclangParams(CommonParams): """Doclang-specific serialization parameters independent of Doclang.""" + # Override parent's layers to default to all ContentLayers + layers: set[ContentLayer] = set(ContentLayer) + # Geometry & content controls (aligned with Doclang defaults) xsize: int = DOCLANG_DFLT_RESOLUTION ysize: int = DOCLANG_DFLT_RESOLUTION @@ -1007,6 +1022,9 @@ class DoclangParams(CommonParams): # types of content to serialize (only relevant if show_content is True): content_types: set[ContentType] = _DEFAULT_CONTENT_TYPES + # Layer mode + layer_mode: LayerMode = LayerMode.MINIMAL + # Doclang formatting do_self_closing: bool = True pretty_indentation: Optional[str] = 2 * " " # None means minimized serialization, "" means no indentation @@ -1021,6 +1039,22 @@ class DoclangParams(CommonParams): image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER +def _create_layer_token( + *, + item: DocItem, + params: DoclangParams, +) -> str: + """Create `` token for an item's content layer if needed.""" + if params.layer_mode == LayerMode.ALWAYS or ( + params.layer_mode == LayerMode.MINIMAL and item.content_layer != ContentLayer.BODY + ): + return DoclangVocabulary.create_selfclosing_token( + token=DoclangToken.LAYER, + attrs={DoclangAttributeKey.CLASS: item.content_layer.value}, + ) + return "" + + def _get_delim(*, params: DoclangParams) -> str: """Return record delimiter based on DoclangSerializationMode.""" return "" if params.pretty_indentation is None else "\n" @@ -1521,6 +1555,9 @@ def _serialize_single_item( if loc: parts.append(loc) + if layer_token := _create_layer_token(item=item, params=params): + parts.append(layer_token) + if selected_token: parts.append(selected_token) @@ -1688,6 +1725,9 @@ def serialize( if params.add_location: body += _create_location_tokens_for_item(item=item, doc=doc, xres=params.xsize, yres=params.ysize) + if layer_token := _create_layer_token(item=item, params=params): + body += layer_token + uri: Optional[str] = None if params.image_mode in [ImageRefMode.REFERENCED, ImageRefMode.EMBEDDED] and item.image and item.image.uri: uri = str(item.image.uri) @@ -1888,6 +1928,9 @@ def serialize( if params.add_location: body += _create_location_tokens_for_item(item=item, doc=doc, xres=params.xsize, yres=params.ysize) + if layer_token := _create_layer_token(item=item, params=params): + body += layer_token + if ContentType.TABLE in params.content_types: otsl_text = self._emit_otsl( item=item, @@ -2008,6 +2051,9 @@ def serialize( loc_str = _create_location_tokens_for_item(item=item, doc=doc, xres=params.xsize, yres=params.ysize) if loc_str: parts.append(create_ser_result(text=loc_str, span_source=item)) + if is_fri: + if layer_token := _create_layer_token(item=item, params=params): + parts.append(create_ser_result(text=layer_token, span_source=item)) parts.extend(doc_serializer.get_parts(item=item, **kwargs)) text_res = delim.join([p.text for p in parts if p.text]) tok = DoclangToken.FIELD_REGION if is_fri else DoclangToken.FIELD_ITEM @@ -2104,6 +2150,9 @@ def serialize_captions( item=cap, doc=self.doc, xres=params.xsize, yres=params.ysize ) results.append(create_ser_result(text=loc_txt)) + + if layer_token := _create_layer_token(item=cap, params=params): + results.append(create_ser_result(text=layer_token)) if cap_res.text and ContentType.REF_CAPTION in params.content_types: cap_res.text = _escape_text(cap_res.text, params) results.append(cap_res) @@ -2130,11 +2179,13 @@ def serialize_footnotes( item=ftn, doc=self.doc, xres=params.xsize, yres=params.ysize ) + layer_token = _create_layer_token(item=ftn, params=params) + content = "" if ftn.text and ContentType.REF_FOOTNOTE in params.content_types: content = _escape_text(ftn.text, params) - text_res = f"{location}{content}" + text_res = f"{location}{layer_token}{content}" if text_res: text_res = _wrap(text_res, wrap_tag=DoclangToken.FOOTNOTE.value) results.append(create_ser_result(text=text_res)) @@ -2378,6 +2429,7 @@ def _walk_children(self, *, doc: DoclingDocument, el: Element, parent: Optional[ DoclangToken.HEAD.value, DoclangToken.META.value, DoclangToken.LOCATION.value, + DoclangToken.LAYER.value, }: continue self._dispatch_element(doc=doc, el=node, parent=parent) @@ -2393,6 +2445,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]: if isinstance(el, Element): if el.tagName not in { DoclangToken.LOCATION.value, + DoclangToken.LAYER.value, DoclangToken.BR.value, DoclangToken.BOLD.value, DoclangToken.ITALIC.value, @@ -2416,7 +2469,9 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]: def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optional[NodeItem]) -> None: """Parse text-like tokens (title, text, caption, footnotes, code, formula).""" element_children = [ - node for node in el.childNodes if isinstance(node, Element) and node.tagName != DoclangToken.LOCATION.value + node + for node in el.childNodes + if isinstance(node, Element) and node.tagName not in {DoclangToken.LOCATION.value, DoclangToken.LAYER.value} ] if len(element_children) > 1 or self._get_children_simple_text_block(el) is None: @@ -2424,6 +2479,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona return prov_list = self._extract_provenance(doc=doc, el=el) + content_layer = self._extract_layer(el=el) text, formatting = self._extract_text_with_formatting(el) if not text: return @@ -2440,6 +2496,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona code_language=lang_label, parent=parent, prov=(prov_list[0] if prov_list else None), + content_layer=content_layer, ) for p in prov_list[1:]: item.prov.append(p) @@ -2494,6 +2551,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona parent=parent, prov=(prov_list[0] if prov_list else None), formatting=formatting, + content_layer=content_layer, ) for p in prov_list[1:]: item.prov.append(p) @@ -2504,6 +2562,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona parent=parent, prov=(prov_list[0] if prov_list else None), formatting=formatting, + content_layer=content_layer, ) for p in prov_list[1:]: item.prov.append(p) @@ -2549,6 +2608,7 @@ def _parse_heading(self, *, doc: DoclingDocument, el: Element, parent: Optional[ level = 1 # Extract provenance from heading token (if any) prov_list = self._extract_provenance(doc=doc, el=el) + content_layer = self._extract_layer(el=el) text = self._get_text(el) text_stripped = text.strip() if text_stripped: @@ -2557,6 +2617,7 @@ def _parse_heading(self, *, doc: DoclingDocument, el: Element, parent: Optional[ level=level, parent=parent, prov=(prov_list[0] if prov_list else None), + content_layer=content_layer, ) for p in prov_list[1:]: item.prov.append(p) @@ -2672,13 +2733,15 @@ def _parse_table_group(self, *, doc: DoclingDocument, el: Element, parent: Optio return # Extract table provenance from leading tokens tbl_provs = self._extract_provenance(doc=doc, el=otsl_el) - # Get inner XML excluding location tokens (work directly with parsed DOM) - inner = self._inner_xml(otsl_el, exclude_tags={"location"}) + content_layer = self._extract_layer(el=otsl_el) + # Get inner XML excluding location and layer tokens (work directly with parsed DOM) + inner = self._inner_xml(otsl_el, exclude_tags={"location", "layer"}) tbl = doc.add_table( data=TableData(), caption=caption, parent=parent, prov=(tbl_provs[0] if tbl_provs else None), + content_layer=content_layer, ) tbl_content = _wrap(text=inner, wrap_tag=DoclangToken.OTSL.value) td = self._parse_otsl_table_content(otsl_content=tbl_content, doc=doc, parent=tbl) @@ -2693,17 +2756,20 @@ def _parse_picture_group(self, *, doc: DoclingDocument, el: Element, parent: Opt caption = self._extract_caption(doc=doc, el=el) footnotes = self._extract_footnotes(doc=doc, el=el) - # Extract provenance from the block (locations appear inside it) + # Extract provenance and layer from the block (locations and layer appear inside it) prov_list: list[ProvenanceItem] = [] + content_layer: Optional[ContentLayer] = None picture_el = self._first_child(el, DoclangToken.PICTURE.value) if picture_el is not None: prov_list = self._extract_provenance(doc=doc, el=picture_el) + content_layer = self._extract_layer(el=picture_el) # Create the picture item first, attach caption and provenance pic = doc.add_picture( caption=caption, parent=parent, prov=(prov_list[0] if prov_list else None), + content_layer=content_layer, ) for p in prov_list[1:]: pic.prov.append(p) @@ -3102,3 +3168,14 @@ def _extract_provenance(self, *, doc: DoclingDocument, el: Element) -> list[Prov res_for_group = None return provs + + def _extract_layer(self, *, el: Element) -> Optional[ContentLayer]: + """Extract content layer from token if present.""" + for node in el.childNodes: + if isinstance(node, Element) and node.tagName == DoclangToken.LAYER.value: + if layer_value := node.getAttribute(DoclangAttributeKey.CLASS.value): + try: + return ContentLayer(layer_value) + except ValueError: + pass + return None diff --git a/test/conftest.py b/test/conftest.py index 9ee385cb..3e99ec43 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -495,3 +495,56 @@ def _doc_with_handwritten() -> DoclingDocument: def doc_with_handwritten(_doc_with_handwritten: DoclingDocument) -> DoclingDocument: """Copy of a document with handwritten text for each test function.""" return _doc_with_handwritten.model_copy(deep=True) + + +@pytest.fixture(scope="session") +def _doc_with_layers() -> DoclingDocument: + """Fixture for a document with different content layers to be reused across the test session.""" + from docling_core.types.doc.document import ContentLayer + + doc = DoclingDocument(name="") + doc.add_page(page_no=1, size=Size(width=100, height=100), image=None) + + # Add page header with furniture layer + doc.add_text( + label=DocItemLabel.PAGE_HEADER, + text="Page Header", + prov=ProvenanceItem( + page_no=1, + bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT), + charspan=(0, 11), + ), + content_layer=ContentLayer.FURNITURE, + ) + + # Add regular text with body layer (default) + doc.add_text( + label=DocItemLabel.TEXT, + text="Main body content", + prov=ProvenanceItem( + page_no=1, + bbox=BoundingBox.from_tuple((5, 6, 7, 8), origin=CoordOrigin.BOTTOMLEFT), + charspan=(0, 17), + ), + content_layer=ContentLayer.BODY, + ) + + # Add page footer with furniture layer + doc.add_text( + label=DocItemLabel.PAGE_FOOTER, + text="Page Footer", + prov=ProvenanceItem( + page_no=1, + bbox=BoundingBox.from_tuple((9, 10, 11, 12), origin=CoordOrigin.BOTTOMLEFT), + charspan=(0, 11), + ), + content_layer=ContentLayer.FURNITURE, + ) + + return doc + + +@pytest.fixture(scope="function") +def doc_with_layers(_doc_with_layers: DoclingDocument) -> DoclingDocument: + """Copy of a document with different content layers for each test function.""" + return _doc_with_layers.model_copy(deep=True) diff --git a/test/data/doc/barchart.out.dclg.xml b/test/data/doc/barchart.out.dclg.xml index 3a116d28..00723add 100644 --- a/test/data/doc/barchart.out.dclg.xml +++ b/test/data/doc/barchart.out.dclg.xml @@ -4,6 +4,7 @@ + Probability, Combinatorics and Control diff --git a/test/data/doc/ddoc_0.v0.gt.dclg.xml b/test/data/doc/ddoc_0.v0.gt.dclg.xml index 18c1eac8..7bcca750 100644 --- a/test/data/doc/ddoc_0.v0.gt.dclg.xml +++ b/test/data/doc/ddoc_0.v0.gt.dclg.xml @@ -4,6 +4,7 @@ + ndbinfo_select_all - Select From ndbinfo Tables @@ -237,6 +238,7 @@ + Print program argument list and exit. @@ -244,6 +246,7 @@ + 4253 diff --git a/test/data/doc/ddoc_0.v1.gt.dclg.xml b/test/data/doc/ddoc_0.v1.gt.dclg.xml index 89402de9..8c3717ef 100644 --- a/test/data/doc/ddoc_0.v1.gt.dclg.xml +++ b/test/data/doc/ddoc_0.v1.gt.dclg.xml @@ -4,6 +4,7 @@ + @@ -195,11 +196,13 @@ + + diff --git a/test/data/doc/ddoc_0.v2.gt.dclg.xml b/test/data/doc/ddoc_0.v2.gt.dclg.xml index 3132774f..8ba8bc0f 100644 --- a/test/data/doc/ddoc_0.v2.gt.dclg.xml +++ b/test/data/doc/ddoc_0.v2.gt.dclg.xml @@ -1 +1 @@ - + diff --git a/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml b/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml index 4348c987..31017481 100644 --- a/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml +++ b/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml @@ -4,6 +4,7 @@ + 13040.19 @@ -11,6 +12,7 @@ + - 16 - @@ -22,6 +24,7 @@ + diff --git a/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml b/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml index bcdfc57e..05ba49d8 100644 --- a/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml +++ b/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml @@ -4,12 +4,14 @@ + + @@ -17,6 +19,7 @@ + diff --git a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml index 753247cc..6a74414c 100644 --- a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml +++ b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml @@ -15,6 +15,7 @@ + Index @@ -22,6 +23,7 @@ + 477 diff --git a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml index 6c6ee685..81500116 100644 --- a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml +++ b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml @@ -14,11 +14,13 @@ + + diff --git a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml index 70dd20c8..7da22882 100644 --- a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml +++ b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml @@ -457,6 +457,7 @@ + 31 @@ -464,6 +465,7 @@ + FAA Chart Users Guide VFR Symbology - Sectional and Terminal Area Charts diff --git a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml index 68b5c262..584cde07 100644 --- a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml +++ b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml @@ -376,11 +376,13 @@ + + diff --git a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml index face4c7d..f7123a21 100644 --- a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml +++ b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml @@ -350,6 +350,7 @@ + EP3800018A1 diff --git a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml index d16373c6..45a7be33 100644 --- a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml +++ b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml @@ -312,5 +312,6 @@ + diff --git a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml index 1adde255..dce1bb06 100644 --- a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml +++ b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml @@ -608,6 +608,7 @@ + (57) diff --git a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml index 2d4c8df0..e331a754 100644 --- a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml +++ b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml @@ -500,5 +500,6 @@ + diff --git a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml index c80e0e18..fe80df75 100644 --- a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml +++ b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml @@ -1398,6 +1398,7 @@ + ANNEX VII @@ -1405,6 +1406,7 @@ + 96 @@ -1412,6 +1414,7 @@ + FR diff --git a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml index 6f658ec5..53969f0b 100644 --- a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml +++ b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml @@ -1249,17 +1249,20 @@ + + + diff --git a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml index de49aedc..7c48dad6 100644 --- a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml +++ b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml @@ -4,6 +4,7 @@ + Michigan Department of Treasury (Rev. 05-19), Page 1 of 2 @@ -11,6 +12,7 @@ + Issued under authority of Public Act 281 of 1967 , as amended . @@ -1606,6 +1608,7 @@ + + 0000 2019 05 01 27 1 @@ -1613,6 +1616,7 @@ + Continue on page 2 . This form cannot be processed if page 2 is not completed and included. diff --git a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml index 7114986b..3e387beb 100644 --- a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml +++ b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml @@ -4,12 +4,14 @@ + + @@ -1428,11 +1430,13 @@ + + diff --git a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml index de48bc40..d9dca05c 100644 --- a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml +++ b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml @@ -372,6 +372,7 @@ + Form PCT/ISA/210 (second sheet) (April 2005) @@ -379,6 +380,7 @@ + 2 diff --git a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml index b94dc6ca..2797eb48 100644 --- a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml +++ b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml @@ -328,11 +328,13 @@ + + diff --git a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml index c80e0e18..fe80df75 100644 --- a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml +++ b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml @@ -1398,6 +1398,7 @@ + ANNEX VII @@ -1405,6 +1406,7 @@ + 96 @@ -1412,6 +1414,7 @@ + FR diff --git a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml index 6f658ec5..53969f0b 100644 --- a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml +++ b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml @@ -1249,17 +1249,20 @@ + + + diff --git a/test/data/doc/layer_always_mode.dclg.xml b/test/data/doc/layer_always_mode.dclg.xml new file mode 100644 index 00000000..e39088e7 --- /dev/null +++ b/test/data/doc/layer_always_mode.dclg.xml @@ -0,0 +1,26 @@ + + + + + + + + Page Header + + + + + + + + Main body content + + + + + + + + Page Footer + + diff --git a/test/data/doc/layer_minimal_mode.dclg.xml b/test/data/doc/layer_minimal_mode.dclg.xml new file mode 100644 index 00000000..509f5798 --- /dev/null +++ b/test/data/doc/layer_minimal_mode.dclg.xml @@ -0,0 +1,25 @@ + + + + + + + + Page Header + + + + + + + Main body content + + + + + + + + Page Footer + + diff --git a/test/data/doc/layer_only_body.dclg.xml b/test/data/doc/layer_only_body.dclg.xml new file mode 100644 index 00000000..4d99b331 --- /dev/null +++ b/test/data/doc/layer_only_body.dclg.xml @@ -0,0 +1,9 @@ + + + + + + + Main body content + + diff --git a/test/test_deserializer_doclang.py b/test/test_deserializer_doclang.py index e6078332..c394f153 100644 --- a/test/test_deserializer_doclang.py +++ b/test/test_deserializer_doclang.py @@ -1236,3 +1236,33 @@ def test_picture_tabular_chart_content_cdata_cells(): assert doc.pictures[0].meta.tabular_chart.chart_data.grid[0][1].text == "Player expenses in million U.S. dollars" assert doc.pictures[0].meta.tabular_chart.chart_data.grid[1][0].text == "19/20" assert doc.pictures[0].meta.tabular_chart.chart_data.grid[1][1].text == "111" + + + +def test_roundtrip_with_layers(): + """Test roundtrip with content layers.""" + from docling_core.types.doc import ContentLayer + + doc = DoclingDocument(name="t") + # Add items with different layers + doc.add_text(label=DocItemLabel.PAGE_HEADER, text="Header", content_layer=ContentLayer.FURNITURE) + doc.add_text(label=DocItemLabel.TEXT, text="Body text", content_layer=ContentLayer.BODY) + doc.add_text(label=DocItemLabel.PAGE_FOOTER, text="Footer", content_layer=ContentLayer.FURNITURE) + + # Serialize with ALWAYS mode to ensure layers are included + from docling_core.experimental.doclang import LayerMode + ser = DoclangDocSerializer( + doc=doc, + params=DoclangParams(layer_mode=LayerMode.ALWAYS), + ) + dt = ser.serialize().text + + # Deserialize + doc2 = _deserialize(dt) + + # Verify layers are preserved + assert len(doc2.body.children) == 3 + items = [doc2.body.children[i].resolve(doc2) for i in range(3)] + assert items[0].content_layer == ContentLayer.FURNITURE + assert items[1].content_layer == ContentLayer.BODY + assert items[2].content_layer == ContentLayer.FURNITURE diff --git a/test/test_serialization_doclang.py b/test/test_serialization_doclang.py index 124c52b4..3aa11e93 100644 --- a/test/test_serialization_doclang.py +++ b/test/test_serialization_doclang.py @@ -12,6 +12,7 @@ DoclangDocSerializer, DoclangParams, DoclangVocabulary, + LayerMode, WrapMode, ) from docling_core.types.doc import ( @@ -34,7 +35,7 @@ TabularChartMetaField, ) from docling_core.types.doc.base import ImageRefMode -from docling_core.types.doc.document import GraphCell, GraphData, GraphLink, ImageRef, RichTableCell, TableCell +from docling_core.types.doc.document import ContentLayer, GraphCell, GraphData, GraphLink, ImageRef, RichTableCell, TableCell from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel from test.test_serialization import verify from test.test_data_gen_flag import GEN_TEST_DATA @@ -1465,3 +1466,36 @@ def test_suppress_empty_picture_with_nonempty_caption(): result = serialize_doclang(doc, params=params) assert "