diff --git a/docling_core/experimental/doclang.py b/docling_core/experimental/doclang.py
index 27b17085..ff42e998 100644
--- a/docling_core/experimental/doclang.py
+++ b/docling_core/experimental/doclang.py
@@ -38,6 +38,7 @@
BaseMeta,
BoundingBox,
CodeItem,
+ ContentLayer,
DescriptionMetaField,
DocItem,
DoclingDocument,
@@ -352,6 +353,7 @@ class DoclangToken(str, Enum):
# Geometric and temporal
LOCATION = "location"
+ LAYER = "layer"
HOUR = "hour"
MINUTE = "minute"
SECOND = "second"
@@ -469,6 +471,7 @@ class DoclangVocabulary(BaseModel):
DoclangAttributeKey.VALUE,
DoclangAttributeKey.RESOLUTION,
},
+ DoclangToken.LAYER: {DoclangAttributeKey.CLASS},
DoclangToken.HOUR: {DoclangAttributeKey.VALUE},
DoclangToken.MINUTE: {DoclangAttributeKey.VALUE},
DoclangToken.SECOND: {DoclangAttributeKey.VALUE},
@@ -543,6 +546,7 @@ class DoclangVocabulary(BaseModel):
DoclangToken.PAGE_BREAK,
DoclangToken.TIME_BREAK,
DoclangToken.LOCATION,
+ DoclangToken.LAYER,
DoclangToken.HOUR,
DoclangToken.MINUTE,
DoclangToken.SECOND,
@@ -577,6 +581,7 @@ class DoclangVocabulary(BaseModel):
DoclangToken.TIME_BREAK: DoclangCategory.SPECIAL,
# Geometric
DoclangToken.LOCATION: DoclangCategory.GEOMETRIC,
+ DoclangToken.LAYER: DoclangCategory.GEOMETRIC,
# Temporal
DoclangToken.HOUR: DoclangCategory.TEMPORAL,
DoclangToken.MINUTE: DoclangCategory.TEMPORAL,
@@ -970,6 +975,13 @@ class WrapMode(str, Enum):
WRAP_WHEN_NEEDED = "wrap_when_needed" # wrap text only if it has leading or trailing whitespace
+class LayerMode(str, Enum):
+ """Layer mode for Doclang output."""
+
+ ALWAYS = "always" # always include layer element
+ MINIMAL = "minimal" # include layer element only when it differs from default
+
+
class ContentType(str, Enum):
"""Content type for Doclang output."""
@@ -991,6 +1003,9 @@ class ContentType(str, Enum):
class DoclangParams(CommonParams):
"""Doclang-specific serialization parameters independent of Doclang."""
+ # Override parent's layers to default to all ContentLayers
+ layers: set[ContentLayer] = set(ContentLayer)
+
# Geometry & content controls (aligned with Doclang defaults)
xsize: int = DOCLANG_DFLT_RESOLUTION
ysize: int = DOCLANG_DFLT_RESOLUTION
@@ -1007,6 +1022,9 @@ class DoclangParams(CommonParams):
# types of content to serialize (only relevant if show_content is True):
content_types: set[ContentType] = _DEFAULT_CONTENT_TYPES
+ # Layer mode
+ layer_mode: LayerMode = LayerMode.MINIMAL
+
# Doclang formatting
do_self_closing: bool = True
pretty_indentation: Optional[str] = 2 * " " # None means minimized serialization, "" means no indentation
@@ -1021,6 +1039,22 @@ class DoclangParams(CommonParams):
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER
+def _create_layer_token(
+ *,
+ item: DocItem,
+ params: DoclangParams,
+) -> str:
+ """Create `` token for an item's content layer if needed."""
+ if params.layer_mode == LayerMode.ALWAYS or (
+ params.layer_mode == LayerMode.MINIMAL and item.content_layer != ContentLayer.BODY
+ ):
+ return DoclangVocabulary.create_selfclosing_token(
+ token=DoclangToken.LAYER,
+ attrs={DoclangAttributeKey.CLASS: item.content_layer.value},
+ )
+ return ""
+
+
def _get_delim(*, params: DoclangParams) -> str:
"""Return record delimiter based on DoclangSerializationMode."""
return "" if params.pretty_indentation is None else "\n"
@@ -1521,6 +1555,9 @@ def _serialize_single_item(
if loc:
parts.append(loc)
+ if layer_token := _create_layer_token(item=item, params=params):
+ parts.append(layer_token)
+
if selected_token:
parts.append(selected_token)
@@ -1688,6 +1725,9 @@ def serialize(
if params.add_location:
body += _create_location_tokens_for_item(item=item, doc=doc, xres=params.xsize, yres=params.ysize)
+ if layer_token := _create_layer_token(item=item, params=params):
+ body += layer_token
+
uri: Optional[str] = None
if params.image_mode in [ImageRefMode.REFERENCED, ImageRefMode.EMBEDDED] and item.image and item.image.uri:
uri = str(item.image.uri)
@@ -1888,6 +1928,9 @@ def serialize(
if params.add_location:
body += _create_location_tokens_for_item(item=item, doc=doc, xres=params.xsize, yres=params.ysize)
+ if layer_token := _create_layer_token(item=item, params=params):
+ body += layer_token
+
if ContentType.TABLE in params.content_types:
otsl_text = self._emit_otsl(
item=item,
@@ -2008,6 +2051,9 @@ def serialize(
loc_str = _create_location_tokens_for_item(item=item, doc=doc, xres=params.xsize, yres=params.ysize)
if loc_str:
parts.append(create_ser_result(text=loc_str, span_source=item))
+ if is_fri:
+ if layer_token := _create_layer_token(item=item, params=params):
+ parts.append(create_ser_result(text=layer_token, span_source=item))
parts.extend(doc_serializer.get_parts(item=item, **kwargs))
text_res = delim.join([p.text for p in parts if p.text])
tok = DoclangToken.FIELD_REGION if is_fri else DoclangToken.FIELD_ITEM
@@ -2104,6 +2150,9 @@ def serialize_captions(
item=cap, doc=self.doc, xres=params.xsize, yres=params.ysize
)
results.append(create_ser_result(text=loc_txt))
+
+ if layer_token := _create_layer_token(item=cap, params=params):
+ results.append(create_ser_result(text=layer_token))
if cap_res.text and ContentType.REF_CAPTION in params.content_types:
cap_res.text = _escape_text(cap_res.text, params)
results.append(cap_res)
@@ -2130,11 +2179,13 @@ def serialize_footnotes(
item=ftn, doc=self.doc, xres=params.xsize, yres=params.ysize
)
+ layer_token = _create_layer_token(item=ftn, params=params)
+
content = ""
if ftn.text and ContentType.REF_FOOTNOTE in params.content_types:
content = _escape_text(ftn.text, params)
- text_res = f"{location}{content}"
+ text_res = f"{location}{layer_token}{content}"
if text_res:
text_res = _wrap(text_res, wrap_tag=DoclangToken.FOOTNOTE.value)
results.append(create_ser_result(text=text_res))
@@ -2378,6 +2429,7 @@ def _walk_children(self, *, doc: DoclingDocument, el: Element, parent: Optional[
DoclangToken.HEAD.value,
DoclangToken.META.value,
DoclangToken.LOCATION.value,
+ DoclangToken.LAYER.value,
}:
continue
self._dispatch_element(doc=doc, el=node, parent=parent)
@@ -2393,6 +2445,7 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]:
if isinstance(el, Element):
if el.tagName not in {
DoclangToken.LOCATION.value,
+ DoclangToken.LAYER.value,
DoclangToken.BR.value,
DoclangToken.BOLD.value,
DoclangToken.ITALIC.value,
@@ -2416,7 +2469,9 @@ def _get_children_simple_text_block(self, element: Element) -> Optional[str]:
def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optional[NodeItem]) -> None:
"""Parse text-like tokens (title, text, caption, footnotes, code, formula)."""
element_children = [
- node for node in el.childNodes if isinstance(node, Element) and node.tagName != DoclangToken.LOCATION.value
+ node
+ for node in el.childNodes
+ if isinstance(node, Element) and node.tagName not in {DoclangToken.LOCATION.value, DoclangToken.LAYER.value}
]
if len(element_children) > 1 or self._get_children_simple_text_block(el) is None:
@@ -2424,6 +2479,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
return
prov_list = self._extract_provenance(doc=doc, el=el)
+ content_layer = self._extract_layer(el=el)
text, formatting = self._extract_text_with_formatting(el)
if not text:
return
@@ -2440,6 +2496,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
code_language=lang_label,
parent=parent,
prov=(prov_list[0] if prov_list else None),
+ content_layer=content_layer,
)
for p in prov_list[1:]:
item.prov.append(p)
@@ -2494,6 +2551,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
parent=parent,
prov=(prov_list[0] if prov_list else None),
formatting=formatting,
+ content_layer=content_layer,
)
for p in prov_list[1:]:
item.prov.append(p)
@@ -2504,6 +2562,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
parent=parent,
prov=(prov_list[0] if prov_list else None),
formatting=formatting,
+ content_layer=content_layer,
)
for p in prov_list[1:]:
item.prov.append(p)
@@ -2549,6 +2608,7 @@ def _parse_heading(self, *, doc: DoclingDocument, el: Element, parent: Optional[
level = 1
# Extract provenance from heading token (if any)
prov_list = self._extract_provenance(doc=doc, el=el)
+ content_layer = self._extract_layer(el=el)
text = self._get_text(el)
text_stripped = text.strip()
if text_stripped:
@@ -2557,6 +2617,7 @@ def _parse_heading(self, *, doc: DoclingDocument, el: Element, parent: Optional[
level=level,
parent=parent,
prov=(prov_list[0] if prov_list else None),
+ content_layer=content_layer,
)
for p in prov_list[1:]:
item.prov.append(p)
@@ -2672,13 +2733,15 @@ def _parse_table_group(self, *, doc: DoclingDocument, el: Element, parent: Optio
return
# Extract table provenance from leading tokens
tbl_provs = self._extract_provenance(doc=doc, el=otsl_el)
- # Get inner XML excluding location tokens (work directly with parsed DOM)
- inner = self._inner_xml(otsl_el, exclude_tags={"location"})
+ content_layer = self._extract_layer(el=otsl_el)
+ # Get inner XML excluding location and layer tokens (work directly with parsed DOM)
+ inner = self._inner_xml(otsl_el, exclude_tags={"location", "layer"})
tbl = doc.add_table(
data=TableData(),
caption=caption,
parent=parent,
prov=(tbl_provs[0] if tbl_provs else None),
+ content_layer=content_layer,
)
tbl_content = _wrap(text=inner, wrap_tag=DoclangToken.OTSL.value)
td = self._parse_otsl_table_content(otsl_content=tbl_content, doc=doc, parent=tbl)
@@ -2693,17 +2756,20 @@ def _parse_picture_group(self, *, doc: DoclingDocument, el: Element, parent: Opt
caption = self._extract_caption(doc=doc, el=el)
footnotes = self._extract_footnotes(doc=doc, el=el)
- # Extract provenance from the block (locations appear inside it)
+ # Extract provenance and layer from the block (locations and layer appear inside it)
prov_list: list[ProvenanceItem] = []
+ content_layer: Optional[ContentLayer] = None
picture_el = self._first_child(el, DoclangToken.PICTURE.value)
if picture_el is not None:
prov_list = self._extract_provenance(doc=doc, el=picture_el)
+ content_layer = self._extract_layer(el=picture_el)
# Create the picture item first, attach caption and provenance
pic = doc.add_picture(
caption=caption,
parent=parent,
prov=(prov_list[0] if prov_list else None),
+ content_layer=content_layer,
)
for p in prov_list[1:]:
pic.prov.append(p)
@@ -3102,3 +3168,14 @@ def _extract_provenance(self, *, doc: DoclingDocument, el: Element) -> list[Prov
res_for_group = None
return provs
+
+ def _extract_layer(self, *, el: Element) -> Optional[ContentLayer]:
+ """Extract content layer from token if present."""
+ for node in el.childNodes:
+ if isinstance(node, Element) and node.tagName == DoclangToken.LAYER.value:
+ if layer_value := node.getAttribute(DoclangAttributeKey.CLASS.value):
+ try:
+ return ContentLayer(layer_value)
+ except ValueError:
+ pass
+ return None
diff --git a/test/conftest.py b/test/conftest.py
index 9ee385cb..3e99ec43 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -495,3 +495,56 @@ def _doc_with_handwritten() -> DoclingDocument:
def doc_with_handwritten(_doc_with_handwritten: DoclingDocument) -> DoclingDocument:
"""Copy of a document with handwritten text for each test function."""
return _doc_with_handwritten.model_copy(deep=True)
+
+
+@pytest.fixture(scope="session")
+def _doc_with_layers() -> DoclingDocument:
+ """Fixture for a document with different content layers to be reused across the test session."""
+ from docling_core.types.doc.document import ContentLayer
+
+ doc = DoclingDocument(name="")
+ doc.add_page(page_no=1, size=Size(width=100, height=100), image=None)
+
+ # Add page header with furniture layer
+ doc.add_text(
+ label=DocItemLabel.PAGE_HEADER,
+ text="Page Header",
+ prov=ProvenanceItem(
+ page_no=1,
+ bbox=BoundingBox.from_tuple((1, 2, 3, 4), origin=CoordOrigin.BOTTOMLEFT),
+ charspan=(0, 11),
+ ),
+ content_layer=ContentLayer.FURNITURE,
+ )
+
+ # Add regular text with body layer (default)
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ text="Main body content",
+ prov=ProvenanceItem(
+ page_no=1,
+ bbox=BoundingBox.from_tuple((5, 6, 7, 8), origin=CoordOrigin.BOTTOMLEFT),
+ charspan=(0, 17),
+ ),
+ content_layer=ContentLayer.BODY,
+ )
+
+ # Add page footer with furniture layer
+ doc.add_text(
+ label=DocItemLabel.PAGE_FOOTER,
+ text="Page Footer",
+ prov=ProvenanceItem(
+ page_no=1,
+ bbox=BoundingBox.from_tuple((9, 10, 11, 12), origin=CoordOrigin.BOTTOMLEFT),
+ charspan=(0, 11),
+ ),
+ content_layer=ContentLayer.FURNITURE,
+ )
+
+ return doc
+
+
+@pytest.fixture(scope="function")
+def doc_with_layers(_doc_with_layers: DoclingDocument) -> DoclingDocument:
+ """Copy of a document with different content layers for each test function."""
+ return _doc_with_layers.model_copy(deep=True)
diff --git a/test/data/doc/barchart.out.dclg.xml b/test/data/doc/barchart.out.dclg.xml
index 3a116d28..00723add 100644
--- a/test/data/doc/barchart.out.dclg.xml
+++ b/test/data/doc/barchart.out.dclg.xml
@@ -4,6 +4,7 @@
+
Probability, Combinatorics and Control
diff --git a/test/data/doc/ddoc_0.v0.gt.dclg.xml b/test/data/doc/ddoc_0.v0.gt.dclg.xml
index 18c1eac8..7bcca750 100644
--- a/test/data/doc/ddoc_0.v0.gt.dclg.xml
+++ b/test/data/doc/ddoc_0.v0.gt.dclg.xml
@@ -4,6 +4,7 @@
+
ndbinfo_select_all - Select From ndbinfo Tables
@@ -237,6 +238,7 @@
+
Print program argument list and exit.
@@ -244,6 +246,7 @@
+
4253
diff --git a/test/data/doc/ddoc_0.v1.gt.dclg.xml b/test/data/doc/ddoc_0.v1.gt.dclg.xml
index 89402de9..8c3717ef 100644
--- a/test/data/doc/ddoc_0.v1.gt.dclg.xml
+++ b/test/data/doc/ddoc_0.v1.gt.dclg.xml
@@ -4,6 +4,7 @@
+
@@ -195,11 +196,13 @@
+
+
diff --git a/test/data/doc/ddoc_0.v2.gt.dclg.xml b/test/data/doc/ddoc_0.v2.gt.dclg.xml
index 3132774f..8ba8bc0f 100644
--- a/test/data/doc/ddoc_0.v2.gt.dclg.xml
+++ b/test/data/doc/ddoc_0.v2.gt.dclg.xml
@@ -1 +1 @@
-
+
diff --git a/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml b/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml
index 4348c987..31017481 100644
--- a/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml
+++ b/test/data/doc/doclang_ref/002a_table_order/output.dclg.xml
@@ -4,6 +4,7 @@
+
13040.19
@@ -11,6 +12,7 @@
+
- 16 -
@@ -22,6 +24,7 @@
+
diff --git a/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml b/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml
index bcdfc57e..05ba49d8 100644
--- a/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml
+++ b/test/data/doc/doclang_ref/002a_table_order/output_no_content.dclg.xml
@@ -4,12 +4,14 @@
+
+
@@ -17,6 +19,7 @@
+
diff --git a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml
index 753247cc..6a74414c 100644
--- a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml
+++ b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output.dclg.xml
@@ -15,6 +15,7 @@
+
Index
@@ -22,6 +23,7 @@
+
477
diff --git a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml
index 6c6ee685..81500116 100644
--- a/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml
+++ b/test/data/doc/kv/01d07afe1cb54ecd23eedfe4d91b81dd88e61bf4e0dbe2467784db4177a6c691/output_no_content.dclg.xml
@@ -14,11 +14,13 @@
+
+
diff --git a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml
index 70dd20c8..7da22882 100644
--- a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml
+++ b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output.dclg.xml
@@ -457,6 +457,7 @@
+
31
@@ -464,6 +465,7 @@
+
FAA Chart Users Guide VFR Symbology - Sectional and Terminal Area Charts
diff --git a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml
index 68b5c262..584cde07 100644
--- a/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml
+++ b/test/data/doc/kv/08212053e2db1a70dd60a4f85650ceb33d7519af34f502e3ac894389d76663d6/output_no_content.dclg.xml
@@ -376,11 +376,13 @@
+
+
diff --git a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml
index face4c7d..f7123a21 100644
--- a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml
+++ b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output.dclg.xml
@@ -350,6 +350,7 @@
+
EP3800018A1
diff --git a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml
index d16373c6..45a7be33 100644
--- a/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml
+++ b/test/data/doc/kv/0cb12d33b02867dc8708f4877480743533f1248683091188000d25456ba12d73/output_no_content.dclg.xml
@@ -312,5 +312,6 @@
+
diff --git a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml
index 1adde255..dce1bb06 100644
--- a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml
+++ b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output.dclg.xml
@@ -608,6 +608,7 @@
+
(57)
diff --git a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml
index 2d4c8df0..e331a754 100644
--- a/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml
+++ b/test/data/doc/kv/167f6658cd410df8d4d14acc53e8c8f509e94c44b8005e6b76de8d17329363a7/output_no_content.dclg.xml
@@ -500,5 +500,6 @@
+
diff --git a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml
index c80e0e18..fe80df75 100644
--- a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml
+++ b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml
@@ -1398,6 +1398,7 @@
+
ANNEX VII
@@ -1405,6 +1406,7 @@
+
96
@@ -1412,6 +1414,7 @@
+
FR
diff --git a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml
index 6f658ec5..53969f0b 100644
--- a/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml
+++ b/test/data/doc/kv/587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml
@@ -1249,17 +1249,20 @@
+
+
+
diff --git a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml
index de49aedc..7c48dad6 100644
--- a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml
+++ b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output.dclg.xml
@@ -4,6 +4,7 @@
+
Michigan Department of Treasury (Rev. 05-19), Page 1 of 2
@@ -11,6 +12,7 @@
+
Issued under authority of Public Act 281 of 1967 , as amended .
@@ -1606,6 +1608,7 @@
+
+ 0000 2019 05 01 27 1
@@ -1613,6 +1616,7 @@
+
Continue on page 2 . This form cannot be processed if page 2 is not completed and included.
diff --git a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml
index 7114986b..3e387beb 100644
--- a/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml
+++ b/test/data/doc/kv/ba4120cada21304563625490e9ad13911e96114d3f07df056a6bf62397a859e1/output_no_content.dclg.xml
@@ -4,12 +4,14 @@
+
+
@@ -1428,11 +1430,13 @@
+
+
diff --git a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml
index de48bc40..d9dca05c 100644
--- a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml
+++ b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output.dclg.xml
@@ -372,6 +372,7 @@
+
Form PCT/ISA/210 (second sheet) (April 2005)
@@ -379,6 +380,7 @@
+
2
diff --git a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml
index b94dc6ca..2797eb48 100644
--- a/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml
+++ b/test/data/doc/kv/fedf68d45c1acd279f7c34fd5b4cbae80677138bdc1d03b2869466e0cf4e89e9/output_no_content.dclg.xml
@@ -328,11 +328,13 @@
+
+
diff --git a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml
index c80e0e18..fe80df75 100644
--- a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml
+++ b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output.dclg.xml
@@ -1398,6 +1398,7 @@
+
ANNEX VII
@@ -1405,6 +1406,7 @@
+
96
@@ -1412,6 +1414,7 @@
+
FR
diff --git a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml
index 6f658ec5..53969f0b 100644
--- a/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml
+++ b/test/data/doc/kv/non_seq_cell_ids_587558a074858891bdf1d625a9e9fce4ea116ac61bd276d86a069697744b62a4/output_no_content.dclg.xml
@@ -1249,17 +1249,20 @@
+
+
+
diff --git a/test/data/doc/layer_always_mode.dclg.xml b/test/data/doc/layer_always_mode.dclg.xml
new file mode 100644
index 00000000..e39088e7
--- /dev/null
+++ b/test/data/doc/layer_always_mode.dclg.xml
@@ -0,0 +1,26 @@
+
+
+
+
+
+
+
+ Page Header
+
+
+
+
+
+
+
+ Main body content
+
+
+
+
+
+
+
+ Page Footer
+
+
diff --git a/test/data/doc/layer_minimal_mode.dclg.xml b/test/data/doc/layer_minimal_mode.dclg.xml
new file mode 100644
index 00000000..509f5798
--- /dev/null
+++ b/test/data/doc/layer_minimal_mode.dclg.xml
@@ -0,0 +1,25 @@
+
+
+
+
+
+
+
+ Page Header
+
+
+
+
+
+
+ Main body content
+
+
+
+
+
+
+
+ Page Footer
+
+
diff --git a/test/data/doc/layer_only_body.dclg.xml b/test/data/doc/layer_only_body.dclg.xml
new file mode 100644
index 00000000..4d99b331
--- /dev/null
+++ b/test/data/doc/layer_only_body.dclg.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+ Main body content
+
+
diff --git a/test/test_deserializer_doclang.py b/test/test_deserializer_doclang.py
index e6078332..c394f153 100644
--- a/test/test_deserializer_doclang.py
+++ b/test/test_deserializer_doclang.py
@@ -1236,3 +1236,33 @@ def test_picture_tabular_chart_content_cdata_cells():
assert doc.pictures[0].meta.tabular_chart.chart_data.grid[0][1].text == "Player expenses in million U.S. dollars"
assert doc.pictures[0].meta.tabular_chart.chart_data.grid[1][0].text == "19/20"
assert doc.pictures[0].meta.tabular_chart.chart_data.grid[1][1].text == "111"
+
+
+
+def test_roundtrip_with_layers():
+ """Test roundtrip with content layers."""
+ from docling_core.types.doc import ContentLayer
+
+ doc = DoclingDocument(name="t")
+ # Add items with different layers
+ doc.add_text(label=DocItemLabel.PAGE_HEADER, text="Header", content_layer=ContentLayer.FURNITURE)
+ doc.add_text(label=DocItemLabel.TEXT, text="Body text", content_layer=ContentLayer.BODY)
+ doc.add_text(label=DocItemLabel.PAGE_FOOTER, text="Footer", content_layer=ContentLayer.FURNITURE)
+
+ # Serialize with ALWAYS mode to ensure layers are included
+ from docling_core.experimental.doclang import LayerMode
+ ser = DoclangDocSerializer(
+ doc=doc,
+ params=DoclangParams(layer_mode=LayerMode.ALWAYS),
+ )
+ dt = ser.serialize().text
+
+ # Deserialize
+ doc2 = _deserialize(dt)
+
+ # Verify layers are preserved
+ assert len(doc2.body.children) == 3
+ items = [doc2.body.children[i].resolve(doc2) for i in range(3)]
+ assert items[0].content_layer == ContentLayer.FURNITURE
+ assert items[1].content_layer == ContentLayer.BODY
+ assert items[2].content_layer == ContentLayer.FURNITURE
diff --git a/test/test_serialization_doclang.py b/test/test_serialization_doclang.py
index 124c52b4..3aa11e93 100644
--- a/test/test_serialization_doclang.py
+++ b/test/test_serialization_doclang.py
@@ -12,6 +12,7 @@
DoclangDocSerializer,
DoclangParams,
DoclangVocabulary,
+ LayerMode,
WrapMode,
)
from docling_core.types.doc import (
@@ -34,7 +35,7 @@
TabularChartMetaField,
)
from docling_core.types.doc.base import ImageRefMode
-from docling_core.types.doc.document import GraphCell, GraphData, GraphLink, ImageRef, RichTableCell, TableCell
+from docling_core.types.doc.document import ContentLayer, GraphCell, GraphData, GraphLink, ImageRef, RichTableCell, TableCell
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
from test.test_serialization import verify
from test.test_data_gen_flag import GEN_TEST_DATA
@@ -1465,3 +1466,36 @@ def test_suppress_empty_picture_with_nonempty_caption():
result = serialize_doclang(doc, params=params)
assert "