Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4562,6 +4562,30 @@ def _append_item_copies(
for item in node_items:
item_copy = item.model_copy(deep=True)

# handle DocItem pointers (comments)
if isinstance(item, DocItem):
if item.comments:
if isinstance(item_copy, DocItem):
item_copy.comments = self._copy_and_reindex_refs(item.comments, doc=doc, parent_ref=parent_ref)
Comment on lines +4566 to +4569
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For readability, can we keep it more compact? (this one and the other similar patterns)

Suggested change
if isinstance(item, DocItem):
if item.comments:
if isinstance(item_copy, DocItem):
item_copy.comments = self._copy_and_reindex_refs(item.comments, doc=doc, parent_ref=parent_ref)
if isinstance(item, DocItem) and item.comments and isinstance(item_copy, DocItem):
item_copy.comments = self._copy_and_reindex_refs(item.comments, doc=doc, parent_ref=parent_ref)


# handling new references for floating items
if isinstance(item, FloatingItem):
if item.captions:
if isinstance(item_copy, FloatingItem):
item_copy.captions = self._copy_and_reindex_refs(item.captions, doc=doc, parent_ref=parent_ref)

if item.footnotes:
if isinstance(item_copy, FloatingItem):
item_copy.footnotes = self._copy_and_reindex_refs(
item.footnotes, doc=doc, parent_ref=parent_ref
)

if item.references:
if isinstance(item_copy, FloatingItem):
item_copy.references = self._copy_and_reindex_refs(
item.references, doc=doc, parent_ref=parent_ref
)

self._append_item(item=item_copy, parent_ref=parent_ref)

if item_copy.children:
Expand All @@ -4578,6 +4602,31 @@ def _append_item_copies(

return new_refs

def _copy_and_reindex_refs(self, ref_list: list[Any], doc: "DoclingDocument", parent_ref: RefItem) -> list[Any]:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine grain comment: can we be more precise with the type hints? Would list[Any] rather be list[NodeItem] ?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because item.comments expects a list[FineRef] while other fields use list[RefItem], MyPy throws a list invariance error if I type-hint the helper method strictly as list[RefItem] or list[FineRef]. Would you prefer I keep the type hint as list[Any] to handle both cases or split this into two separate helper functions?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ceberam Just a gentle ping on this when you have a moment.

If list[Any] is a bit too loose for the project's typing standards, I can implement a generic TypeVar (e.g., T_Ref = TypeVar("T_Ref", bound="RefItem") alongside Sequence[T_Ref]). This approach should satisfy MyPy's strict list invariance rules while keeping the exact typing intact for both FineRef and RefItem outputs.

Let me know if you'd like me to push that update, or if you're comfortable moving forward with it as-is!

"""Helper to copy referenced items and return their new indices

:param ref_list: list[Any]: The list of references (e.g., captions, footnotes, comments) to be copied
:param doc: "DoclingDocument": The document from which the NodeItems are taken
:param parent_ref: RefItem: The reference of the parent item in the current document where copies will be appended to

:returns: list[Any]: A new list of references pointing to the newly appended items in the current document
Comment on lines +4606 to +4612
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though we have not been consistent in the past, we want to stick to the google docstring conventions at least on new code, as we specify it on pyproject.toml

"""
if not ref_list:
return []

new_refs = []
for ref in ref_list:
resolved_item = ref.resolve(doc)
if resolved_item:
ref_copy = resolved_item.model_copy(deep=True)
self._append_item(item=ref_copy, parent_ref=parent_ref)

new_ref_pointer = ref.model_copy(deep=True)
new_ref_pointer.cref = ref_copy.get_ref().cref

new_refs.append(new_ref_pointer)
return new_refs

def num_pages(self):
"""num_pages."""
return len(self.pages.values())
Expand Down
43 changes: 43 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1986,3 +1986,46 @@ def test_docitem_comments_delete_updates_refs():
# The resolved comment should still work
resolved = updated_para.comments[0].resolve(doc)
assert resolved.text == "Comment on second paragraph."

def test_add_node_items_updates_all_pointers():
"""
Verifies that copying an item updates its references, footnotes, and comments to point to the newly assigned indices in the destination document.
"""
# set source document
src_doc = DoclingDocument(name="source")

# create the items that will be pointed to and attach them
ref_text = src_doc.add_text(label=DocItemLabel.REFERENCE, text="[1] Source Reference")
foot_text = src_doc.add_text(label=DocItemLabel.FOOTNOTE, text="* Source Footnote")

table = src_doc.add_table(data=TableData(num_rows=1, num_cols=1))
table.references = [ref_text.get_ref()]
table.footnotes = [foot_text.get_ref()]
src_doc.add_comment(text="Source Comment", targets=[table])

dest_doc = DoclingDocument(name="dest")

# pad the destination so the indices are forced to shift
pad_text = dest_doc.add_text(label=DocItemLabel.TEXT, text="Padding Text")
dest_doc.add_comment(text="Padding Comment", targets=[pad_text])

dest_doc.add_node_items(node_items=[table], doc=src_doc)
new_table = dest_doc.tables[0]

# references
assert len(new_table.references) == 1
resolved_ref = new_table.references[0].resolve(dest_doc)
assert resolved_ref is not None, "Reference pointer is broken!"
assert resolved_ref.text == "[1] Source Reference"

# footnotes
assert len(new_table.footnotes) == 1
resolved_foot = new_table.footnotes[0].resolve(dest_doc)
assert resolved_foot is not None, "Footnote pointer is broken!"
assert resolved_foot.text == "* Source Footnote"

# comments
assert len(new_table.comments) == 1
resolved_comment = new_table.comments[0].resolve(dest_doc)
assert resolved_comment is not None, "Comment pointer is broken!"
assert resolved_comment.text == "Source Comment"
Loading