diff --git a/docling_core/transforms/chunker/doc_chunk.py b/docling_core/transforms/chunker/doc_chunk.py index b0bcc6fe..e441774e 100644 --- a/docling_core/transforms/chunker/doc_chunk.py +++ b/docling_core/transforms/chunker/doc_chunk.py @@ -1,14 +1,19 @@ """Data model for chunk metadata.""" +from __future__ import annotations + import logging import re +import warnings +from copy import deepcopy from typing import Annotated, ClassVar, Final, Literal, Optional from pydantic import Field, StringConstraints, field_validator from docling_core.search.package import VERSION_PATTERN from docling_core.transforms.chunker import BaseChunk, BaseMeta -from docling_core.types.doc.document import DocItem, DocumentOrigin +from docling_core.transforms.serializer.common import DocSerializer +from docling_core.types.doc.document import DocItem, DoclingDocument, DocumentOrigin, InlineGroup, ListGroup, RefItem _VERSION: Final = "1.0.0" @@ -87,3 +92,130 @@ class DocChunk(BaseChunk): """Data model for document chunks.""" meta: DocMeta + + def _get_top_containing_items(self, doc: DoclingDocument) -> list[DocItem] | None: + """Get top-level document items that contain this chunk's items. + + Traverses the document tree upward from each item in the chunk to find + the top-level items (direct children of document body) that contain them. + Maintains the original document reading order. + + Args: + doc: The DoclingDocument containing this chunk. + + Returns: + List of top-level DocItems in document order, or None if no items found. + """ + + items = {} + ref_items = [item.self_ref for item in self.meta.doc_items] + for item in ref_items: + # traverse document tree till top level (body) + top_item = RefItem(cref=item).resolve(doc) + while top_item.parent != doc.body.get_ref(): + top_item = top_item.parent.resolve(doc) + items[top_item.self_ref] = top_item + + # maintain the reading order as in the original document + doc_body_refs = [ref.cref for ref in doc.body.children] + doc_ordered_refs = [ref for ref in doc_body_refs if ref in items] + if len(doc_ordered_refs) > 0: + return [items[ref] for ref in doc_ordered_refs] + return None + + def expand_to_item(self, dl_doc: DoclingDocument, serializer: DocSerializer) -> DocChunk: + """Expand chunk to include complete top-level document items. + + Expands the chunk to contain full top-level items (sections, tables, lists) + rather than partial content. This ensures semantic completeness by including + all content from the top-level items that contain any part of the original chunk. + + Args: + dl_doc: The DoclingDocument containing this chunk. + serializer: Serializer to convert document items to text. + + Returns: + New DocChunk with expanded content and updated metadata, or the original + chunk if expansion fails or yields no content. + + Note: + - It is recommended to use same serializer as the original document + """ + top_items = self._get_top_containing_items(dl_doc) + if not top_items: + _logger.warning(f"error in getting top items of {self}") + return self + + content = "" + all_doc_items = [] + + for top_item in top_items: + if isinstance(top_item, ListGroup | InlineGroup | DocItem): + try: + ser_res = serializer.serialize(item=top_item) + content += ser_res.text + "\n" + # Extract doc_items from serialization result + all_doc_items.extend(ser_res.get_unique_doc_items()) + + except Exception as e: + _logger.warning(f"error in extacting text of {top_item}: {e}") + + if len(content.strip()) == 0: + _logger.warning(f"expansion of {self} did not yield any text") + return self + + meta = deepcopy(self.meta) + meta.doc_items = all_doc_items + return DocChunk( + text=content, + meta=meta, + ) + + def expand_to_page(self, doc: DoclingDocument, serializer: DocSerializer) -> DocChunk: + """Expand chunk to include all content from its pages. + + Expands the chunk to contain all content from the pages it spans. This is + useful for maintaining page-level context and ensuring complete page coverage + in retrieval applications. + + Args: + doc: The DoclingDocument containing this chunk. + serializer: Serializer to convert document content to text. + + Returns: + New DocChunk with all content from the chunk's pages and updated metadata, + or the original chunk if expansion is not possible. + + Raises: + UserWarning: If document has no pages or chunk items have no page provenance. + + Example: + If a chunk spans pages 2-3, this expands it to include all content + from both pages, not just the original chunk's items. + + Note: + - It is recommended to use same serializer as the original document + """ + + page_ids = [i.page_no for item in self.meta.doc_items for i in item.prov] + + if len(doc.pages) == 0 or page_ids is None or len(page_ids) == 0: + warnings.warn( + f"cannot expand to page the following chunk: {self}. \n Probably pagination was not supported in document conversion" + ) + return self + + page_serializer = deepcopy(serializer) # avoid mutating the serializer + page_serializer.params.pages = set(page_ids) + ser_res = page_serializer.serialize() + + # Extract doc_items from serialization result + expanded_doc_items = ser_res.get_unique_doc_items() + + # Update metadata + meta = deepcopy(self.meta) + meta.doc_items = expanded_doc_items + return DocChunk( + text=ser_res.text, + meta=meta, + ) diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index 903fcbbd..95850e11 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -440,7 +440,7 @@ def get_header_and_body_lines( the header row and separator row, and body_lines contains the data rows. """ - lines = [line for line in table_text.split("\n") if line.strip()] + lines = [line for line in table_text.splitlines(True) if line.strip()] if len(lines) < 2: # Not enough lines for a proper markdown table (need at least header + separator) diff --git a/test/data/chunker/0d_out_chunks.json b/test/data/chunker/0d_out_chunks.json index 6b1f306f..d3a29df3 100644 --- a/test/data/chunker/0d_out_chunks.json +++ b/test/data/chunker/0d_out_chunks.json @@ -259,7 +259,7 @@ } }, { - "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || - | - | - | - | - | - | - | - || | | TTS | Pages/s | Mem | TTS | Pages/s | Mem || Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |", + "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend |\n| - | - | - | - | - | - | - | - |\n| | | TTS | Pages/s | Mem | TTS | Pages/s | Mem |\n| Apple M3 Max | 4 | 177 s 167 s | 1.27 1.34 | 6.20 GB | 103 s 92 s | 2.18 2.45 | 2.56 GB |\n", "meta": { "doc_items": [ "#/tables/0" @@ -271,7 +271,7 @@ } }, { - "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend || (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |", + "text": "Table 1: Runtime characteristics of Docling with the standard model pipeline and settings, on our test dataset of 225 pages, on two different systems. OCR is disabled. We show the time-to-solution (TTS), computed throughput in pages per second, and the peak memory used (resident set size) for both the Docling-native PDF backend and for the pypdfium backend, using 4 and 16 threads.\n\n| CPU | Thread budget | native backend | native backend | native backend | pypdfium backend | pypdfium backend | pypdfium backend |\n| (16 cores) Intel(R) Xeon E5-2690 | 16 4 16 | 375 s 244 s | 0.60 0.92 | 6.16 GB | 239 s 143 s | 0.94 1.57 | 2.42 GB |", "meta": { "doc_items": [ "#/tables/0" @@ -638,7 +638,7 @@ } }, { - "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset.", + "text": "to avoid this at any cost in order to have clear, unbiased baseline numbers for human document-layout annotation. Third, we introduced the feature of snapping boxes around text segments to obtain a pixel-accurate annotation and again reduce time and effort. The CCS annotation tool automatically shrinks every user-drawn box to the minimum bounding-box around the enclosed text-cells for all purely text-based segments, which excludes only Table and Picture . For the latter, we instructed annotation staff to minimise inclusion of surrounding whitespace while including all graphical lines. A downside of snapping boxes to enclosed text cells is that some wrongly parsed PDF pages cannot be annotated correctly and need to be skipped. Fourth, we established a way to flag pages as rejected for cases where no valid annotation according to the label guidelines could be achieved. Example cases for this would be PDF pages that render incorrectly or contain layouts that are impossible to capture with non-overlapping rectangles. Such rejected pages are not contained in the final dataset. With all these measures in place, experienced annotation staff managed to annotate a single page", "meta": { "doc_items": [ "#/texts/513" @@ -650,7 +650,7 @@ } }, { - "text": "With all these measures in place, experienced annotation staff managed to annotate a single page in a typical timeframe of 20s to 60s, depending on its complexity.", + "text": "in a typical timeframe of 20s to 60s, depending on its complexity.", "meta": { "doc_items": [ "#/texts/513" @@ -703,7 +703,7 @@ } }, { - "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture .", + "text": "In Table 2, we present baseline experiments (given in mAP) on Mask R-CNN [12], Faster R-CNN [11], and YOLOv5 [13]. Both training and evaluation were performed on RGB images with dimensions of 1025 \u00d7 1025 pixels. For training, we only used one annotation in case of redundantly annotated pages. As one can observe, the variation in mAP between the models is rather low, but overall between 6 and 10% lower than the mAP computed from the pairwise human annotations on triple-annotated pages. This gives a good indication that the DocLayNet dataset poses a worthwhile challenge for the research community to close the gap between human recognition and ML approaches. It is interesting to see that Mask R-CNN and Faster R-CNN produce very comparable mAP scores, indicating that pixel-based image segmentation derived from bounding-boxes does not help to obtain better predictions. On the other hand, the more recent Yolov5x model does very well and even out-performs humans on selected labels such as Text , Table and Picture . This is not entirely surprising, as Text , Table and", "meta": { "doc_items": [ "#/texts/523" @@ -715,7 +715,7 @@ } }, { - "text": "This is not entirely surprising, as Text , Table and Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.", + "text": "Picture are abundant and the most visually distinctive in a document.\ncoioct dcochon modols\nmak enbrel\nFigure 3: Page 6 of the DocLayNet paper. If recognized, metadata such as authors are appearing first under the title. Elements recognized as page headers or footers are suppressed in Markdown to deliver uninterrupted content in reading order. Tables are inserted in reading order. The paragraph in '5. Experiments' wrapping over the column end is broken up in two and interrupted by the table.\nKDD '22, August 14-18, 2022, Washington, DC, USA\nBirgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter Staar\nTable 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %\nbetween pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.\nof row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", "meta": { "doc_items": [ "#/texts/523", @@ -725,18 +725,7 @@ "#/texts/529", "#/texts/530", "#/texts/531", - "#/texts/532" - ], - "headings": [ - "Docling Technical Report", - "Baselines for Object Detection" - ] - } - }, - { - "text": "of row 'Total') in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric", - "meta": { - "doc_items": [ + "#/texts/532", "#/texts/533" ], "headings": [ @@ -770,7 +759,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n", "meta": { "doc_items": [ "#/tables/3" @@ -782,7 +771,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -794,7 +783,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n", "meta": { "doc_items": [ "#/tables/3" @@ -806,7 +795,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -818,7 +807,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page-footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -830,7 +819,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page-header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -842,7 +831,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -854,7 +843,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Section-header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -866,7 +855,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -878,7 +867,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -890,7 +879,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n", "meta": { "doc_items": [ "#/tables/3" @@ -902,7 +891,7 @@ } }, { - "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", + "text": "| class label | Count | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |", "meta": { "doc_items": [ "#/tables/3" @@ -930,7 +919,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten || Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |\n| Caption | 22524 | 2.04 | 1.77 | 2.32 | 84-89 | 40-61 | 86-92 | 94-99 | 95-99 | 69-78 | n/a |\n", "meta": { "doc_items": [ "#/tables/4" @@ -942,7 +931,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Footnote | 6318 | 0.60 | 0.31 | 0.58 | 83-91 | n/a | 100 | 62-88 | 85-94 | n/a | 82-97 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -954,7 +943,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Formula | 25027 | 2.25 | 1.90 | 2.96 | 83-85 | n/a | n/a | 84-87 | 86-96 | n/a | n/a |\n", "meta": { "doc_items": [ "#/tables/4" @@ -966,7 +955,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| List-item | 185660 | 17.19 | 13.34 | 15.82 | 87-88 | 74-83 | 90-92 | 97-97 | 81-85 | 75-88 | 93-95 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -978,7 +967,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page- footer | 70878 | 6.51 | 5.58 | 6.00 | 93-94 | 88-90 | 95-96 | 100 | 92-97 | 100 | 96-98 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -990,7 +979,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Page- header | 58022 | 5.10 | 6.70 | 5.06 | 85-89 | 66-76 | 90-94 | 98-100 | 91-92 | 97-99 | 81-86 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1002,7 +991,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Picture | 45976 | 4.21 | 2.78 | 5.31 | 69-71 | 56-59 | 82-86 | 69-82 | 80-95 | 66-71 | 59-76 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1014,7 +1003,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Section- header | 142884 | 12.60 | 15.77 | 12.85 | 83-84 | 76-81 | 90-92 | 94-95 | 87-94 | 69-73 | 78-86 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1026,7 +1015,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Table | 34733 | 3.20 | 2.27 | 3.60 | 77-81 | 75-80 | 83-86 | 98-99 | 58-80 | 79-84 | 70-85 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1038,7 +1027,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Text | 510377 | 45.82 | 49.28 | 45.00 | 84-86 | 81-86 | 88-93 | 89-93 | 87-92 | 71-79 | 87-95 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1050,7 +1039,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Title | 5071 | 0.47 | 0.30 | 0.50 | 60-72 | 24-63 | 50-63 | 94-100 | 82-96 | 68-79 | 24-56 |\n", "meta": { "doc_items": [ "#/tables/4" @@ -1062,7 +1051,7 @@ } }, { - "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n| - | - | - | - | - | - | - | - | - | - | - | - || Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |\n3\n,\ngovernment offices,\nWe reviewed the col-\n,\nPage-\nTitle and\n.", + "text": "| | | % of Total | % of Total | % of Total | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) | triple inter- annotator mAP @ 0.5-0.95 (%) |\n\n| - | - | - | - | - | - | - | - | - | - | - | - |\n| Total | 1107470 | 941123 | 99816 | 66531 | 82-83 | 71-74 | 79-81 | 89-94 | 86-91 | 71-76 | 68-85 |\n3\n,\ngovernment offices,\nWe reviewed the col-\n,\nPage-\nTitle and\n.", "meta": { "doc_items": [ "#/tables/4", diff --git a/test/test_doc_chunk_expansion.py b/test/test_doc_chunk_expansion.py new file mode 100644 index 00000000..9d94ec3a --- /dev/null +++ b/test/test_doc_chunk_expansion.py @@ -0,0 +1,367 @@ +"""Tests for DocChunk expansion methods.""" + +import pytest + +from docling_core.transforms.chunker.doc_chunk import DocChunk, DocMeta +from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer, ChunkingSerializerProvider +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker +from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer +from docling_core.transforms.serializer.markdown import MarkdownTableSerializer +from docling_core.types.doc import DocItemLabel, DoclingDocument, Size +from docling_core.types.doc.document import TableData + +EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2" +MAX_TOKENS = 50 + +def check_lines_equal_in_order(text_a: str, text_b: str) -> bool: + """ + Check if lines of string A are equal to lines of string B in the same order. + + This function splits both strings into lines and verifies that: + 1. All lines from A appear in B + 2. They appear in the same order + 3. Lines can be non-consecutive in B (other lines can appear between them) + + Args: + text_a (str): First string (subset) to check + text_b (str): Second string (superset) to check against + + Returns: + bool: True if all lines of A appear in B in the same order, False otherwise + """ + if not isinstance(text_a, str) or not isinstance(text_b, str): + raise TypeError("Both inputs must be strings.") + + lines_a = [line for line in text_a.splitlines() if line.strip()] + lines_b = [line for line in text_b.splitlines() if line.strip()] + + # If A is empty, it's always contained in B + if not lines_a: + return True + + # If B is empty but A is not, A cannot be contained in B + if not lines_b: + return False + + # Track position in B + b_index = 0 + + # Try to find each line of A in B in order + for line_a in lines_a: + found = False + # Search for line_a starting from current position in B + while b_index < len(lines_b): + if lines_b[b_index] == line_a: + found = True + b_index += 1 # Move to next position in B + break + b_index += 1 + + # If we couldn't find this line of A in B, return False + if not found: + return False + + return True + + +@pytest.fixture +def sample_doc(): + """Create a comprehensive sample document for testing with pages and various content types. + + Content is associated with pages through ProvenanceItem which includes page_no. + When add_text/add_heading is called with prov parameter containing page_no, + that content is associated with that specific page. + """ + from docling_core.types.doc.document import ProvenanceItem, BoundingBox + + doc = DoclingDocument(name="test_doc") + + # Add pages + page1 = doc.add_page(size=Size(width=612, height=792), page_no=1) + page2 = doc.add_page(size=Size(width=612, height=792), page_no=2) + + # Section 1 on page 1 (explicitly set page_no in prov) + doc.add_heading( + text="Section 1", + level=1, + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=50, t=50, r=550, b=80), charspan=(0, 9)) + ) + doc.add_text( + text="This is the first paragraph.", + label=DocItemLabel.PARAGRAPH, + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=50, t=90, r=550, b=120), charspan=(10, 38)) + ) + doc.add_text( + text="This is the second paragraph.", + label=DocItemLabel.PARAGRAPH, + prov=ProvenanceItem(page_no=1, bbox=BoundingBox(l=50, t=130, r=550, b=160), charspan=(39, 68)) + ) + + # Section 2 on page 2 with list (explicitly set page_no=2 in prov) + doc.add_heading( + text="Section 2", + level=1, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=50, t=50, r=550, b=80), charspan=(69, 78)) + ) + doc.add_text( + text="Content in section 2.", + label=DocItemLabel.PARAGRAPH, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=50, t=90, r=550, b=120), charspan=(79, 100)) + ) + + # Add a list in section 2 on page 2 + list_group = doc.add_list_group() + doc.add_list_item( + text="First list item", + enumerated=False, + parent=list_group, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=70, t=130, r=550, b=150), charspan=(101, 116)) + ) + doc.add_list_item( + text="Second list item", + enumerated=False, + parent=list_group, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=70, t=160, r=550, b=180), charspan=(117, 133)) + ) + doc.add_list_item( + text="Third list item", + enumerated=False, + parent=list_group, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=70, t=190, r=550, b=210), charspan=(134, 149)) + ) + + # Add a table on page 2 + table_data = TableData(num_cols=2) + table_data.add_row(["Header 1", "Header 2"]) + table_data.add_row(["Value 1", "Value 2"]) + table_data.add_row(["Value 3", "Value 4"]) + table_data.add_row(["Value 5", "Value 6"]) + doc.add_table( + data=table_data, + prov=ProvenanceItem(page_no=2, bbox=BoundingBox(l=50, t=220, r=550, b=300), charspan=(150, 200)) + ) + + return doc + +class MarkdownSerializerProvider(ChunkingSerializerProvider): + def get_serializer(self, doc: DoclingDocument): + return ChunkingDocSerializer( + doc=doc, + table_serializer=MarkdownTableSerializer(), + + ) + +@pytest.fixture +def hybrid_chunker(): + """Create a reusable HybridChunker instance.""" + return HybridChunker( + tokenizer=HuggingFaceTokenizer.from_pretrained( + model_name=EMBED_MODEL_ID, + max_tokens=MAX_TOKENS, + ), + serializer_provider=MarkdownSerializerProvider(), + repeat_table_header=True + ) + +@pytest.fixture +def sample_chunks(sample_doc, hybrid_chunker): + """Create chunks from sample_doc once and cache them.""" + chunks = list(hybrid_chunker.chunk(dl_doc=sample_doc)) + assert len(chunks) > 0, "Expected at least one chunk to be created" + return chunks + + +@pytest.fixture +def sample_serializer(sample_doc, hybrid_chunker): + """Create serializer for sample_doc once and cache it.""" + return hybrid_chunker.serializer_provider.get_serializer(sample_doc) + + +class TestGetTopContainingItems: + """Tests for _get_top_containing_items method.""" + + # helper method: recursively traverse top item children to find chunk items + def _find_chunk_item_in_descendants(self, item, doc, target_refs): + """Recursively check if any target_refs are in item's descendants.""" + # Check if this item itself is a target + if item.self_ref in target_refs: + return True + + # Check children if item has them + if hasattr(item, 'children') and item.children: + for child_ref in item.children: + child = child_ref.resolve(doc) + if self._find_chunk_item_in_descendants(child, doc, target_refs): + return True + + return False + + + def test_get_top_items_basic(self, sample_doc, sample_chunks): + """Test getting top-level items from a chunk.""" + assert len(sample_chunks) > 0, "Should have at least one chunk" + + for chunk in sample_chunks: + top_items = chunk._get_top_containing_items(sample_doc) + + assert top_items is not None, "Should return top items" + assert len(top_items) > 0, "Should have at least one top item" + + # Verify all returned items are direct children of body + for item in top_items: + assert item.parent == sample_doc.body.get_ref(), ( + f"Item {item.self_ref} should be direct child of body" + ) + + # Verify that at least one doc_item from the chunk is a descendant of a top item + chunk_item_refs = {item.self_ref for item in chunk.meta.doc_items} + + for top_item in top_items: + assert self._find_chunk_item_in_descendants(top_item, sample_doc, chunk_item_refs), ( + f"Could not find any chunk items in descendants of top item {top_item.self_ref}" + ) + + + def test_get_top_items_maintains_order(self, sample_doc, sample_chunks): + """Test that top items maintain document reading order.""" + for chunk in sample_chunks: + top_items = chunk._get_top_containing_items(sample_doc) + if top_items and len(top_items) > 1: + # Get the order in the document body + body_refs = [ref.cref for ref in sample_doc.body.children] + top_refs = [item.self_ref for item in top_items] + + # Verify order is maintained + prev_idx = -1 + for ref in top_refs: + curr_idx = body_refs.index(ref) + assert curr_idx > prev_idx, "Items should maintain reading order" + prev_idx = curr_idx + + def test_get_top_items_empty_chunk(self): + """Test _get_top_containing_items with chunk containing non-body items.""" + doc = DoclingDocument(name="empty_doc") + text_item = doc.add_text(text="Some text", label=DocItemLabel.PARAGRAPH) + + # Create a chunk with a doc item that doesn't have proper parent + # This simulates an edge case where get_top_containing_items might return None + meta = DocMeta(doc_items=[text_item]) + chunk = DocChunk(text="test", meta=meta) + + # Should return the text item as top item since it's a direct child of body + result = chunk._get_top_containing_items(doc) + assert result is not None, "Should return top items for valid doc_items" + assert len(result) > 0, "Should have at least one top item" + + +class TestExpandToItem: + """Tests for expand_to_item method.""" + + def test_expand_to_item_basic(self, sample_doc, sample_serializer, sample_chunks): + """Test basic expansion to full items.""" + + for chunk in sample_chunks: + expanded = chunk.expand_to_item( + dl_doc=sample_doc, + serializer=sample_serializer + ) + + assert expanded is not None, "Should return expanded chunk" + assert isinstance(expanded, DocChunk), "Should return DocChunk instance" + + # Expanded chunk should have content + assert len(expanded.text.strip()) > 0, "Expanded chunk should have text" + + # Expanded chunk text should contain original chunk text (or be a superset) + assert check_lines_equal_in_order(chunk.text,expanded.text), ( + f"Expanded chunk should contain of original chunk text. " + f"original {chunk.text}" + f"expanded: {expanded.text}" + ) + assert expanded.meta.origin == chunk.meta.origin, ( + "Origin should be preserved" + ) + + + def test_expand_to_item_error_handling(self, sample_doc, hybrid_chunker): + """Test error handling in expand_to_item when serialization fails.""" + # Create a mock serializer that raises an exception + class FailingSerializer: + def serialize(self, item): + raise RuntimeError("Serialization failed") + + # Create a chunk with valid doc items + text_item = sample_doc.texts[0] + meta = DocMeta(doc_items=[text_item]) + chunk = DocChunk(text="original text", meta=meta) + + # Call expand_to_object with failing serializer + # Should catch the exception and return original chunk + expanded = chunk.expand_to_item( + dl_doc=sample_doc, + serializer=FailingSerializer() + ) + + # Should return original chunk when serialization fails + assert expanded == chunk, "Should return original chunk when serialization fails" + + +class TestExpandToPage: + """Tests for expand_to_page method.""" + + + def test_expand_to_page_basic(self, sample_doc, sample_chunks, sample_serializer): + """Test that page expansion includes all page content.""" + + for chunk in sample_chunks: + + # Get page numbers from chunk + page_ids = [ + i.page_no for item in chunk.meta.doc_items for i in item.prov + ] + + if page_ids: + expanded = chunk.expand_to_page( + doc=sample_doc, + serializer=sample_serializer + ) + assert expanded is not None, "Should return expanded chunk" + assert isinstance(expanded, DocChunk), "Should return DocChunk" + # Expanded text should contain page content + assert len(expanded.text) > 0, "Expanded chunk should have text" + + # Verify it contains original + assert check_lines_equal_in_order(chunk.text,expanded.text), ( + "Expanded text should contain original" + ) + + # Metadata fields should be updated with expanded content + assert expanded.meta.origin == chunk.meta.origin, "Expanded chunk should have metadata" + def get_ref_items(chunk:DocChunk): + return [item.self_ref for item in chunk.meta.doc_items] + assert set(get_ref_items(chunk)).issubset(get_ref_items(expanded)) , ( + "Expanded chunk should have at least as many doc_items as original" + ) + + def test_expand_to_page_no_pages(self, hybrid_chunker): + """Test expand_to_page when document has no pages for all chunks.""" + # Create a document without pages + doc_no_pages = DoclingDocument(name="no_pages_doc") + doc_no_pages.add_heading(text="Section 1", level=1) + doc_no_pages.add_text(text="Some content.", label=DocItemLabel.PARAGRAPH) + + chunks = list(hybrid_chunker.chunk(dl_doc=doc_no_pages)) + serializer = hybrid_chunker.serializer_provider.get_serializer(doc_no_pages) + + assert len(chunks) > 0, "Should have at least one chunk" + + for chunk in chunks: + result = chunk.expand_to_page( + doc=doc_no_pages, + serializer=serializer + ) + + # Should return original chunk when no pages + assert result == chunk, "Should return original chunk when document has no pages" + + \ No newline at end of file