Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docling_jobkit/cli/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def convert(
to_formats=[v.value for v in config.options.to_formats],
generate_page_images=config.options.include_images,
generate_picture_images=config.options.include_images,
compact_tables=options.compact_tables,
)
for source in config.sources:
with get_source_processor(source) as source_processor:
Expand Down
8 changes: 8 additions & 0 deletions docling_jobkit/convert/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def _export_document_as_content(
export_doctags: bool,
image_mode: ImageRefMode,
md_page_break_placeholder: str,
compact_tables: bool = False,
) -> ExportDocumentResponse:
document = ExportDocumentResponse(filename=conv_res.input.file.name)

Expand All @@ -63,11 +64,13 @@ def _export_document_as_content(
document.text_content = new_doc.export_to_markdown(
strict_text=True,
image_mode=image_mode,
compact_tables=compact_tables,
)
if export_md:
document.md_content = new_doc.export_to_markdown(
image_mode=image_mode,
page_break_placeholder=md_page_break_placeholder or None,
compact_tables=compact_tables,
)
if export_doctags:
document.doctags_content = new_doc.export_to_doctags()
Expand All @@ -85,6 +88,7 @@ def _export_documents_as_files(
export_doctags: bool,
image_export_mode: ImageRefMode,
md_page_break_placeholder: str,
compact_tables: bool = False,
):
success_count = 0
failure_count = 0
Expand Down Expand Up @@ -124,6 +128,7 @@ def _export_documents_as_files(
filename=fname,
strict_text=True,
image_mode=ImageRefMode.PLACEHOLDER,
compact_tables=compact_tables,
)

# Export Markdown format:
Expand All @@ -135,6 +140,7 @@ def _export_documents_as_files(
artifacts_dir=artifacts_dir,
image_mode=image_export_mode,
page_break_placeholder=md_page_break_placeholder or None,
compact_tables=compact_tables,
)

# Export Document Tags format:
Expand Down Expand Up @@ -274,6 +280,7 @@ def process_export_results(
export_doctags=export_doctags,
image_mode=conversion_options.image_export_mode,
md_page_break_placeholder=conversion_options.md_page_break_placeholder,
compact_tables=conversion_options.compact_tables,
)
task_result = ExportResult(
content=content,
Expand Down Expand Up @@ -303,6 +310,7 @@ def process_export_results(
export_doctags=export_doctags,
image_export_mode=conversion_options.image_export_mode,
md_page_break_placeholder=conversion_options.md_page_break_placeholder,
compact_tables=conversion_options.compact_tables,
)

files = os.listdir(output_dir)
Expand Down
5 changes: 4 additions & 1 deletion docling_jobkit/convert/results_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
generate_picture_images: bool = False,
export_parquet_file: bool = False,
scratch_dir: Path | None = None,
compact_tables: bool = False,
):
self._target_processor = target_processor

Expand All @@ -51,6 +52,8 @@ def __init__(

self.scratch_dir = scratch_dir or Path(tempfile.mkdtemp(prefix="docling_"))
self.scratch_dir.mkdir(exist_ok=True, parents=True)

self.compact_tables = compact_tables

def __del__(self):
if self.scratch_dir is not None:
Expand Down Expand Up @@ -122,7 +125,7 @@ def process_documents(self, results: Iterable[ConversionResult]):
# Export Docling document format to markdown:
target_key = f"md/{name_without_ext}.md"

data = conv_res.document.export_to_markdown()
data = conv_res.document.export_to_markdown(compact_tables=self.compact_tables,)
self._target_processor.upload_object(
obj=data,
target_filename=target_key,
Expand Down
12 changes: 11 additions & 1 deletion docling_jobkit/datamodel/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,17 @@ class ConvertDocumentsOptions(BaseModel):
examples=["<!-- page-break -->", ""],
),
] = ""

compact_tables: Annotated[
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's name the argument consistently as md_compact_tables

bool,
Field(
description=(
"Whether to use compact table format without column padding. "
"When False (default), tables use padded columns for better "
"visual formatting. When True, tables use minimal whitespace, "
"which is better for large tables and downstream processing."
),
),
] = False
do_code_enrichment: Annotated[
bool,
Field(
Expand Down
1 change: 1 addition & 0 deletions docling_jobkit/kfp_pipeline/docling_s3in_s3out.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def convert_payload(
to_formats=[v.value for v in convert_options.to_formats],
generate_page_images=convert_options.include_images,
generate_picture_images=convert_options.include_images,
compact_tables=convert_options.compact_tables,
)
for item in result_processor.process_documents(
converter.convert_documents(
Expand Down