Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ benchmarks/**

model_artifacts/
scratch/
.worktrees/
ds_convert_models/

# Created by https://www.toptal.com/developers/gitignore/api/python,macos,virtualenv,pycharm,visualstudiocode,emacs,vim,jupyternotebooks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
avoiding "too many files open" errors.

Usage:
uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory <input_dir> --output-directory <output_dir> [--sliding-window <int>] [--use-predictions/--no-use-predictions] [--max-files-per-chunk <int>]
uv run python docling_eval/campaign_tools/cvat_create_annotation_tasks_from_folders.py batch-prepare --input-directory <input_dir> --output-directory <output_dir> [--sliding-window <int>] [--window-mode <rolling|parity_even|parity_odd>] [--slice-manifest <path>] [--use-predictions/--no-use-predictions] [--max-files-per-chunk <int>]

Arguments:
input_directory: Root directory containing subdirectories with files to process
output_directory: Where to store the generated datasets (one subdir per input subdir, with chunk suffixes if needed)
sliding_window: Number of pages per CVAT task (default: 1)
window_mode: Window selection mode for PDFs (rolling, parity_even, parity_odd)
slice_manifest: Optional path to recombined_slices_manifest.json
use_predictions: Whether to create prediction dataset and use predictions in CVAT (default: True)
max_files_per_chunk: Maximum number of files to process per chunk (default: 1000)
"""
Expand All @@ -41,6 +43,8 @@ def process_subdirectories(
input_directory: Path,
output_directory: Path,
sliding_window: int = 1,
window_mode: str = "rolling",
slice_manifest: Optional[Path] = None,
use_predictions: bool = True,
max_files_per_chunk: int = 1000,
) -> None:
Expand All @@ -53,6 +57,8 @@ def process_subdirectories(
input_directory: Root directory with subdirectories to process
output_directory: Where to store generated datasets
sliding_window: Number of pages per CVAT task (default: 1)
window_mode: Window selection mode for PDF chunks (rolling, parity_even, parity_odd)
slice_manifest: Optional manifest for parity window modes
use_predictions: Whether to create prediction dataset and use predictions in CVAT
max_files_per_chunk: Maximum number of files to process per chunk (default: 1000)
"""
Expand Down Expand Up @@ -154,12 +160,19 @@ def process_subdirectories(
source_dir = (
(eval_dir / "test") if use_predictions else (gt_dir / "test")
)
effective_manifest = slice_manifest
if effective_manifest is None:
candidate_manifest = subdir / "recombined_slices_manifest.json"
if candidate_manifest.exists():
effective_manifest = candidate_manifest
create_cvat(
gt_dir=source_dir,
output_dir=cvat_dir,
bucket_size=100,
use_predictions=use_predictions,
sliding_window=sliding_window,
window_mode=window_mode,
slice_manifest=effective_manifest,
)
else:
typer.echo(f" CVAT dataset already exists, skipping.")
Expand Down Expand Up @@ -190,6 +203,17 @@ def batch_prepare(
sliding_window: int = typer.Option(
1, help="Number of pages per CVAT task (default: 1)"
),
window_mode: str = typer.Option(
"rolling",
help="Window selection mode for PDF chunks: rolling, parity_even, parity_odd.",
),
slice_manifest: Optional[Path] = typer.Option(
None,
help=(
"Optional path to recombined_slices_manifest.json. "
"If omitted, each subdirectory is checked for a local manifest file."
),
),
use_predictions: bool = typer.Option(
True, help="Whether to create prediction dataset and use predictions in CVAT"
),
Expand All @@ -204,6 +228,8 @@ def batch_prepare(
input_directory,
output_directory,
sliding_window,
window_mode,
slice_manifest,
use_predictions,
max_files_per_chunk,
)
Expand Down
21 changes: 21 additions & 0 deletions docling_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,25 @@ def create_cvat(
help="Size of sliding window for page processing (1 for single pages, >1 for multi-page windows)"
),
] = 2,
window_mode: Annotated[
str,
typer.Option(
help=(
"PDF window selection mode: "
"rolling (default), parity_even, or parity_odd. "
"Parity modes require --slice-manifest and sliding-window=2."
)
),
] = "rolling",
slice_manifest: Annotated[
Optional[Path],
typer.Option(
help=(
"Optional manifest from recombine_chunked_slice_pdfs.py. "
"Required for parity_even/parity_odd window modes."
)
),
] = None,
):
"""Create dataset ready to upload to CVAT starting from (ground-truth) dataset.

Expand All @@ -1272,6 +1291,8 @@ def create_cvat(
bucket_size=bucket_size,
use_predictions=use_predictions,
sliding_window=sliding_window,
window_mode=window_mode,
slice_manifest=slice_manifest,
)
builder.prepare_for_annotation()

Expand Down
Loading
Loading