From e8f6f0f0e2e86bfa130d7f0bd257b6e0e4b3759c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Fri, 6 Mar 2026 15:03:03 -0500 Subject: [PATCH 1/4] Add ingest batch and ingest inprocess commands to the new 'nr' cli command --- nemo_retriever/README.md | 26 +- nemo_retriever/chart_stage_config.yaml | 4 +- nemo_retriever/embedding_stage_config.yaml | 2 +- nemo_retriever/harness/HANDOFF.md | 22 +- nemo_retriever/infographic_stage_config.yaml | 4 +- nemo_retriever/pdf_stage_config.yaml | 4 +- nemo_retriever/pyproject.toml | 2 +- .../src/nemo_retriever/adapters/cli/main.py | 12 +- .../src/nemo_retriever/audio/cli.py | 2 +- .../src/nemo_retriever/audio/stage.py | 4 +- .../nemo_retriever/examples/batch_pipeline.py | 2 +- .../examples/online_pipeline.py | 2 +- .../src/nemo_retriever/html/__main__.py | 6 +- .../src/nemo_retriever/ingest-config.yaml | 22 +- .../ingest_modes/inprocess_cli.py | 291 ++++++++++++++++++ .../local/stages/stage1_pdf_extraction.py | 4 +- .../local/stages/stage5_text_embeddings.py | 2 +- .../local/stages/stage6_vdb_upload.py | 2 +- .../local/stages/stage7_vdb_query.py | 2 +- .../stages/stage999_post_mortem_analysis.py | 4 +- .../src/nemo_retriever/txt/__main__.py | 6 +- .../src/nemo_retriever/vector_store/stage.py | 2 +- nemo_retriever/table_stage_config.yaml | 4 +- nemo_retriever/tests/test_audio_stage.py | 2 +- 24 files changed, 366 insertions(+), 67 deletions(-) create mode 100644 nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md index d51d20bdf..c48a56b11 100644 --- a/nemo_retriever/README.md +++ b/nemo_retriever/README.md @@ -16,8 +16,8 @@ From the repo root: ```bash cd /path/to/nv-ingest -uv venv .retriever -source .retriever/bin/activate +uv venv .nr +source .nr/bin/activate uv pip install -e ./nemo_retriever ``` @@ -52,7 +52,7 @@ uv run python nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py /path Pass the directory that contains your PDFs as the first argument (`input-dir`). For recall evaluation, the pipeline uses `bo767_query_gt.csv` in the current directory by default; override with `--query-csv `. For document-level recall, use `--recall-match-mode pdf_only` with `query,expected_pdf` data. Recall is skipped if the query file does not exist. By default, per-query details (query, gold, hits) are printed; use `--no-recall-details` to print only the missed-gold summary and recall metrics. To use an existing Ray cluster, pass `--ray-address auto`. If OCR fails with a missing `libcudart.so.13`, install the CUDA 13 runtime and set `LD_LIBRARY_PATH` as shown above. -For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt` with the same examples (e.g. `batch_pipeline.py --input-type html`). HTML files are converted to markdown via markitdown, then chunked with the same tokenizer as .txt. Staged CLI: `retriever html run --input-dir ` writes `*.html_extraction.json`; then `retriever local stage5 run --input-dir --pattern "*.html_extraction.json"` and `retriever local stage6 run --input-dir `. +For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt` with the same examples (e.g. `batch_pipeline.py --input-type html`). HTML files are converted to markdown via markitdown, then chunked with the same tokenizer as .txt. Staged CLI: `nr html run --input-dir ` writes `*.html_extraction.json`; then `nr local stage5 run --input-dir --pattern "*.html_extraction.json"` and `nr local stage6 run --input-dir `. ## Harness (run, sweep, nightly) @@ -61,7 +61,7 @@ For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt - Config files: - `nemo_retriever/harness/test_configs.yaml` - `nemo_retriever/harness/nightly_config.yaml` -- CLI entrypoint is nested under `retriever harness`. +- CLI entrypoint is nested under `nr harness`. - First pass is LanceDB-only and enforces recall-required pass/fail by default. - Single-run artifact directories default to `_`. - Dataset-specific recall adapters are supported via config: @@ -77,37 +77,37 @@ For **HTML** or **text** ingestion, use `--input-type html` or `--input-type txt ```bash # Dataset preset from test_configs.yaml (recall-required example) -retriever harness run --dataset jp20 --preset single_gpu +nr harness run --dataset jp20 --preset single_gpu # Direct dataset path -retriever harness run --dataset /datasets/nv-ingest/bo767 --preset single_gpu +nr harness run --dataset /datasets/nv-ingest/bo767 --preset single_gpu # Add repeatable run or session tags for later review -retriever harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate +nr harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate ``` ### Sweep runs (explicit runs list) ```bash -retriever harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml ``` ### Nightly session ```bash -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml -retriever harness nightly --dry-run -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness nightly --dry-run +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly ``` ### Session inspection ```bash # Print a compact table from a completed sweep/nightly session -retriever harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC +nr harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC # Compare two session summaries by run name -retriever harness compare \ +nr harness compare \ nemo_retriever/artifacts/nightly_20260305_010203_UTC \ nemo_retriever/artifacts/nightly_20260306_010204_UTC ``` diff --git a/nemo_retriever/chart_stage_config.yaml b/nemo_retriever/chart_stage_config.yaml index fada15b68..00abf3c51 100644 --- a/nemo_retriever/chart_stage_config.yaml +++ b/nemo_retriever/chart_stage_config.yaml @@ -1,8 +1,8 @@ # Example config for chart extraction. # # Intended usage (once the chart stage CLI is wired up similarly to table stage): -# - `retriever chart stage run --config --input ` -# - `retriever local stage4 run --config --input ` +# - `nr chart stage run --config --input ` +# - `nr local stage4 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema` # via `nemo_retriever.chart.config.load_chart_extractor_schema_from_dict`. diff --git a/nemo_retriever/embedding_stage_config.yaml b/nemo_retriever/embedding_stage_config.yaml index 46c45d170..df9de3567 100644 --- a/nemo_retriever/embedding_stage_config.yaml +++ b/nemo_retriever/embedding_stage_config.yaml @@ -11,7 +11,7 @@ api_key: "" # e.g. $NGC_API_KEY or $NVIDIA_API_KEY # Embedding service settings -# If set to null/empty, `retriever local stage5` will fall back to local HF embeddings +# If set to null/empty, `nr local stage5` will fall back to local HF embeddings # via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`. embedding_nim_endpoint: null # embedding_nim_endpoint: "http://localhost:8012/v1" diff --git a/nemo_retriever/harness/HANDOFF.md b/nemo_retriever/harness/HANDOFF.md index e724307b1..ac992ad79 100644 --- a/nemo_retriever/harness/HANDOFF.md +++ b/nemo_retriever/harness/HANDOFF.md @@ -49,36 +49,36 @@ From repo root: ```bash source ~/setup_env.sh -source .retriever/bin/activate +source .nr/bin/activate uv pip install -e ./nemo_retriever ``` Single run: ```bash -retriever harness run --dataset jp20 --preset single_gpu -retriever harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate +nr harness run --dataset jp20 --preset single_gpu +nr harness run --dataset jp20 --preset single_gpu --tag nightly --tag candidate ``` Sweep: ```bash -retriever harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness sweep --runs-config nemo_retriever/harness/nightly_config.yaml ``` Nightly: ```bash -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml -retriever harness nightly --dry-run -retriever harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml +nr harness nightly --dry-run +nr harness nightly --runs-config nemo_retriever/harness/nightly_config.yaml --tag nightly ``` Session inspection: ```bash -retriever harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC -retriever harness compare \ +nr harness summary nemo_retriever/artifacts/nightly_20260305_010203_UTC +nr harness compare \ nemo_retriever/artifacts/nightly_20260305_010203_UTC \ nemo_retriever/artifacts/nightly_20260306_010204_UTC ``` @@ -148,9 +148,9 @@ Notes: - `financebench` now defaults to `data/financebench_train.json` with recall enabled. - Session UX improvements: - Runs, sweeps, and nightly sessions accept repeatable `--tag` values persisted into artifacts. - - `retriever harness summary` prints a compact table from `session_summary.json`. + - `nr harness summary` prints a compact table from `session_summary.json`. - Comparison utility: - - `retriever harness compare` prints pages/sec and recall deltas by run name for two sessions. + - `nr harness compare` prints pages/sec and recall deltas by run name for two sessions. ## Current Validation Status diff --git a/nemo_retriever/infographic_stage_config.yaml b/nemo_retriever/infographic_stage_config.yaml index 67ac68546..d945aa586 100644 --- a/nemo_retriever/infographic_stage_config.yaml +++ b/nemo_retriever/infographic_stage_config.yaml @@ -1,6 +1,6 @@ # Example config for: -# - `retriever infographic stage run --config --input ` -# - `retriever local stage2 run --config --input ` +# - `nr infographic stage run --config --input ` +# - `nr local stage2 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema` # via `nemo_retriever.infographic.config.load_infographic_extractor_schema_from_dict`. diff --git a/nemo_retriever/pdf_stage_config.yaml b/nemo_retriever/pdf_stage_config.yaml index 33126ba61..7e9a7352e 100644 --- a/nemo_retriever/pdf_stage_config.yaml +++ b/nemo_retriever/pdf_stage_config.yaml @@ -1,11 +1,11 @@ -# Example config for: `retriever pdf stage page-elements --config ` +# Example config for: `nr pdf stage page-elements --config ` # # CLI override rule: # - If you pass an option explicitly on the CLI, it wins. # - Otherwise the value from this YAML file is used. # # You can run repeatedly: -# retriever pdf stage page-elements --config nemo_retriever/pdf_stage_config.yaml +# nr pdf stage page-elements --config nemo_retriever/pdf_stage_config.yaml # # Directory containing PDFs (scanned recursively for *.pdf) diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index b9d1ac476..54de81697 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -78,7 +78,7 @@ dev = [ ] [project.scripts] -retriever = "nemo_retriever.__main__:main" +nr = "nemo_retriever.__main__:main" [tool.setuptools.dynamic] version = {attr = "nemo_retriever.version.get_build_version"} diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py index 32d8012de..72ec2fe96 100644 --- a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py +++ b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py @@ -7,12 +7,14 @@ import typer from nemo_retriever.audio import app as audio_app +from nemo_retriever.examples.batch_pipeline import app as batch_app from nemo_retriever.utils.benchmark import app as benchmark_app from nemo_retriever.chart import app as chart_app from nemo_retriever.utils.compare import app as compare_app from nemo_retriever.harness import app as harness_app from nemo_retriever.html import __main__ as html_main from nemo_retriever.utils.image import app as image_app +from nemo_retriever.ingest_modes.inprocess_cli import app as inprocess_app from nemo_retriever.local import app as local_app from nemo_retriever.online import __main__ as online_main from nemo_retriever.pdf import app as pdf_app @@ -21,7 +23,13 @@ from nemo_retriever.vector_store import app as vector_store_app from nemo_retriever.version import get_version_info -app = typer.Typer(help="Retriever") +app = typer.Typer(help="NeMo Retriever – RAG ingestion pipeline CLI.") + +ingest_app = typer.Typer(help="Run ingestion pipelines (batch or in-process).") +ingest_app.add_typer(batch_app, name="batch") +ingest_app.add_typer(inprocess_app, name="inprocess") +app.add_typer(ingest_app, name="ingest") + app.add_typer(audio_app, name="audio") app.add_typer(image_app, name="image") app.add_typer(pdf_app, name="pdf") @@ -54,7 +62,7 @@ def _callback( version: bool = typer.Option( False, "--version", - help="Show retriever version metadata and exit.", + help="Show nr version metadata and exit.", callback=_version_callback, is_eager=True, ) diff --git a/nemo_retriever/src/nemo_retriever/audio/cli.py b/nemo_retriever/src/nemo_retriever/audio/cli.py index 1dbb6ee6d..af91777e8 100644 --- a/nemo_retriever/src/nemo_retriever/audio/cli.py +++ b/nemo_retriever/src/nemo_retriever/audio/cli.py @@ -8,7 +8,7 @@ This module intentionally contains **no configuration logic**. It simply re-exports the `nemo_retriever.audio.stage` Typer application so any arguments provided to: - `retriever audio ...` + `nr audio ...` are handled exactly the same as the stage commands (e.g. `extract`, `discover`). """ diff --git a/nemo_retriever/src/nemo_retriever/audio/stage.py b/nemo_retriever/src/nemo_retriever/audio/stage.py index 000ae99c9..7284ff1cb 100644 --- a/nemo_retriever/src/nemo_retriever/audio/stage.py +++ b/nemo_retriever/src/nemo_retriever/audio/stage.py @@ -5,8 +5,8 @@ """ Audio extraction stage: chunk + ASR only, write *.audio_extraction.json sidecars. -Invoked as `retriever audio extract` / `retriever audio discover` (or -`python -m nemo_retriever.audio extract` / `discover`). Analogous to `retriever pdf stage page-elements`. +Invoked as `nr audio extract` / `nr audio discover` (or +`python -m nemo_retriever.audio extract` / `discover`). Analogous to `nr pdf stage page-elements`. """ from __future__ import annotations diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 6c8574577..1999c6437 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -4,7 +4,7 @@ """ Batch ingestion pipeline with optional recall evaluation. -Run with: uv run python -m nemo_retriever.examples.batch_pipeline +Run with: nr batch """ import json diff --git a/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py index 9808e29d6..d23a82225 100644 --- a/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/online_pipeline.py @@ -7,7 +7,7 @@ - Inprocess: runs the full pipeline locally (no server). - Online: submits each document to the online ingest REST service (start with - `retriever online serve`). Uses the same LanceDB for recall evaluation. + `nr online serve`). Uses the same LanceDB for recall evaluation. Run with: uv run python -m nemo_retriever.examples.online_pipeline diff --git a/nemo_retriever/src/nemo_retriever/html/__main__.py b/nemo_retriever/src/nemo_retriever/html/__main__.py index 28d4e9a37..748f5f361 100644 --- a/nemo_retriever/src/nemo_retriever/html/__main__.py +++ b/nemo_retriever/src/nemo_retriever/html/__main__.py @@ -5,7 +5,7 @@ """ CLI for .html extraction: markitdown -> markdown -> tokenizer split, write *.html_extraction.json. -Use with: retriever local stage5 run --pattern "*.html_extraction.json" then stage6. +Use with: nr local stage5 run --pattern "*.html_extraction.json" then stage6. """ from __future__ import annotations @@ -63,8 +63,8 @@ def run( Scan input_dir for *.html, convert to markdown and chunk each, write .html_extraction.json. Output JSON has the same primitives-like shape as stage5 input (text, path, page_number, metadata). - Then run: retriever local stage5 run --input-dir --pattern "*.html_extraction.json" - and retriever local stage6 run --input-dir . + Then run: nr local stage5 run --input-dir --pattern "*.html_extraction.json" + and nr local stage6 run --input-dir . """ input_dir = Path(input_dir) html_files = sorted(input_dir.glob("*.html")) diff --git a/nemo_retriever/src/nemo_retriever/ingest-config.yaml b/nemo_retriever/src/nemo_retriever/ingest-config.yaml index 59fb95303..b35c1b573 100644 --- a/nemo_retriever/src/nemo_retriever/ingest-config.yaml +++ b/nemo_retriever/src/nemo_retriever/ingest-config.yaml @@ -1,4 +1,4 @@ -# nv-ingest retriever consolidated configuration +# nv-ingest nr consolidated configuration # # This single file replaces the older per-stage YAML configs: # - pdf_stage_config.yaml @@ -17,7 +17,7 @@ # - Sections below are consumed by their respective stages. pdf: - # Example config for: `retriever pdf stage page-elements --config ` + # Example config for: `nr pdf stage page-elements --config ` # # Directory containing PDFs (scanned recursively for *.pdf) input_dir: /home/local/jdyer/datasets/jp20 @@ -59,14 +59,14 @@ pdf: # Optionally limit number of PDFs processed limit: null -# Optional config for `retriever txt run` and .extract_txt() API +# Optional config for `nr txt run` and .extract_txt() API txt: max_tokens: 512 overlap_tokens: 0 tokenizer_model_id: nvidia/llama-3.2-nv-embedqa-1b-v2 encoding: utf-8 -# Optional config for `retriever html run` and .extract_html() API +# Optional config for `nr html run` and .extract_html() API html: max_tokens: 512 overlap_tokens: 0 @@ -94,8 +94,8 @@ audio_asr: table: # Example config for: - # - `retriever table stage run --config --input ` - # - `retriever local stage3 run --config --input ` + # - `nr table stage run --config --input ` + # - `nr local stage3 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema` # via `nemo_retriever.table.config.load_table_extractor_schema_from_dict`. @@ -133,8 +133,8 @@ table: chart: # Example config for: - # - `retriever chart stage run --config --input ` - # - `retriever local stage4 run --config --input ` + # - `nr chart stage run --config --input ` + # - `nr local stage4 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema` # via `nemo_retriever.chart.config.load_chart_extractor_schema_from_dict`. @@ -185,8 +185,8 @@ chart: infographic: # Example config for: - # - `retriever infographic stage run --config --input ` - # - `retriever local stage2 run --config --input ` + # - `nr infographic stage run --config --input ` + # - `nr local stage2 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema` # via `nemo_retriever.infographic.config.load_infographic_extractor_schema_from_dict`. @@ -231,7 +231,7 @@ embedding: api_key: "" # e.g. $NGC_API_KEY or $NVIDIA_API_KEY # Embedding service settings - # If set to null/empty, `retriever local stage5` will fall back to local HF embeddings + # If set to null/empty, `nr local stage5` will fall back to local HF embeddings # via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`. embedding_nim_endpoint: null # embedding_nim_endpoint: "http://localhost:8012/v1" diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py new file mode 100644 index 000000000..db9f0ef99 --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py @@ -0,0 +1,291 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +In-process ingestion pipeline CLI. + +Run with: nr inprocess +""" + +from __future__ import annotations + +import logging +import time +from pathlib import Path +from typing import Optional + +import typer + +logger = logging.getLogger(__name__) + +app = typer.Typer(help="Run the in-process ingestion pipeline (no Ray required).") + +LANCEDB_URI = "lancedb" +LANCEDB_TABLE = "nv-ingest" + + +@app.command() +def main( + input_path: Path = typer.Argument( + ..., + help="File or directory containing PDFs, .txt, .html, or .doc/.pptx files to ingest.", + path_type=Path, + ), + debug: bool = typer.Option( + False, + "--debug/--no-debug", + help="Enable debug-level logging.", + ), + input_type: str = typer.Option( + "pdf", + "--input-type", + help="Input format: 'pdf', 'txt', 'html', or 'doc'. " + "Use 'txt' for .txt, 'html' for .html (markitdown -> chunks), " + "'doc' for .docx/.pptx (converted to PDF via LibreOffice).", + ), + # -- Extract params -------------------------------------------------------- + extract_text: bool = typer.Option( + True, + "--extract-text/--no-extract-text", + help="Extract text from PDF pages.", + ), + extract_tables: bool = typer.Option( + True, + "--extract-tables/--no-extract-tables", + help="Extract tables from PDF pages.", + ), + extract_charts: bool = typer.Option( + True, + "--extract-charts/--no-extract-charts", + help="Extract charts from PDF pages.", + ), + extract_infographics: bool = typer.Option( + False, + "--extract-infographics/--no-extract-infographics", + help="Extract infographics from PDF pages.", + ), + use_table_structure: bool = typer.Option( + False, + "--use-table-structure", + help="Enable the combined table-structure + OCR stage for tables.", + ), + table_output_format: Optional[str] = typer.Option( + None, + "--table-output-format", + help="Table output format: 'pseudo_markdown' (OCR-only) or 'markdown' (table-structure + OCR).", + ), + inference_batch_size: Optional[int] = typer.Option( + None, + "--inference-batch-size", + help="Inference batch size for detection models.", + ), + page_elements_invoke_url: Optional[str] = typer.Option( + None, + "--page-elements-invoke-url", + help="Optional remote endpoint URL for page-elements model inference.", + ), + ocr_invoke_url: Optional[str] = typer.Option( + None, + "--ocr-invoke-url", + help="Optional remote endpoint URL for OCR model inference.", + ), + table_structure_invoke_url: Optional[str] = typer.Option( + None, + "--table-structure-invoke-url", + help="Optional remote endpoint URL for table-structure model inference.", + ), + # -- Embed params ---------------------------------------------------------- + embed_model_name: str = typer.Option( + "nvidia/llama-3.2-nv-embedqa-1b-v2", + "--embed-model-name", + help="Embedding model name passed to .embed().", + ), + embed_invoke_url: Optional[str] = typer.Option( + None, + "--embed-invoke-url", + help="Optional remote endpoint URL for embedding model inference.", + ), + embed_modality: str = typer.Option( + "text", + "--embed-modality", + help="Default embedding modality: 'text', 'image', or 'text_image'.", + ), + embed_granularity: str = typer.Option( + "element", + "--embed-granularity", + help="Embedding granularity: 'element' (per table/chart/text) or 'page' (per page).", + ), + text_elements_modality: Optional[str] = typer.Option( + None, + "--text-elements-modality", + help="Embedding modality override for page-text rows. Falls back to --embed-modality.", + ), + structured_elements_modality: Optional[str] = typer.Option( + None, + "--structured-elements-modality", + help="Embedding modality override for table/chart/infographic rows. Falls back to --embed-modality.", + ), + # -- VDB upload params ----------------------------------------------------- + lancedb_uri: str = typer.Option( + LANCEDB_URI, + "--lancedb-uri", + help="LanceDB URI/path for this run.", + ), + lancedb_table: str = typer.Option( + LANCEDB_TABLE, + "--lancedb-table", + help="LanceDB table name.", + ), + hybrid: bool = typer.Option( + False, + "--hybrid/--no-hybrid", + help="Enable LanceDB hybrid mode (dense + FTS text).", + ), + overwrite: bool = typer.Option( + True, + "--overwrite/--no-overwrite", + help="Overwrite existing LanceDB table (False to append).", + ), + # -- Output ---------------------------------------------------------------- + output_directory: Optional[str] = typer.Option( + None, + "--output-dir", + help="Optional directory to write per-document JSON results.", + ), + # -- Execution params ------------------------------------------------------ + parallel: bool = typer.Option( + False, + "--parallel/--no-parallel", + help="Enable parallel CPU extraction via ProcessPoolExecutor.", + ), + max_workers: Optional[int] = typer.Option( + None, + "--max-workers", + help="Max workers for parallel CPU extraction (default: CPU count).", + ), + gpu_devices: Optional[str] = typer.Option( + None, + "--gpu-devices", + help="Comma-separated GPU device IDs for multi-GPU pipelined execution (e.g. '0,1').", + ), + page_chunk_size: int = typer.Option( + 32, + "--page-chunk-size", + help="Number of pages per chunk for parallel processing.", + ), + show_progress: bool = typer.Option( + True, + "--show-progress/--no-show-progress", + help="Show progress bars during ingestion.", + ), +) -> None: + """Run the in-process ingestion pipeline on local documents (no Ray required).""" + from nemo_retriever.ingest_modes.inprocess import InProcessIngestor + from nemo_retriever.params import ( + EmbedParams, + ExtractParams, + HtmlChunkParams, + IngestExecuteParams, + TextChunkParams, + VdbUploadParams, + ) + + log_level = logging.DEBUG if debug else logging.INFO + logging.basicConfig( + level=log_level, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + force=True, + ) + + lancedb_uri_abs = str(Path(lancedb_uri).expanduser().resolve()) + + input_path = Path(input_path) + if input_path.is_file(): + file_patterns = [str(input_path)] + elif input_path.is_dir(): + ext_map = { + "txt": ["*.txt"], + "html": ["*.html"], + "doc": ["*.docx", "*.pptx"], + } + exts = ext_map.get(input_type, ["*.pdf"]) + file_patterns = [str(input_path / e) for e in exts] + else: + raise typer.BadParameter(f"Path does not exist: {input_path}") + + ingestor = InProcessIngestor() + ingestor = ingestor.files(file_patterns) + + if input_type == "txt": + ingestor = ingestor.extract_txt(TextChunkParams(max_tokens=512, overlap_tokens=0)) + elif input_type == "html": + ingestor = ingestor.extract_html(HtmlChunkParams(max_tokens=512, overlap_tokens=0)) + else: + extract_kw: dict = dict( + extract_text=extract_text, + extract_tables=extract_tables, + extract_charts=extract_charts, + extract_infographics=extract_infographics, + use_table_structure=use_table_structure, + table_output_format=table_output_format, + ) + if inference_batch_size is not None: + extract_kw["inference_batch_size"] = inference_batch_size + if page_elements_invoke_url: + extract_kw["page_elements_invoke_url"] = page_elements_invoke_url + if ocr_invoke_url: + extract_kw["ocr_invoke_url"] = ocr_invoke_url + if table_structure_invoke_url: + extract_kw["table_structure_invoke_url"] = table_structure_invoke_url + ingestor = ingestor.extract(ExtractParams(**extract_kw)) + + ingestor = ingestor.embed( + EmbedParams( + model_name=str(embed_model_name), + embed_invoke_url=embed_invoke_url, + embed_modality=embed_modality, + text_elements_modality=text_elements_modality, + structured_elements_modality=structured_elements_modality, + embed_granularity=embed_granularity, + ) + ) + + ingestor = ingestor.vdb_upload( + VdbUploadParams( + lancedb={ + "lancedb_uri": lancedb_uri_abs, + "table_name": lancedb_table, + "overwrite": overwrite, + "create_index": True, + "hybrid": hybrid, + } + ) + ) + + if output_directory: + ingestor = ingestor.save_to_disk(output_directory=output_directory) + + parsed_gpu_devices = None + if gpu_devices: + parsed_gpu_devices = [int(d.strip()) for d in gpu_devices.split(",") if d.strip()] + + logger.info("Running in-process ingestion...") + start = time.perf_counter() + + ingestor.ingest( + params=IngestExecuteParams( + show_progress=show_progress, + parallel=parallel, + max_workers=max_workers, + gpu_devices=parsed_gpu_devices, + page_chunk_size=page_chunk_size, + ) + ) + + elapsed = time.perf_counter() - start + logger.info(f"In-process ingestion complete in {elapsed:.2f}s") + + +if __name__ == "__main__": + app() diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py b/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py index 06bc08a4a..863bea08c 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage1_pdf_extraction.py @@ -10,11 +10,11 @@ This module intentionally contains **no configuration logic**. It simply re-exports the `nemo_retriever.pdf.stage` Typer application so any arguments provided to: - `retriever local stage1 ...` + `nr local stage1 ...` are handled exactly the same as: - `retriever pdf ...` + `nr pdf ...` """ from nemo_retriever.pdf.stage import app as app diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py b/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py index 3ea038f3d..5cdae2f2a 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage5_text_embeddings.py @@ -10,7 +10,7 @@ This module intentionally contains no configuration logic. It re-exports the `nemo_retriever.text_embed.stage` Typer application so arguments provided to: - `retriever local stage5 ...` + `nr local stage5 ...` are handled by `nemo_retriever.text_embed.stage`, including local-HF fallback options when an embedding endpoint is not configured. diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py b/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py index 17c18b45f..a61808957 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage6_vdb_upload.py @@ -10,7 +10,7 @@ This module intentionally contains no configuration logic. It re-exports the `nemo_retriever.vector_store.stage` Typer application so arguments provided to: - `retriever local stage6 ...` + `nr local stage6 ...` are handled by `nemo_retriever.vector_store.stage`. """ diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py b/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py index 451befd0d..e11be2c5d 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage7_vdb_query.py @@ -10,7 +10,7 @@ This module intentionally contains no configuration logic. It re-exports the `nemo_retriever.recall.vdb_recall` Typer application so arguments provided to: - `retriever local stage7 ...` + `nr local stage7 ...` are handled by `nemo_retriever.recall.vdb_recall`, including a local-HF fallback path when embedding HTTP/gRPC endpoints are not configured. diff --git a/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py b/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py index e4f6ba1ae..951d9dfe5 100644 --- a/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py +++ b/nemo_retriever/src/nemo_retriever/local/stages/stage999_post_mortem_analysis.py @@ -659,7 +659,7 @@ def _arc_for(p: Path) -> str: # Include a small readme to guide recipients. readme = ( - "retriever stage999 gathered results\n" + "nr stage999 gathered results\n" "\n" "Contents:\n" "- bo767_query_gt.csv: original query->pdf_page mapping\n" @@ -1697,7 +1697,7 @@ def _short(s: str, n: int = 90) -> str: summary_cache: Dict[int, Dict[str, Any]] = {} root = tk.Tk() - root.title("retriever stage999 post-mortem analysis") + root.title("nr stage999 post-mortem analysis") # Layout: left search+list, right detail panel. root.columnconfigure(0, weight=0) diff --git a/nemo_retriever/src/nemo_retriever/txt/__main__.py b/nemo_retriever/src/nemo_retriever/txt/__main__.py index b769c774e..fb60cb962 100644 --- a/nemo_retriever/src/nemo_retriever/txt/__main__.py +++ b/nemo_retriever/src/nemo_retriever/txt/__main__.py @@ -5,7 +5,7 @@ """ CLI for .txt extraction: tokenizer-based split, write *.txt_extraction.json. -Use with: retriever local stage5 run --pattern "*.txt_extraction.json" then stage6. +Use with: nr local stage5 run --pattern "*.txt_extraction.json" then stage6. """ from __future__ import annotations @@ -63,8 +63,8 @@ def run( Scan input_dir for *.txt, tokenizer-split each into chunks, write .txt_extraction.json. Output JSON has the same primitives-like shape as stage5 input (text, path, page_number, metadata). - Then run: retriever local stage5 run --input-dir --pattern "*.txt_extraction.json" - and retriever local stage6 run --input-dir . + Then run: nr local stage5 run --input-dir --pattern "*.txt_extraction.json" + and nr local stage6 run --input-dir . """ input_dir = Path(input_dir) txt_files = sorted(input_dir.glob("*.txt")) diff --git a/nemo_retriever/src/nemo_retriever/vector_store/stage.py b/nemo_retriever/src/nemo_retriever/vector_store/stage.py index ba3f994a1..94c535272 100644 --- a/nemo_retriever/src/nemo_retriever/vector_store/stage.py +++ b/nemo_retriever/src/nemo_retriever/vector_store/stage.py @@ -24,7 +24,7 @@ def run( exists=True, file_okay=False, dir_okay=True, - help="Directory containing `*.text_embeddings.json` files (from `retriever local stage5`).", + help="Directory containing `*.text_embeddings.json` files (from `nr local stage5`).", ), recursive: bool = typer.Option(False, "--recursive/--no-recursive", help="Scan subdirectories too."), limit: Optional[int] = typer.Option(None, "--limit", min=1, help="Optionally limit number of input files."), diff --git a/nemo_retriever/table_stage_config.yaml b/nemo_retriever/table_stage_config.yaml index e268c8ba7..8a7b80152 100644 --- a/nemo_retriever/table_stage_config.yaml +++ b/nemo_retriever/table_stage_config.yaml @@ -1,6 +1,6 @@ # Example config for: -# - `retriever table stage run --config --input ` -# - `retriever local stage3 run --config --input ` +# - `nr table stage run --config --input ` +# - `nr local stage3 run --config --input ` # # This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema` # via `nemo_retriever.table.config.load_table_extractor_schema_from_dict`. diff --git a/nemo_retriever/tests/test_audio_stage.py b/nemo_retriever/tests/test_audio_stage.py index 33124adbf..b23209513 100644 --- a/nemo_retriever/tests/test_audio_stage.py +++ b/nemo_retriever/tests/test_audio_stage.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 """ -Tests for the audio extraction-only stage (retriever audio extract). +Tests for the audio extraction-only stage (nr audio extract). """ from pathlib import Path From 2f94f14f3b9a1f75420de03e355e9f4fc5343cf5 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Fri, 6 Mar 2026 15:22:33 -0500 Subject: [PATCH 2/4] updates and unit tests --- nemo_retriever/pyproject.toml | 1 + .../src/nemo_retriever/adapters/cli/main.py | 2 +- .../examples/inprocess_pipeline.py | 2 +- .../ingest_modes/inprocess_cli.py | 291 ------------------ nemo_retriever/tests/test_nr_cli.py | 153 +++++++++ 5 files changed, 156 insertions(+), 293 deletions(-) delete mode 100644 nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py create mode 100644 nemo_retriever/tests/test_nr_cli.py diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 54de81697..24c400f43 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -69,6 +69,7 @@ dependencies = [ "soundfile>=0.12.0", "scipy>=1.11.0", "nvidia-ml-py", + "pytest" ] [project.optional-dependencies] diff --git a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py index 72ec2fe96..e533892e8 100644 --- a/nemo_retriever/src/nemo_retriever/adapters/cli/main.py +++ b/nemo_retriever/src/nemo_retriever/adapters/cli/main.py @@ -14,7 +14,7 @@ from nemo_retriever.harness import app as harness_app from nemo_retriever.html import __main__ as html_main from nemo_retriever.utils.image import app as image_app -from nemo_retriever.ingest_modes.inprocess_cli import app as inprocess_app +from nemo_retriever.examples.inprocess_pipeline import app as inprocess_app from nemo_retriever.local import app as local_app from nemo_retriever.online import __main__ as online_main from nemo_retriever.pdf import app as pdf_app diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index ea6295c2b..37835af99 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -4,7 +4,7 @@ """ In-process ingestion pipeline (no Ray) with optional recall evaluation. -Run with: uv run python -m nemo_retriever.examples.inprocess_pipeline +Run with: nr ingest inprocess """ import json diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py deleted file mode 100644 index db9f0ef99..000000000 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess_cli.py +++ /dev/null @@ -1,291 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -In-process ingestion pipeline CLI. - -Run with: nr inprocess -""" - -from __future__ import annotations - -import logging -import time -from pathlib import Path -from typing import Optional - -import typer - -logger = logging.getLogger(__name__) - -app = typer.Typer(help="Run the in-process ingestion pipeline (no Ray required).") - -LANCEDB_URI = "lancedb" -LANCEDB_TABLE = "nv-ingest" - - -@app.command() -def main( - input_path: Path = typer.Argument( - ..., - help="File or directory containing PDFs, .txt, .html, or .doc/.pptx files to ingest.", - path_type=Path, - ), - debug: bool = typer.Option( - False, - "--debug/--no-debug", - help="Enable debug-level logging.", - ), - input_type: str = typer.Option( - "pdf", - "--input-type", - help="Input format: 'pdf', 'txt', 'html', or 'doc'. " - "Use 'txt' for .txt, 'html' for .html (markitdown -> chunks), " - "'doc' for .docx/.pptx (converted to PDF via LibreOffice).", - ), - # -- Extract params -------------------------------------------------------- - extract_text: bool = typer.Option( - True, - "--extract-text/--no-extract-text", - help="Extract text from PDF pages.", - ), - extract_tables: bool = typer.Option( - True, - "--extract-tables/--no-extract-tables", - help="Extract tables from PDF pages.", - ), - extract_charts: bool = typer.Option( - True, - "--extract-charts/--no-extract-charts", - help="Extract charts from PDF pages.", - ), - extract_infographics: bool = typer.Option( - False, - "--extract-infographics/--no-extract-infographics", - help="Extract infographics from PDF pages.", - ), - use_table_structure: bool = typer.Option( - False, - "--use-table-structure", - help="Enable the combined table-structure + OCR stage for tables.", - ), - table_output_format: Optional[str] = typer.Option( - None, - "--table-output-format", - help="Table output format: 'pseudo_markdown' (OCR-only) or 'markdown' (table-structure + OCR).", - ), - inference_batch_size: Optional[int] = typer.Option( - None, - "--inference-batch-size", - help="Inference batch size for detection models.", - ), - page_elements_invoke_url: Optional[str] = typer.Option( - None, - "--page-elements-invoke-url", - help="Optional remote endpoint URL for page-elements model inference.", - ), - ocr_invoke_url: Optional[str] = typer.Option( - None, - "--ocr-invoke-url", - help="Optional remote endpoint URL for OCR model inference.", - ), - table_structure_invoke_url: Optional[str] = typer.Option( - None, - "--table-structure-invoke-url", - help="Optional remote endpoint URL for table-structure model inference.", - ), - # -- Embed params ---------------------------------------------------------- - embed_model_name: str = typer.Option( - "nvidia/llama-3.2-nv-embedqa-1b-v2", - "--embed-model-name", - help="Embedding model name passed to .embed().", - ), - embed_invoke_url: Optional[str] = typer.Option( - None, - "--embed-invoke-url", - help="Optional remote endpoint URL for embedding model inference.", - ), - embed_modality: str = typer.Option( - "text", - "--embed-modality", - help="Default embedding modality: 'text', 'image', or 'text_image'.", - ), - embed_granularity: str = typer.Option( - "element", - "--embed-granularity", - help="Embedding granularity: 'element' (per table/chart/text) or 'page' (per page).", - ), - text_elements_modality: Optional[str] = typer.Option( - None, - "--text-elements-modality", - help="Embedding modality override for page-text rows. Falls back to --embed-modality.", - ), - structured_elements_modality: Optional[str] = typer.Option( - None, - "--structured-elements-modality", - help="Embedding modality override for table/chart/infographic rows. Falls back to --embed-modality.", - ), - # -- VDB upload params ----------------------------------------------------- - lancedb_uri: str = typer.Option( - LANCEDB_URI, - "--lancedb-uri", - help="LanceDB URI/path for this run.", - ), - lancedb_table: str = typer.Option( - LANCEDB_TABLE, - "--lancedb-table", - help="LanceDB table name.", - ), - hybrid: bool = typer.Option( - False, - "--hybrid/--no-hybrid", - help="Enable LanceDB hybrid mode (dense + FTS text).", - ), - overwrite: bool = typer.Option( - True, - "--overwrite/--no-overwrite", - help="Overwrite existing LanceDB table (False to append).", - ), - # -- Output ---------------------------------------------------------------- - output_directory: Optional[str] = typer.Option( - None, - "--output-dir", - help="Optional directory to write per-document JSON results.", - ), - # -- Execution params ------------------------------------------------------ - parallel: bool = typer.Option( - False, - "--parallel/--no-parallel", - help="Enable parallel CPU extraction via ProcessPoolExecutor.", - ), - max_workers: Optional[int] = typer.Option( - None, - "--max-workers", - help="Max workers for parallel CPU extraction (default: CPU count).", - ), - gpu_devices: Optional[str] = typer.Option( - None, - "--gpu-devices", - help="Comma-separated GPU device IDs for multi-GPU pipelined execution (e.g. '0,1').", - ), - page_chunk_size: int = typer.Option( - 32, - "--page-chunk-size", - help="Number of pages per chunk for parallel processing.", - ), - show_progress: bool = typer.Option( - True, - "--show-progress/--no-show-progress", - help="Show progress bars during ingestion.", - ), -) -> None: - """Run the in-process ingestion pipeline on local documents (no Ray required).""" - from nemo_retriever.ingest_modes.inprocess import InProcessIngestor - from nemo_retriever.params import ( - EmbedParams, - ExtractParams, - HtmlChunkParams, - IngestExecuteParams, - TextChunkParams, - VdbUploadParams, - ) - - log_level = logging.DEBUG if debug else logging.INFO - logging.basicConfig( - level=log_level, - format="%(asctime)s %(levelname)s %(name)s: %(message)s", - force=True, - ) - - lancedb_uri_abs = str(Path(lancedb_uri).expanduser().resolve()) - - input_path = Path(input_path) - if input_path.is_file(): - file_patterns = [str(input_path)] - elif input_path.is_dir(): - ext_map = { - "txt": ["*.txt"], - "html": ["*.html"], - "doc": ["*.docx", "*.pptx"], - } - exts = ext_map.get(input_type, ["*.pdf"]) - file_patterns = [str(input_path / e) for e in exts] - else: - raise typer.BadParameter(f"Path does not exist: {input_path}") - - ingestor = InProcessIngestor() - ingestor = ingestor.files(file_patterns) - - if input_type == "txt": - ingestor = ingestor.extract_txt(TextChunkParams(max_tokens=512, overlap_tokens=0)) - elif input_type == "html": - ingestor = ingestor.extract_html(HtmlChunkParams(max_tokens=512, overlap_tokens=0)) - else: - extract_kw: dict = dict( - extract_text=extract_text, - extract_tables=extract_tables, - extract_charts=extract_charts, - extract_infographics=extract_infographics, - use_table_structure=use_table_structure, - table_output_format=table_output_format, - ) - if inference_batch_size is not None: - extract_kw["inference_batch_size"] = inference_batch_size - if page_elements_invoke_url: - extract_kw["page_elements_invoke_url"] = page_elements_invoke_url - if ocr_invoke_url: - extract_kw["ocr_invoke_url"] = ocr_invoke_url - if table_structure_invoke_url: - extract_kw["table_structure_invoke_url"] = table_structure_invoke_url - ingestor = ingestor.extract(ExtractParams(**extract_kw)) - - ingestor = ingestor.embed( - EmbedParams( - model_name=str(embed_model_name), - embed_invoke_url=embed_invoke_url, - embed_modality=embed_modality, - text_elements_modality=text_elements_modality, - structured_elements_modality=structured_elements_modality, - embed_granularity=embed_granularity, - ) - ) - - ingestor = ingestor.vdb_upload( - VdbUploadParams( - lancedb={ - "lancedb_uri": lancedb_uri_abs, - "table_name": lancedb_table, - "overwrite": overwrite, - "create_index": True, - "hybrid": hybrid, - } - ) - ) - - if output_directory: - ingestor = ingestor.save_to_disk(output_directory=output_directory) - - parsed_gpu_devices = None - if gpu_devices: - parsed_gpu_devices = [int(d.strip()) for d in gpu_devices.split(",") if d.strip()] - - logger.info("Running in-process ingestion...") - start = time.perf_counter() - - ingestor.ingest( - params=IngestExecuteParams( - show_progress=show_progress, - parallel=parallel, - max_workers=max_workers, - gpu_devices=parsed_gpu_devices, - page_chunk_size=page_chunk_size, - ) - ) - - elapsed = time.perf_counter() - start - logger.info(f"In-process ingestion complete in {elapsed:.2f}s") - - -if __name__ == "__main__": - app() diff --git a/nemo_retriever/tests/test_nr_cli.py b/nemo_retriever/tests/test_nr_cli.py new file mode 100644 index 000000000..6542c3dfa --- /dev/null +++ b/nemo_retriever/tests/test_nr_cli.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Tests for the ``nr`` CLI entrypoint, the ``nr ingest`` command group, +and the ``nr ingest inprocess`` / ``nr ingest batch`` subcommands. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +try: + from nemo_retriever.adapters.cli.main import app +except ImportError as _exc: + pytest.skip(f"CLI dependencies not available: {_exc}", allow_module_level=True) + +from typer.testing import CliRunner + +RUNNER = CliRunner() + + +# --------------------------------------------------------------------------- +# pyproject.toml entrypoint +# --------------------------------------------------------------------------- + + +def test_pyproject_entrypoint_is_nr() -> None: + """The installed script name must be ``nr``, not ``retriever``.""" + pyproject = Path(__file__).resolve().parents[1] / "pyproject.toml" + text = pyproject.read_text() + assert "\nnr = " in text + assert "\nretriever = " not in text + + +# --------------------------------------------------------------------------- +# Top-level app +# --------------------------------------------------------------------------- + + +def test_top_level_help_succeeds() -> None: + result = RUNNER.invoke(app, ["--help"]) + assert result.exit_code == 0 + + +def test_top_level_help_lists_ingest() -> None: + result = RUNNER.invoke(app, ["--help"]) + assert "ingest" in result.output + + +def test_version_flag() -> None: + result = RUNNER.invoke(app, ["--version"]) + assert result.exit_code == 0 + assert result.output.strip() + + +def test_existing_subcommands_still_registered() -> None: + """All pre-existing subcommands must still appear in top-level help.""" + result = RUNNER.invoke(app, ["--help"]) + for name in ("audio", "pdf", "local", "harness", "recall", "benchmark"): + assert name in result.output, f"Expected subcommand {name!r} in top-level help" + + +# --------------------------------------------------------------------------- +# ``nr ingest`` command group +# --------------------------------------------------------------------------- + + +def test_ingest_help_succeeds() -> None: + result = RUNNER.invoke(app, ["ingest", "--help"]) + assert result.exit_code == 0 + + +def test_ingest_help_lists_batch_and_inprocess() -> None: + result = RUNNER.invoke(app, ["ingest", "--help"]) + assert "batch" in result.output + assert "inprocess" in result.output + + +# --------------------------------------------------------------------------- +# ``nr ingest batch`` subcommand +# --------------------------------------------------------------------------- + + +def test_ingest_batch_help_succeeds() -> None: + result = RUNNER.invoke(app, ["ingest", "batch", "--help"]) + assert result.exit_code == 0 + + +def test_ingest_batch_help_shows_key_options() -> None: + result = RUNNER.invoke(app, ["ingest", "batch", "--help"]) + for flag in ("--embed-model-name", "--lancedb-uri", "--input-type", "--debug"): + assert flag in result.output, f"Expected {flag!r} in batch --help output" + + +# --------------------------------------------------------------------------- +# ``nr ingest inprocess`` subcommand – help / option presence +# --------------------------------------------------------------------------- + + +def test_ingest_inprocess_help_succeeds() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + assert result.exit_code == 0 + + +def test_ingest_inprocess_help_shows_extract_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + for flag in ( + "--use-table-structure", + "--table-output-format", + "--table-structure-invoke-url", + "--page-elements-invoke-url", + "--ocr-invoke-url", + ): + assert flag in result.output, f"Expected {flag!r} in inprocess --help" + + +def test_ingest_inprocess_help_shows_embed_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + for flag in ( + "--embed-model-name", + "--embed-invoke-url", + "--embed-modality", + "--embed-granularity", + "--text-elements-modality", + "--structured-elements-modality", + ): + assert flag in result.output, f"Expected {flag!r} in inprocess --help" + + +def test_ingest_inprocess_help_shows_execution_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + for flag in ("--max-workers", "--gpu-devices", "--num-gpus"): + assert flag in result.output, f"Expected {flag!r} in inprocess --help" + + +def test_ingest_inprocess_help_shows_recall_options() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + for flag in ("--query-csv", "--no-recall-details"): + assert flag in result.output, f"Expected {flag!r} in inprocess --help" + + +# --------------------------------------------------------------------------- +# ``nr ingest inprocess`` subcommand – argument validation +# --------------------------------------------------------------------------- + + +def test_ingest_inprocess_rejects_missing_path() -> None: + result = RUNNER.invoke(app, ["ingest", "inprocess"]) + assert result.exit_code != 0 From 775fcf4ac2148b65340c31c5e24984689a1bffa3 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Fri, 6 Mar 2026 15:25:25 -0500 Subject: [PATCH 3/4] updates and unit tests --- .../nemo_retriever/examples/batch_pipeline.py | 2 +- .../examples/inprocess_pipeline.py | 2 +- nemo_retriever/tests/test_nr_cli.py | 28 +++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index 1999c6437..c11e96b2f 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -345,7 +345,7 @@ def _hit_key_and_distance(hit: dict) -> tuple[str | None, float | None]: return key, dist -@app.command() +@app.command(name="pipeline") def main( ctx: typer.Context, debug: bool = typer.Option( diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py index 37835af99..930505da8 100644 --- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py @@ -96,7 +96,7 @@ def _hit_key_and_distance(hit: dict) -> tuple[str | None, float | None]: return key, dist -@app.command() +@app.command(name="pipeline") def main( input_path: Path = typer.Argument( ..., diff --git a/nemo_retriever/tests/test_nr_cli.py b/nemo_retriever/tests/test_nr_cli.py index 6542c3dfa..24d37ec3c 100644 --- a/nemo_retriever/tests/test_nr_cli.py +++ b/nemo_retriever/tests/test_nr_cli.py @@ -86,14 +86,14 @@ def test_ingest_help_lists_batch_and_inprocess() -> None: def test_ingest_batch_help_succeeds() -> None: - result = RUNNER.invoke(app, ["ingest", "batch", "--help"]) + result = RUNNER.invoke(app, ["ingest", "batch", "pipeline", "--help"]) assert result.exit_code == 0 def test_ingest_batch_help_shows_key_options() -> None: - result = RUNNER.invoke(app, ["ingest", "batch", "--help"]) + result = RUNNER.invoke(app, ["ingest", "batch", "pipeline", "--help"]) for flag in ("--embed-model-name", "--lancedb-uri", "--input-type", "--debug"): - assert flag in result.output, f"Expected {flag!r} in batch --help output" + assert flag in result.output, f"Expected {flag!r} in batch pipeline --help output" # --------------------------------------------------------------------------- @@ -102,12 +102,12 @@ def test_ingest_batch_help_shows_key_options() -> None: def test_ingest_inprocess_help_succeeds() -> None: - result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) assert result.exit_code == 0 def test_ingest_inprocess_help_shows_extract_options() -> None: - result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) for flag in ( "--use-table-structure", "--table-output-format", @@ -115,11 +115,11 @@ def test_ingest_inprocess_help_shows_extract_options() -> None: "--page-elements-invoke-url", "--ocr-invoke-url", ): - assert flag in result.output, f"Expected {flag!r} in inprocess --help" + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" def test_ingest_inprocess_help_shows_embed_options() -> None: - result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) for flag in ( "--embed-model-name", "--embed-invoke-url", @@ -128,26 +128,26 @@ def test_ingest_inprocess_help_shows_embed_options() -> None: "--text-elements-modality", "--structured-elements-modality", ): - assert flag in result.output, f"Expected {flag!r} in inprocess --help" + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" def test_ingest_inprocess_help_shows_execution_options() -> None: - result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) for flag in ("--max-workers", "--gpu-devices", "--num-gpus"): - assert flag in result.output, f"Expected {flag!r} in inprocess --help" + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" def test_ingest_inprocess_help_shows_recall_options() -> None: - result = RUNNER.invoke(app, ["ingest", "inprocess", "--help"]) + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline", "--help"]) for flag in ("--query-csv", "--no-recall-details"): - assert flag in result.output, f"Expected {flag!r} in inprocess --help" + assert flag in result.output, f"Expected {flag!r} in inprocess pipeline --help" # --------------------------------------------------------------------------- -# ``nr ingest inprocess`` subcommand – argument validation +# ``nr ingest inprocess pipeline`` – argument validation # --------------------------------------------------------------------------- def test_ingest_inprocess_rejects_missing_path() -> None: - result = RUNNER.invoke(app, ["ingest", "inprocess"]) + result = RUNNER.invoke(app, ["ingest", "inprocess", "pipeline"]) assert result.exit_code != 0 From 8b2e7b71fbdb2f312489418119ba3e705ef9f770 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Fri, 6 Mar 2026 15:33:09 -0500 Subject: [PATCH 4/4] Fix syntax issue --- .../src/nemo_retriever/examples/batch_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index c11e96b2f..2392f964a 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -855,11 +855,11 @@ def main( ) ingest_elapsed_s = time.perf_counter() - ingest_start + num_rows = ingest_results.groupby("source_id").count().count() logger.info( - f"Ingestion complete. {len(ingest_results)} rows procesed in " - f"{ingest_elapsed_s:.2f} seconds. {len(ingest_results)/ingest_elapsed_s:.2f} PPS" + f"Ingestion complete. {num_rows} rows procesed in " + f"{ingest_elapsed_s:.2f} seconds. {num_rows/ingest_elapsed_s:.2f} PPS" ) - logger.info(f"Ingestion Dataset: {ingestor.get_dataset()}") if isinstance(ingestor, BatchIngestor): error_rows = ingestor.get_error_rows(dataset=ingest_results).materialize()