diff --git a/.github/workflows/pypi-nightly-publish.yml b/.github/workflows/pypi-nightly-publish.yml index 98ea26927..40549a429 100644 --- a/.github/workflows/pypi-nightly-publish.yml +++ b/.github/workflows/pypi-nightly-publish.yml @@ -121,15 +121,16 @@ jobs: run: | cd nemo_retriever python - <<'PY' + import re from datetime import datetime, timezone from pathlib import Path - Path("src/nemo_retriever/_build_info.py").write_text( - '"""Build metadata written by CI before packaging."""\n\n' - 'BUILD_GIT_SHA = "${{ github.sha }}"\n' - f'BUILD_DATE = "{datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")}"\n', - encoding="utf-8", - ) + vf = Path("src/nemo_retriever/version.py") + src = vf.read_text(encoding="utf-8") + src = re.sub(r'^_PACKAGE_BUILD_GIT_SHA = .*$', '_PACKAGE_BUILD_GIT_SHA = "${{ github.sha }}"', src, flags=re.M) + build_date = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + src = re.sub(r'^_PACKAGE_BUILD_DATE = .*$', f'_PACKAGE_BUILD_DATE = "{build_date}"', src, flags=re.M) + vf.write_text(src, encoding="utf-8") PY RETRIEVER_RELEASE_TYPE=${{ env.RELEASE_TYPE }} \ RETRIEVER_VERSION=${{ env.VERSION }} \ diff --git a/.github/workflows/reusable-pypi-build.yml b/.github/workflows/reusable-pypi-build.yml index 7f9947d1d..7df9091a8 100644 --- a/.github/workflows/reusable-pypi-build.yml +++ b/.github/workflows/reusable-pypi-build.yml @@ -91,15 +91,16 @@ jobs: run: | cd nemo_retriever python - <<'PY' + import re from datetime import datetime, timezone from pathlib import Path - Path("src/nemo_retriever/_build_info.py").write_text( - '"""Build metadata written by CI before packaging."""\n\n' - 'BUILD_GIT_SHA = "${{ github.sha }}"\n' - f'BUILD_DATE = "{datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")}"\n', - encoding="utf-8", - ) + vf = Path("src/nemo_retriever/version.py") + src = vf.read_text(encoding="utf-8") + src = re.sub(r'^_PACKAGE_BUILD_GIT_SHA = .*$', '_PACKAGE_BUILD_GIT_SHA = "${{ github.sha }}"', src, flags=re.M) + build_date = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + src = re.sub(r'^_PACKAGE_BUILD_DATE = .*$', f'_PACKAGE_BUILD_DATE = "{build_date}"', src, flags=re.M) + vf.write_text(src, encoding="utf-8") PY RETRIEVER_RELEASE_TYPE=${{ inputs.release-type }} \ RETRIEVER_VERSION=${{ steps.set-version.outputs.version }} \ diff --git a/nemo_retriever/RAY_BALANCING.md b/nemo_retriever/RAY_BALANCING.md deleted file mode 100644 index ec71d3142..000000000 --- a/nemo_retriever/RAY_BALANCING.md +++ /dev/null @@ -1,162 +0,0 @@ -# Ray Balancing Strategy - -This document describes the default Ray Data balancing strategy used by -`nemo_retriever/src/nemo_retriever/ray_balance_dag.py`, why each test family exists, and -what to try next. - -## Goal - -Keep the default experiment set broad enough to find bottlenecks, but small -enough to run repeatedly across multiple machines. The default matrix is now -designed to stay under 1,000 variants. - -## Design Approach - -The default matrix uses a practical DOE-style approach: - -1. **Baseline run** - A stable reference point used for quick comparisons. -2. **One-factor-at-a-time (OFAT) sweeps** - Change one knob while keeping others at baseline to isolate sensitivity. -3. **Targeted interaction sweeps** - Test only high-value parameter interactions where coupling is expected. - -This avoids a full Cartesian product over all knobs (which grows to millions of -runs and is usually not actionable). - -## Default Matrix Definition - -The script’s default matrix includes the following families. - -### A) Baseline - -- Single baseline config with balanced CPU/GPU and midrange batch sizes. -- Purpose: anchor for all deltas and detect regressions quickly. - -### B) OFAT Sweeps - -- `pdf_workers`: `[4, 8, 12, 16]` -- `pdf_num_cpus`: `[1.0, 2.0, 3.0, 4.0]` -- `pdf_split_bs`: `[1, 4, 8]` -- `pdf_bs`: `[8, 16, 24, 32]` -- `page_elements_bs`: `[8, 16, 24, 32]` -- `page_elements_workers`: `[1, 2, 3]` -- `ocr_workers`: `[1, 2, 3]` -- `ocr_bs`: `[8, 16, 24, 32]` -- `embed_workers`: `[1, 2, 3]` -- `embed_bs`: `[128, 256, 512, 768]` -- `page_elements_cpus_per_actor`: `[1.0, 2.0, 4.0]` -- `ocr_cpus_per_actor`: `[1.0, 2.0, 4.0]` -- `embed_cpus_per_actor`: `[1.0, 2.0, 4.0]` -- `gpu_page_elements`: `[0.25, 0.5, 0.75]` -- `gpu_ocr`: `[0.75, 1.0]` -- `gpu_embed`: `[0.25, 0.5, 0.75]` - -Why this matters: - -- Identifies which knobs are low-impact (can be fixed) vs high-impact (worth - deeper search). -- Narrows the search space before trying interactions. - -### C) Targeted Interaction Grids - -1. **OCR throughput coupling** - - `ocr_bs x ocr_workers x gpu_ocr` -2. **Embedding throughput coupling** - - `embed_bs x embed_workers x gpu_embed` -3. **Page-elements throughput coupling** - - `page_elements_bs x page_elements_workers x gpu_page_elements` -4. **CPU extraction balance** - - `pdf_workers x pdf_num_cpus x pdf_bs` -5. **Actor CPU pressure** - - `page_elements_cpus_per_actor x ocr_cpus_per_actor x embed_cpus_per_actor` -6. **Pipeline batch-shape interaction** - - `pdf_bs x ocr_bs x embed_bs` - -Why this matters: - -- These are the pairs/triples most likely to create backpressure or starvation. -- Captures non-linear behavior without exploding matrix size. - -## What Has Been Tried So Far - -- Full/fat sweeps were tested early and found to be too large operationally. -- Matrix generation now deduplicates repeated variants and focuses on high-signal - combinations. -- Row-range sharding support (`--row-start`, `--row-end`) is used for distributed - execution across machines. -- Runtime metrics are captured per run: - - Ray Data operator stats (`rd_dataset.stats()`) - - Ray timeline (`ray.timeline(...)`) - -## GPU Constraint Handling - -Some deployments reject fractional `num_gpus` values above `1.0` per actor. - -To avoid invalid scheduling requests, matrix generation/loading normalizes any -`gpu_* > 1.0` request by: - -- setting per-actor GPU to `1.0`, and -- multiplying the corresponding actor count (`*_workers`) by `ceil(gpu_*)`. - -This keeps total requested GPU capacity similar while using valid actor specs. - -## Runtime Metrics Artifacts - -For each run, the pipeline writes metrics files under the run logs directory -(`runtime_metrics/` subdir) with the run prefix: - -- `.rd_dataset.stats.txt` (per-operator Ray Data stats) -- `.ray.timeline.json` (cluster task timeline) -- `.runtime.summary.json` (top-level run summary) - -## LanceDB Isolation and Recall Guarantees - -To prevent cross-run contamination: - -- The matrix runner deletes the configured LanceDB URI path before each run. -- Each run then recreates and writes a fresh `nv-ingest` table. - -To ensure recall is actually executed: - -- The batch pipeline now treats a missing LanceDB table as a hard failure - (after a short retry), instead of silently skipping recall. -- The matrix results CSV includes a `recall_ran` flag and marks runs as failed - if recall metrics are absent. - -## How to Generate and Run - -Generate matrix CSV only: - -```bash -python nemo_retriever/src/nemo_retriever/ray_balance_dag.py \ - --input-dir /path/to/pdfs \ - --write-default-matrix-csv nemo_retriever/ray_balance_variants.csv \ - --exit-after-writing-matrix -``` - -Run a shard: - -```bash -python nemo_retriever/src/nemo_retriever/ray_balance_dag.py \ - --input-dir /path/to/pdfs \ - --matrix-csv nemo_retriever/ray_balance_variants.csv \ - --row-start 1 \ - --row-end 200 \ - --output-csv nemo_retriever/ray_balance_results_001_200.csv -``` - -## Recommended Next Experiments - -1. **Adaptive second-pass search** - - Keep top 10-20% by throughput and run local neighborhood sweeps. -2. **Constraint-aware optimization** - - Add objective penalties for GPU OOM, high object-store pressure, or low - recall to avoid fragile winners. -3. **Dataset-stratified tests** - - Split small/medium/large PDFs and optimize per segment; mixed corpora often - hide better settings. -4. **Stability runs** - - Re-run top candidates 3-5 times and compare variance, not just best mean. -5. **Multi-objective scoring** - - Rank by weighted score: throughput, recall@k, and cost (GPU-hours). diff --git a/nemo_retriever/chart_stage_config.yaml b/nemo_retriever/chart_stage_config.yaml deleted file mode 100644 index fada15b68..000000000 --- a/nemo_retriever/chart_stage_config.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Example config for chart extraction. -# -# Intended usage (once the chart stage CLI is wired up similarly to table stage): -# - `retriever chart stage run --config --input ` -# - `retriever local stage4 run --config --input ` -# -# This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_chart_schema.ChartExtractorSchema` -# via `nemo_retriever.chart.config.load_chart_extractor_schema_from_dict`. -# -# IMPORTANT: -# If `endpoint_config.yolox_endpoints` is null/empty, chart extraction will fall back to the local -# HuggingFace model (`nemo_retriever.model.local.nemotron_graphic_elements_v1`). -# If `endpoint_config.ocr_endpoints` is null/empty, chart extraction falls back to local Nemotron OCR -# (`nemo_retriever.model.local.nemotron_ocr_v1`) with default HuggingFace model loading. -# - -# Optional worker settings -max_queue_size: 1 -n_workers: 2 -raise_on_failure: false - -# Endpoint configuration for chart extraction (YOLOX graphic-elements + OCR). -endpoint_config: - # Optional auth token for secured services (NIM / hosted endpoints) - auth_token: null - - # Tuple/list in the form: [grpc, http] - # - # Chart extraction uses the YOLOX *graphic-elements* model (not page-elements). - # - # For the provided `docker-compose.yaml`, the host-mapped ports are: - # - graphic-elements HTTP: 8003 (container 8000) - # - graphic-elements gRPC: 8004 (container 8001) - # - # If you're running from inside the docker compose network instead, these often look like: - # - "graphic-elements:8001" and "http://graphic-elements:8000/v1/infer" - # yolox_endpoints: ["localhost:8004", "http://localhost:8003/v1/infer"] - yolox_endpoints: null - # Optional; if omitted it is inferred from which endpoint is present. - # yolox_infer_protocol: grpc - - # OCR model endpoints (same pattern: [grpc, http]). - # For the provided `docker-compose.yaml`, the host-mapped ports are: - # - ocr HTTP: 8019 (container 8000) - # - ocr gRPC: 8010 (container 8001) - # ocr_endpoints: ["localhost:8010", "http://localhost:8019/v1/infer"] - ocr_endpoints: null - # Optional; if omitted it is inferred from which endpoint is present. - # ocr_infer_protocol: grpc - - # Optional performance knobs - nim_batch_size: 2 - workers_per_progress_engine: 5 diff --git a/nemo_retriever/embedding_stage_config.yaml b/nemo_retriever/embedding_stage_config.yaml deleted file mode 100644 index f04c29841..000000000 --- a/nemo_retriever/embedding_stage_config.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# Text embedding stage config (nemo_retriever.text_embed) -# -# This YAML is passed to: -# nemo_retriever.text_embed.config.load_text_embedding_schema_from_dict(...) -# which validates against nv-ingest-api's `TextEmbeddingSchema`. -# -# Minimal required fields are optional (schema provides defaults), but you -# typically set api_key / endpoint / model to point at your embedding service. - -# Auth (optional; can also be provided via task_config overrides) -api_key: "" # e.g. $NGC_API_KEY or $NVIDIA_API_KEY - -# Embedding service settings -# If set to null/empty, `retriever local stage5` will fall back to local HF embeddings -# via `nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder`. -embedding_nim_endpoint: null -# embedding_nim_endpoint: "http://localhost:8012/v1" -embedding_model: "nvidia/llama-nemotron-embed-1b-v2" - -# Request formatting -encoding_format: "float" # usually "float" -input_type: "passage" # "passage" (docs) or "query" (queries) -truncate: "END" # how the service truncates long inputs - -# Batch sizing (NIM-side batching is handled internally; this is stage batching) -batch_size: 4 - -# Modalities for multi-modal models (leave as "text" for text-only models) -text_elements_modality: "text" -image_elements_modality: "text" -structured_elements_modality: "text" -audio_elements_modality: "text" - -# Behavior -raise_on_failure: false -httpx_log_level: "WARNING" # DEBUG | INFO | WARNING | ERROR | CRITICAL - -# Optional: embed custom content from metadata.custom_content via glom path -# custom_content_field: "my_field" # e.g. "foo" or "nested.foo" -# result_target_field: "my_field_embedding" # where to write embedding under custom_content - -# Optional: request embedding vector size if the backend supports it -# dimensions: 1024 diff --git a/nemo_retriever/infographic_stage_config.yaml b/nemo_retriever/infographic_stage_config.yaml deleted file mode 100644 index 67ac68546..000000000 --- a/nemo_retriever/infographic_stage_config.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# Example config for: -# - `retriever infographic stage run --config --input ` -# - `retriever local stage2 run --config --input ` -# -# This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_infographic_schema.InfographicExtractorSchema` -# via `nemo_retriever.infographic.config.load_infographic_extractor_schema_from_dict`. -# -# IMPORTANT: -# `endpoint_config.ocr_endpoints` must provide at least one endpoint (gRPC or HTTP). Both cannot be null/empty. -# - -# Optional worker settings -max_queue_size: 1 -n_workers: 2 -raise_on_failure: false - -# Endpoint configuration for OCR used to enrich infographic primitives. -endpoint_config: - # Tuple/list in the form: [grpc, http] - # - gRPC example: "ocr:8001" - # - HTTP example: "http://ocr:8000/v1/infer" - #ocr_endpoints: ["localhost:8001", "http://localhost:8019/v1/infer"] - ocr_endpoints: null - - # Optional; if omitted it is inferred from which endpoint is present. - # ocr_infer_protocol: grpc - - # Optional auth token for secured services (NIM) - auth_token: null - - # Optional performance knobs - nim_batch_size: 2 - workers_per_progress_engine: 5 diff --git a/nemo_retriever/pdf_stage_config.yaml b/nemo_retriever/pdf_stage_config.yaml deleted file mode 100644 index 33126ba61..000000000 --- a/nemo_retriever/pdf_stage_config.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# Example config for: `retriever pdf stage page-elements --config ` -# -# CLI override rule: -# - If you pass an option explicitly on the CLI, it wins. -# - Otherwise the value from this YAML file is used. -# -# You can run repeatedly: -# retriever pdf stage page-elements --config nemo_retriever/pdf_stage_config.yaml -# - -# Directory containing PDFs (scanned recursively for *.pdf) -input_dir: /home/local/jdyer/datasets/bo767 - -# PDF extraction method: pdfium | pdfium_hybrid | ocr | nemotron_parse | tika | unstructured_io | adobe | llama -method: pdfium - -# Optional auth token for NIM-backed services -auth_token: null - -endpoints: - yolox: - # If set to null then HuggingFace models will be used instead of NIMs - # grpc: localhost:8001 - # http: http://localhost:8000/v1/infer - grpc: null - http: null - - # Only required for method: nemotron_parse - nemotron_parse: - grpc: null - http: null - model_name: null - -extract: - text: true - # Text depth: page | document - text_depth: page - images: false - tables: true - charts: true - infographics: true - page_as_image: false - -outputs: - write_json: true - json_output_dir: /home/local/jdyer/datasets/bo767-results-hf-standalone/ - -# Optionally limit number of PDFs processed -limit: null diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml index 87f4ac5d7..7b96665ab 100644 --- a/nemo_retriever/pyproject.toml +++ b/nemo_retriever/pyproject.toml @@ -80,7 +80,7 @@ dev = [ ] [project.scripts] -retriever = "nemo_retriever.__main__:main" +retriever = "nemo_retriever.adapters.cli.main:main" [tool.setuptools.dynamic] version = {attr = "nemo_retriever.version.get_build_version"} diff --git a/nemo_retriever/ray_balance_variants.csv b/nemo_retriever/ray_balance_variants.csv deleted file mode 100644 index 309b3cdf4..000000000 --- a/nemo_retriever/ray_balance_variants.csv +++ /dev/null @@ -1,240 +0,0 @@ -run_id,pdf_workers,pdf_num_cpus,pdf_split_bs,pdf_bs,page_elements_bs,page_elements_workers,ocr_workers,ocr_bs,embed_workers,embed_bs,page_elements_cpus_per_actor,ocr_cpus_per_actor,embed_cpus_per_actor,gpu_page_elements,gpu_ocr,gpu_embed,ray_address,start_ray -V00001,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00002,4,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00003,12,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00004,16,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00005,8,1.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00006,8,3.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00007,8,4.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00008,8,2.0,4,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00009,8,2.0,8,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00010,8,2.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00011,8,2.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00012,8,2.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00013,8,2.0,1,16,8,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00014,8,2.0,1,16,24,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00015,8,2.0,1,16,32,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00016,8,2.0,1,16,16,2,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00017,8,2.0,1,16,16,3,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00018,8,2.0,1,16,16,1,1,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00019,8,2.0,1,16,16,1,3,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00020,8,2.0,1,16,16,1,2,8,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00021,8,2.0,1,16,16,1,2,24,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00022,8,2.0,1,16,16,1,2,32,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00023,8,2.0,1,16,16,1,2,16,2,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00024,8,2.0,1,16,16,1,2,16,3,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00025,8,2.0,1,16,16,1,2,16,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00026,8,2.0,1,16,16,1,2,16,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00027,8,2.0,1,16,16,1,2,16,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00028,8,2.0,1,16,16,1,2,16,1,256,2.0,2.0,1.0,0.5,1.0,0.5,,false -V00029,8,2.0,1,16,16,1,2,16,1,256,4.0,2.0,1.0,0.5,1.0,0.5,,false -V00030,8,2.0,1,16,16,1,2,16,1,256,1.0,1.0,1.0,0.5,1.0,0.5,,false -V00031,8,2.0,1,16,16,1,2,16,1,256,1.0,4.0,1.0,0.5,1.0,0.5,,false -V00032,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,2.0,0.5,1.0,0.5,,false -V00033,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,4.0,0.5,1.0,0.5,,false -V00034,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00035,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00036,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00037,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00038,8,2.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00039,8,2.0,1,16,16,1,1,8,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00040,8,2.0,1,16,16,1,1,8,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00041,8,2.0,1,16,16,1,2,8,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00042,8,2.0,1,16,16,1,3,8,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00043,8,2.0,1,16,16,1,3,8,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00044,8,2.0,1,16,16,1,1,16,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00045,8,2.0,1,16,16,1,3,16,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00046,8,2.0,1,16,16,1,1,24,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00047,8,2.0,1,16,16,1,1,24,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00048,8,2.0,1,16,16,1,2,24,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00049,8,2.0,1,16,16,1,3,24,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00050,8,2.0,1,16,16,1,3,24,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00051,8,2.0,1,16,16,1,1,32,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00052,8,2.0,1,16,16,1,1,32,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00053,8,2.0,1,16,16,1,2,32,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00054,8,2.0,1,16,16,1,3,32,1,256,1.0,2.0,1.0,0.5,0.75,0.5,,false -V00055,8,2.0,1,16,16,1,3,32,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00056,8,2.0,1,16,16,1,2,16,1,128,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00057,8,2.0,1,16,16,1,2,16,1,128,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00058,8,2.0,1,16,16,1,2,16,2,128,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00059,8,2.0,1,16,16,1,2,16,2,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00060,8,2.0,1,16,16,1,2,16,2,128,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00061,8,2.0,1,16,16,1,2,16,3,128,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00062,8,2.0,1,16,16,1,2,16,3,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00063,8,2.0,1,16,16,1,2,16,3,128,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00064,8,2.0,1,16,16,1,2,16,2,256,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00065,8,2.0,1,16,16,1,2,16,2,256,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00066,8,2.0,1,16,16,1,2,16,3,256,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00067,8,2.0,1,16,16,1,2,16,3,256,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00068,8,2.0,1,16,16,1,2,16,1,512,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00069,8,2.0,1,16,16,1,2,16,1,512,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00070,8,2.0,1,16,16,1,2,16,2,512,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00071,8,2.0,1,16,16,1,2,16,2,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00072,8,2.0,1,16,16,1,2,16,2,512,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00073,8,2.0,1,16,16,1,2,16,3,512,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00074,8,2.0,1,16,16,1,2,16,3,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00075,8,2.0,1,16,16,1,2,16,3,512,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00076,8,2.0,1,16,16,1,2,16,1,768,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00077,8,2.0,1,16,16,1,2,16,1,768,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00078,8,2.0,1,16,16,1,2,16,2,768,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00079,8,2.0,1,16,16,1,2,16,2,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00080,8,2.0,1,16,16,1,2,16,2,768,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00081,8,2.0,1,16,16,1,2,16,3,768,1.0,2.0,1.0,0.5,1.0,0.25,,false -V00082,8,2.0,1,16,16,1,2,16,3,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00083,8,2.0,1,16,16,1,2,16,3,768,1.0,2.0,1.0,0.5,1.0,0.75,,false -V00084,8,2.0,1,16,8,1,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00085,8,2.0,1,16,8,1,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00086,8,2.0,1,16,8,2,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00087,8,2.0,1,16,8,2,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00088,8,2.0,1,16,8,2,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00089,8,2.0,1,16,8,3,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00090,8,2.0,1,16,8,3,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00091,8,2.0,1,16,8,3,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00092,8,2.0,1,16,16,2,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00093,8,2.0,1,16,16,2,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00094,8,2.0,1,16,16,3,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00095,8,2.0,1,16,16,3,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00096,8,2.0,1,16,24,1,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00097,8,2.0,1,16,24,1,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00098,8,2.0,1,16,24,2,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00099,8,2.0,1,16,24,2,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00100,8,2.0,1,16,24,2,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00101,8,2.0,1,16,24,3,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00102,8,2.0,1,16,24,3,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00103,8,2.0,1,16,24,3,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00104,8,2.0,1,16,32,1,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00105,8,2.0,1,16,32,1,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00106,8,2.0,1,16,32,2,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00107,8,2.0,1,16,32,2,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00108,8,2.0,1,16,32,2,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00109,8,2.0,1,16,32,3,2,16,1,256,1.0,2.0,1.0,0.25,1.0,0.5,,false -V00110,8,2.0,1,16,32,3,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00111,8,2.0,1,16,32,3,2,16,1,256,1.0,2.0,1.0,0.75,1.0,0.5,,false -V00112,4,1.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00113,4,1.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00114,4,1.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00115,4,1.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00116,4,2.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00117,4,2.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00118,4,2.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00119,4,3.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00120,4,3.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00121,4,3.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00122,4,3.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00123,4,4.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00124,4,4.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00125,4,4.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00126,4,4.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00127,8,1.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00128,8,1.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00129,8,1.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00130,8,3.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00131,8,3.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00132,8,3.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00133,8,4.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00134,8,4.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00135,8,4.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00136,12,1.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00137,12,1.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00138,12,1.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00139,12,1.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00140,12,2.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00141,12,2.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00142,12,2.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00143,12,3.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00144,12,3.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00145,12,3.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00146,12,3.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00147,12,4.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00148,12,4.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00149,12,4.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00150,12,4.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00151,16,1.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00152,16,1.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00153,16,1.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00154,16,1.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00155,16,2.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00156,16,2.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00157,16,2.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00158,16,3.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00159,16,3.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00160,16,3.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00161,16,3.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00162,16,4.0,1,8,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00163,16,4.0,1,16,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00164,16,4.0,1,24,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00165,16,4.0,1,32,16,1,2,16,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00166,8,2.0,1,16,16,1,2,16,1,256,1.0,1.0,2.0,0.5,1.0,0.5,,false -V00167,8,2.0,1,16,16,1,2,16,1,256,1.0,1.0,4.0,0.5,1.0,0.5,,false -V00168,8,2.0,1,16,16,1,2,16,1,256,1.0,4.0,2.0,0.5,1.0,0.5,,false -V00169,8,2.0,1,16,16,1,2,16,1,256,1.0,4.0,4.0,0.5,1.0,0.5,,false -V00170,8,2.0,1,16,16,1,2,16,1,256,2.0,1.0,1.0,0.5,1.0,0.5,,false -V00171,8,2.0,1,16,16,1,2,16,1,256,2.0,1.0,2.0,0.5,1.0,0.5,,false -V00172,8,2.0,1,16,16,1,2,16,1,256,2.0,1.0,4.0,0.5,1.0,0.5,,false -V00173,8,2.0,1,16,16,1,2,16,1,256,2.0,2.0,2.0,0.5,1.0,0.5,,false -V00174,8,2.0,1,16,16,1,2,16,1,256,2.0,2.0,4.0,0.5,1.0,0.5,,false -V00175,8,2.0,1,16,16,1,2,16,1,256,2.0,4.0,1.0,0.5,1.0,0.5,,false -V00176,8,2.0,1,16,16,1,2,16,1,256,2.0,4.0,2.0,0.5,1.0,0.5,,false -V00177,8,2.0,1,16,16,1,2,16,1,256,2.0,4.0,4.0,0.5,1.0,0.5,,false -V00178,8,2.0,1,16,16,1,2,16,1,256,4.0,1.0,1.0,0.5,1.0,0.5,,false -V00179,8,2.0,1,16,16,1,2,16,1,256,4.0,1.0,2.0,0.5,1.0,0.5,,false -V00180,8,2.0,1,16,16,1,2,16,1,256,4.0,1.0,4.0,0.5,1.0,0.5,,false -V00181,8,2.0,1,16,16,1,2,16,1,256,4.0,2.0,2.0,0.5,1.0,0.5,,false -V00182,8,2.0,1,16,16,1,2,16,1,256,4.0,2.0,4.0,0.5,1.0,0.5,,false -V00183,8,2.0,1,16,16,1,2,16,1,256,4.0,4.0,1.0,0.5,1.0,0.5,,false -V00184,8,2.0,1,16,16,1,2,16,1,256,4.0,4.0,2.0,0.5,1.0,0.5,,false -V00185,8,2.0,1,16,16,1,2,16,1,256,4.0,4.0,4.0,0.5,1.0,0.5,,false -V00186,8,2.0,1,8,16,1,2,8,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00187,8,2.0,1,8,16,1,2,8,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00188,8,2.0,1,8,16,1,2,8,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00189,8,2.0,1,8,16,1,2,8,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00190,8,2.0,1,8,16,1,2,16,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00191,8,2.0,1,8,16,1,2,16,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00192,8,2.0,1,8,16,1,2,16,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00193,8,2.0,1,8,16,1,2,24,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00194,8,2.0,1,8,16,1,2,24,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00195,8,2.0,1,8,16,1,2,24,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00196,8,2.0,1,8,16,1,2,24,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00197,8,2.0,1,8,16,1,2,32,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00198,8,2.0,1,8,16,1,2,32,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00199,8,2.0,1,8,16,1,2,32,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00200,8,2.0,1,8,16,1,2,32,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00201,8,2.0,1,16,16,1,2,8,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00202,8,2.0,1,16,16,1,2,8,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00203,8,2.0,1,16,16,1,2,8,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00204,8,2.0,1,16,16,1,2,24,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00205,8,2.0,1,16,16,1,2,24,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00206,8,2.0,1,16,16,1,2,24,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00207,8,2.0,1,16,16,1,2,32,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00208,8,2.0,1,16,16,1,2,32,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00209,8,2.0,1,16,16,1,2,32,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00210,8,2.0,1,24,16,1,2,8,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00211,8,2.0,1,24,16,1,2,8,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00212,8,2.0,1,24,16,1,2,8,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00213,8,2.0,1,24,16,1,2,8,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00214,8,2.0,1,24,16,1,2,16,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00215,8,2.0,1,24,16,1,2,16,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00216,8,2.0,1,24,16,1,2,16,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00217,8,2.0,1,24,16,1,2,24,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00218,8,2.0,1,24,16,1,2,24,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00219,8,2.0,1,24,16,1,2,24,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00220,8,2.0,1,24,16,1,2,24,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00221,8,2.0,1,24,16,1,2,32,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00222,8,2.0,1,24,16,1,2,32,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00223,8,2.0,1,24,16,1,2,32,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00224,8,2.0,1,24,16,1,2,32,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00225,8,2.0,1,32,16,1,2,8,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00226,8,2.0,1,32,16,1,2,8,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00227,8,2.0,1,32,16,1,2,8,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00228,8,2.0,1,32,16,1,2,8,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00229,8,2.0,1,32,16,1,2,16,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00230,8,2.0,1,32,16,1,2,16,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00231,8,2.0,1,32,16,1,2,16,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00232,8,2.0,1,32,16,1,2,24,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00233,8,2.0,1,32,16,1,2,24,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00234,8,2.0,1,32,16,1,2,24,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00235,8,2.0,1,32,16,1,2,24,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00236,8,2.0,1,32,16,1,2,32,1,128,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00237,8,2.0,1,32,16,1,2,32,1,256,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00238,8,2.0,1,32,16,1,2,32,1,512,1.0,2.0,1.0,0.5,1.0,0.5,,false -V00239,8,2.0,1,32,16,1,2,32,1,768,1.0,2.0,1.0,0.5,1.0,0.5,,false diff --git a/nemo_retriever/src/nemo_retriever/__init__.py b/nemo_retriever/src/nemo_retriever/__init__.py index 2d0dbc01b..e4b32d309 100644 --- a/nemo_retriever/src/nemo_retriever/__init__.py +++ b/nemo_retriever/src/nemo_retriever/__init__.py @@ -2,7 +2,7 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Retriever application package.""" +"""NeMo Retriever application package.""" from __future__ import annotations diff --git a/nemo_retriever/src/nemo_retriever/__main__.py b/nemo_retriever/src/nemo_retriever/__main__.py deleted file mode 100644 index 652df9193..000000000 --- a/nemo_retriever/src/nemo_retriever/__main__.py +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from .adapters.cli.main import app, main - -__all__ = ["app", "main"] diff --git a/nemo_retriever/src/nemo_retriever/_build_info.py b/nemo_retriever/src/nemo_retriever/_build_info.py deleted file mode 100644 index c446b4406..000000000 --- a/nemo_retriever/src/nemo_retriever/_build_info.py +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -"""Build metadata written by CI before packaging.""" - -BUILD_GIT_SHA = "unknown" -BUILD_DATE = "unknown" diff --git a/nemo_retriever/src/nemo_retriever/chart/chart_detection.py b/nemo_retriever/src/nemo_retriever/chart/chart_detection.py index 76578a5ad..10cf19512 100644 --- a/nemo_retriever/src/nemo_retriever/chart/chart_detection.py +++ b/nemo_retriever/src/nemo_retriever/chart/chart_detection.py @@ -14,6 +14,7 @@ import pandas as pd from nemo_retriever.nim.nim import invoke_image_inference_batches from nemo_retriever.params import RemoteRetryParams +from nemo_retriever.utils.detection import prediction_to_detections try: import numpy as np @@ -58,64 +59,6 @@ def _decode_b64_image_to_chw_tensor(image_b64: str) -> Tuple["torch.Tensor", Tup return t, (int(h), int(w)) -def _crop_b64_image_by_norm_bbox( - page_image_b64: str, - *, - bbox_xyxy_norm: Sequence[float], - image_format: str = "png", -) -> Tuple[Optional[str], Optional[Tuple[int, int]]]: - """ - Crop a base64-encoded RGB image by a normalized xyxy bbox. - - Returns: - - cropped_image_b64 (png) or None - - cropped_shape_hw (H,W) or None - """ - if Image is None: # pragma: no cover - raise ImportError("Cropping requires pillow.") - if not isinstance(page_image_b64, str) or not page_image_b64: - return None, None - try: - x1n, y1n, x2n, y2n = [float(x) for x in bbox_xyxy_norm] - except Exception: - return None, None - - try: - raw = base64.b64decode(page_image_b64) - with Image.open(io.BytesIO(raw)) as im0: - im = im0.convert("RGB") - w, h = im.size - if w <= 1 or h <= 1: - return None, None - - def _clamp_int(v: float, lo: int, hi: int) -> int: - if v != v: # NaN - return lo - return int(min(max(v, float(lo)), float(hi))) - - x1 = _clamp_int(x1n * w, 0, w) - x2 = _clamp_int(x2n * w, 0, w) - y1 = _clamp_int(y1n * h, 0, h) - y2 = _clamp_int(y2n * h, 0, h) - - if x2 <= x1 or y2 <= y1: - return None, None - - crop = im.crop((x1, y1, x2, y2)) - cw, ch = crop.size - if cw <= 1 or ch <= 1: - return None, None - - buf = io.BytesIO() - fmt = str(image_format or "png").lower() - if fmt not in {"png"}: - fmt = "png" - crop.save(buf, format=fmt.upper()) - return base64.b64encode(buf.getvalue()).decode("ascii"), (int(ch), int(cw)) - except Exception: - return None, None - - def _labels_from_model(model: Any) -> List[str]: try: labels = getattr(getattr(model, "_model", None), "labels", None) @@ -136,107 +79,6 @@ def _labels_from_model(model: Any) -> List[str]: return [] -def _prediction_to_detections(pred: Any, *, label_names: List[str]) -> List[Dict[str, Any]]: - if torch is None: # pragma: no cover - raise ImportError("torch required for prediction parsing.") - - boxes = labels = scores = None - if isinstance(pred, dict): - # IMPORTANT: do not use `or` chains here. torch.Tensor truthiness is ambiguous and raises. - def _get_any(d: Dict[str, Any], *keys: str) -> Any: - for k in keys: - if k in d: - v = d.get(k) - if v is not None: - return v - return None - - boxes = _get_any(pred, "boxes", "bboxes", "bbox", "box") - labels = _get_any(pred, "labels", "classes", "class_ids", "class") - scores = _get_any(pred, "scores", "conf", "confidences", "score") - elif isinstance(pred, (list, tuple)) and len(pred) >= 3: - boxes, labels, scores = pred[0], pred[1], pred[2] - - if boxes is None or labels is None: - return [] - - def _to_tensor(x: Any) -> Optional["torch.Tensor"]: - if x is None: - return None - if isinstance(x, torch.Tensor): - return x.detach().cpu() - try: - return torch.as_tensor(x).detach().cpu() - except Exception: - return None - - # Handle string labels (e.g. NIM returns ["chart_title", "xlabel", ...]). - # torch.as_tensor cannot convert strings, so handle them before tensor conversion. - _string_labels: Optional[List[str]] = None - if isinstance(labels, (list, tuple)) and labels and isinstance(labels[0], str): - _string_labels = [str(x) for x in labels] - - b = _to_tensor(boxes) - labels_t = _to_tensor(labels) if _string_labels is None else None - s = _to_tensor(scores) if scores is not None else None - if b is None: - return [] - if labels_t is None and _string_labels is None: - return [] - - if b.ndim != 2 or int(b.shape[-1]) != 4: - return [] - if labels_t is not None: - if labels_t.ndim == 2 and int(labels_t.shape[-1]) == 1: - labels_t = labels_t.squeeze(-1) - if labels_t.ndim != 1: - return [] - - n_labels = len(_string_labels) if _string_labels is not None else int(labels_t.shape[0]) - n = int(min(b.shape[0], n_labels)) - dets: List[Dict[str, Any]] = [] - for i in range(n): - try: - x1, y1, x2, y2 = [float(x) for x in b[i].tolist()] - except Exception: - continue - - if _string_labels is not None: - label_i = i - label_name = _string_labels[i] - else: - label_i: Optional[int] - try: - label_i = int(labels_t[i].item()) - except Exception: - label_i = None - - label_name = None - if label_i is not None and 0 <= label_i < len(label_names): - label_name = label_names[label_i] - if not label_name: - label_name = f"label_{label_i}" if label_i is not None else "unknown" - - score_f: Optional[float] - if s is not None and s.ndim >= 1 and int(s.shape[0]) > i: - try: - score_f = float(s[i].item()) - except Exception: - score_f = None - else: - score_f = None - - dets.append( - { - "bbox_xyxy_norm": [x1, y1, x2, y2], - "label": label_i, - "label_name": str(label_name), - "score": score_f, - } - ) - return dets - - def _counts_by_label(detections: Sequence[Dict[str, Any]]) -> Dict[str, int]: out: Dict[str, int] = {} for d in detections: @@ -354,11 +196,11 @@ def graphic_elements_ocr_page_elements( Original columns plus ``chart`` and ``graphic_elements_ocr_v1``. """ from nemo_retriever.ocr.ocr import ( - _blocks_to_text, - _crop_all_from_page, - _extract_remote_ocr_item, - _np_rgb_to_b64_png, - _parse_ocr_result, + blocks_to_text, + crop_all_from_page, + extract_remote_ocr_item, + np_rgb_to_b64_png, + parse_ocr_result, ) from nemo_retriever.util.table_and_chart import join_graphic_elements_and_ocr_output @@ -413,7 +255,7 @@ def graphic_elements_ocr_page_elements( continue # --- Crop all chart detections --- - crops = _crop_all_from_page(page_image_b64, dets, {"chart"}) + crops = crop_all_from_page(page_image_b64, dets, {"chart"}) if not crops: all_chart.append(chart_items) @@ -422,7 +264,7 @@ def graphic_elements_ocr_page_elements( # Pre-compute base64 encodings once for remote paths. crop_b64s = ( - [_np_rgb_to_b64_png(crop_array) for _, _, crop_array in crops] + [np_rgb_to_b64_png(crop_array) for _, _, crop_array in crops] if (use_remote_ge or use_remote_ocr) else [] ) @@ -457,7 +299,7 @@ def graphic_elements_ocr_page_elements( if isinstance(pre, torch.Tensor) and pre.ndim == 3: pre = pre.unsqueeze(0) pred = graphic_elements_model.invoke(pre, (h, w)) - ge_dets = _prediction_to_detections(pred, label_names=label_names) + ge_dets = prediction_to_detections(pred, label_names=label_names) ge_results.append(ge_dets) # --- Run OCR on all crops --- @@ -476,7 +318,7 @@ def graphic_elements_ocr_page_elements( if len(ocr_response_items) != len(crops): raise RuntimeError(f"Expected {len(crops)} OCR responses, got {len(ocr_response_items)}") for resp in ocr_response_items: - ocr_results.append(_extract_remote_ocr_item(resp)) + ocr_results.append(extract_remote_ocr_item(resp)) else: for _, _, crop_array in crops: ocr_results.append(ocr_model.invoke(crop_array, merge_level="word")) @@ -492,8 +334,8 @@ def graphic_elements_ocr_page_elements( # Fallback: if no GE detections matched, use OCR-only text. if not text: - blocks = _parse_ocr_result(ocr_preds) - text = _blocks_to_text(blocks) + blocks = parse_ocr_result(ocr_preds) + text = blocks_to_text(blocks) chart_items.append({"bbox_xyxy_norm": bbox, "text": text}) diff --git a/nemo_retriever/src/nemo_retriever/chart/config.py b/nemo_retriever/src/nemo_retriever/chart/config.py index 520631183..04a93a351 100644 --- a/nemo_retriever/src/nemo_retriever/chart/config.py +++ b/nemo_retriever/src/nemo_retriever/chart/config.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Any, Dict -from nemo_retriever.config_utils import endpoints_from_yaml +from nemo_retriever.utils.config_utils import endpoints_from_yaml from nv_ingest_api.internal.schemas.extract.extract_chart_schema import ChartExtractorSchema diff --git a/nemo_retriever/src/nemo_retriever/config/__init__.py b/nemo_retriever/src/nemo_retriever/config/__init__.py deleted file mode 100644 index 23432d208..000000000 --- a/nemo_retriever/src/nemo_retriever/config/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .loader import load_config_file, load_config_section, resolve_config_path - -__all__ = ["load_config_file", "load_config_section", "resolve_config_path"] diff --git a/nemo_retriever/src/nemo_retriever/config/loader.py b/nemo_retriever/src/nemo_retriever/config/loader.py deleted file mode 100644 index a21532d33..000000000 --- a/nemo_retriever/src/nemo_retriever/config/loader.py +++ /dev/null @@ -1,37 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Dict, Optional, Tuple - -from nemo_retriever.ingest_config import ( - load_ingest_config_file, - load_ingest_config_section, - resolve_ingest_config_path, -) - - -def resolve_config_path(explicit: Optional[Path]) -> Tuple[Optional[Path], str]: - return resolve_ingest_config_path(explicit) - - -def load_config_file(explicit: Optional[Path], *, verbose: bool = True) -> Tuple[Dict[str, Any], Optional[Path], str]: - return load_ingest_config_file(explicit, verbose=verbose) - - -def load_config_section( - explicit: Optional[Path], - *, - section: str, - verbose: bool = True, - warn_if_missing_section: bool = True, -) -> Dict[str, Any]: - return load_ingest_config_section( - explicit, - section=section, - verbose=verbose, - warn_if_missing_section=warn_if_missing_section, - ) diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py index b7faf73d5..28c0da622 100644 --- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py +++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py @@ -166,19 +166,6 @@ def _write_detection_summary(path: Path, summary: Optional[dict]) -> None: target.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") -def _print_pages_per_second(processed_pages: Optional[int], ingest_elapsed_s: float) -> None: - if ingest_elapsed_s <= 0: - print("Pages/sec: unavailable (ingest elapsed time was non-positive).") - return - if processed_pages is None: - print("Pages/sec: unavailable (could not estimate processed pages). " f"Ingest time: {ingest_elapsed_s:.2f}s") - return - - pps = processed_pages / ingest_elapsed_s - print(f"Pages processed: {processed_pages}") - print(f"Pages/sec (ingest only; excludes Ray startup and recall): {pps:.2f}") - - def _count_materialized_rows(dataset: object) -> int: """Count rows from a materialized Ray Dataset without relying on ``len()``.""" count = getattr(dataset, "count", None) diff --git a/nemo_retriever/src/nemo_retriever/harness/nightly.py b/nemo_retriever/src/nemo_retriever/harness/nightly.py index cfc07f2d5..b8c50469c 100644 --- a/nemo_retriever/src/nemo_retriever/harness/nightly.py +++ b/nemo_retriever/src/nemo_retriever/harness/nightly.py @@ -11,7 +11,7 @@ from nemo_retriever.harness.artifacts import write_session_summary from nemo_retriever.harness.config import DEFAULT_NIGHTLY_CONFIG_PATH, load_nightly_config -from nemo_retriever.harness.run import _normalize_tags, execute_runs +from nemo_retriever.harness.run import normalize_tags, execute_runs from nemo_retriever.harness.slack import load_replay_report, load_session_report, post_report_to_slack @@ -58,7 +58,7 @@ def nightly_command( ), dry_run: bool = typer.Option(False, "--dry-run", help="Print nightly run plan without executing."), ) -> None: - normalized_tags = _normalize_tags(tag) + normalized_tags = normalize_tags(tag) nightly_cfg = load_nightly_config(runs_config) runs = nightly_cfg["runs"] slack_config = nightly_cfg["slack"] diff --git a/nemo_retriever/src/nemo_retriever/harness/run.py b/nemo_retriever/src/nemo_retriever/harness/run.py index f0d10509c..c76cd03a6 100644 --- a/nemo_retriever/src/nemo_retriever/harness/run.py +++ b/nemo_retriever/src/nemo_retriever/harness/run.py @@ -91,7 +91,7 @@ def _collect_run_metadata() -> dict[str, Any]: } -def _normalize_tags(tags: list[str] | None) -> list[str]: +def normalize_tags(tags: list[str] | None) -> list[str]: normalized: list[str] = [] seen: set[str] = set() @@ -466,7 +466,7 @@ def _run_entry( artifact_dir.mkdir(parents=True, exist_ok=True) resolved_run_name = run_name or cfg.dataset_label - normalized_tags = _normalize_tags(tags) + normalized_tags = normalize_tags(tags) result = _run_single(cfg, artifact_dir, run_id=resolved_run_name, tags=normalized_tags) run_result = { "run_name": resolved_run_name, @@ -548,7 +548,7 @@ def sweep_command( tag: list[str] = typer.Option([], "--tag", help="Session tag to persist on each run. Repeatable."), dry_run: bool = typer.Option(False, "--dry-run", help="Print run plan without executing."), ) -> None: - normalized_tags = _normalize_tags(tag) + normalized_tags = normalize_tags(tag) runs = load_runs_config(runs_config) if dry_run: typer.echo("Sweep dry run:") diff --git a/nemo_retriever/src/nemo_retriever/html/convert.py b/nemo_retriever/src/nemo_retriever/html/convert.py index a4968a670..222f84af3 100644 --- a/nemo_retriever/src/nemo_retriever/html/convert.py +++ b/nemo_retriever/src/nemo_retriever/html/convert.py @@ -23,7 +23,7 @@ DEFAULT_TOKENIZER_MODEL_ID, split_text_by_tokens, ) -from ..txt.split import _get_tokenizer as _get_txt_tokenizer +from ..txt.split import get_tokenizer as _get_txt_tokenizer def html_to_markdown(html_content: Union[str, bytes, Path]) -> str: diff --git a/nemo_retriever/src/nemo_retriever/infographic/config.py b/nemo_retriever/src/nemo_retriever/infographic/config.py index f44495890..48ae71ac0 100644 --- a/nemo_retriever/src/nemo_retriever/infographic/config.py +++ b/nemo_retriever/src/nemo_retriever/infographic/config.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Any, Dict -from nemo_retriever.config_utils import endpoints_from_yaml +from nemo_retriever.utils.config_utils import endpoints_from_yaml from nv_ingest_api.internal.schemas.extract.extract_infographic_schema import InfographicExtractorSchema diff --git a/nemo_retriever/src/nemo_retriever/infographic/infographic_detection.py b/nemo_retriever/src/nemo_retriever/infographic/infographic_detection.py index 84ca7058f..acd9fa2f0 100644 --- a/nemo_retriever/src/nemo_retriever/infographic/infographic_detection.py +++ b/nemo_retriever/src/nemo_retriever/infographic/infographic_detection.py @@ -22,6 +22,8 @@ import pandas as pd from nemo_retriever.params import RemoteRetryParams from nemo_retriever.nim.nim import invoke_image_inference_batches +from nemo_retriever.ocr.ocr import crop_b64_image_by_norm_bbox +from nemo_retriever.utils.detection import prediction_to_detections try: import numpy as np @@ -66,64 +68,6 @@ def _decode_b64_image_to_chw_tensor(image_b64: str) -> Tuple["torch.Tensor", Tup return t, (int(h), int(w)) -def _crop_b64_image_by_norm_bbox( - page_image_b64: str, - *, - bbox_xyxy_norm: Sequence[float], - image_format: str = "png", -) -> Tuple[Optional[str], Optional[Tuple[int, int]]]: - """ - Crop a base64-encoded RGB image by a normalized xyxy bbox. - - Returns: - - cropped_image_b64 (png) or None - - cropped_shape_hw (H,W) or None - """ - if Image is None: # pragma: no cover - raise ImportError("Cropping requires pillow.") - if not isinstance(page_image_b64, str) or not page_image_b64: - return None, None - try: - x1n, y1n, x2n, y2n = [float(x) for x in bbox_xyxy_norm] - except Exception: - return None, None - - try: - raw = base64.b64decode(page_image_b64) - with Image.open(io.BytesIO(raw)) as im0: - im = im0.convert("RGB") - w, h = im.size - if w <= 1 or h <= 1: - return None, None - - def _clamp_int(v: float, lo: int, hi: int) -> int: - if v != v: # NaN - return lo - return int(min(max(v, float(lo)), float(hi))) - - x1 = _clamp_int(x1n * w, 0, w) - x2 = _clamp_int(x2n * w, 0, w) - y1 = _clamp_int(y1n * h, 0, h) - y2 = _clamp_int(y2n * h, 0, h) - - if x2 <= x1 or y2 <= y1: - return None, None - - crop = im.crop((x1, y1, x2, y2)) - cw, ch = crop.size - if cw <= 1 or ch <= 1: - return None, None - - buf = io.BytesIO() - fmt = str(image_format or "png").lower() - if fmt not in {"png"}: - fmt = "png" - crop.save(buf, format=fmt.upper()) - return base64.b64encode(buf.getvalue()).decode("ascii"), (int(ch), int(cw)) - except Exception: - return None, None - - def _labels_from_model(model: Any) -> List[str]: try: labels = getattr(getattr(model, "_model", None), "labels", None) @@ -144,93 +88,6 @@ def _labels_from_model(model: Any) -> List[str]: return [] -def _prediction_to_detections(pred: Any, *, label_names: List[str]) -> List[Dict[str, Any]]: - if torch is None: # pragma: no cover - raise ImportError("torch required for prediction parsing.") - - boxes = labels = scores = None - if isinstance(pred, dict): - # IMPORTANT: do not use `or` chains here. torch.Tensor truthiness is ambiguous and raises. - def _get_any(d: Dict[str, Any], *keys: str) -> Any: - for k in keys: - if k in d: - v = d.get(k) - if v is not None: - return v - return None - - boxes = _get_any(pred, "boxes", "bboxes", "bbox", "box") - labels = _get_any(pred, "labels", "classes", "class_ids", "class") - scores = _get_any(pred, "scores", "conf", "confidences", "score") - elif isinstance(pred, (list, tuple)) and len(pred) >= 3: - boxes, labels, scores = pred[0], pred[1], pred[2] - - if boxes is None or labels is None: - return [] - - def _to_tensor(x: Any) -> Optional["torch.Tensor"]: - if x is None: - return None - if isinstance(x, torch.Tensor): - return x.detach().cpu() - try: - return torch.as_tensor(x).detach().cpu() - except Exception: - return None - - b = _to_tensor(boxes) - l = _to_tensor(labels) # noqa: E741 - s = _to_tensor(scores) if scores is not None else None - if b is None or l is None: - return [] - - if b.ndim != 2 or int(b.shape[-1]) != 4: - return [] - if l.ndim == 2 and int(l.shape[-1]) == 1: - l = l.squeeze(-1) # noqa: E741 - if l.ndim != 1: - return [] - - n = int(min(b.shape[0], l.shape[0])) - dets: List[Dict[str, Any]] = [] - for i in range(n): - try: - x1, y1, x2, y2 = [float(x) for x in b[i].tolist()] - except Exception: - continue - - label_i: Optional[int] - try: - label_i = int(l[i].item()) - except Exception: - label_i = None - - score_f: Optional[float] - if s is not None and s.ndim >= 1 and int(s.shape[0]) > i: - try: - score_f = float(s[i].item()) - except Exception: - score_f = None - else: - score_f = None - - label_name = None - if label_i is not None and 0 <= label_i < len(label_names): - label_name = label_names[label_i] - if not label_name: - label_name = f"label_{label_i}" if label_i is not None else "unknown" - - dets.append( - { - "bbox_xyxy_norm": [x1, y1, x2, y2], - "label": label_i, - "label_name": str(label_name), - "score": score_f, - } - ) - return dets - - def _extract_remote_pred_item(response_item: Any) -> Any: if isinstance(response_item, dict): for k in ("prediction", "predictions", "output", "outputs", "data"): @@ -344,7 +201,7 @@ def detect_infographic_elements_v1( raise RuntimeError(f"Expected {len(valid)} remote predictions, got {len(response_items)}") for local_j, row_i in enumerate(valid): pred_item = _extract_remote_pred_item(response_items[local_j]) - dets = _prediction_to_detections(pred_item, label_names=label_names) + dets = prediction_to_detections(pred_item, label_names=label_names) payloads[row_i] = {"detections": dets, "timing": {"seconds": float(elapsed)}, "error": None} except BaseException as e: elapsed = time.perf_counter() - t0 @@ -393,7 +250,7 @@ def detect_infographic_elements_v1( if len(preds_list) != len(idxs): raise RuntimeError("Batched invoke returned unexpected output shape; falling back to per-image calls.") for local_j, row_i in enumerate(idxs): - dets = _prediction_to_detections(preds_list[local_j], label_names=label_names) + dets = prediction_to_detections(preds_list[local_j], label_names=label_names) payloads[row_i] = {"detections": dets, "timing": {"seconds": float(elapsed)}, "error": None} except BaseException: for local_j, row_i in enumerate(idxs): @@ -411,7 +268,7 @@ def detect_infographic_elements_v1( if isinstance(pre, torch.Tensor) and pre.ndim == 3: pre = pre.unsqueeze(0) pred = model.invoke(pre, sh) - dets = _prediction_to_detections(pred, label_names=label_names) + dets = prediction_to_detections(pred, label_names=label_names) payloads[row_i] = { "detections": dets, "timing": {"seconds": float(time.perf_counter() - t1)}, @@ -541,7 +398,7 @@ def detect_infographic_elements_v1_from_page_elements_v3( if not isinstance(bbox, (list, tuple)) or len(bbox) != 4: continue - crop_b64, crop_shape_hw = _crop_b64_image_by_norm_bbox( + crop_b64, crop_shape_hw = crop_b64_image_by_norm_bbox( page_image_b64, bbox_xyxy_norm=cast(Sequence[float], bbox) ) if not crop_b64 or crop_shape_hw is None: @@ -587,7 +444,7 @@ def detect_infographic_elements_v1_from_page_elements_v3( raise RuntimeError(f"Expected {len(crop_b64s)} remote predictions, got {len(response_items)}") for resp in response_items: pred_item = _extract_remote_pred_item(resp) - dets = _prediction_to_detections(pred_item, label_names=label_names) + dets = prediction_to_detections(pred_item, label_names=label_names) crop_payloads.append({"detections": dets, "timing": {"seconds": float(elapsed)}, "error": None}) except BaseException as e: elapsed = time.perf_counter() - t0 @@ -650,7 +507,7 @@ def detect_infographic_elements_v1_from_page_elements_v3( "Batched invoke returned unexpected output shape; falling back to per-image calls." ) for local_j, crop_i in enumerate(idxs): - dets = _prediction_to_detections(preds_list[local_j], label_names=label_names) + dets = prediction_to_detections(preds_list[local_j], label_names=label_names) crop_payloads[crop_i] = { "detections": dets, "timing": {"seconds": float(elapsed)}, @@ -672,7 +529,7 @@ def detect_infographic_elements_v1_from_page_elements_v3( if isinstance(pre, torch.Tensor) and pre.ndim == 3: pre = pre.unsqueeze(0) pred = model.invoke(pre, sh) - dets = _prediction_to_detections(pred, label_names=label_names) + dets = prediction_to_detections(pred, label_names=label_names) crop_payloads[crop_i] = { "detections": dets, "timing": {"seconds": float(time.perf_counter() - t1)}, diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py index 1557c81ad..703a51fde 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py @@ -142,7 +142,7 @@ def __call__(self, batch_df: Any) -> Any: return batch_df -class _BatchEmbedActor: +class BatchEmbedActor: """Ray Data actor that holds a local text embedder on a single GPU. When ``embedding_endpoint`` is provided in kwargs, the actor skips local @@ -754,7 +754,7 @@ def embed( embed_actor_num_gpus = self._requested_plan.get_embed_gpus_per_actor() self._rd_dataset = self._rd_dataset.map_batches( - _BatchEmbedActor, + BatchEmbedActor, batch_size=self._requested_plan.get_embed_batch_size(), batch_format="pandas", num_gpus=embed_actor_num_gpus, # pulled from if statement above diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py b/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py index 68c33ad58..ff580a087 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py @@ -28,7 +28,7 @@ from ..params import EmbedParams from ..params import ExtractParams from ..params import PdfSplitParams -from .batch import _BatchEmbedActor +from .batch import BatchEmbedActor from .batch import BatchIngestor from .inprocess import collapse_content_to_page_rows from .inprocess import embed_text_main_text_embed @@ -233,7 +233,7 @@ def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "FusedInges Run page-elements + OCR + explode + embed in one GPU actor stage. `fused` mode intentionally does not support remote NIM invocation. - When _pipeline_type == "audio", uses explode + _BatchEmbedActor (no PDF stages). + When _pipeline_type == "audio", uses explode + BatchEmbedActor (no PDF stages). """ resolved = params or EmbedParams(**kwargs) if params is not None and kwargs: @@ -262,7 +262,7 @@ def embed(self, params: EmbedParams | None = None, **kwargs: Any) -> "FusedInges num_gpus=0, ) self._rd_dataset = self._rd_dataset.map_batches( - _BatchEmbedActor, + BatchEmbedActor, batch_size=embed_batch_size, batch_format="pandas", num_cpus=embed_cpus_per_actor, diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py index dd2854ae7..83b337f02 100644 --- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py +++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py @@ -30,7 +30,7 @@ from nemo_retriever.model.local import NemotronOCRV1, NemotronPageElementsV3, NemotronParseV12 from nemo_retriever.chart.chart_detection import graphic_elements_ocr_page_elements from nemo_retriever.page_elements import detect_page_elements_v3 -from nemo_retriever.ocr.ocr import _crop_b64_image_by_norm_bbox, nemotron_parse_page_elements, ocr_page_elements +from nemo_retriever.ocr.ocr import crop_b64_image_by_norm_bbox, nemotron_parse_page_elements, ocr_page_elements from nemo_retriever.table.table_detection import table_structure_ocr_page_elements from nemo_retriever.text_embed.main_text_embed import TextEmbeddingConfig, create_text_embeddings_for_df @@ -57,7 +57,7 @@ from ..params import TextChunkParams from ..params import VdbUploadParams from ..pdf.extract import pdf_extraction -from ..pdf.split import _split_pdf_to_single_page_bytes, pdf_path_to_pages_df +from ..pdf.split import split_pdf_to_single_page_bytes, pdf_path_to_pages_df from ..txt import txt_file_to_chunks_df from ..html import html_file_to_chunks_df @@ -207,7 +207,7 @@ def explode_content_to_rows( if struct_mod in IMAGE_MODALITIES and page_image_b64: bbox = item.get("bbox_xyxy_norm") if bbox and len(bbox) == 4: - cropped_b64, _ = _crop_b64_image_by_norm_bbox(page_image_b64, bbox_xyxy_norm=bbox) + cropped_b64, _ = crop_b64_image_by_norm_bbox(page_image_b64, bbox_xyxy_norm=bbox) content_row["_image_b64"] = cropped_b64 else: content_row["_image_b64"] = page_image_b64 @@ -550,7 +550,7 @@ def pages_df_from_pdf_bytes(pdf_bytes: Union[bytes, bytearray], source_path: str Used by the online ingest mode to run the same pipeline on document bytes received via REST. Columns: bytes, path, page_number. """ - pages = _split_pdf_to_single_page_bytes(pdf_bytes) + pages = split_pdf_to_single_page_bytes(pdf_bytes) out_rows = [{"bytes": b, "path": source_path, "page_number": i + 1} for i, b in enumerate(pages)] return pd.DataFrame(out_rows) @@ -1649,7 +1649,7 @@ def _loader(p: str) -> pd.DataFrame: with open(abs_path, "rb") as f: file_bytes = f.read() pdf_bytes = convert_to_pdf_bytes(file_bytes, ext) - pages = _split_pdf_to_single_page_bytes(pdf_bytes) + pages = split_pdf_to_single_page_bytes(pdf_bytes) out_rows = [{"bytes": b, "path": abs_path, "page_number": i + 1} for i, b in enumerate(pages)] return pd.DataFrame(out_rows) except BaseException as e: diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_parse_v1_2.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_parse_v1_2.py index 07e47ddcd..93467887c 100644 --- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_parse_v1_2.py +++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_parse_v1_2.py @@ -37,7 +37,7 @@ def __init__( self._model_path = model_path self._task_prompt = task_prompt - self._device = torch.device(device or ("cuda:0" if torch.cuda.is_available() else "cpu")) + self._device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu")) self._dtype = torch.bfloat16 if self._device.type == "cuda" else torch.float32 hf_cache_dir = configure_global_hf_cache_base(hf_cache_dir) _revision = get_hf_revision(self._model_path) diff --git a/nemo_retriever/src/nemo_retriever/ocr/__init__.py b/nemo_retriever/src/nemo_retriever/ocr/__init__.py index 49e6be1a4..bdffed321 100644 --- a/nemo_retriever/src/nemo_retriever/ocr/__init__.py +++ b/nemo_retriever/src/nemo_retriever/ocr/__init__.py @@ -2,6 +2,26 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from .ocr import OCRActor, ocr_page_elements +from .ocr import ( + OCRActor, + blocks_to_pseudo_markdown, + blocks_to_text, + crop_all_from_page, + crop_b64_image_by_norm_bbox, + extract_remote_ocr_item, + np_rgb_to_b64_png, + ocr_page_elements, + parse_ocr_result, +) -__all__ = ["OCRActor", "ocr_page_elements"] +__all__ = [ + "OCRActor", + "blocks_to_pseudo_markdown", + "blocks_to_text", + "crop_all_from_page", + "crop_b64_image_by_norm_bbox", + "extract_remote_ocr_item", + "np_rgb_to_b64_png", + "ocr_page_elements", + "parse_ocr_result", +] diff --git a/nemo_retriever/src/nemo_retriever/ocr/ocr.py b/nemo_retriever/src/nemo_retriever/ocr/ocr.py index a99955f24..b2ca71c0d 100644 --- a/nemo_retriever/src/nemo_retriever/ocr/ocr.py +++ b/nemo_retriever/src/nemo_retriever/ocr/ocr.py @@ -48,7 +48,7 @@ def _error_payload(*, stage: str, exc: BaseException) -> Dict[str, Any]: } -def _crop_b64_image_by_norm_bbox( +def crop_b64_image_by_norm_bbox( page_image_b64: str, *, bbox_xyxy_norm: Sequence[float], @@ -110,7 +110,7 @@ def _clamp_int(v: float, lo: int, hi: int) -> int: return None, None -def _crop_all_from_page( +def crop_all_from_page( page_image_b64: str, detections: List[Dict[str, Any]], wanted_labels: set, @@ -188,7 +188,7 @@ def _clamp_int(v: float, lo: int, hi: int) -> int: return results -def _np_rgb_to_b64_png(crop_array: np.ndarray) -> str: +def np_rgb_to_b64_png(crop_array: np.ndarray) -> str: if Image is None: # pragma: no cover raise ImportError("Pillow is required for image encoding.") img = Image.fromarray(crop_array.astype(np.uint8), mode="RGB") @@ -197,7 +197,7 @@ def _np_rgb_to_b64_png(crop_array: np.ndarray) -> str: return base64.b64encode(buf.getvalue()).decode("ascii") -def _extract_remote_ocr_item(response_item: Any) -> Any: +def extract_remote_ocr_item(response_item: Any) -> Any: if isinstance(response_item, dict): # NIM text_detections format: return full list (not v[0]) td = response_item.get("text_detections") @@ -212,7 +212,7 @@ def _extract_remote_ocr_item(response_item: Any) -> Any: return response_item -def _parse_ocr_result(preds: Any) -> List[Dict[str, Any]]: +def parse_ocr_result(preds: Any) -> List[Dict[str, Any]]: """ Parse the output of ``NemotronOCRV1.invoke()`` into a flat list of ``{"text": str, "sort_y": float, "sort_x": float}`` blocks. @@ -317,13 +317,13 @@ def _parse_ocr_result(preds: Any) -> List[Dict[str, Any]]: return blocks -def _blocks_to_text(blocks: List[Dict[str, Any]]) -> str: +def blocks_to_text(blocks: List[Dict[str, Any]]) -> str: """Sort text blocks by reading order (y then x) and join with newlines.""" blocks.sort(key=lambda b: (b.get("sort_y", 0.0), b.get("sort_x", 0.0))) return "\n".join(b["text"] for b in blocks if b.get("text")) -def _blocks_to_pseudo_markdown(blocks: List[Dict[str, Any]]) -> str: +def blocks_to_pseudo_markdown(blocks: List[Dict[str, Any]]) -> str: """Convert OCR text blocks into pseudo-markdown table format. Uses DBSCAN clustering on y-coordinates to identify rows, then @@ -501,13 +501,13 @@ def ocr_page_elements( continue # --- decode page image once, crop all matching detections --- - crops = _crop_all_from_page(page_image_b64, dets, wanted_labels) + crops = crop_all_from_page(page_image_b64, dets, wanted_labels) if use_remote: crop_b64s: List[str] = [] crop_meta: List[Tuple[str, List[float], Tuple[int, int]]] = [] for label_name, bbox, crop_array in crops: - crop_b64s.append(_np_rgb_to_b64_png(crop_array)) + crop_b64s.append(np_rgb_to_b64_png(crop_array)) crop_meta.append((label_name, bbox, (crop_array.shape[0], crop_array.shape[1]))) if crop_b64s: @@ -525,7 +525,7 @@ def ocr_page_elements( raise RuntimeError(f"Expected {len(crop_meta)} OCR responses, got {len(response_items)}") for i, (label_name, bbox, crop_hw) in enumerate(crop_meta): - preds = _extract_remote_ocr_item(response_items[i]) + preds = extract_remote_ocr_item(response_items[i]) if label_name == "chart" and use_graphic_elements: ge_dets = _find_ge_detections_for_bbox(row, bbox) @@ -535,11 +535,11 @@ def ocr_page_elements( chart_items.append({"bbox_xyxy_norm": bbox, "text": text}) continue - blocks = _parse_ocr_result(preds) + blocks = parse_ocr_result(preds) if label_name == "table": - text = _blocks_to_pseudo_markdown(blocks) or _blocks_to_text(blocks) + text = blocks_to_pseudo_markdown(blocks) or blocks_to_text(blocks) else: - text = _blocks_to_text(blocks) + text = blocks_to_text(blocks) entry = {"bbox_xyxy_norm": bbox, "text": text} if label_name == "table": table_items.append(entry) @@ -572,13 +572,13 @@ def _append_local_result( if text: chart_items.append({"bbox_xyxy_norm": bbox, "text": text}) return - blocks = _parse_ocr_result(preds) + blocks = parse_ocr_result(preds) if label_name == "table": - text = _blocks_to_pseudo_markdown(blocks) + text = blocks_to_pseudo_markdown(blocks) if not text: - text = _blocks_to_text(blocks) + text = blocks_to_text(blocks) else: - text = _blocks_to_text(blocks) + text = blocks_to_text(blocks) entry = {"bbox_xyxy_norm": bbox, "text": text} if label_name == "table": table_items.append(entry) @@ -832,7 +832,7 @@ def nemotron_parse_page_elements( all_meta.append({"timing": None, "error": None}) continue - crops = _crop_all_from_page(page_image_b64, dets, wanted_labels) + crops = crop_all_from_page(page_image_b64, dets, wanted_labels) # Parse-only mode may skip page-elements detection entirely. In that # case, parse the full page once and fan out the text to enabled # content channels. @@ -849,7 +849,7 @@ def nemotron_parse_page_elements( crop_b64s: List[str] = [] crop_meta: List[Tuple[str, List[float]]] = [] for label_name, bbox, crop_array in crops: - crop_b64s.append(_np_rgb_to_b64_png(crop_array)) + crop_b64s.append(np_rgb_to_b64_png(crop_array)) crop_meta.append((label_name, bbox)) if crop_b64s: diff --git a/nemo_retriever/src/nemo_retriever/pdf/config.py b/nemo_retriever/src/nemo_retriever/pdf/config.py index f91b0a685..ab831a45d 100644 --- a/nemo_retriever/src/nemo_retriever/pdf/config.py +++ b/nemo_retriever/src/nemo_retriever/pdf/config.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Any, Dict -from nemo_retriever.config_utils import endpoints_from_yaml +from nemo_retriever.utils.config_utils import endpoints_from_yaml from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema diff --git a/nemo_retriever/src/nemo_retriever/pdf/split.py b/nemo_retriever/src/nemo_retriever/pdf/split.py index c750d54fc..ea04069f3 100644 --- a/nemo_retriever/src/nemo_retriever/pdf/split.py +++ b/nemo_retriever/src/nemo_retriever/pdf/split.py @@ -46,7 +46,7 @@ def _error_record( } -def _split_pdf_to_single_page_bytes(pdf_binary: Any) -> List[bytes]: +def split_pdf_to_single_page_bytes(pdf_binary: Any) -> List[bytes]: """ Split a PDF into single-page PDFs (raw bytes) using pypdfium2. """ @@ -103,7 +103,7 @@ def pdf_path_to_pages_df(path: str) -> pd.DataFrame: out_rows: List[Dict[str, Any]] = [] try: raw_bytes = Path(abs_path).read_bytes() - pages = _split_pdf_to_single_page_bytes(raw_bytes) + pages = split_pdf_to_single_page_bytes(raw_bytes) for page_idx, page_bytes in enumerate(pages): out_rows.append( { @@ -141,7 +141,7 @@ def split_pdf_batch(pdf_batch: Any, params: PdfSplitParams | None = None) -> pd. if not isinstance(pdf_bytes, (bytes, bytearray, memoryview)): raise ValueError(f"Unsupported bytes payload type: {type(pdf_bytes)!r}") - pages = _split_pdf_to_single_page_bytes(pdf_bytes) + pages = split_pdf_to_single_page_bytes(pdf_bytes) start_idx = 0 if start_page is None else max(int(start_page) - 1, 0) end_idx = (len(pages) - 1) if end_page is None else min(int(end_page) - 1, len(pages) - 1) if len(pages) == 0 or start_idx > end_idx: diff --git a/nemo_retriever/src/nemo_retriever/recall/vdb_recall.py b/nemo_retriever/src/nemo_retriever/recall/vdb_recall.py index f223cbf1a..a2d212fda 100644 --- a/nemo_retriever/src/nemo_retriever/recall/vdb_recall.py +++ b/nemo_retriever/src/nemo_retriever/recall/vdb_recall.py @@ -12,7 +12,7 @@ import pandas as pd # noqa: F401 from rich.console import Console -from .core import RecallConfig, evaluate_recall, retrieve_and_score, _normalize_query_df # noqa: F401 +from .core import RecallConfig, retrieve_and_score app = typer.Typer(help="Embed query CSV rows, search LanceDB, print hits, and compute recall@k.") console = Console() diff --git a/nemo_retriever/src/nemo_retriever/table/config.py b/nemo_retriever/src/nemo_retriever/table/config.py index 412dba1c0..535c4df8b 100644 --- a/nemo_retriever/src/nemo_retriever/table/config.py +++ b/nemo_retriever/src/nemo_retriever/table/config.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Any, Dict -from nemo_retriever.config_utils import endpoints_from_yaml +from nemo_retriever.utils.config_utils import endpoints_from_yaml from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema diff --git a/nemo_retriever/src/nemo_retriever/table/table_detection.py b/nemo_retriever/src/nemo_retriever/table/table_detection.py index 14df8f22c..974ecfefa 100644 --- a/nemo_retriever/src/nemo_retriever/table/table_detection.py +++ b/nemo_retriever/src/nemo_retriever/table/table_detection.py @@ -11,6 +11,7 @@ import pandas as pd from nemo_retriever.params import RemoteRetryParams +from nemo_retriever.utils.detection import prediction_to_detections try: import torch @@ -41,114 +42,6 @@ def _labels_from_model(model: Any) -> List[str]: return [] -def _prediction_to_detections(pred: Any, *, label_names: List[str]) -> List[Dict[str, Any]]: - """ - Best-effort conversion of model output into a standard detection list. - - Produces dicts of the form: - {"bbox_xyxy_norm": [...], "label": int|None, "label_name": str, "score": float|None} - """ - if torch is None: # pragma: no cover - raise ImportError("torch required for prediction parsing.") - - boxes = labels = scores = None - if isinstance(pred, dict): - # IMPORTANT: do not use `or` chains here. torch.Tensor truthiness is ambiguous and raises. - def _get_any(d: Dict[str, Any], *keys: str) -> Any: - for k in keys: - if k in d: - v = d.get(k) - if v is not None: - return v - return None - - boxes = _get_any(pred, "boxes", "bboxes", "bbox", "box") - labels = _get_any(pred, "labels", "classes", "class_ids", "class") - scores = _get_any(pred, "scores", "conf", "confidences", "score") - elif isinstance(pred, (list, tuple)) and len(pred) >= 3: - boxes, labels, scores = pred[0], pred[1], pred[2] - - if boxes is None or labels is None: - return [] - - # Normalize to torch tensors. - def _to_tensor(x: Any) -> Optional["torch.Tensor"]: - if x is None: - return None - if isinstance(x, torch.Tensor): - return x.detach().cpu() - try: - return torch.as_tensor(x).detach().cpu() - except Exception: - return None - - # Handle string labels (e.g. NIM returns ["cell", "row", "column", ...]). - # torch.as_tensor cannot convert strings, so handle them before tensor conversion. - _string_labels: Optional[List[str]] = None - if isinstance(labels, (list, tuple)) and labels and isinstance(labels[0], str): - _string_labels = [str(x) for x in labels] - - b = _to_tensor(boxes) - labels_t = _to_tensor(labels) if _string_labels is None else None - s = _to_tensor(scores) if scores is not None else None - if b is None: - return [] - if labels_t is None and _string_labels is None: - return [] - - # Expect boxes (N,4), labels (N,) - if b.ndim != 2 or int(b.shape[-1]) != 4: - return [] - if labels_t is not None: - if labels_t.ndim == 2 and int(labels_t.shape[-1]) == 1: - labels_t = labels_t.squeeze(-1) - if labels_t.ndim != 1: - return [] - - n_labels = len(_string_labels) if _string_labels is not None else int(labels_t.shape[0]) - n = int(min(b.shape[0], n_labels)) - dets: List[Dict[str, Any]] = [] - for i in range(n): - try: - x1, y1, x2, y2 = [float(x) for x in b[i].tolist()] - except Exception: - continue - - if _string_labels is not None: - label_i = i - label_name = _string_labels[i] - else: - try: - label_i = int(labels_t[i].item()) - except Exception: - label_i = None - - label_name = None - if label_i is not None and 0 <= label_i < len(label_names): - label_name = label_names[label_i] - if not label_name: - label_name = f"label_{label_i}" if label_i is not None else "unknown" - - score_f: Optional[float] - if s is not None and s.ndim >= 1 and int(s.shape[0]) > i: - try: - score_f = float(s[i].item()) - except Exception: - score_f = None - else: - score_f = None - - dets.append( - { - "bbox_xyxy_norm": [x1, y1, x2, y2], - "label": label_i, - "label_name": str(label_name), - "score": score_f, - } - ) - return dets - - def _parse_nim_bounding_boxes(response_item: Any) -> List[Dict[str, Any]]: """Parse the ``bounding_boxes`` NIM response format. @@ -255,13 +148,13 @@ def table_structure_ocr_page_elements( """ from nemo_retriever.nim.nim import invoke_image_inference_batches from nemo_retriever.ocr.ocr import ( - _blocks_to_pseudo_markdown, - _crop_all_from_page, - _extract_remote_ocr_item, - _np_rgb_to_b64_png, - _parse_ocr_result, + blocks_to_pseudo_markdown, + crop_all_from_page, + extract_remote_ocr_item, + np_rgb_to_b64_png, + parse_ocr_result, ) - from nemo_retriever.util.table_and_chart import join_table_structure_and_ocr_output + from nemo_retriever.utils.table_and_chart import join_table_structure_and_ocr_output retry = remote_retry or RemoteRetryParams( remote_max_pool_workers=int(kwargs.get("remote_max_pool_workers", 16)), @@ -316,7 +209,7 @@ def table_structure_ocr_page_elements( continue # --- Pass 1: Collect table crops --- - crops = _crop_all_from_page(page_image_b64, dets, {"table"}) + crops = crop_all_from_page(page_image_b64, dets, {"table"}) if not crops: all_table.append(table_items) @@ -325,7 +218,7 @@ def table_structure_ocr_page_elements( # Pre-compute base64 encodings once for remote paths. crop_b64s = ( - [_np_rgb_to_b64_png(crop_array) for _, _, crop_array in crops] + [np_rgb_to_b64_png(crop_array) for _, _, crop_array in crops] if (use_remote_ts or use_remote_ocr) else [] ) @@ -350,7 +243,7 @@ def table_structure_ocr_page_elements( parsed = _parse_nim_bounding_boxes(resp) if not parsed: pred_item = _extract_remote_pred_item(resp) - parsed = _prediction_to_detections(pred_item, label_names=label_names) + parsed = prediction_to_detections(pred_item, label_names=label_names) structure_results.append(parsed) else: # Local batched inference. @@ -365,7 +258,7 @@ def table_structure_ocr_page_elements( if isinstance(pre, torch.Tensor) and pre.ndim == 3: pre = pre.unsqueeze(0) pred = table_structure_model.invoke(pre, (h, w)) - dets = _prediction_to_detections(pred, label_names=label_names) + dets = prediction_to_detections(pred, label_names=label_names) structure_results.append(dets) # --- Pass 3: Run OCR on all crops --- @@ -384,7 +277,7 @@ def table_structure_ocr_page_elements( if len(ocr_response_items) != len(crops): raise RuntimeError(f"Expected {len(crops)} OCR responses, got {len(ocr_response_items)}") for resp in ocr_response_items: - ocr_results.append(_extract_remote_ocr_item(resp)) + ocr_results.append(extract_remote_ocr_item(resp)) else: for _, _, crop_array in crops: ocr_results.append(ocr_model.invoke(crop_array, merge_level="word")) @@ -400,13 +293,13 @@ def table_structure_ocr_page_elements( # Fallback: if no cells were detected, use OCR-only pseudo-markdown. if not markdown: - blocks = _parse_ocr_result(ocr_preds) - markdown = _blocks_to_pseudo_markdown(blocks) + blocks = parse_ocr_result(ocr_preds) + markdown = blocks_to_pseudo_markdown(blocks) if not markdown: # Last resort: plain text. - from nemo_retriever.ocr.ocr import _blocks_to_text + from nemo_retriever.ocr.ocr import blocks_to_text - markdown = _blocks_to_text(blocks) + markdown = blocks_to_text(blocks) table_items.append({"bbox_xyxy_norm": bbox, "text": markdown}) diff --git a/nemo_retriever/src/nemo_retriever/txt/split.py b/nemo_retriever/src/nemo_retriever/txt/split.py index d47b8dfd3..7c5cb4172 100644 --- a/nemo_retriever/src/nemo_retriever/txt/split.py +++ b/nemo_retriever/src/nemo_retriever/txt/split.py @@ -22,7 +22,7 @@ DEFAULT_OVERLAP_TOKENS = 0 -def _get_tokenizer(model_id: str, cache_dir: Optional[str] = None): # noqa: ANN201 +def get_tokenizer(model_id: str, cache_dir: Optional[str] = None): # noqa: ANN201 """Lazy-load HuggingFace tokenizer.""" from transformers import AutoTokenizer @@ -131,7 +131,7 @@ def txt_file_to_chunks_df( path = str(Path(path).resolve()) raw = Path(path).read_text(encoding=encoding, errors="replace") model_id = tokenizer_model_id or DEFAULT_TOKENIZER_MODEL_ID - tokenizer = _get_tokenizer(model_id, cache_dir=tokenizer_cache_dir) + tokenizer = get_tokenizer(model_id, cache_dir=tokenizer_cache_dir) chunk_texts = split_text_by_tokens( raw, tokenizer=tokenizer, @@ -183,7 +183,7 @@ def txt_bytes_to_chunks_df( path = str(Path(path).resolve()) raw = content_bytes.decode(encoding, errors="replace") model_id = tokenizer_model_id or DEFAULT_TOKENIZER_MODEL_ID - tokenizer = _get_tokenizer(model_id, cache_dir=tokenizer_cache_dir) + tokenizer = get_tokenizer(model_id, cache_dir=tokenizer_cache_dir) chunk_texts = split_text_by_tokens( raw, tokenizer=tokenizer, diff --git a/nemo_retriever/src/nemo_retriever/util/__init__.py b/nemo_retriever/src/nemo_retriever/util/__init__.py deleted file mode 100644 index 6aa2e3d5b..000000000 --- a/nemo_retriever/src/nemo_retriever/util/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 diff --git a/nemo_retriever/src/nemo_retriever/config_utils.py b/nemo_retriever/src/nemo_retriever/utils/config_utils.py similarity index 100% rename from nemo_retriever/src/nemo_retriever/config_utils.py rename to nemo_retriever/src/nemo_retriever/utils/config_utils.py diff --git a/nemo_retriever/src/nemo_retriever/utils/detection.py b/nemo_retriever/src/nemo_retriever/utils/detection.py new file mode 100644 index 000000000..882cfe5ae --- /dev/null +++ b/nemo_retriever/src/nemo_retriever/utils/detection.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +try: + import torch +except Exception: # pragma: no cover + torch = None # type: ignore[assignment] + + +def prediction_to_detections(pred: Any, *, label_names: List[str]) -> List[Dict[str, Any]]: + """ + Best-effort conversion of model output into a standard detection list. + + Produces dicts of the form: + {"bbox_xyxy_norm": [...], "label": int|None, "label_name": str, "score": float|None} + """ + if torch is None: # pragma: no cover + raise ImportError("torch required for prediction parsing.") + + boxes = labels = scores = None + if isinstance(pred, dict): + # IMPORTANT: do not use `or` chains here. torch.Tensor truthiness is ambiguous and raises. + def _get_any(d: Dict[str, Any], *keys: str) -> Any: + for k in keys: + if k in d: + v = d.get(k) + if v is not None: + return v + return None + + boxes = _get_any(pred, "boxes", "bboxes", "bbox", "box") + labels = _get_any(pred, "labels", "classes", "class_ids", "class") + scores = _get_any(pred, "scores", "conf", "confidences", "score") + elif isinstance(pred, (list, tuple)) and len(pred) >= 3: + boxes, labels, scores = pred[0], pred[1], pred[2] + + if boxes is None or labels is None: + return [] + + def _to_tensor(x: Any) -> Optional["torch.Tensor"]: + if x is None: + return None + if isinstance(x, torch.Tensor): + return x.detach().cpu() + try: + return torch.as_tensor(x).detach().cpu() + except Exception: + return None + + # Handle string labels (e.g. NIM returns ["chart_title", "xlabel", ...]). + # torch.as_tensor cannot convert strings, so handle them before tensor conversion. + _string_labels: Optional[List[str]] = None + if isinstance(labels, (list, tuple)) and labels and isinstance(labels[0], str): + _string_labels = [str(x) for x in labels] + + b = _to_tensor(boxes) + labels_t = _to_tensor(labels) if _string_labels is None else None + s = _to_tensor(scores) if scores is not None else None + if b is None: + return [] + if labels_t is None and _string_labels is None: + return [] + + if b.ndim != 2 or int(b.shape[-1]) != 4: + return [] + if labels_t is not None: + if labels_t.ndim == 2 and int(labels_t.shape[-1]) == 1: + labels_t = labels_t.squeeze(-1) + if labels_t.ndim != 1: + return [] + + n_labels = len(_string_labels) if _string_labels is not None else int(labels_t.shape[0]) + n = int(min(b.shape[0], n_labels)) + dets: List[Dict[str, Any]] = [] + for i in range(n): + try: + x1, y1, x2, y2 = [float(x) for x in b[i].tolist()] + except Exception: + continue + + if _string_labels is not None: + label_i = i + label_name = _string_labels[i] + else: + try: + label_i = int(labels_t[i].item()) + except Exception: + label_i = None + + label_name = None + if label_i is not None and 0 <= label_i < len(label_names): + label_name = label_names[label_i] + if not label_name: + label_name = f"label_{label_i}" if label_i is not None else "unknown" + + score_f: Optional[float] + if s is not None and s.ndim >= 1 and int(s.shape[0]) > i: + try: + score_f = float(s[i].item()) + except Exception: + score_f = None + else: + score_f = None + + dets.append( + { + "bbox_xyxy_norm": [x1, y1, x2, y2], + "label": label_i, + "label_name": str(label_name), + "score": score_f, + } + ) + return dets diff --git a/nemo_retriever/src/nemo_retriever/util/table_and_chart.py b/nemo_retriever/src/nemo_retriever/utils/table_and_chart.py similarity index 99% rename from nemo_retriever/src/nemo_retriever/util/table_and_chart.py rename to nemo_retriever/src/nemo_retriever/utils/table_and_chart.py index f28f05567..bcf26abf1 100644 --- a/nemo_retriever/src/nemo_retriever/util/table_and_chart.py +++ b/nemo_retriever/src/nemo_retriever/utils/table_and_chart.py @@ -541,7 +541,7 @@ def _structure_dets_to_class_boxes( Parameters ---------- dets : list[dict] - Output of ``_prediction_to_detections()`` — each dict has + Output of ``prediction_to_detections()`` — each dict has ``bbox_xyxy_norm`` (normalized [0,1]) and ``label_name``. crop_hw : (int, int) ``(H, W)`` of the crop image. @@ -575,7 +575,7 @@ def join_table_structure_and_ocr_output( Parameters ---------- structure_dets : list[dict] - From ``_prediction_to_detections()`` with label_names cell/row/column + From ``prediction_to_detections()`` with label_names cell/row/column and ``bbox_xyxy_norm`` in [0, 1]. ocr_preds : list | dict Raw OCR output from ``NemotronOCRV1.invoke()``. @@ -613,7 +613,7 @@ def join_graphic_elements_and_ocr_output( Parameters ---------- ge_dets : list[dict] - From ``_prediction_to_detections()`` with chart-element label_names + From ``prediction_to_detections()`` with chart-element label_names and ``bbox_xyxy_norm`` in [0, 1]. ocr_preds : list | dict Raw OCR output from ``NemotronOCRV1.invoke()``. diff --git a/nemo_retriever/src/nemo_retriever/version.py b/nemo_retriever/src/nemo_retriever/version.py index 81c099013..553b2d464 100644 --- a/nemo_retriever/src/nemo_retriever/version.py +++ b/nemo_retriever/src/nemo_retriever/version.py @@ -13,13 +13,9 @@ import os import subprocess -try: - from ._build_info import BUILD_DATE as _PACKAGE_BUILD_DATE - from ._build_info import BUILD_GIT_SHA as _PACKAGE_BUILD_GIT_SHA -except ImportError: - # During setuptools build isolation the package may not be importable - _PACKAGE_BUILD_DATE = "unknown" - _PACKAGE_BUILD_GIT_SHA = "unknown" +# Overwritten by CI before packaging; see .github/workflows/*.yml. +_PACKAGE_BUILD_GIT_SHA = "unknown" +_PACKAGE_BUILD_DATE = "unknown" _PKG_NAME = "nemo-retriever" _UNKNOWN = "unknown" diff --git a/nemo_retriever/table_stage_config.yaml b/nemo_retriever/table_stage_config.yaml deleted file mode 100644 index e268c8ba7..000000000 --- a/nemo_retriever/table_stage_config.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Example config for: -# - `retriever table stage run --config --input ` -# - `retriever local stage3 run --config --input ` -# -# This YAML is parsed into `nv_ingest_api.internal.schemas.extract.extract_table_schema.TableExtractorSchema` -# via `nemo_retriever.table.config.load_table_extractor_schema_from_dict`. -# -# IMPORTANT: -# `endpoint_config.yolox_endpoints` and `endpoint_config.ocr_endpoints` must each provide at least one -# endpoint (gRPC or HTTP). Both cannot be null/empty for either entry. -# - -# Optional worker settings -max_queue_size: 1 -n_workers: 2 -raise_on_failure: false - -# Endpoint configuration for table extraction (YOLOX table-structure + OCR). -endpoint_config: - # Optional auth token for secured services (NIM) - auth_token: null - - # Tuple/list in the form: [grpc, http] - # YOLOX table-structure model endpoints - # yolox_endpoints: ["localhost:8007", "http://localhost:8006/v1/infer"] - yolox_endpoints: null - # Optional; if omitted it is inferred from which endpoint is present. - # yolox_infer_protocol: grpc - - # OCR model endpoints - # ocr_endpoints: ["localhost:8010", "http://localhost:8019/v1/infer"] - ocr_endpoints: null - # Optional; if omitted it is inferred from which endpoint is present. - # ocr_infer_protocol: grpc - - # Optional performance knobs - nim_batch_size: 2 - workers_per_progress_engine: 5 diff --git a/nemo_retriever/tests/test_audio_pipeline_batch.py b/nemo_retriever/tests/test_audio_pipeline_batch.py index 09ce1a7ad..9325ca610 100644 --- a/nemo_retriever/tests/test_audio_pipeline_batch.py +++ b/nemo_retriever/tests/test_audio_pipeline_batch.py @@ -177,7 +177,7 @@ def test_inprocess_audio_pipeline_local_asr_mocked(tmp_path: Path): @pytest.mark.skipif(not is_media_available(), reason="ffmpeg not available") def test_fused_audio_pipeline_with_mocked_asr(tmp_path: Path): - """Fused: same as batch but FusedIngestor; embed() uses explode + _BatchEmbedActor when _pipeline_type==audio.""" + """Fused: same as batch but FusedIngestor; embed() uses explode + BatchEmbedActor when _pipeline_type==audio.""" ray = pytest.importorskip("ray") pytest.importorskip("lancedb") diff --git a/nemo_retriever/tests/test_chart_graphic_elements.py b/nemo_retriever/tests/test_chart_graphic_elements.py index 25fd499a7..b462af553 100644 --- a/nemo_retriever/tests/test_chart_graphic_elements.py +++ b/nemo_retriever/tests/test_chart_graphic_elements.py @@ -382,7 +382,7 @@ def test_graphic_elements_flag_does_not_affect_table_stages(self) -> None: # --------------------------------------------------------------------------- -# _prediction_to_detections string labels test +# prediction_to_detections string labels test # --------------------------------------------------------------------------- @@ -390,28 +390,28 @@ def test_graphic_elements_flag_does_not_affect_table_stages(self) -> None: class TestPredictionToDetectionsStringLabels: def test_string_labels_handled(self) -> None: import torch - from nemo_retriever.chart.chart_detection import _prediction_to_detections + from nemo_retriever.utils.detection import prediction_to_detections pred = { "boxes": torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]), "labels": ["chart_title", "xlabel"], "scores": torch.tensor([0.9, 0.8]), } - dets = _prediction_to_detections(pred, label_names=[]) + dets = prediction_to_detections(pred, label_names=[]) assert len(dets) == 2 assert dets[0]["label_name"] == "chart_title" assert dets[1]["label_name"] == "xlabel" def test_integer_labels_still_work(self) -> None: import torch - from nemo_retriever.chart.chart_detection import _prediction_to_detections + from nemo_retriever.utils.detection import prediction_to_detections pred = { "boxes": torch.tensor([[0.1, 0.2, 0.3, 0.4]]), "labels": torch.tensor([1]), "scores": torch.tensor([0.9]), } - dets = _prediction_to_detections(pred, label_names=["chart_title", "xlabel"]) + dets = prediction_to_detections(pred, label_names=["chart_title", "xlabel"]) assert len(dets) == 1 assert dets[0]["label_name"] == "xlabel" assert dets[0]["label"] == 1 diff --git a/nemo_retriever/tests/test_multimodal_embed.py b/nemo_retriever/tests/test_multimodal_embed.py index f357193ef..caa8259ed 100644 --- a/nemo_retriever/tests/test_multimodal_embed.py +++ b/nemo_retriever/tests/test_multimodal_embed.py @@ -204,7 +204,7 @@ def test_text_mode_tags_modality(self): assert list(result["_embed_modality"]) == ["text", "text"] assert "_image_b64" not in result.columns - @patch("nemo_retriever.ingest_modes.inprocess._crop_b64_image_by_norm_bbox") + @patch("nemo_retriever.ingest_modes.inprocess.crop_b64_image_by_norm_bbox") def test_text_image_carries_image(self, mock_crop): """text_image mode copies page image to _image_b64, crops for structured content.""" mock_crop.return_value = ("cropped_b64", None) diff --git a/nemo_retriever/tests/test_table_structure.py b/nemo_retriever/tests/test_table_structure.py index 7499dff1a..d200cbae1 100644 --- a/nemo_retriever/tests/test_table_structure.py +++ b/nemo_retriever/tests/test_table_structure.py @@ -14,7 +14,7 @@ import pandas as pd import pytest -from nemo_retriever.util.table_and_chart import join_table_structure_and_ocr_output +from nemo_retriever.utils.table_and_chart import join_table_structure_and_ocr_output def _can_import(mod: str) -> bool: