diff --git a/.github/workflows/perform-release.yml b/.github/workflows/perform-release.yml
index e389962d0..addd44a69 100644
--- a/.github/workflows/perform-release.yml
+++ b/.github/workflows/perform-release.yml
@@ -1,13 +1,34 @@
 name: Perform Release
+run-name: "Perform Release ${{ inputs.version || inputs.source-branch || github.ref_name }}${{ inputs.dry-run == true && ' (dry run)' || '' }}"
 
 on:
   workflow_dispatch:
     inputs:
+      source-branch:
+        description: '*** SOURCE BRANCH *** — Branch to build artifacts from (leave empty to use the branch selected above)'
+        required: false
+        type: string
+        default: ''
+      version:
+        description: 'Override release version (leave empty to derive from source branch name)'
+        required: false
+        type: string
+        default: ''
       dry-run:
         description: 'Build and validate artifacts without publishing'
         required: false
         type: boolean
         default: false
+      skip-docker:
+        description: '⛔ SKIP Docker — disable Docker build and publish entirely'
+        required: false
+        type: boolean
+        default: false
+      skip-helm-chart:
+        description: '⛔ SKIP Helm chart — disable Helm build and publish entirely'
+        required: false
+        type: boolean
+        default: false
       release-type:
         description: 'Release type for PyPI packages'
         required: false
@@ -18,26 +39,41 @@ on:
           - dev
 
 permissions:
-  contents: read
-  packages: write
+  contents: write
 
 jobs:
   determine-version:
     name: Determine Release Version
     runs-on: ubuntu-latest
     outputs:
-      version: ${{ steps.version.outputs.version }}
+      version: ${{ steps.resolve.outputs.version }}
+      source-ref: ${{ steps.resolve.outputs.source_ref }}
     steps:
-      - name: Extract version from branch name
-        id: version
+      - name: Resolve source branch and version
+        id: resolve
         run: |
-          VERSION="${{ github.ref_name }}"
+          if [ -n "${{ inputs.source-branch }}" ]; then
+            SOURCE_REF="${{ inputs.source-branch }}"
+          else
+            SOURCE_REF="${{ github.ref_name }}"
+          fi
+          echo "source_ref=$SOURCE_REF" >> $GITHUB_OUTPUT
+
+          if [ -n "${{ inputs.version }}" ]; then
+            VERSION="${{ inputs.version }}"
+            echo "Using explicit version: $VERSION"
+          else
+            VERSION="$SOURCE_REF"
+            echo "Derived version from source branch: $VERSION"
+          fi
           echo "version=$VERSION" >> $GITHUB_OUTPUT
+
           echo "### Release Configuration" >> $GITHUB_STEP_SUMMARY
           echo "| Setting | Value |" >> $GITHUB_STEP_SUMMARY
           echo "|---------|-------|" >> $GITHUB_STEP_SUMMARY
           echo "| Version | \`$VERSION\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| Branch | \`${{ github.ref_name }}\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Source Branch | \`$SOURCE_REF\` |" >> $GITHUB_STEP_SUMMARY
+          echo "| Workflow Branch | \`${{ github.ref_name }}\` |" >> $GITHUB_STEP_SUMMARY
           echo "| Dry Run | \`${{ inputs.dry-run }}\` |" >> $GITHUB_STEP_SUMMARY
           echo "| Release Type | \`${{ inputs.release-type }}\` |" >> $GITHUB_STEP_SUMMARY
 
@@ -53,20 +89,22 @@ jobs:
     with:
       version: ${{ needs.determine-version.outputs.version }}
       release-type: ${{ inputs.release-type }}
-      source-ref: ${{ github.ref_name }}
+      source-ref: ${{ needs.determine-version.outputs.source-ref }}
+      workflow-ref: ${{ github.ref_name }}
       runner: linux-large-disk
 
   nvingest-docker-build:
     name: Build nv-ingest Docker Image
+    if: ${{ !inputs.skip-docker }}
     needs: determine-version
     runs-on: linux-large-disk
     outputs:
       image: ${{ steps.meta.outputs.image }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
-          ref: ${{ github.ref_name }}
+          ref: ${{ needs.determine-version.outputs.source-ref }}
 
       - name: Setup Docker Buildx
         uses: ./.github/actions/setup-docker-buildx
@@ -74,10 +112,25 @@ jobs:
       - name: Set image metadata
         id: meta
         run: |
-          echo "image=${{ secrets.DOCKER_REGISTRY }}/nv-ingest:${{ needs.determine-version.outputs.version }}" >> $GITHUB_OUTPUT
+          if [ -z "$DOCKER_REGISTRY" ]; then
+            echo "::error::DOCKER_REGISTRY secret is not set"
+            exit 1
+          fi
+          echo "image=${DOCKER_REGISTRY}/nv-ingest:${{ needs.determine-version.outputs.version }}" >> $GITHUB_OUTPUT
+        env:
+          DOCKER_REGISTRY: ${{ secrets.DOCKER_REGISTRY }}
+
+      - name: Create HF token file
+        env:
+          HF_ACCESS_TOKEN: ${{ secrets.HF_ACCESS_TOKEN }}
+        run: |
+          mkdir -p ./scripts/private_local
+          if [ -n "${HF_ACCESS_TOKEN}" ]; then
+            printf '%s' "${HF_ACCESS_TOKEN}" > ./scripts/private_local/hf_token.txt
+          fi
 
       - name: Build image (validate)
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           push: false
@@ -85,56 +138,40 @@ jobs:
           target: runtime
           platforms: linux/amd64
           build-args: |
-            HF_ACCESS_TOKEN=${{ secrets.HF_ACCESS_TOKEN }}
             DOWNLOAD_LLAMA_TOKENIZER=True
             GIT_COMMIT=${{ github.sha }}
           tags: ${{ steps.meta.outputs.image }}
+          secret-files: hf_token=./scripts/private_local/hf_token.txt
           cache-to: type=gha,scope=nvingest,mode=max
           cache-from: type=gha,scope=nvingest
 
-  retriever-docker-build:
-    name: Build nemo-retriever Docker Image
-    needs: determine-version
-    runs-on: ubuntu-latest
-    outputs:
-      image: ${{ steps.meta.outputs.image }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.ref_name }}
-          fetch-depth: 0
-
-      - name: Setup Docker Buildx
-        uses: ./.github/actions/setup-docker-buildx
-
-      - name: Set image metadata
-        id: meta
+      - name: Export Docker image
         run: |
-          IMAGE="ghcr.io/$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')/nemo-retriever"
-          echo "image=$IMAGE" >> $GITHUB_OUTPUT
+          docker save "${{ steps.meta.outputs.image }}" | gzip -1 > nv-ingest-docker-image.tar.gz
+          ls -lh nv-ingest-docker-image.tar.gz
 
-      - name: Build image (validate)
-        uses: docker/build-push-action@v5
+      - name: Upload Docker image artifact
+        uses: actions/upload-artifact@v5
         with:
-          context: .
-          file: ./nemo_retriever/Dockerfile
-          push: false
-          load: true
-          platforms: linux/amd64
-          tags: ${{ steps.meta.outputs.image }}:${{ needs.determine-version.outputs.version }}
-          cache-to: type=gha,scope=retriever,mode=max
-          cache-from: type=gha,scope=retriever
+          name: nv-ingest-docker-image
+          path: nv-ingest-docker-image.tar.gz
+          if-no-files-found: error
 
   helm-build:
     name: Build Helm Chart
+    if: ${{ !inputs.skip-helm-chart }}
     needs: determine-version
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
-          ref: ${{ github.ref_name }}
+          ref: ${{ needs.determine-version.outputs.source-ref }}
+
+      - name: Overlay CI scripts from workflow branch
+        run: |
+          git fetch --depth=1 origin "${{ github.ref_name }}"
+          git checkout FETCH_HEAD -- ci/scripts/
 
       - name: Setup Helm
         uses: azure/setup-helm@v4
@@ -162,7 +199,7 @@ jobs:
           helm dependency build helm/
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.12'
 
@@ -178,126 +215,85 @@ jobs:
             --version "${{ needs.determine-version.outputs.version }}" \
             --dry-run
 
+      - name: Upload Helm chart artifact
+        uses: actions/upload-artifact@v5
+        with:
+          name: helm-chart
+          path: nv-ingest-*.tgz
+          if-no-files-found: error
+
   # ══════════════════════════════════════════════════════════════════════
   #  Publish Phase — runs only after ALL builds succeed and dry-run is
   #  off.  Every publish job depends on every build job so that a single
   #  build failure prevents any artifact from being published.
   # ══════════════════════════════════════════════════════════════════════
 
-  pypi-publish:
-    name: Publish Python Wheels
-    if: ${{ !inputs.dry-run }}
-    needs:
-      - pypi-build
-      - nvingest-docker-build
-      - retriever-docker-build
-      - helm-build
-    uses: ./.github/workflows/reusable-pypi-publish.yml
-    secrets:
-      ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }}
-      ARTIFACTORY_USERNAME: ${{ secrets.ARTIFACTORY_USERNAME }}
-      ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }}
-
   nvingest-docker-publish:
     name: Publish nv-ingest Docker Image
-    if: ${{ !inputs.dry-run }}
+    if: ${{ !inputs.dry-run && !inputs.skip-docker && !cancelled() && !failure() }}
     needs:
       - determine-version
-      - pypi-build
       - nvingest-docker-build
-      - retriever-docker-build
+      - pypi-build
       - helm-build
     runs-on: linux-large-disk
+    outputs:
+      image: ${{ steps.push.outputs.image }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
-          ref: ${{ github.ref_name }}
-
-      - name: Setup Docker Buildx
-        uses: ./.github/actions/setup-docker-buildx
-        with:
-          use-qemu: 'true'
-          platforms: 'linux/amd64,linux/arm64'
+          ref: ${{ needs.determine-version.outputs.source-ref }}
 
       - name: Login to NGC
         uses: ./.github/actions/docker-login-ngc
         with:
           password: ${{ secrets.DOCKER_PASSWORD }}
 
-      - name: Build and push multi-platform image
-        uses: docker/build-push-action@v5
+      - name: Download Docker image artifact
+        uses: actions/download-artifact@v5
         with:
-          context: .
-          push: true
-          target: runtime
-          platforms: linux/amd64,linux/arm64
-          build-args: |
-            HF_ACCESS_TOKEN=${{ secrets.HF_ACCESS_TOKEN }}
-            DOWNLOAD_LLAMA_TOKENIZER=True
-            GIT_COMMIT=${{ github.sha }}
-          tags: ${{ needs.nvingest-docker-build.outputs.image }}
-          cache-from: type=gha,scope=nvingest
+          name: nv-ingest-docker-image
 
-  retriever-docker-publish:
-    name: Publish nemo-retriever Docker Image
-    if: ${{ !inputs.dry-run }}
-    needs:
-      - determine-version
-      - pypi-build
-      - nvingest-docker-build
-      - retriever-docker-build
-      - helm-build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.ref_name }}
-          fetch-depth: 0
-
-      - name: Setup Docker Buildx
-        uses: ./.github/actions/setup-docker-buildx
-        with:
-          use-qemu: 'true'
-          platforms: 'linux/amd64,linux/arm64'
-
-      - name: Login to GHCR
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push multi-platform image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./nemo_retriever/Dockerfile
-          push: true
-          platforms: linux/amd64,linux/arm64
-          tags: |
-            ${{ needs.retriever-docker-build.outputs.image }}:${{ needs.determine-version.outputs.version }}
-            ${{ needs.retriever-docker-build.outputs.image }}:latest
-          cache-from: type=gha,scope=retriever
+      - name: Load and push image
+        id: push
+        run: |
+          echo "Loading image from tarball..."
+          LOAD_OUTPUT=$(gunzip -c nv-ingest-docker-image.tar.gz | docker load)
+          echo "$LOAD_OUTPUT"
+          IMAGE=$(echo "$LOAD_OUTPUT" | sed -n 's/^Loaded image: //p')
+          if [ -z "$IMAGE" ]; then
+            echo "::error::Failed to parse image name from docker load output"
+            exit 1
+          fi
+          echo "image=${IMAGE}" >> $GITHUB_OUTPUT
+          echo "Pushing ${IMAGE}..."
+          docker push "${IMAGE}"
 
   helm-publish:
     name: Publish Helm Chart
-    if: ${{ !inputs.dry-run }}
+    if: ${{ !inputs.dry-run && !inputs.skip-helm-chart && !cancelled() && !failure() }}
     needs:
       - determine-version
-      - pypi-build
       - nvingest-docker-build
-      - retriever-docker-build
+      - pypi-build
       - helm-build
     runs-on: ubuntu-latest
     env:
       NGC_CLI_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+      NGC_CLI_ORG: ${{ secrets.NGC_ORG }}
+      NGC_CLI_TEAM: ${{ secrets.NGC_TEAM }}
+      NGC_CLI_FORMAT_TYPE: json
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
-          ref: ${{ github.ref_name }}
+          ref: ${{ needs.determine-version.outputs.source-ref }}
+
+      - name: Overlay CI scripts from workflow branch
+        run: |
+          git fetch --depth=1 origin "${{ github.ref_name }}"
+          git checkout FETCH_HEAD -- ci/scripts/
 
       - name: Setup Helm
         uses: azure/setup-helm@v4
@@ -308,23 +304,13 @@ jobs:
           curl -sSL "https://github.com/norwoodj/helm-docs/releases/download/v${HELM_DOCS_VERSION}/helm-docs_${HELM_DOCS_VERSION}_Linux_x86_64.tar.gz" \
             | tar xz -C /usr/local/bin helm-docs
 
-      - name: Install NGC CLI
-        run: |
-          curl -sSL "https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.55.0/files/ngccli_linux.zip" -o /tmp/ngccli.zip
-          unzip -q /tmp/ngccli.zip -d /tmp
-          sudo mv /tmp/ngc-cli/ngc /usr/local/bin/ngc
-          sudo chmod +x /usr/local/bin/ngc
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
 
-      - name: Configure and verify NGC CLI
-        run: |
-          ngc config set <<EOF
-          $NGC_CLI_API_KEY
-          json
-          nvidian
-          nemo-llm
-          EOF
-          echo "NGC CLI configured. Verifying authentication..."
-          ngc config current
+      - name: Install Python dependencies
+        run: pip install ngcsdk pyyaml
 
       - name: Update Helm README
         run: helm/update_helm_readme.sh
@@ -342,22 +328,53 @@ jobs:
           helm dependency update helm/
           helm dependency build helm/
 
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Install Python dependencies
-        run: pip install pyyaml
-
       - name: Publish chart to NGC
+        env:
+          NGC_CLI_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
         run: |
           python ci/scripts/release_helm_chart.py \
-            --org nvidian \
-            --team nemo-llm \
+            --org ${{ secrets.NGC_ORG }} \
+            --team ${{ secrets.NGC_TEAM }} \
             --name nv-ingest \
             --version "${{ needs.determine-version.outputs.version }}"
 
+  pypi-publish:
+    name: Publish Python Wheels (last — PyPI versions are immutable)
+    if: ${{ !inputs.dry-run && !cancelled() && !failure() }}
+    needs:
+      - pypi-build
+      - nvingest-docker-publish
+      - helm-publish
+    uses: ./.github/workflows/reusable-pypi-publish.yml
+    secrets:
+      ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }}
+      ARTIFACTORY_USERNAME: ${{ secrets.ARTIFACTORY_USERNAME }}
+      ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }}
+
+  # ── Tag Release ───────────────────────────────────────────────────────
+  tag-release:
+    name: Tag Release
+    if: ${{ !inputs.dry-run && !cancelled() && !failure() }}
+    needs:
+      - determine-version
+      - pypi-publish
+      - nvingest-docker-publish
+      - helm-publish
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ needs.determine-version.outputs.source-ref }}
+
+      - name: Create and push tag
+        run: |
+          VERSION="${{ needs.determine-version.outputs.version }}"
+          git tag "$VERSION"
+          git push origin "$VERSION"
+          echo "### Git Tag" >> $GITHUB_STEP_SUMMARY
+          echo "Created and pushed tag: \`$VERSION\`" >> $GITHUB_STEP_SUMMARY
+
   # ── Announcement ─────────────────────────────────────────────────────
   announce:
     name: Release Announcement
@@ -368,10 +385,9 @@ jobs:
       - pypi-publish
       - nvingest-docker-build
       - nvingest-docker-publish
-      - retriever-docker-build
-      - retriever-docker-publish
       - helm-build
       - helm-publish
+      - tag-release
     runs-on: ubuntu-latest
     steps:
       - name: Generate Slack announcement
@@ -379,18 +395,21 @@ jobs:
           VERSION: ${{ needs.determine-version.outputs.version }}
           DRY_RUN: ${{ inputs.dry-run }}
           RELEASE_TYPE: ${{ inputs.release-type }}
-          BASE_BRANCH: ${{ github.ref_name }}
+          SOURCE_BRANCH: ${{ needs.determine-version.outputs.source-ref }}
           PYPI_BUILD_RESULT: ${{ needs.pypi-build.result }}
           PYPI_PUBLISH_RESULT: ${{ needs.pypi-publish.result }}
+          PYPI_VERSION: ${{ needs.pypi-build.outputs.version }}
           NVINGEST_BUILD_RESULT: ${{ needs.nvingest-docker-build.result }}
           NVINGEST_PUBLISH_RESULT: ${{ needs.nvingest-docker-publish.result }}
-          RETRIEVER_BUILD_RESULT: ${{ needs.retriever-docker-build.result }}
-          RETRIEVER_PUBLISH_RESULT: ${{ needs.retriever-docker-publish.result }}
           HELM_BUILD_RESULT: ${{ needs.helm-build.result }}
           HELM_PUBLISH_RESULT: ${{ needs.helm-publish.result }}
-          NVINGEST_IMAGE: ${{ needs.nvingest-docker-build.outputs.image }}
-          RETRIEVER_IMAGE: ${{ needs.retriever-docker-build.outputs.image }}
+          TAG_RESULT: ${{ needs.tag-release.result }}
+          SKIP_DOCKER: ${{ inputs.skip-docker }}
+          SKIP_HELM: ${{ inputs.skip-helm-chart }}
+          NVINGEST_IMAGE: ${{ needs.nvingest-docker-publish.outputs.image || needs.nvingest-docker-build.outputs.image }}
+          REPO_URL: ${{ github.server_url }}/${{ github.repository }}
           RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          ARTIFACTORY_URL: ${{ secrets.ARTIFACTORY_URL }}
         run: |
           status_emoji() {
             case "$1" in
@@ -403,8 +422,8 @@ jobs:
           }
 
           ALL_BUILDS_OK="true"
-          for r in "$PYPI_BUILD_RESULT" "$NVINGEST_BUILD_RESULT" "$RETRIEVER_BUILD_RESULT" "$HELM_BUILD_RESULT"; do
-            if [ "$r" != "success" ]; then
+          for r in "$PYPI_BUILD_RESULT" "$NVINGEST_BUILD_RESULT" "$HELM_BUILD_RESULT"; do
+            if [ "$r" != "success" ] && [ "$r" != "skipped" ]; then
               ALL_BUILDS_OK="false"
               break
             fi
@@ -418,10 +437,13 @@ jobs:
             HEADER=":rocket: *Release ${VERSION} Published* :rocket:"
           fi
 
+          PIP_VER="${PYPI_VERSION:-${VERSION}}"
+          TAG_URL="${REPO_URL}/releases/tag/${VERSION}"
+
           MSG="${HEADER}"
           MSG+="\n"
           MSG+="\n*Version:* \`${VERSION}\`"
-          MSG+="\n*Branch:* \`${BASE_BRANCH}\`"
+          MSG+="\n*Source Branch:* \`${SOURCE_BRANCH}\`"
           MSG+="\n*Release Type:* \`${RELEASE_TYPE}\`"
           MSG+="\n"
           MSG+="\n*Artifacts:*"
@@ -434,36 +456,32 @@ jobs:
           elif [ "$PYPI_BUILD_RESULT" != "success" ]; then
             MSG+="\n$(status_emoji "$PYPI_BUILD_RESULT") *PyPI Wheels* — Build: ${PYPI_BUILD_RESULT}"
           else
-            MSG+="\n$(status_emoji "$PYPI_PUBLISH_RESULT") *PyPI Wheels* — Publish blocked (other build failed)"
+            MSG+="\n$(status_emoji "$PYPI_PUBLISH_RESULT") *PyPI Wheels* — Publish blocked (other publish failed)"
           fi
-          MSG+="\n    \`nv-ingest-api\`  \`nv-ingest-client\`  \`nv-ingest\`  \`nemo-retriever\`"
+          MSG+="\n    \`nv-ingest-api==${PIP_VER}\`  \`nv-ingest-client==${PIP_VER}\`  \`nv-ingest==${PIP_VER}\`  \`nemo-retriever==${PIP_VER}\`"
+          MSG+="\n"
+          MSG+="\n    *Quick install:*"
+          MSG+="\n    \`\`\`pip install --index-url ${ARTIFACTORY_URL} nv-ingest-api==${PIP_VER} nv-ingest-client==${PIP_VER} nv-ingest==${PIP_VER} nemo-retriever==${PIP_VER}\`\`\`"
 
           # — nv-ingest Docker —
-          if [ "$DRY_RUN" = "true" ]; then
+          if [ "$SKIP_DOCKER" = "true" ]; then
+            MSG+="\n:fast_forward: *nv-ingest Docker* — Disabled (skip-docker)"
+          elif [ "$DRY_RUN" = "true" ]; then
             MSG+="\n$(status_emoji "$NVINGEST_BUILD_RESULT") *nv-ingest Docker* — Built and validated (not pushed)"
           elif [ "$NVINGEST_PUBLISH_RESULT" = "success" ]; then
             MSG+="\n:white_check_mark: *nv-ingest Docker* — \`${NVINGEST_IMAGE}\`"
+            MSG+="\n    \`\`\`docker pull ${NVINGEST_IMAGE}\`\`\`"
           elif [ "$NVINGEST_BUILD_RESULT" != "success" ]; then
             MSG+="\n$(status_emoji "$NVINGEST_BUILD_RESULT") *nv-ingest Docker* — Build: ${NVINGEST_BUILD_RESULT}"
           else
             MSG+="\n$(status_emoji "$NVINGEST_PUBLISH_RESULT") *nv-ingest Docker* — Publish blocked (other build failed)"
           fi
 
-          # — nemo-retriever Docker —
-          RETRIEVER_TAG="${RETRIEVER_IMAGE}:${VERSION}"
-          if [ "$DRY_RUN" = "true" ]; then
-            MSG+="\n$(status_emoji "$RETRIEVER_BUILD_RESULT") *nemo-retriever Docker* — Built and validated (not pushed)"
-          elif [ "$RETRIEVER_PUBLISH_RESULT" = "success" ]; then
-            MSG+="\n:white_check_mark: *nemo-retriever Docker* — \`${RETRIEVER_TAG}\`"
-          elif [ "$RETRIEVER_BUILD_RESULT" != "success" ]; then
-            MSG+="\n$(status_emoji "$RETRIEVER_BUILD_RESULT") *nemo-retriever Docker* — Build: ${RETRIEVER_BUILD_RESULT}"
-          else
-            MSG+="\n$(status_emoji "$RETRIEVER_PUBLISH_RESULT") *nemo-retriever Docker* — Publish blocked (other build failed)"
-          fi
-
           # — Helm Chart —
           HELM_REF="nvidian/nemo-llm/nv-ingest:${VERSION}"
-          if [ "$DRY_RUN" = "true" ]; then
+          if [ "$SKIP_HELM" = "true" ]; then
+            MSG+="\n:fast_forward: *Helm Chart* — Disabled (skip-helm-chart)"
+          elif [ "$DRY_RUN" = "true" ]; then
             MSG+="\n$(status_emoji "$HELM_BUILD_RESULT") *Helm Chart* — Built and validated (not published)"
           elif [ "$HELM_PUBLISH_RESULT" = "success" ]; then
             MSG+="\n:white_check_mark: *Helm Chart* — \`${HELM_REF}\` (NGC)"
@@ -473,6 +491,15 @@ jobs:
             MSG+="\n$(status_emoji "$HELM_PUBLISH_RESULT") *Helm Chart* — Publish blocked (other build failed)"
           fi
 
+          # — Git Tag —
+          if [ "$DRY_RUN" = "true" ]; then
+            MSG+="\n:fast_forward: *Git Tag* — Skipped (dry run)"
+          elif [ "$TAG_RESULT" = "success" ]; then
+            MSG+="\n:white_check_mark: *Git Tag* — <${TAG_URL}|\`${VERSION}\`>"
+          else
+            MSG+="\n$(status_emoji "$TAG_RESULT") *Git Tag* — ${TAG_RESULT}"
+          fi
+
           MSG+="\n"
           MSG+="\n<${RUN_URL}|:github: View Workflow Run>"
 
diff --git a/.github/workflows/release-helm.yml b/.github/workflows/release-helm.yml
index fd3edae1e..cc772f771 100644
--- a/.github/workflows/release-helm.yml
+++ b/.github/workflows/release-helm.yml
@@ -39,9 +39,12 @@ jobs:
     runs-on: ubuntu-latest
     env:
       NGC_CLI_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+      NGC_CLI_ORG: ${{ inputs.ngc-org }}
+      NGC_CLI_TEAM: ${{ inputs.ngc-team }}
+      NGC_CLI_FORMAT_TYPE: json
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           ref: ${{ inputs.source-ref }}
 
@@ -54,23 +57,13 @@ jobs:
           curl -sSL "https://github.com/norwoodj/helm-docs/releases/download/v${HELM_DOCS_VERSION}/helm-docs_${HELM_DOCS_VERSION}_Linux_x86_64.tar.gz" \
             | tar xz -C /usr/local/bin helm-docs
 
-      - name: Install NGC CLI
-        run: |
-          curl -sSL "https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.55.0/files/ngccli_linux.zip" -o /tmp/ngccli.zip
-          unzip -q /tmp/ngccli.zip -d /tmp
-          sudo mv /tmp/ngc-cli/ngc /usr/local/bin/ngc
-          sudo chmod +x /usr/local/bin/ngc
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
 
-      - name: Configure and verify NGC CLI
-        run: |
-          ngc config set <<EOF
-          $NGC_CLI_API_KEY
-          json
-          ${{ inputs.ngc-org }}
-          ${{ inputs.ngc-team }}
-          EOF
-          echo "NGC CLI configured. Verifying authentication..."
-          ngc config current
+      - name: Install Python dependencies
+        run: pip install ngcsdk pyyaml
 
       - name: Update Helm README
         run: helm/update_helm_readme.sh
@@ -88,14 +81,6 @@ jobs:
           helm dependency update helm/
           helm dependency build helm/
 
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Install Python dependencies
-        run: pip install pyyaml
-
       - name: Release Helm chart
         run: |
           DRY_RUN_FLAG=""
diff --git a/.github/workflows/retriever-unit-tests.yml b/.github/workflows/retriever-unit-tests.yml
index e26d93328..87fb7ee25 100644
--- a/.github/workflows/retriever-unit-tests.yml
+++ b/.github/workflows/retriever-unit-tests.yml
@@ -19,11 +19,15 @@ jobs:
         with:
           python-version: "3.12"
 
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
       - name: Install unit test dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install pytest pandas pydantic pyyaml typer scikit-learn
-          python -m pip install api/
+          uv pip install --system -e src/ -e api/ -e client/
+          uv pip install --system -e nemo_retriever
 
       - name: Run retriever unit tests
         env:
diff --git a/.github/workflows/reusable-pypi-build.yml b/.github/workflows/reusable-pypi-build.yml
index 7f9947d1d..28442d533 100644
--- a/.github/workflows/reusable-pypi-build.yml
+++ b/.github/workflows/reusable-pypi-build.yml
@@ -18,6 +18,11 @@ on:
         required: false
         type: string
         default: 'main'
+      workflow-ref:
+        description: 'Git ref of the workflow branch (used to overlay pyproject.toml files)'
+        required: false
+        type: string
+        default: ''
       runner:
         description: 'GitHub runner to use'
         required: false
@@ -36,10 +41,16 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           ref: ${{ inputs.source-ref }}
 
+      - name: Overlay build config from workflow branch
+        if: ${{ inputs.workflow-ref != '' && inputs.workflow-ref != inputs.source-ref }}
+        run: |
+          git fetch --depth=1 origin "${{ inputs.workflow-ref }}"
+          git checkout FETCH_HEAD -- api/pyproject.toml client/pyproject.toml src/pyproject.toml nemo_retriever/pyproject.toml
+
       - name: Determine version
         id: set-version
         run: |
@@ -52,7 +63,7 @@ jobs:
           echo "Building version: $VERSION"
 
       - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: '3.12'
 
@@ -103,12 +114,12 @@ jobs:
           PY
           RETRIEVER_RELEASE_TYPE=${{ inputs.release-type }} \
           RETRIEVER_VERSION=${{ steps.set-version.outputs.version }} \
-          RETRIEVER_BUILD_NUMBER=${{ github.run_number }} \
+          RETRIEVER_BUILD_NUMBER=${{ inputs.release-type == 'release' && '0' || github.run_number }} \
           RETRIEVER_GIT_SHA=${{ github.sha }} \
           python -m build
 
       - name: Upload wheel artifacts
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
         with:
           name: python-wheels
           path: |
diff --git a/.github/workflows/reusable-pypi-publish.yml b/.github/workflows/reusable-pypi-publish.yml
index 746755db9..0a585eea4 100644
--- a/.github/workflows/reusable-pypi-publish.yml
+++ b/.github/workflows/reusable-pypi-publish.yml
@@ -16,13 +16,18 @@ jobs:
 
     steps:
       - name: Download wheel artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
         with:
           name: python-wheels
           path: ./dist
 
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
       - name: Install twine
-        run: pip install twine
+        run: pip install 'twine>=6.1'
 
       - name: Publish wheels to Artifactory
         env:
@@ -31,7 +36,7 @@ jobs:
           ARTIFACTORY_PASSWORD: ${{ secrets.ARTIFACTORY_PASSWORD }}
         run: |
           # Publish all wheels
-          twine upload \
+          twine upload --verbose \
             --repository-url $ARTIFACTORY_URL \
             -u $ARTIFACTORY_USERNAME \
             -p $ARTIFACTORY_PASSWORD \
diff --git a/Dockerfile b/Dockerfile
index 87f1e9304..f89038926 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -39,13 +39,13 @@ RUN chmod +x scripts/install_ffmpeg.sh \
 # For GPL-licensed components, we provide their source code in the container
 # via `apt-get source` below to satisfy GPL requirements.
 ARG GPL_LIBS="\
+    libfreetype6 \
     libltdl7 \
     libhunspell-1.7-0 \
     libhyphen0 \
     libdbus-1-3 \
 "
 ARG FORCE_REMOVE_PKGS="\
-    libfreetype6 \
     ucf \
     liblangtag-common \
     libjbig0 \
diff --git a/api/pyproject.toml b/api/pyproject.toml
index 17b5f7a8c..3e1da46d8 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -13,10 +13,10 @@ readme = "README.md"
 authors = [
     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
 ]
-license = {file = "LICENSE"}
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 classifiers = [
     "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
diff --git a/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py b/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py
index f06f80d29..ef64c8f4c 100644
--- a/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py
+++ b/api/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py
@@ -16,7 +16,8 @@
 import tritonclient.grpc as grpcclient
 
 from nv_ingest_api.internal.primitives.nim import ModelInterface
-from nv_ingest_api.internal.primitives.nim.model_interface.decorators import multiprocessing_cache
+from nv_ingest_api.internal.primitives.nim.model_interface.decorators import global_cache
+from nv_ingest_api.internal.primitives.nim.model_interface.decorators import lock
 from nv_ingest_api.internal.primitives.nim.model_interface.helpers import preprocess_image_for_paddle
 from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
 
@@ -752,12 +753,11 @@ def _format_single_batch(
             raise ValueError("Invalid protocol specified. Must be 'grpc' or 'http'.")
 
 
-@multiprocessing_cache(max_calls=100)  # Cache results first to avoid redundant retries from backoff
 @backoff.on_predicate(backoff.expo, max_time=30)
 def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MODEL_NAME):
     """
     Determines the OCR model name by checking the environment, querying the gRPC endpoint,
-    or falling back to a default.
+    or falling back to a default. Only caches when the repository is successfully queried.
     """
     # 1. Check for an explicit override from the environment variable first.
     ocr_model_name = os.getenv("OCR_MODEL_NAME", None)
@@ -769,14 +769,25 @@ def get_ocr_model_name(ocr_grpc_endpoint=None, default_model_name=DEFAULT_OCR_MO
         logger.debug(f"No OCR gRPC endpoint provided. Falling back to default model name '{default_model_name}'.")
         return default_model_name
 
-    # 3. Attempt to query the gRPC endpoint to discover the model name.
+    # 3. Check cache (only populated on successful repository query).
+    key = (
+        "get_ocr_model_name",
+        (ocr_grpc_endpoint,),
+        frozenset({"default_model_name": default_model_name}.items()),
+    )
+    with lock:
+        if key in global_cache:
+            return global_cache[key]
+
+    # 4. Attempt to query the gRPC endpoint to discover the model name.
     try:
         client = grpcclient.InferenceServerClient(ocr_grpc_endpoint)
         model_index = client.get_model_repository_index(as_json=True)
         model_names = [x["name"] for x in model_index.get("models", [])]
         ocr_model_name = model_names[0]
+        with lock:
+            global_cache[key] = ocr_model_name
+        return ocr_model_name
     except Exception:
         logger.warning(f"Failed to get ocr model name after 30 seconds. Falling back to '{default_model_name}'.")
-        ocr_model_name = default_model_name
-
-    return ocr_model_name
+        return default_model_name
diff --git a/api/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py b/api/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py
index 0b1084905..ff93cb953 100644
--- a/api/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py
+++ b/api/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py
@@ -20,6 +20,8 @@
 
 from nv_ingest_api.internal.primitives.nim import ModelInterface
 import tritonclient.grpc as grpcclient
+from nv_ingest_api.internal.primitives.nim.model_interface.decorators import global_cache
+from nv_ingest_api.internal.primitives.nim.model_interface.decorators import lock
 from nv_ingest_api.internal.primitives.nim.model_interface.decorators import multiprocessing_cache
 from nv_ingest_api.internal.primitives.nim.model_interface.helpers import get_model_name
 from nv_ingest_api.util.image_processing import scale_image_to_encoding_size
@@ -135,10 +137,36 @@ def __init__(
         self.class_labels = class_labels
 
         if endpoints:
-            self.model_name = get_yolox_model_name(endpoints[0], default_model_name="yolox_ensemble")
-            self._grpc_uses_bls = self.model_name == "pipeline"
+            self._yolox_grpc_endpoint = endpoints[0]
+            self._model_name = None
+            self._grpc_uses_bls_value = None  # Resolved on first use
         else:
-            self._grpc_uses_bls = False
+            self._yolox_grpc_endpoint = None
+            self._model_name = None
+            self._grpc_uses_bls_value = False
+
+    def _resolve_yolox_model_name_if_needed(self) -> None:
+        """Resolve model name and BLS flag from the gRPC endpoint on first use. Cached on the instance."""
+        if self._yolox_grpc_endpoint is None:
+            return
+        if self._model_name is not None:
+            return
+        self._model_name = get_yolox_model_name(self._yolox_grpc_endpoint, default_model_name="yolox_ensemble")
+        self._grpc_uses_bls_value = self._model_name == "pipeline"
+
+    @property
+    def model_name(self) -> Optional[str]:
+        self._resolve_yolox_model_name_if_needed()
+        return self._model_name
+
+    @model_name.setter
+    def model_name(self, value: Optional[str]) -> None:
+        self._model_name = value
+
+    @property
+    def _grpc_uses_bls(self) -> bool:
+        self._resolve_yolox_model_name_if_needed()
+        return bool(self._grpc_uses_bls_value)
 
     def prepare_data_for_inference(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -2117,7 +2145,6 @@ def postprocess_included_texts(boxes, confs, labels, classes):
     return boxes, labels, confs
 
 
-@multiprocessing_cache(max_calls=100)  # Cache results first to avoid redundant retries from backoff
 @backoff.on_predicate(backoff.expo, max_time=30)
 def get_yolox_model_name(yolox_grpc_endpoint, default_model_name="yolox"):
     # If a gRPC endpoint isn't provided (common when using HTTP-only NIM endpoints),
@@ -2131,6 +2158,15 @@ def get_yolox_model_name(yolox_grpc_endpoint, default_model_name="yolox"):
     ):
         return default_model_name
 
+    key = (
+        "get_yolox_model_name",
+        (yolox_grpc_endpoint,),
+        frozenset({"default_model_name": default_model_name}.items()),
+    )
+    with lock:
+        if key in global_cache:
+            return global_cache[key]
+
     try:
         client = grpcclient.InferenceServerClient(yolox_grpc_endpoint)
         model_index = client.get_model_repository_index(as_json=True)
@@ -2148,14 +2184,23 @@ def get_yolox_model_name(yolox_grpc_endpoint, default_model_name="yolox"):
             "nemoretriever-page-elements-v2",
         ):
             if preferred in model_names:
-                return preferred
+                result = preferred
+                with lock:
+                    global_cache[key] = result
+                return result
 
         # Otherwise pick a best-effort match for newer model names.
         candidates = [m for m in model_names if isinstance(m, str) and ("yolox" in m or "page-elements" in m)]
         if candidates:
-            return sorted(candidates)[0]
-
-        return default_model_name
+            result = sorted(candidates)[0]
+            with lock:
+                global_cache[key] = result
+            return result
+
+        result = default_model_name
+        with lock:
+            global_cache[key] = result
+        return result
     except Exception as e:
         logger.warning(
             "Failed to inspect YOLOX model repository at '%s' (%s). Falling back to '%s'.",
diff --git a/api/src/nv_ingest_api/internal/transform/split_text.py b/api/src/nv_ingest_api/internal/transform/split_text.py
index 9b88ec5ce..9d099ec7f 100644
--- a/api/src/nv_ingest_api/internal/transform/split_text.py
+++ b/api/src/nv_ingest_api/internal/transform/split_text.py
@@ -56,14 +56,8 @@ def _get_tokenizer(
         if cache_key in _tokenizer_cache:
             return _tokenizer_cache[cache_key]
 
-        from nemo_retriever.utils.hf_model_registry import get_hf_revision
-
         logger.info("Loading and caching tokenizer: %s", tokenizer_identifier)
-        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_identifier,
-            revision=get_hf_revision(tokenizer_identifier),
-            token=token,
-        )
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_identifier, token=token)
         _tokenizer_cache[cache_key] = tokenizer
         return tokenizer
 
diff --git a/ci/scripts/release_helm_chart.py b/ci/scripts/release_helm_chart.py
index 3c8529017..7ca7f5eb9 100644
--- a/ci/scripts/release_helm_chart.py
+++ b/ci/scripts/release_helm_chart.py
@@ -10,11 +10,15 @@
     -t nemo-llm
     -v 24.06
     -n nv-ingest
+
+Requires: pip install ngcsdk pyyaml
+Env vars: NGC_CLI_API_KEY (required for publish)
 """
 
 import argparse
 import os
 import subprocess
+import sys
 
 import yaml
 
@@ -101,17 +105,13 @@ def main() -> None:
         shell=True,
     )
 
-    # Update the version and chart name
     chart = yaml.safe_load(open(f"dist/{n}/Chart.yaml").read())
     chart["name"] = n
     chart["version"] = v
     with open(f"dist/{n}/Chart.yaml", "w") as f:
         f.write(yaml.safe_dump(chart))
 
-    # Update the README.md
     overview = f"dist/{n}/README.md"
-
-    ngc = "ngc registry chart"
     logo = args.logo_url if args.logo_url else LOGO
 
     subprocess.check_call(f"helm package dist/{n}", shell=True)
@@ -120,15 +120,33 @@ def main() -> None:
         print(f"[DRY RUN] Chart packaged successfully: {n}-{v}.tgz")
         print(f"[DRY RUN] Skipping NGC chart update and push for {o}/{t}/{n}:{v}")
     else:
-        subprocess.check_call(
-            f"{ngc} update {o}/{t}/{n} --overview-filename {overview} --short-desc '{d}'"
-            + f"  --logo '{logo}' --display-name '{dn}' --publisher NVIDIA",
-            shell=True,
+        api_key = os.environ.get("NGC_CLI_API_KEY", "")
+        if not api_key:
+            print("ERROR: NGC_CLI_API_KEY environment variable is not set", file=sys.stderr)
+            sys.exit(1)
+
+        from ngcsdk import Client
+
+        clt = Client()
+        clt.configure(api_key=api_key, org_name=o, team_name=t)
+
+        target = f"{o}/{t}/{n}"
+        print(f"Updating chart metadata for {target} ...")
+        clt.registry.chart.update(
+            target=target,
+            overview_filepath=overview,
+            short_description=d,
+            logo=logo,
+            display_name=dn,
+            publisher="NVIDIA",
         )
-        subprocess.check_call(
-            f"{ngc} push --org {o} --team {t} {o}/{t}/{n}:{v}",
-            shell=True,
+
+        print(f"Pushing chart {target}:{v} ...")
+        clt.registry.chart.push(
+            target=f"{target}:{v}",
+            source_dir=".",
         )
+        print(f"Successfully pushed {target}:{v}")
 
 
 if __name__ == "__main__":
diff --git a/ci/scripts/validate_deployment_configs.py b/ci/scripts/validate_deployment_configs.py
index 1cb528949..14c8b3d43 100755
--- a/ci/scripts/validate_deployment_configs.py
+++ b/ci/scripts/validate_deployment_configs.py
@@ -49,9 +49,9 @@ def __str__(self) -> str:
     "page-elements": "page_elements",
     "graphic-elements": "graphic_elements",
     "table-structure": "table_structure",
-    "ocr": "nemoretriever_ocr_v1",
+    "ocr": "ocr",
     "embedding": "embedqa",
-    "reranker": "llama_3_2_nv_rerankqa_1b_v2",
+    "reranker": "rerankqa",
     "nemotron-parse": "nemotron_parse",
     "vlm": "nemotron_nano_12b_v2_vl",
     "audio": "audio",
diff --git a/client/pyproject.toml b/client/pyproject.toml
index 09dde9966..7ffa4d8ce 100644
--- a/client/pyproject.toml
+++ b/client/pyproject.toml
@@ -13,7 +13,8 @@ readme = "README.md"
 authors = [
     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
 ]
-license = {file = "LICENSE"}
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 requires-python = ">=3.11"
 classifiers = [
     "Programming Language :: Python :: 3.11",
diff --git a/docker-compose.a100-40gb.yaml b/docker-compose.a100-40gb.yaml
index a717d7a3a..cbe16ebff 100644
--- a/docker-compose.a100-40gb.yaml
+++ b/docker-compose.a100-40gb.yaml
@@ -6,14 +6,17 @@ services:
   page-elements:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   graphic-elements:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   table-structure:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   ocr:
     environment:
diff --git a/docker-compose.l40s.yaml b/docker-compose.l40s.yaml
index 8f8414e5a..55da32ca1 100644
--- a/docker-compose.l40s.yaml
+++ b/docker-compose.l40s.yaml
@@ -6,14 +6,17 @@ services:
   page-elements:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   graphic-elements:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   table-structure:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   ocr:
     environment:
diff --git a/docker-compose.rtx-pro-4500.yaml b/docker-compose.rtx-pro-4500.yaml
index a717d7a3a..cbe16ebff 100644
--- a/docker-compose.rtx-pro-4500.yaml
+++ b/docker-compose.rtx-pro-4500.yaml
@@ -6,14 +6,17 @@ services:
   page-elements:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   graphic-elements:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   table-structure:
     environment:
       - NIM_TRITON_MAX_BATCH_SIZE=1
+      - NIM_TRITON_DATA_MAX_BATCH_SIZE=1
 
   ocr:
     environment:
diff --git a/docker/scripts/post_build_triggers.py b/docker/scripts/post_build_triggers.py
index 1488e6339..8eb26f301 100644
--- a/docker/scripts/post_build_triggers.py
+++ b/docker/scripts/post_build_triggers.py
@@ -4,30 +4,6 @@
 
 from transformers import AutoTokenizer
 
-try:
-    from nemo_retriever.utils.hf_model_registry import get_hf_revision
-except ModuleNotFoundError:
-    # Fallback for Docker build stages where nemo_retriever isn't installed yet.
-    _REVISIONS = {
-        "meta-llama/Llama-3.2-1B": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
-        "intfloat/e5-large-unsupervised": "15af9288f69a6291f37bfb89b47e71abc747b206",
-    }
-
-    def get_hf_revision(model_id, *, strict=True):  # type: ignore[misc]
-        revision = _REVISIONS.get(model_id)
-        if revision is not None:
-            return revision
-        msg = (
-            f"No pinned HuggingFace revision for model '{model_id}'. "
-            "Add an entry to _REVISIONS in post_build_triggers.py (and "
-            "HF_MODEL_REVISIONS in hf_model_registry.py) to pin it."
-        )
-        if strict:
-            raise ValueError(msg)
-        print(f"WARNING: {msg} Falling back to the default (main) branch.")
-        return None
-
-
 MAX_RETRIES = 5
 
 
@@ -36,7 +12,7 @@ def download_tokenizer(model_name, save_path, token=None):
 
     for attempt in range(MAX_RETRIES):
         try:
-            tokenizer = AutoTokenizer.from_pretrained(model_name, revision=get_hf_revision(model_name), token=token)
+            tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
             tokenizer.save_pretrained(save_path)
             return
         except Exception as e:
diff --git a/docs/docs/extraction/audio.md b/docs/docs/extraction/audio.md
index 4be7ee8ac..192c4ecf4 100644
--- a/docs/docs/extraction/audio.md
+++ b/docs/docs/extraction/audio.md
@@ -27,7 +27,7 @@ to transcribe speech to text, which is then embedded by using the Nemotron embed
 
 !!! important
 
-    Due to limitations in available VRAM controls in the current release, the RIVA ASR NIM microservice must run on a [dedicated additional GPU](support-matrix.md). For the full list of requirements, refer to [Support Matrix](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/support-matrix.html).
+    Due to limitations in available VRAM controls in the current release, the RIVA ASR NIM microservice must run on a [dedicated additional GPU](support-matrix.md). For the full list of requirements, refer to [Support Matrix](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/support-matrix/support-matrix.html).
 
 This pipeline enables users to retrieve speech files at the segment level.
 
diff --git a/docs/docs/extraction/benchmarking.md b/docs/docs/extraction/benchmarking.md
index 62abbf302..54e6eb171 100644
--- a/docs/docs/extraction/benchmarking.md
+++ b/docs/docs/extraction/benchmarking.md
@@ -35,20 +35,20 @@ Before you use this documentation, you need the following:
 ### Run Your First Test
 
 ```bash
-# 1. Navigate to the nemo-retriever-bench directory
+# 1. Navigate to the harness directory
 cd tools/harness
 
 # 2. Install dependencies
 uv sync
 
 # 3. Run with a pre-configured dataset (assumes services are running)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 
 # Or use a custom path that uses the "active" configuration
-uv run nemo-retriever-bench --case=e2e --dataset=/path/to/your/data
+uv run nv-ingest-harness-run --case=e2e --dataset=/path/to/your/data
 
 # With managed infrastructure (starts/stops services)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767 --managed
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed
 ```
 
 ## Configuration System
@@ -144,13 +144,13 @@ datasets:
 **Usage:**
 ```bash
 # Single dataset - configs applied automatically
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 
 # Multiple datasets (sweeping) - each gets its own config
-uv run nemo-retriever-bench --case=e2e --dataset=bo767,earnings,bo20
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767,earnings,bo20
 
 # Custom path still works (uses active section config)
-uv run nemo-retriever-bench --case=e2e --dataset=/custom/path
+uv run nv-ingest-harness-run --case=e2e --dataset=/custom/path
 ```
 
 **Dataset Extraction Settings:**
@@ -176,7 +176,7 @@ Example:
 # YAML active section has api_version: v1
 # Dataset bo767 has extract_images: false
 # Override via environment variable (highest priority)
-EXTRACT_IMAGES=true API_VERSION=v2 uv run nemo-retriever-bench --case=e2e --dataset=bo767
+EXTRACT_IMAGES=true API_VERSION=v2 uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 # Result: Uses bo767 path, but extract_images=true (env override) and api_version=v2 (env override)
 ```
 
@@ -240,13 +240,13 @@ Configuration is validated on load with helpful error messages.
 
 ```bash
 # Run with default YAML configuration (assumes services are running)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 
 # With document-level analysis
-uv run nemo-retriever-bench --case=e2e --dataset=bo767 --doc-analysis
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --doc-analysis
 
 # With managed infrastructure (starts/stops services)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767 --managed
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed
 ```
 
 ### Dataset Sweeping
@@ -255,7 +255,7 @@ Run multiple datasets in a single command - each dataset automatically gets its
 
 ```bash
 # Sweep multiple datasets
-uv run nemo-retriever-bench --case=e2e --dataset=bo767,earnings,bo20
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767,earnings,bo20
 
 # Each dataset runs sequentially with its own:
 # - Extraction settings (from dataset config)
@@ -263,13 +263,13 @@ uv run nemo-retriever-bench --case=e2e --dataset=bo767,earnings,bo20
 # - Results summary at the end
 
 # With managed infrastructure (services start once, shared across all datasets)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767,earnings,bo20 --managed
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767,earnings,bo20 --managed
 
 # E2E+Recall sweep (each dataset ingests then evaluates recall)
-uv run nemo-retriever-bench --case=e2e_recall --dataset=bo767,earnings
+uv run nv-ingest-harness-run --case=e2e_recall --dataset=bo767,earnings
 
 # Recall-only sweep (evaluates existing collections)
-uv run nemo-retriever-bench --case=recall --dataset=bo767,earnings
+uv run nv-ingest-harness-run --case=recall --dataset=bo767,earnings
 ```
 
 **Sweep Behavior:**
@@ -283,10 +283,10 @@ uv run nemo-retriever-bench --case=recall --dataset=bo767,earnings
 
 ```bash
 # Override via environment (useful for CI/CD)
-API_VERSION=v2 EXTRACT_TABLES=false uv run nemo-retriever-bench --case=e2e
+API_VERSION=v2 EXTRACT_TABLES=false uv run nv-ingest-harness-run --case=e2e
 
 # Temporary changes without editing YAML
-DATASET_DIR=/custom/path uv run nemo-retriever-bench --case=e2e
+DATASET_DIR=/custom/path uv run nv-ingest-harness-run --case=e2e
 ```
 
 ## Test Scenarios
@@ -472,23 +472,23 @@ recall:
 ```bash
 # Evaluate existing bo767 collections (no reranker)
 # recall_dataset automatically set from dataset config
-uv run nemo-retriever-bench --case=recall --dataset=bo767
+uv run nv-ingest-harness-run --case=recall --dataset=bo767
 
 # With reranker only (set reranker_mode in YAML recall section)
-uv run nemo-retriever-bench --case=recall --dataset=bo767
+uv run nv-ingest-harness-run --case=recall --dataset=bo767
 
 # Sweep multiple datasets for recall evaluation
-uv run nemo-retriever-bench --case=recall --dataset=bo767,earnings
+uv run nv-ingest-harness-run --case=recall --dataset=bo767,earnings
 ```
 
 **E2E + Recall (fresh ingestion):**
 ```bash
 # Fresh ingestion with recall evaluation
 # recall_dataset automatically set from dataset config
-uv run nemo-retriever-bench --case=e2e_recall --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e_recall --dataset=bo767
 
 # Sweep multiple datasets (each ingests then evaluates)
-uv run nemo-retriever-bench --case=e2e_recall --dataset=bo767,earnings
+uv run nv-ingest-harness-run --case=e2e_recall --dataset=bo767,earnings
 ```
 
 **Dataset configuration:**
@@ -536,7 +536,7 @@ The easiest way to test multiple datasets is using dataset sweeping:
 
 ```bash
 # Test multiple datasets - each gets its native config automatically
-uv run nemo-retriever-bench --case=e2e --dataset=bo767,earnings,bo20
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767,earnings,bo20
 
 # Each dataset runs with its pre-configured extraction settings
 # Results are organized in separate artifact directories
@@ -547,7 +547,7 @@ uv run nemo-retriever-bench --case=e2e --dataset=bo767,earnings,bo20
 To sweep through different parameter values:
 
 1. **Edit** `test_configs.yaml` - Update values in the `active` section
-2. **Run** the test: `uv run nemo-retriever-bench --case=e2e --dataset=<name>`
+2. **Run** the test: `uv run nv-ingest-harness-run --case=e2e --dataset=<name>`
 3. **Analyze** results in `artifacts/<test_name>_<timestamp>/`
 4. **Repeat** steps 1-3 for next parameter combination
 
@@ -555,18 +555,18 @@ Example parameter sweep workflow:
 ```bash
 # Test 1: Baseline V1
 vim test_configs.yaml  # Set: api_version=v1, extract_tables=true
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 
 # Test 2: V2 with 32-page splitting
 vim test_configs.yaml  # Set: api_version=v2, pdf_split_page_count=32
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 
 # Test 3: V2 with 8-page splitting
 vim test_configs.yaml  # Set: pdf_split_page_count=8
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 
 # Test 4: Tables disabled (override via env var)
-EXTRACT_TABLES=false uv run nemo-retriever-bench --case=e2e --dataset=bo767
+EXTRACT_TABLES=false uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 ```
 
 **Note**: Each test run creates a new timestamped artifact directory, so you can compare results across sweeps.
@@ -576,7 +576,7 @@ EXTRACT_TABLES=false uv run nemo-retriever-bench --case=e2e --dataset=bo767
 ### Attach Mode (Default)
 
 ```bash
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 ```
 
 - **Default behavior**: Assumes services are already running
@@ -588,7 +588,7 @@ uv run nemo-retriever-bench --case=e2e --dataset=bo767
 ### Managed Mode
 
 ```bash
-uv run nemo-retriever-bench --case=e2e --dataset=bo767 --managed
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed
 ```
 
 - Starts Docker services automatically
@@ -600,10 +600,10 @@ uv run nemo-retriever-bench --case=e2e --dataset=bo767 --managed
 **Managed mode options:**
 ```bash
 # Skip Docker image rebuild (faster startup)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767 --managed --no-build
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed --no-build
 
 # Keep services running after test (useful for multi-test scenarios)
-uv run nemo-retriever-bench --case=e2e --dataset=bo767 --managed --keep-up
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed --keep-up
 ```
 
 ## Artifacts and Logging
@@ -631,7 +631,7 @@ tools/harness/artifacts/<test_name>_<timestamp>_UTC/
 Enable per-document element breakdown:
 
 ```bash
-uv run nemo-retriever-bench --case=e2e --doc-analysis
+uv run nv-ingest-harness-run --case=e2e --doc-analysis
 ```
 
 **Sample Output:**
@@ -812,7 +812,7 @@ The framework is dataset-agnostic and supports multiple approaches:
 **Option 1: Use pre-configured dataset (Recommended)**
 ```bash
 # Dataset configs automatically applied
-uv run nemo-retriever-bench --case=e2e --dataset=bo767
+uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 ```
 
 **Option 2: Add new dataset to YAML**
@@ -827,17 +827,17 @@ datasets:
     extract_infographics: false
     recall_dataset: null  # or set to evaluator name if applicable
 ```
-Then use: `uv run nemo-retriever-bench --case=e2e --dataset=my_dataset`
+Then use: `uv run nv-ingest-harness-run --case=e2e --dataset=my_dataset`
 
 **Option 3: Use custom path (uses active section config)**
 ```bash
-uv run nemo-retriever-bench --case=e2e --dataset=/path/to/your/dataset
+uv run nv-ingest-harness-run --case=e2e --dataset=/path/to/your/dataset
 ```
 
 **Option 4: Environment variable override**
 ```bash
 # Override specific settings via env vars
-EXTRACT_IMAGES=true uv run nemo-retriever-bench --case=e2e --dataset=bo767
+EXTRACT_IMAGES=true uv run nv-ingest-harness-run --case=e2e --dataset=bo767
 ```
 
 **Best Practice**: For repeated testing, add your dataset to the `datasets` section with its native extraction settings. This ensures consistent configuration and enables dataset sweeping.
diff --git a/docs/docs/extraction/cli-reference.md b/docs/docs/extraction/cli-reference.md
index 5bee4f046..6ce311402 100644
--- a/docs/docs/extraction/cli-reference.md
+++ b/docs/docs/extraction/cli-reference.md
@@ -203,7 +203,7 @@ nemo-retriever \
 To submit a .pdf file with both a splitting task and an extraction task, run the following code.
 
 !!! note
-    Currently, `split` only works for pdfium, nemotron-parse, and Unstructured.io.
+    Currently, `split` only works for pdfium and nemotron-parse.
 
 ```bash
 nemo-retriever \
diff --git a/docs/docs/extraction/content-metadata.md b/docs/docs/extraction/content-metadata.md
index ae384b6fc..5cd194f9e 100644
--- a/docs/docs/extraction/content-metadata.md
+++ b/docs/docs/extraction/content-metadata.md
@@ -164,7 +164,7 @@ Describes the structural location of content within a document.
 | `span`           | `int`                 | `-1`                       | Span identifier within a line, for finer granularity.                                                   |
 | `nearby_objects` | `NearbyObjectsSchema` | `NearbyObjectsSchema()`    | Information about objects (text, images, structured data) near the current content. See [NearbyObjectsSchema](#nearbyobjectsschema). |
 
-### `NearbyObjectsSchema` (Currently Unused)
+### `NearbyObjectsSchema` (Currently Unused) {#nearbyobjectsschema}
 Container for different types of nearby objects.
 
 | Field        | Type                   | Default Value                | Description                                                              |
@@ -243,7 +243,7 @@ Specific metadata for audio content.
 | `audio_transcript` | `str` | `""`          | Transcript of the audio content.                |
 | `audio_type`       | `str` | `""`          | Type or format of the audio (e.g., `mp3`, `wav`). |
 
-### `ErrorMetadataSchema` (Currently Unused)
+### `ErrorMetadataSchema` (Currently Unused) {#errormetadataschema}
 Metadata describing errors encountered during processing.
 
 | Field       | Type           | Default Value | Description                                                              |
@@ -253,7 +253,7 @@ Metadata describing errors encountered during processing.
 | `source_id` | `str`          | `""`          | Identifier of the source item that caused the error, if applicable.        |
 | `error_msg` | `str`          | *Required*    | The error message.                                                       |
 
-### `InfoMessageMetadataSchema` (Currently Unused)
+### `InfoMessageMetadataSchema` (Currently Unused) {#infomessagemetadataschema}
 Informational messages related to processing.
 
 | Field     | Type           | Default Value | Description                                                              |
diff --git a/docs/docs/extraction/custom-metadata.md b/docs/docs/extraction/custom-metadata.md
index 1ac644243..613443e93 100644
--- a/docs/docs/extraction/custom-metadata.md
+++ b/docs/docs/extraction/custom-metadata.md
@@ -60,7 +60,7 @@ For more information about the `Ingestor` class, see [Use the NeMo Retriever Lib
 For more information about the `vdb_upload` method, see [Upload Data](data-store.md).
 
 ```python
-from nemo_retriever.client import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 hostname="localhost"
 collection_name = "nemo_retriever_collection"
@@ -142,7 +142,7 @@ you can use the `content_metadata` field to filter search results.
 The following example uses a filter expression to narrow results by department.
 
 ```python
-from nemo_retriever.util.milvus import query
+from nv_ingest_client.util.vdb.milvus import nvingest_retrieval
 
 hostname="localhost"
 collection_name = "nemo_retriever_collection"
@@ -156,15 +156,15 @@ queries = ["this is expensive"]
 q_results = []
 for que in queries:
     q_results.append(
-        query(
-            [que], 
-            collection_name, 
-            milvus_uri=f"http://{hostname}:19530", 
-            embedding_endpoint=f"http://{hostname}:8012/v1",  
-            hybrid=sparse, 
-            top_k=top_k, 
-            model_name=model_name, 
-            gpu_search=False, 
+        nvingest_retrieval(
+            [que],
+            collection_name=collection_name,
+            milvus_uri=f"http://{hostname}:19530",
+            embedding_endpoint=f"http://{hostname}:8012/v1",
+            hybrid=sparse,
+            top_k=top_k,
+            model_name=model_name,
+            gpu_search=False,
             _filter=filter_expr
         )
     )
diff --git a/docs/docs/extraction/faq.md b/docs/docs/extraction/faq.md
index d7eabd490..a83d2c20c 100644
--- a/docs/docs/extraction/faq.md
+++ b/docs/docs/extraction/faq.md
@@ -76,12 +76,11 @@ For more information, refer to [Extract Specific Elements from PDFs](python-api-
 ```python
 Ingestor(client=client)
     .files("data/multimodal_test.pdf")
-    .extract(              
+    .extract(
         extract_text=True,
         extract_tables=True,
         extract_charts=True,
         extract_images=True,
-        paddle_output_format="markdown",
         extract_infographics=True,
         text_depth="page"
     )
diff --git a/docs/docs/extraction/nimclient.md b/docs/docs/extraction/nimclient.md
index cc1c402f2..9d4a5fe42 100644
--- a/docs/docs/extraction/nimclient.md
+++ b/docs/docs/extraction/nimclient.md
@@ -12,7 +12,7 @@ The NimClient architecture consists of two main components:
 1. **NimClient**: The client class that handles communication with NIM endpoints via gRPC or HTTP protocols
 2. **ModelInterface**: An abstract base class that defines how to format input data, parse output responses, and process inference results for specific models
 
-For advanced usage patterns, see the existing model interfaces in `api/src/nemo_retriever/internal/primitives/nim/model_interface/`.
+For advanced usage patterns, see the existing model interfaces in `api/src/nv_ingest_api/internal/primitives/nim/model_interface/`.
 
 
 ## Quick Start
@@ -20,8 +20,8 @@ For advanced usage patterns, see the existing model interfaces in `api/src/nemo_
 ### Basic NimClient Creation
 
 ```python
-from nemo_retriever.util.nim import create_inference_client
-from nemo_retriever.internal.primitives.nim import ModelInterface
+from nv_ingest_api.util.nim import create_inference_client
+from nv_ingest_api.internal.primitives.nim import ModelInterface
 
 # Create a custom model interface (see examples below)
 model_interface = MyCustomModelInterface()
@@ -48,7 +48,7 @@ results = client.infer(data, model_name="your-model-name")
 
 ```python
 import os
-from nemo_retriever.util.nim import create_inference_client
+from nv_ingest_api.util.nim import create_inference_client
 
 # Use environment variables for configuration
 auth_token = os.getenv("NGC_API_KEY")
@@ -71,7 +71,7 @@ To integrate a new NIM, you need to create a custom `ModelInterface` subclass th
 ```python
 from typing import Dict, Any, List, Tuple, Optional
 import numpy as np
-from nemo_retriever.internal.primitives.nim import ModelInterface
+from nv_ingest_api.internal.primitives.nim import ModelInterface
 
 class MyCustomModelInterface(ModelInterface):
     """
@@ -305,7 +305,7 @@ class TextGenerationModelInterface(ModelInterface):
 
 ```python
 import base64
-from nemo_retriever.util.image_processing.transforms import numpy_to_base64
+from nv_ingest_api.util.image_processing.transforms import numpy_to_base64
 
 class ImageAnalysisModelInterface(ModelInterface):
     """Interface for image analysis NIMs (e.g., vision models)."""
@@ -382,8 +382,8 @@ class ImageAnalysisModelInterface(ModelInterface):
 ### Basic UDF with NimClient
 
 ```python
-from nemo_retriever.internal.primitives.control_message import IngestControlMessage
-from nemo_retriever.util.nim import create_inference_client
+from nv_ingest_api.internal.primitives.control_message import IngestControlMessage
+from nv_ingest_api.util.nim import create_inference_client
 import os
 
 def analyze_document_with_nim(control_message: IngestControlMessage) -> IngestControlMessage:
@@ -570,7 +570,7 @@ If memory issues persist, you can reduce the `NIM_TRITON_RATE_LIMIT` value — e
 import logging
 
 # Enable debug logging
-logging.getLogger("nemo_retriever.internal.primitives.nim").setLevel(logging.DEBUG)
+logging.getLogger("nv_ingest_api.internal.primitives.nim").setLevel(logging.DEBUG)
 
 # Test your model interface separately
 model_interface = MyCustomModelInterface()
diff --git a/docs/docs/extraction/overview.md b/docs/docs/extraction/overview.md
index 263204ddc..3c2d390d9 100644
--- a/docs/docs/extraction/overview.md
+++ b/docs/docs/extraction/overview.md
@@ -28,7 +28,7 @@ NeMo Retriever Library is a microservice service that does the following:
 
 - Accept a JSON job description, containing a document payload, and a set of ingestion tasks to perform on that payload.
 - Allow the results of a job to be retrieved. The result is a JSON dictionary that contains a list of metadata describing objects extracted from the base document, and processing annotations and timing/trace data.
-- Support multiple methods of extraction for each document type to balance trade-offs between throughput and accuracy. For example, for .pdf documents, extraction is performed by using pdfium, [nemotron-parse](https://build.nvidia.com/nvidia/nemotron-parse), Unstructured.io, and Adobe Content Extraction Services.
+- Support multiple methods of extraction for each document type to balance trade-offs between throughput and accuracy. For example, for .pdf documents, extraction is performed by using pdfium and [nemotron-parse](https://build.nvidia.com/nvidia/nemotron-parse).
 - Support various types of pre- and post- processing operations, including text splitting and chunking, transform and filtering, embedding generation, and image offloading to storage.
 
 NeMo Retriever Library supports the following file types:
diff --git a/docs/docs/extraction/prerequisites.md b/docs/docs/extraction/prerequisites.md
index 902c499c8..c96af36fa 100644
--- a/docs/docs/extraction/prerequisites.md
+++ b/docs/docs/extraction/prerequisites.md
@@ -11,6 +11,7 @@ Before you begin using [NeMo Retriever Library](overview.md), ensure the followi
 ## Software Requirements
 
 - Linux operating systems (Ubuntu 22.04 or later recommended)
+- **Python 3.12 or later** (required for NeMo Retriever Library packages; see note below)
 - [Docker](https://docs.docker.com/engine/install/)
 - [Docker Compose](https://docs.docker.com/compose/install/)
 - [Docker Buildx](https://docs.docker.com/build/concepts/overview/#buildx) `>= 0.17` (Compose 2.40+ enforces this)
@@ -21,7 +22,7 @@ Before you begin using [NeMo Retriever Library](overview.md), ensure the followi
 
 !!! note
 
-    You install Python later.
+    Install **Python 3.12 or later** before creating your environment. Using Python 3.10 or 3.11 will cause dependency resolution failures when installing NeMo Retriever Library packages.
 
 
 
diff --git a/docs/docs/extraction/python-api-reference.md b/docs/docs/extraction/python-api-reference.md
index b9d914649..e5908d028 100644
--- a/docs/docs/extraction/python-api-reference.md
+++ b/docs/docs/extraction/python-api-reference.md
@@ -80,7 +80,7 @@ The caption task can call a vision-language model (VLM) with the following optio
 
 Example:
 ```python
-from nemo_retriever.client.interface import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 ingestor = (
     Ingestor()
@@ -224,7 +224,7 @@ The `extract` method enables different types of data to be extracted.
 Use the following code to extract a single PDF file.
 
 ```python
-from nemo_retriever.client.interface import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 # Initialize Ingestor with a local PDF file
 ingestor = Ingestor().files("path/to/document.pdf")
@@ -527,7 +527,7 @@ The caption task can call a VLM with optional prompt and system prompt overrides
 
 Example:
 ```python
-from nemo_retriever.client.interface import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 ingestor = (
     Ingestor()
@@ -662,7 +662,7 @@ For more information on environment variables, refer to [Environment Variables](
 Use the following code to extract mp3 audio content.
 
 ```python
-from nemo_retriever.client import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 ingestor = Ingestor().files("audio_file.mp3")
 
diff --git a/docs/docs/extraction/quickstart-guide.md b/docs/docs/extraction/quickstart-guide.md
index 97bcfb578..415f123a6 100644
--- a/docs/docs/extraction/quickstart-guide.md
+++ b/docs/docs/extraction/quickstart-guide.md
@@ -82,6 +82,12 @@ h. Run the command `docker ps`. You should see output similar to the following.
 
     ```
     CONTAINER ID  IMAGE                                            COMMAND                 CREATED         STATUS                  PORTS            NAMES
+    ...
+    ```
+
+To run the NeMo Retriever Library Python client from your host machine, **Python 3.12 or later is required**. Create a virtual environment and install the client packages:
+
+```shell
 uv venv --python 3.12 nv-ingest-dev
 source nv-ingest-dev/bin/activate
 uv pip install nv-ingest==26.1.2 nv-ingest-api==26.1.2 nv-ingest-client==26.1.2
@@ -89,7 +95,7 @@ uv pip install nv-ingest==26.1.2 nv-ingest-api==26.1.2 nv-ingest-client==26.1.2
 
 !!! tip
 
-    To confirm that you have activated your Conda environment, run `which pip` and `which python`, and confirm that you see `nemo_retriever` in the result. You can do this before any pip or python command that you run.
+    To confirm that you have activated your virtual environment, run `which pip` and `which python`, and confirm that you see `nemo_retriever` or your venv path in the result. You can do this before any pip or python command that you run.
 
 
 !!! note
@@ -131,9 +137,10 @@ The following examples demonstrate how to extract text, charts, tables, and imag
 <a id="ingest_python_example"></a>
 ```python
 import logging, os, time
-from nemo_retriever.client import Ingestor, NemoRetrieverClient
-from nemo_retriever.util.process_json_files import ingest_json_results_to_blob
-client = NemoRetrieverClient(                                                                         
+from nv_ingest_client.client.interface import Ingestor
+from nv_ingest_client.client import NvIngestClient
+from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
+client = NvIngestClient(                                                                         
     message_client_port=7670,                                                               
     message_client_hostname="localhost"        
 )                                                                 
@@ -459,7 +466,7 @@ docker compose \
 
 ## Specify MIG slices for NIM models
 
-When you deploy NeMo Retriever Library with NIM models on MIG‑enabled GPUs, MIG device slices are requested and scheduled through the `values.yaml` file for the corresponding NIM microservice. For IBM Content-Aware Storage (CAS) deployments, this allows NeMo Retriever Library NIM pods to land only on nodes that expose the desired MIG profiles [raw.githubusercontent](https://raw.githubusercontent.com/NVIDIA/NeMo-Retriever/main/helm/README.md%E2%80%8B).​
+When you deploy NeMo Retriever Library with NIM models on MIG‑enabled GPUs, MIG device slices are requested and scheduled through the `values.yaml` file for the corresponding NIM microservice. For IBM Content-Aware Storage (CAS) deployments, this allows NeMo Retriever Library NIM pods to land only on nodes that expose the desired MIG profiles [raw.githubusercontent](https://raw.githubusercontent.com/NVIDIA/NeMo-Retriever/main/helm/README.md).​
 
 To target a specific MIG profile—for example, a 3g.20gb slice on an A100, which is a hardware-partitioned virtual GPU instance that gives your workload a fixed mid-sized share of the A100’s compute plus 20 GB of dedicated GPU memory and behaves like a smaller independent GPU—for a given NIM, configure the `resources` and `nodeSelector` under that NIM’s values path in `values.yaml`.
 
diff --git a/docs/docs/extraction/quickstart-library-mode.md b/docs/docs/extraction/quickstart-library-mode.md
index e65b4fac1..5f7a0f51d 100644
--- a/docs/docs/extraction/quickstart-library-mode.md
+++ b/docs/docs/extraction/quickstart-library-mode.md
@@ -81,10 +81,11 @@ On a 4 CPU core low end laptop, the following code should take about 10 seconds.
 ```python
 import time
 
-from nemo_retriever.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
-from nemo_retriever.client import Ingestor, NemoRetrieverClient
-from nemo_retriever.util.message_brokers.simple_message_broker import SimpleClient
-from nemo_retriever.util.process_json_files import ingest_json_results_to_blob
+from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
+from nv_ingest_client.client.interface import Ingestor
+from nv_ingest_client.client import NvIngestClient
+from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
+from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob
 
 def main():
     # Start the pipeline subprocess for library mode
@@ -190,7 +191,7 @@ To query for relevant snippets of the ingested content, and use them with an LLM
 ```python
 import os
 from openai import OpenAI
-from nemo_retriever.util.milvus import query
+from nv_ingest_client.util.vdb.milvus import nvingest_retrieval
 
 milvus_uri = "milvus.db"
 collection_name = "test"
@@ -198,16 +199,16 @@ sparse=False
 
 queries = ["Which animal is responsible for the typos?"]
 
-retrieved_docs = query(
+retrieved_docs = nvingest_retrieval(
     queries,
-    collection_name,
+    collection_name=collection_name,
     milvus_uri=milvus_uri,
     hybrid=sparse,
     top_k=1,
 )
 
 # simple generation example
-extract = retrieved_docs[0][0]["entity"]["text"]
+extract = retrieved_docs[0][0].get("entity", retrieved_docs[0][0]).get("text", "")
 client = OpenAI(
   base_url = "https://integrate.api.nvidia.com/v1",
   api_key = os.environ["NVIDIA_API_KEY"]
@@ -307,8 +308,8 @@ It listens for ingestion requests on port `7671` from an external client.
 import logging
 import os
 
-from nemo_retriever.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
-from nemo_retriever.util.logging.configuration import configure_logging as configure_local_logging
+from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
+from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging
 
 # Configure the logger
 logger = logging.getLogger(__name__)
@@ -353,11 +354,11 @@ import logging
 import os
 import time
 
-from nemo_retriever.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
-from nemo_retriever.util.logging.configuration import configure_logging as configure_local_logging
-from nemo_retriever.util.message_brokers.simple_message_broker import SimpleClient
-from nemo_retriever.client import Ingestor
-from nemo_retriever.client import NemoRetrieverClient
+from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_runners import run_pipeline
+from nv_ingest_api.util.logging.configuration import configure_logging as configure_local_logging
+from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
+from nv_ingest_client.client.interface import Ingestor
+from nv_ingest_client.client import NvIngestClient
 
 # Configure the logger
 logger = logging.getLogger(__name__)
diff --git a/docs/docs/extraction/support-matrix.md b/docs/docs/extraction/support-matrix.md
index eec709b8c..7873e014c 100644
--- a/docs/docs/extraction/support-matrix.md
+++ b/docs/docs/extraction/support-matrix.md
@@ -7,12 +7,17 @@ Before you begin using [NeMo Retriever Library](overview.md), ensure that you ha
     NVIDIA Ingest (nv-ingest) has been renamed to the NeMo Retriever Library.
 
 
+## Software Requirements
+
+- **Python**: 3.12 or later. The NeMo Retriever Library core and harness require Python 3.12+; the client supports Python 3.11+. Using Python 3.10 or earlier will cause dependency resolution failures. For details, see [Prerequisites](prerequisites.md).
+
+
 ## Core and Advanced Pipeline Features
 
 The NeMo Retriever Library core pipeline features run on a single A10G or better GPU. 
 The core pipeline features include the following:
 
-- llama3.2-nv-embedqa-1b-v2 — Embedding model for converting text chunks into vectors.
+- llama-nemotron-embed-1b-v2 — Embedding model for converting text chunks into vectors.
 - nemotron-page-elements-v3 — Detects and classifies images on a page as a table, chart or infographic.
 - nemotron-table-structure-v1 — Detects rows, columns, and cells within a table to preserve table structure and convert to Markdown format. 
 - nemotron-graphic-elements-v1 — Detects graphic elements within chart images such as titles, legends, axes, and numerical values. 
@@ -39,6 +44,7 @@ This includes the following:
 NeMo Retriever Library supports the following GPU hardware.
 
 - [RTX Pro 6000 Blackwell Server Edition](https://www.nvidia.com/en-us/data-center/rtx-pro-6000-blackwell-server-edition/)
+- [RTX PRO 4500 Blackwell](https://www.nvidia.com/en-us/products/workstations/professional-desktop-gpus/rtx-pro-4500/)
 - [DGX B200](https://www.nvidia.com/en-us/data-center/dgx-b200/)
 - [H200 NVL](https://www.nvidia.com/en-us/data-center/h200/)
 - [H100 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/h100/)
@@ -49,24 +55,30 @@ NeMo Retriever Library supports the following GPU hardware.
 
 The following are the hardware requirements to run NeMo Retriever Library.
 
-|Feature         | GPU Option                | RTX Pro 6000  | B200          | H200 NVL      | H100        | A100 80GB   | A100 40GB     | A10G          | L40S   |
-|----------------|---------------------------|---------------|---------------|---------------|-------------|-------------|---------------|---------------|--------|
-| GPU            | Memory                    | 96GB          | 180GB         | 141GB         | 80GB        | 80GB        | 40GB          | 24GB          | 48GB   |
-| Core Features  | Total GPUs                | 1             | 1             | 1             | 1           | 1           | 1             | 1             | 1      |
-| Core Features  | Total Disk Space          | ~150GB        | ~150GB        | ~150GB        | ~150GB      | ~150GB      | ~150GB        | ~150GB        | ~150GB |
-| Audio          | Additional Dedicated GPUs | 1             | 1             | 1             | 1           | 1           | 1             | 1             | 1      |
-| Audio          | Additional Disk Space     | ~37GB         | ~37GB         | ~37GB         | ~37GB       | ~37GB       | ~37GB         | ~37GB         | ~37GB  |
-| nemotron-parse | Additional Dedicated GPUs | Not supported | Not supported | Not supported | 1           | 1           | 1             | 1             | 1      |
-| nemotron-parse | Additional Disk Space     | Not supported | Not supported | Not supported | ~16GB       | ~16GB       | ~16GB         | ~16GB         | ~16GB  |
-| VLM            | Additional Dedicated GPUs | 1             | 1             | 1             | 1           | 1           | Not supported | Not supported | 1      |
-| VLM            | Additional Disk Space     | ~16GB         | ~16GB         | ~16GB         | ~16GB       | ~16GB       | Not supported | Not supported | ~16GB  |
-| Reranker       | With Core Pipeline        | Yes           | Yes           | Yes           | Yes         | Yes         | No*           | No*           | No*    |
-| Reranker       | Standalone (recall only)  | Yes           | Yes           | Yes           | Yes         | Yes         | Yes           | Yes           | Yes    |
+|Feature         | GPU Option                | RTX Pro 6000  | RTX PRO 4500  | B200          | H200 NVL      | H100        | A100 80GB   | A100 40GB     | A10G          | L40S   |
+|----------------|---------------------------|---------------|---------------|---------------|---------------|-------------|-------------|---------------|---------------|--------|
+| GPU            | Memory                    | 96GB          | 32GB          | 180GB         | 141GB         | 80GB        | 80GB        | 40GB          | 24GB          | 48GB   |
+| Core Features  | Total GPUs                | 1             | 1             | 1             | 1             | 1           | 1           | 1             | 1             | 1      |
+| Core Features  | Total Disk Space          | ~150GB        | ~150GB        | ~150GB        | ~150GB        | ~150GB      | ~150GB      | ~150GB        | ~150GB        | ~150GB |
+| Audio          | Additional Dedicated GPUs | 1             | 1†            | 1             | 1             | 1           | 1           | 1             | 1             | 1      |
+| Audio          | Additional Disk Space     | ~37GB         | ~37GB         | ~37GB         | ~37GB         | ~37GB       | ~37GB       | ~37GB         | ~37GB         | ~37GB  |
+| nemotron-parse | Additional Dedicated GPUs | Not supported | Not supported‡| Not supported | Not supported | 1           | 1           | 1             | 1             | 1      |
+| nemotron-parse | Additional Disk Space     | Not supported | Not supported | Not supported | Not supported | ~16GB       | ~16GB       | ~16GB         | ~16GB         | ~16GB  |
+| VLM            | Additional Dedicated GPUs | 1             | Not supported§| 1             | 1             | 1           | 1           | Not supported | Not supported | 1      |
+| VLM            | Additional Disk Space     | ~16GB         | Not supported | ~16GB         | ~16GB         | ~16GB       | ~16GB       | Not supported | Not supported | ~16GB  |
+| Reranker       | With Core Pipeline        | Yes           | No*           | Yes           | Yes           | Yes         | Yes         | No*           | No*           | No*    |
+| Reranker       | Standalone (recall only)  | Yes           | Yes           | Yes           | Yes           | Yes         | Yes         | Yes           | Yes           | Yes    |
 
 \* GPUs with less than 80GB VRAM cannot run the reranker concurrently with the core pipeline. 
 To perform recall testing with the reranker on these GPUs, shut down the core pipeline NIM microservices 
 and run only the embedder, reranker, and your vector database.
 
+† Audio (Parakeet) runs but requires a runtime engine build — no pre-defined model profile for this GPU. Dev team to confirm official support status.
+
+‡ Nemotron Parse fails to start on 32GB despite being supported on A10G (24GB). Pending engineering investigation — may be Blackwell architecture compatibility issue (see related bug).
+
+§ VLM (nemotron-nano-12b-v2-vl) fails to load on 32GB, consistent with "Not supported" on A100-40GB (40GB). 32GB is below the threshold.
+
 
 
 ## Related Topics
diff --git a/docs/docs/extraction/user-defined-functions.md b/docs/docs/extraction/user-defined-functions.md
index d5f2b72c8..d7a48a02c 100644
--- a/docs/docs/extraction/user-defined-functions.md
+++ b/docs/docs/extraction/user-defined-functions.md
@@ -16,7 +16,7 @@ This guide covers how to write, validate, and submit UDFs using both the CLI and
 Create a Python function that accepts an `IngestControlMessage` and returns a modified `IngestControlMessage`:
 
 ```python
-from nemo_retriever.internal.primitives.ingest_control_message import IngestControlMessage
+from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
 
 def my_custom_processor(control_message: IngestControlMessage) -> IngestControlMessage:
     """Add custom metadata to all documents."""
@@ -77,7 +77,7 @@ nemo-retriever \
 ### 3. Submit via Python Client
 
 ```python
-from nemo_retriever.client.interface import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 # Create an Ingestor instance with default client
 ingestor = Ingestor()
@@ -305,7 +305,7 @@ UDFs can be executed at different stages of the pipeline by specifying the `targ
 - `broker_response` - Response message handling
 - `otel_tracer` - OpenTelemetry tracing
 
-> **Note:** For the complete and up-to-date list of pipeline stages, see the [default_pipeline.yaml](../../../config/default_pipeline.yaml) configuration file.
+> **Note:** For the complete and up-to-date list of pipeline stages, see the [default_pipeline.yaml](https://github.com/NVIDIA/nv-ingest/blob/main/config/default_pipeline.yaml) configuration file.
 
 #### Target Stage Selection Examples
 
@@ -461,9 +461,9 @@ NVIDIA Inference Microservices (NIMs) provide powerful AI capabilities that can
 ### Quick NIM Integration
 
 ```python
-from nemo_retriever.internal.primitives.control_message import IngestControlMessage
-from nemo_retriever.util.nim import create_inference_client
-from nemo_retriever.internal.primitives.nim.model_interface.vlm import VLMModelInterface
+from nv_ingest_api.internal.primitives.control_message import IngestControlMessage
+from nv_ingest_api.util.nim import create_inference_client
+from nv_ingest_api.internal.primitives.nim.model_interface.vlm import VLMModelInterface
 import os
 
 def document_analysis_with_nim(control_message: IngestControlMessage) -> IngestControlMessage:
@@ -873,7 +873,7 @@ Test your UDF functions in isolation before deploying them to the pipeline:
 
 ```python
 import pandas as pd
-from nemo_retriever.internal.primitives.ingest_control_message import IngestControlMessage
+from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
 
 def test_my_udf():
     # Create test data
diff --git a/docs/docs/extraction/user-defined-stages.md b/docs/docs/extraction/user-defined-stages.md
index a20e17673..54dd8edb8 100644
--- a/docs/docs/extraction/user-defined-stages.md
+++ b/docs/docs/extraction/user-defined-stages.md
@@ -44,8 +44,8 @@ The following example demonstrates how to create a valid Lambda function and con
 ```python
 import pandas as pd
 from pydantic import BaseModel
-from nemo_retriever.internal.primitives.ingest_control_message import IngestControlMessage
-from nemo_retriever.internal.schemas.meta.metadata_schema import validate_metadata
+from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
+from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
 
 # Config schema for your stage
 class MyToyConfig(BaseModel):
@@ -166,7 +166,7 @@ After you change any metadata, you can validate it by using the `validate_metada
 as demonstrated in the following code example.
 
 ```python
-from nemo_retriever.internal.schemas.meta.metadata_schema import validate_metadata
+from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
 
 def edit_metadata(control_message: IngestControlMessage, stage_config: MyToyConfig) -> IngestControlMessage:
   df = control_message.payload()
@@ -235,8 +235,8 @@ The  following example adds user-defined stages to your NeMo Retriever Library p
     ```python
     # my_pipeline/stages.py
     from pydantic import BaseModel
-    from nemo_retriever.internal.primitives.ingest_control_message import IngestControlMessage
-    from nemo_retriever.internal.schemas.meta.metadata_schema import validate_metadata
+    from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage
+    from nv_ingest_api.internal.schemas.meta.metadata_schema import validate_metadata
 
     class DoubleConfig(BaseModel):
     multiply_by: int = 2
diff --git a/docs/docs/extraction/v2-api-guide.md b/docs/docs/extraction/v2-api-guide.md
index 1ac15d216..52f7ae22e 100644
--- a/docs/docs/extraction/v2-api-guide.md
+++ b/docs/docs/extraction/v2-api-guide.md
@@ -30,7 +30,7 @@ The V2 API automatically splits large PDFs into smaller chunks before processing
 ### Minimal Example
 
 ```python
-from nemo_retriever.client import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 # Two-step configuration
 ingestor = Ingestor(
@@ -432,7 +432,7 @@ For test scripts like `tools/harness/src/nemo_retriever_harness/cases/e2e.py`:
 
 ```python
 import os
-from nemo_retriever.client import Ingestor
+from nv_ingest_client.client.interface import Ingestor
 
 # Read from environment
 api_version = os.getenv("API_VERSION", "v1")
diff --git a/docs/docs/extraction/vlm-embed.md b/docs/docs/extraction/vlm-embed.md
index 331379ab3..2e493675c 100644
--- a/docs/docs/extraction/vlm-embed.md
+++ b/docs/docs/extraction/vlm-embed.md
@@ -1,6 +1,6 @@
 # Use Multimodal Embedding with NeMo Retriever Library
 
-This guide explains how to use the [NeMo Retriever Library](https://www.perplexity.ai/search/overview.md) with the multimodal embedding model [Llama Nemotron Embed VL 1B v2](https://build.nvidia.com/nvidia/llama-nemotron-embed-vl-1b-v2).
+This guide explains how to use the [NeMo Retriever Library](overview.md) with the multimodal embedding model [Llama Nemotron Embed VL 1B v2](https://build.nvidia.com/nvidia/llama-nemotron-embed-vl-1b-v2).
 
 The `Llama Nemotron Embed VL 1B v2` model is optimized for multimodal question-answering and retrieval tasks.
 It can embed documents as text, images, or paired text-image combinations.
diff --git a/helm/README.md b/helm/README.md
index 6bc711a8b..35a06176e 100644
--- a/helm/README.md
+++ b/helm/README.md
@@ -298,7 +298,7 @@ You can also use NV-Ingest's Python client API to interact with the service runn
 | envVars.AUDIO_GRPC_ENDPOINT | string | `"audio:50051"` |  |
 | envVars.AUDIO_INFER_PROTOCOL | string | `"grpc"` |  |
 | envVars.COMPONENTS_TO_READY_CHECK | string | `"ALL"` |  |
-| envVars.EMBEDDING_NIM_ENDPOINT | string | `"http://llama-32-nv-embedqa-1b-v2:8000/v1"` |  |
+| envVars.EMBEDDING_NIM_ENDPOINT | string | `"http://llama-nemotron-embed-1b-v2:8000/v1"` |  |
 | envVars.EMBEDDING_NIM_MODEL_NAME | string | `"nvidia/llama-nemotron-embed-1b-v2"` |  |
 | envVars.IMAGE_STORAGE_PUBLIC_BASE_URL | string | `""` |  |
 | envVars.IMAGE_STORAGE_URI | string | `"s3://nv-ingest/artifacts/store/images"` |  |
@@ -465,46 +465,46 @@ You can also use NV-Ingest's Python client API to interact with the service runn
 | nimOperator.graphic_elements.storage.pvc.create | bool | `true` |  |
 | nimOperator.graphic_elements.storage.pvc.size | string | `"25Gi"` |  |
 | nimOperator.graphic_elements.storage.pvc.volumeAccessMode | string | `"ReadWriteOnce"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.authSecret | string | `"ngc-api"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.enabled | bool | `false` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.env[0].name | string | `"NIM_HTTP_API_PORT"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.env[0].value | string | `"8000"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.env[1].name | string | `"NIM_TRITON_LOG_VERBOSE"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.env[1].value | string | `"1"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.expose.service.grpcPort | int | `8001` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.expose.service.port | int | `8000` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.expose.service.type | string | `"ClusterIP"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.pullPolicy | string | `"IfNotPresent"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.pullSecrets[0] | string | `"ngc-secret"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.repository | string | `"nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.tag | string | `"1.10.0"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.replicas | int | `1` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.resources.limits."nvidia.com/gpu" | int | `1` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.create | bool | `true` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.size | string | `"50Gi"` |  |
-| nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.volumeAccessMode | string | `"ReadWriteOnce"` |  |
-| nimOperator.nemoretriever_ocr_v1.authSecret | string | `"ngc-api"` |  |
-| nimOperator.nemoretriever_ocr_v1.enabled | bool | `true` |  |
-| nimOperator.nemoretriever_ocr_v1.env[0].name | string | `"OMP_NUM_THREADS"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[0].value | string | `"8"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[1].name | string | `"NIM_HTTP_API_PORT"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[1].value | string | `"8000"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[2].name | string | `"NIM_TRITON_LOG_VERBOSE"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[2].value | string | `"1"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[3].name | string | `"NIM_TRITON_MAX_BATCH_SIZE"` |  |
-| nimOperator.nemoretriever_ocr_v1.env[3].value | string | `"32"` |  |
-| nimOperator.nemoretriever_ocr_v1.expose.service.grpcPort | int | `8001` |  |
-| nimOperator.nemoretriever_ocr_v1.expose.service.port | int | `8000` |  |
-| nimOperator.nemoretriever_ocr_v1.expose.service.type | string | `"ClusterIP"` |  |
-| nimOperator.nemoretriever_ocr_v1.image.pullPolicy | string | `"IfNotPresent"` |  |
-| nimOperator.nemoretriever_ocr_v1.image.pullSecrets[0] | string | `"ngc-secret"` |  |
-| nimOperator.nemoretriever_ocr_v1.image.repository | string | `"nvcr.io/nim/nvidia/nemotron-ocr-v1"` |  |
-| nimOperator.nemoretriever_ocr_v1.image.tag | string | `"1.3.0"` |  |
-| nimOperator.nemoretriever_ocr_v1.replicas | int | `1` |  |
-| nimOperator.nemoretriever_ocr_v1.resources.limits."nvidia.com/gpu" | int | `1` |  |
-| nimOperator.nemoretriever_ocr_v1.storage.pvc.create | bool | `true` |  |
-| nimOperator.nemoretriever_ocr_v1.storage.pvc.size | string | `"25Gi"` |  |
-| nimOperator.nemoretriever_ocr_v1.storage.pvc.volumeAccessMode | string | `"ReadWriteOnce"` |  |
+| nimOperator.rerankqa.authSecret | string | `"ngc-api"` |  |
+| nimOperator.rerankqa.enabled | bool | `false` |  |
+| nimOperator.rerankqa.env[0].name | string | `"NIM_HTTP_API_PORT"` |  |
+| nimOperator.rerankqa.env[0].value | string | `"8000"` |  |
+| nimOperator.rerankqa.env[1].name | string | `"NIM_TRITON_LOG_VERBOSE"` |  |
+| nimOperator.rerankqa.env[1].value | string | `"1"` |  |
+| nimOperator.rerankqa.expose.service.grpcPort | int | `8001` |  |
+| nimOperator.rerankqa.expose.service.port | int | `8000` |  |
+| nimOperator.rerankqa.expose.service.type | string | `"ClusterIP"` |  |
+| nimOperator.rerankqa.image.pullPolicy | string | `"IfNotPresent"` |  |
+| nimOperator.rerankqa.image.pullSecrets[0] | string | `"ngc-secret"` |  |
+| nimOperator.rerankqa.image.repository | string | `"nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2"` |  |
+| nimOperator.rerankqa.image.tag | string | `"1.10.0"` |  |
+| nimOperator.rerankqa.replicas | int | `1` |  |
+| nimOperator.rerankqa.resources.limits."nvidia.com/gpu" | int | `1` |  |
+| nimOperator.rerankqa.storage.pvc.create | bool | `true` |  |
+| nimOperator.rerankqa.storage.pvc.size | string | `"50Gi"` |  |
+| nimOperator.rerankqa.storage.pvc.volumeAccessMode | string | `"ReadWriteOnce"` |  |
+| nimOperator.ocr.authSecret | string | `"ngc-api"` |  |
+| nimOperator.ocr.enabled | bool | `true` |  |
+| nimOperator.ocr.env[0].name | string | `"OMP_NUM_THREADS"` |  |
+| nimOperator.ocr.env[0].value | string | `"8"` |  |
+| nimOperator.ocr.env[1].name | string | `"NIM_HTTP_API_PORT"` |  |
+| nimOperator.ocr.env[1].value | string | `"8000"` |  |
+| nimOperator.ocr.env[2].name | string | `"NIM_TRITON_LOG_VERBOSE"` |  |
+| nimOperator.ocr.env[2].value | string | `"1"` |  |
+| nimOperator.ocr.env[3].name | string | `"NIM_TRITON_MAX_BATCH_SIZE"` |  |
+| nimOperator.ocr.env[3].value | string | `"32"` |  |
+| nimOperator.ocr.expose.service.grpcPort | int | `8001` |  |
+| nimOperator.ocr.expose.service.port | int | `8000` |  |
+| nimOperator.ocr.expose.service.type | string | `"ClusterIP"` |  |
+| nimOperator.ocr.image.pullPolicy | string | `"IfNotPresent"` |  |
+| nimOperator.ocr.image.pullSecrets[0] | string | `"ngc-secret"` |  |
+| nimOperator.ocr.image.repository | string | `"nvcr.io/nim/nvidia/nemotron-ocr-v1"` |  |
+| nimOperator.ocr.image.tag | string | `"1.3.0"` |  |
+| nimOperator.ocr.replicas | int | `1` |  |
+| nimOperator.ocr.resources.limits."nvidia.com/gpu" | int | `1` |  |
+| nimOperator.ocr.storage.pvc.create | bool | `true` |  |
+| nimOperator.ocr.storage.pvc.size | string | `"25Gi"` |  |
+| nimOperator.ocr.storage.pvc.volumeAccessMode | string | `"ReadWriteOnce"` |  |
 | nimOperator.nemotron_nano_12b_v2_vl.authSecret | string | `"ngc-api"` |  |
 | nimOperator.nemotron_nano_12b_v2_vl.enabled | bool | `false` |  |
 | nimOperator.nemotron_nano_12b_v2_vl.env[0].name | string | `"NIM_HTTP_API_PORT"` |  |
diff --git a/helm/mig/nv-ingest-mig-values-25x.yaml b/helm/mig/nv-ingest-mig-values-25x.yaml
index d1b108e2e..5f9757518 100644
--- a/helm/mig/nv-ingest-mig-values-25x.yaml
+++ b/helm/mig/nv-ingest-mig-values-25x.yaml
@@ -38,7 +38,7 @@ nemotron-table-structure-v1:
       nvidia.com/gpu: 0
       nvidia.com/mig-1g.10gb: 1
 
-nvidia-nim-llama-32-nv-embedqa-1b-v2:
+nvidia-nim-llama-nemotron-embed-1b-v2:
   resources:
     limits:
       nvidia.com/gpu: 0
@@ -75,8 +75,8 @@ text-embedding-nim:
       nvidia.com/gpu: 0
       nvidia.com/mig-1g.10gb: 1
 
-# If you want to deploy llama-32-nv-rerankqa-1b-v2
-llama-32-nv-rerankqa-1b-v2:
+# If you want to deploy llama-nemotron-rerank-1b-v2
+llama-nemotron-rerank-1b-v2:
   resources:
     limits:
       nvidia.com/gpu: 0
diff --git a/helm/mig/nv-ingest-mig-values.yaml b/helm/mig/nv-ingest-mig-values.yaml
index 8ae0e8c83..97707a5da 100644
--- a/helm/mig/nv-ingest-mig-values.yaml
+++ b/helm/mig/nv-ingest-mig-values.yaml
@@ -39,7 +39,7 @@ nimOperator:
         nvidia.com/gpu: "0"
         nvidia.com/mig-1g.10gb: 1
 
-  nemoretriever_ocr_v1:
+  ocr:
     resources:
       limits:
         nvidia.com/gpu: "0"
@@ -48,8 +48,8 @@ nimOperator:
         nvidia.com/gpu: "0"
         nvidia.com/mig-1g.20gb: 1
 
-  # If you want to deploy llama-32-nv-rerankqa-1b-v2
-  llama_3_2_nv_rerankqa_1b_v2:
+  # If you want to deploy llama-nemotron-rerank-1b-v2
+  rerankqa:
     enabled: true
     resources:
       limits:
diff --git a/helm/overrides/values-a100-40gb.yaml b/helm/overrides/values-a100-40gb.yaml
index 003c234ba..828bfdd33 100644
--- a/helm/overrides/values-a100-40gb.yaml
+++ b/helm/overrides/values-a100-40gb.yaml
@@ -13,6 +13,8 @@ nimOperator:
         value: "1"
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
       - name: NIM_TRITON_CPU_THREADS_PRE_PROCESSOR
         value: "2"
       - name: OMP_NUM_THREADS
@@ -44,6 +46,8 @@ nimOperator:
         value: "3"
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
       - name: NIM_TRITON_CUDA_MEMORY_POOL_MB
         value: "2048"
       - name: OMP_NUM_THREADS
@@ -59,12 +63,14 @@ nimOperator:
         value: "3"
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
       - name: NIM_TRITON_CUDA_MEMORY_POOL_MB
         value: "2048"
       - name: OMP_NUM_THREADS
         value: "1"
 
-  nemoretriever_ocr_v1:
+  ocr:
     env:
       - name: OMP_NUM_THREADS
         value: "8"
@@ -75,7 +81,7 @@ nimOperator:
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
 
-  llama_3_2_nv_rerankqa_1b_v2:
+  rerankqa:
     env:
       - name: NIM_HTTP_API_PORT
         value: "8000"
diff --git a/helm/overrides/values-a10g.yaml b/helm/overrides/values-a10g.yaml
index 0ad99584c..36a9bed4d 100644
--- a/helm/overrides/values-a10g.yaml
+++ b/helm/overrides/values-a10g.yaml
@@ -70,7 +70,7 @@ nimOperator:
       - name: OMP_NUM_THREADS
         value: "1"
 
-  nemoretriever_ocr_v1:
+  ocr:
     env:
       - name: OMP_NUM_THREADS
         value: "8"
@@ -81,7 +81,7 @@ nimOperator:
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
 
-  llama_3_2_nv_rerankqa_1b_v2:
+  rerankqa:
     env:
       - name: NIM_HTTP_API_PORT
         value: "8000"
diff --git a/helm/overrides/values-l40s.yaml b/helm/overrides/values-l40s.yaml
index 7f4e3a680..d430e39f1 100644
--- a/helm/overrides/values-l40s.yaml
+++ b/helm/overrides/values-l40s.yaml
@@ -13,6 +13,8 @@ nimOperator:
         value: "1"
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
       - name: NIM_TRITON_CPU_THREADS_PRE_PROCESSOR
         value: "2"
       - name: OMP_NUM_THREADS
@@ -44,6 +46,8 @@ nimOperator:
         value: "3"
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
       - name: NIM_TRITON_CUDA_MEMORY_POOL_MB
         value: "2048"
       - name: OMP_NUM_THREADS
@@ -59,12 +63,14 @@ nimOperator:
         value: "3"
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
       - name: NIM_TRITON_CUDA_MEMORY_POOL_MB
         value: "2048"
       - name: OMP_NUM_THREADS
         value: "1"
 
-  nemoretriever_ocr_v1:
+  ocr:
     env:
       - name: OMP_NUM_THREADS
         value: "8"
@@ -75,7 +81,7 @@ nimOperator:
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "1"
 
-  llama_3_2_nv_rerankqa_1b_v2:
+  rerankqa:
     env:
       - name: NIM_HTTP_API_PORT
         value: "8000"
diff --git a/helm/overrides/values-rtx-pro-4500.yaml b/helm/overrides/values-rtx-pro-4500.yaml
new file mode 100644
index 000000000..55a38482e
--- /dev/null
+++ b/helm/overrides/values-rtx-pro-4500.yaml
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# GPU-specific overrides for RTX Pro 4500 (loaded by harness when --deployment-type helm --sku rtx-pro-4500).
+# Sets NIM_TRITON_MAX_BATCH_SIZE=1 per NIM to match docker-compose.rtx-pro-4500.yaml.
+
+nimOperator:
+  page_elements:
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_TRITON_MAX_BATCH_SIZE
+        value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
+      - name: NIM_TRITON_CPU_THREADS_PRE_PROCESSOR
+        value: "2"
+      - name: OMP_NUM_THREADS
+        value: "2"
+      - name: NIM_TRITON_CPU_THREADS_POST_PROCESSOR
+        value: "1"
+      - name: NIM_ENABLE_OTEL
+        value: "true"
+      - name: NIM_OTEL_SERVICE_NAME
+        value: "page-elements"
+      - name: NIM_OTEL_TRACES_EXPORTER
+        value: "otlp"
+      - name: NIM_OTEL_METRICS_EXPORTER
+        value: "console"
+      - name: NIM_OTEL_EXPORTER_OTLP_ENDPOINT
+        value: "http://otel-collector:4318"
+      - name: TRITON_OTEL_URL
+        value: "http://otel-collector:4318/v1/traces"
+      - name: TRITON_OTEL_RATE
+        value: "1"
+
+  graphic_elements:
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_TRITON_RATE_LIMIT
+        value: "3"
+      - name: NIM_TRITON_MAX_BATCH_SIZE
+        value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
+      - name: NIM_TRITON_CUDA_MEMORY_POOL_MB
+        value: "2048"
+      - name: OMP_NUM_THREADS
+        value: "1"
+
+  table_structure:
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_TRITON_RATE_LIMIT
+        value: "3"
+      - name: NIM_TRITON_MAX_BATCH_SIZE
+        value: "1"
+      - name: NIM_TRITON_DATA_MAX_BATCH_SIZE
+        value: "1"
+      - name: NIM_TRITON_CUDA_MEMORY_POOL_MB
+        value: "2048"
+      - name: OMP_NUM_THREADS
+        value: "1"
+
+  ocr:
+    env:
+      - name: OMP_NUM_THREADS
+        value: "8"
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_TRITON_MAX_BATCH_SIZE
+        value: "1"
+
+  rerankqa:
+    env:
+      - name: NIM_HTTP_API_PORT
+        value: "8000"
+      - name: NIM_TRITON_LOG_VERBOSE
+        value: "1"
+      - name: NIM_TRITON_MAX_BATCH_SIZE
+        value: "1"
diff --git a/helm/templates/llama-3.2-nv-rerankqa-1b-v2.yaml b/helm/templates/llama-3.2-nv-rerankqa-1b-v2.yaml
deleted file mode 100644
index 12e69da27..000000000
--- a/helm/templates/llama-3.2-nv-rerankqa-1b-v2.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-{{ if and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") (eq .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.enabled true) -}}
-apiVersion: apps.nvidia.com/v1alpha1
-kind: NIMCache
-metadata:
-  name: llama-nemotron-rerank-1b-v2
-  annotations:
-    helm.sh/resource-policy: keep
-spec:
-  source:
-    ngc:
-      modelPuller: "{{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.repository }}:{{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.tag }}"
-      pullSecret: "{{ index .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.pullSecrets 0 }}"
-      authSecret: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.authSecret }}
-  storage:
-    pvc:
-      create: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.create }}
-      storageClass: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.storageClass }}
-      size: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.size }}
-      volumeAccessMode: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.storage.pvc.volumeAccessMode }}
----
-apiVersion: apps.nvidia.com/v1alpha1
-kind: NIMService
-metadata:
-  name: llama-32-nv-rerankqa-1b-v2
-spec:
-  image:
-    repository: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.repository }}
-    tag: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.tag }}
-    pullPolicy: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.pullPolicy }}
-    pullSecrets:
-{{ toYaml .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.image.pullSecrets | nindent 6 }}
-  authSecret: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.authSecret }}
-  storage:
-    nimCache:
-      name: llama-nemotron-rerank-1b-v2
-  replicas: {{ .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.replicas }}
-  nodeSelector:
-{{ toYaml .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.nodeSelector | nindent 4 }}
-  resources:
-{{ toYaml .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.resources | nindent 4 }}
-  tolerations:
-{{ toYaml .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.tolerations | nindent 4 }}
-  expose:
-{{ toYaml .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.expose | nindent 4 }}
-  env:
-{{ toYaml .Values.nimOperator.llama_3_2_nv_rerankqa_1b_v2.env | nindent 4 }}
-{{- end }}
diff --git a/helm/templates/llama-3.2-nv-embedqa-1b-v2.yaml b/helm/templates/llama-nemotron-embed-1b-v2.yaml
similarity index 98%
rename from helm/templates/llama-3.2-nv-embedqa-1b-v2.yaml
rename to helm/templates/llama-nemotron-embed-1b-v2.yaml
index e9376ced7..199bcdc9c 100644
--- a/helm/templates/llama-3.2-nv-embedqa-1b-v2.yaml
+++ b/helm/templates/llama-nemotron-embed-1b-v2.yaml
@@ -21,7 +21,7 @@ spec:
 apiVersion: apps.nvidia.com/v1alpha1
 kind: NIMService
 metadata:
-  name: llama-32-nv-embedqa-1b-v2
+  name: llama-nemotron-embed-1b-v2
 spec:
   image:
     repository: {{ .Values.nimOperator.embedqa.image.repository }}
diff --git a/helm/templates/llama-nemotron-rerank-1b-v2.yaml b/helm/templates/llama-nemotron-rerank-1b-v2.yaml
new file mode 100644
index 000000000..6cfc2fcfc
--- /dev/null
+++ b/helm/templates/llama-nemotron-rerank-1b-v2.yaml
@@ -0,0 +1,47 @@
+{{ if and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") (eq .Values.nimOperator.rerankqa.enabled true) -}}
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMCache
+metadata:
+  name: llama-nemotron-rerank-1b-v2
+  annotations:
+    helm.sh/resource-policy: keep
+spec:
+  source:
+    ngc:
+      modelPuller: "{{ .Values.nimOperator.rerankqa.image.repository }}:{{ .Values.nimOperator.rerankqa.image.tag }}"
+      pullSecret: "{{ index .Values.nimOperator.rerankqa.image.pullSecrets 0 }}"
+      authSecret: {{ .Values.nimOperator.rerankqa.authSecret }}
+  storage:
+    pvc:
+      create: {{ .Values.nimOperator.rerankqa.storage.pvc.create }}
+      storageClass: {{ .Values.nimOperator.rerankqa.storage.pvc.storageClass }}
+      size: {{ .Values.nimOperator.rerankqa.storage.pvc.size }}
+      volumeAccessMode: {{ .Values.nimOperator.rerankqa.storage.pvc.volumeAccessMode }}
+---
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMService
+metadata:
+  name: llama-nemotron-rerank-1b-v2
+spec:
+  image:
+    repository: {{ .Values.nimOperator.rerankqa.image.repository }}
+    tag: {{ .Values.nimOperator.rerankqa.image.tag }}
+    pullPolicy: {{ .Values.nimOperator.rerankqa.image.pullPolicy }}
+    pullSecrets:
+{{ toYaml .Values.nimOperator.rerankqa.image.pullSecrets | nindent 6 }}
+  authSecret: {{ .Values.nimOperator.rerankqa.authSecret }}
+  storage:
+    nimCache:
+      name: llama-nemotron-rerank-1b-v2
+  replicas: {{ .Values.nimOperator.rerankqa.replicas }}
+  nodeSelector:
+{{ toYaml .Values.nimOperator.rerankqa.nodeSelector | nindent 4 }}
+  resources:
+{{ toYaml .Values.nimOperator.rerankqa.resources | nindent 4 }}
+  tolerations:
+{{ toYaml .Values.nimOperator.rerankqa.tolerations | nindent 4 }}
+  expose:
+{{ toYaml .Values.nimOperator.rerankqa.expose | nindent 4 }}
+  env:
+{{ toYaml .Values.nimOperator.rerankqa.env | nindent 4 }}
+{{- end }}
diff --git a/helm/templates/nemoretriever-ocr-v1.yaml b/helm/templates/nemoretriever-ocr-v1.yaml
deleted file mode 100644
index 6606d12f5..000000000
--- a/helm/templates/nemoretriever-ocr-v1.yaml
+++ /dev/null
@@ -1,41 +0,0 @@
-{{ if and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") (eq .Values.nimOperator.nemoretriever_ocr_v1.enabled true) -}}
-apiVersion: apps.nvidia.com/v1alpha1
-kind: NIMCache
-metadata:
-  name: nemotron-ocr-v1
-  annotations:
-    helm.sh/resource-policy: keep
-spec:
-  source:
-    ngc:
-      modelPuller: "{{ .Values.nimOperator.nemoretriever_ocr_v1.image.repository }}:{{ .Values.nimOperator.nemoretriever_ocr_v1.image.tag }}"
-      pullSecret: "{{ index .Values.nimOperator.nemoretriever_ocr_v1.image.pullSecrets 0 }}"
-      authSecret: {{ .Values.nimOperator.nemoretriever_ocr_v1.authSecret }}
-  storage:
-    pvc:
-      create: {{ .Values.nimOperator.nemoretriever_ocr_v1.storage.pvc.create }}
-      storageClass: {{ .Values.nimOperator.nemoretriever_ocr_v1.storage.pvc.storageClass }}
-      size: {{ .Values.nimOperator.nemoretriever_ocr_v1.storage.pvc.size }}
-      volumeAccessMode: {{ .Values.nimOperator.nemoretriever_ocr_v1.storage.pvc.volumeAccessMode }}
----
-apiVersion: apps.nvidia.com/v1alpha1
-kind: NIMService
-metadata:
-  name: nemotron-ocr-v1
-spec:
-  image:
-    repository: {{ .Values.nimOperator.nemoretriever_ocr_v1.image.repository }}
-    tag: {{ .Values.nimOperator.nemoretriever_ocr_v1.image.tag }}
-    pullPolicy: {{ .Values.nimOperator.nemoretriever_ocr_v1.image.pullPolicy }}
-    pullSecrets: {{ toYaml .Values.nimOperator.nemoretriever_ocr_v1.image.pullSecrets | nindent 6 }}
-  authSecret: {{ .Values.nimOperator.nemoretriever_ocr_v1.authSecret }}
-  storage:
-    nimCache:
-      name: nemotron-ocr-v1
-  replicas: {{ .Values.nimOperator.nemoretriever_ocr_v1.replicas }}
-  nodeSelector: {{ toYaml .Values.nimOperator.nemoretriever_ocr_v1.nodeSelector | nindent 4 }}
-  resources: {{ toYaml .Values.nimOperator.nemoretriever_ocr_v1.resources | nindent 4 }}
-  tolerations: {{ toYaml .Values.nimOperator.nemoretriever_ocr_v1.tolerations | nindent 4 }}
-  expose: {{ toYaml .Values.nimOperator.nemoretriever_ocr_v1.expose | nindent 4 }}
-  env: {{ toYaml .Values.nimOperator.nemoretriever_ocr_v1.env | nindent 4 }}
-{{- end }}
diff --git a/helm/templates/nemoretriever-graphic-elements-v1.yaml b/helm/templates/nemotron-graphic-elements-v1.yaml
similarity index 100%
rename from helm/templates/nemoretriever-graphic-elements-v1.yaml
rename to helm/templates/nemotron-graphic-elements-v1.yaml
diff --git a/helm/templates/nemotron-ocr-v1.yaml b/helm/templates/nemotron-ocr-v1.yaml
new file mode 100644
index 000000000..7ae0f2dea
--- /dev/null
+++ b/helm/templates/nemotron-ocr-v1.yaml
@@ -0,0 +1,41 @@
+{{ if and (.Capabilities.APIVersions.Has "apps.nvidia.com/v1alpha1") (eq .Values.nimOperator.ocr.enabled true) -}}
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMCache
+metadata:
+  name: nemotron-ocr-v1
+  annotations:
+    helm.sh/resource-policy: keep
+spec:
+  source:
+    ngc:
+      modelPuller: "{{ .Values.nimOperator.ocr.image.repository }}:{{ .Values.nimOperator.ocr.image.tag }}"
+      pullSecret: "{{ index .Values.nimOperator.ocr.image.pullSecrets 0 }}"
+      authSecret: {{ .Values.nimOperator.ocr.authSecret }}
+  storage:
+    pvc:
+      create: {{ .Values.nimOperator.ocr.storage.pvc.create }}
+      storageClass: {{ .Values.nimOperator.ocr.storage.pvc.storageClass }}
+      size: {{ .Values.nimOperator.ocr.storage.pvc.size }}
+      volumeAccessMode: {{ .Values.nimOperator.ocr.storage.pvc.volumeAccessMode }}
+---
+apiVersion: apps.nvidia.com/v1alpha1
+kind: NIMService
+metadata:
+  name: nemotron-ocr-v1
+spec:
+  image:
+    repository: {{ .Values.nimOperator.ocr.image.repository }}
+    tag: {{ .Values.nimOperator.ocr.image.tag }}
+    pullPolicy: {{ .Values.nimOperator.ocr.image.pullPolicy }}
+    pullSecrets: {{ toYaml .Values.nimOperator.ocr.image.pullSecrets | nindent 6 }}
+  authSecret: {{ .Values.nimOperator.ocr.authSecret }}
+  storage:
+    nimCache:
+      name: nemotron-ocr-v1
+  replicas: {{ .Values.nimOperator.ocr.replicas }}
+  nodeSelector: {{ toYaml .Values.nimOperator.ocr.nodeSelector | nindent 4 }}
+  resources: {{ toYaml .Values.nimOperator.ocr.resources | nindent 4 }}
+  tolerations: {{ toYaml .Values.nimOperator.ocr.tolerations | nindent 4 }}
+  expose: {{ toYaml .Values.nimOperator.ocr.expose | nindent 4 }}
+  env: {{ toYaml .Values.nimOperator.ocr.env | nindent 4 }}
+{{- end }}
diff --git a/helm/templates/nemoretriever-page-elements-v3.yaml b/helm/templates/nemotron-page-elements-v3.yaml
similarity index 100%
rename from helm/templates/nemoretriever-page-elements-v3.yaml
rename to helm/templates/nemotron-page-elements-v3.yaml
diff --git a/helm/templates/nemoretriever-table-structure-v1.yaml b/helm/templates/nemotron-table-structure-v1.yaml
similarity index 100%
rename from helm/templates/nemoretriever-table-structure-v1.yaml
rename to helm/templates/nemotron-table-structure-v1.yaml
diff --git a/helm/values.yaml b/helm/values.yaml
index 20323f68a..c0597062e 100644
--- a/helm/values.yaml
+++ b/helm/values.yaml
@@ -170,7 +170,7 @@ envVars:
   AUDIO_GRPC_ENDPOINT: "audio:50051"
   AUDIO_INFER_PROTOCOL: "grpc"
 
-  EMBEDDING_NIM_ENDPOINT: "http://llama-32-nv-embedqa-1b-v2:8000/v1"
+  EMBEDDING_NIM_ENDPOINT: "http://llama-nemotron-embed-1b-v2:8000/v1"
   EMBEDDING_NIM_MODEL_NAME: "nvidia/llama-nemotron-embed-1b-v2"
 
   NEMOTRON_PARSE_HTTP_ENDPOINT: http://nemotron-parse:8000/v1/chat/completions
@@ -828,16 +828,16 @@ nimOperator:
       - name: NIM_TRITON_PERFORMANCE_MODE
         value: "throughput"
 
-  ## @param nemoretriever_ocr_v1 [object] Configuration for NemoRetriever OCR v1 NIM
-  ## @param nemoretriever_ocr_v1.enabled [bool] Enable the NEMORetriever OCR v1 service
-  ## @param nemoretriever_ocr_v1.image.* [various] Image settings for NEMORetriever OCR v1
-  ## @param nemoretriever_ocr_v1.authSecret [string] Secret for authentication
-  ## @param nemoretriever_ocr_v1.storage.* [object] Storage/PVC configuration
-  ## @param nemoretriever_ocr_v1.replicas [int] Number of service replicas
-  ## @param nemoretriever_ocr_v1.resources [object] Limits/requests for compute resources
-  ## @param nemoretriever_ocr_v1.expose.* [object] Ports and service config
-  ## @param nemoretriever_ocr_v1.env [array] Additional environment variables
-  nemoretriever_ocr_v1:
+  ## @param ocr [object] Configuration for Nemotron OCR v1 NIM
+  ## @param ocr.enabled [bool] Enable the Nemotron OCR v1 service
+  ## @param ocr.image.* [various] Image settings for Nemotron OCR v1
+  ## @param ocr.authSecret [string] Secret for authentication
+  ## @param ocr.storage.* [object] Storage/PVC configuration
+  ## @param ocr.replicas [int] Number of service replicas
+  ## @param ocr.resources [object] Limits/requests for compute resources
+  ## @param ocr.expose.* [object] Ports and service config
+  ## @param ocr.env [array] Additional environment variables
+  ocr:
     enabled: true
     image:
       repository: nvcr.io/nim/nvidia/nemotron-ocr-v1
@@ -870,16 +870,16 @@ nimOperator:
       - name: NIM_TRITON_MAX_BATCH_SIZE
         value: "32"
 
-  ## @param llama_3_2_nv_rerankqa_1b_v2 [object] Configuration for LLaMA-3.2 NV RerankQA 1B v2 NIM
-  ## @param llama_3_2_nv_rerankqa_1b_v2.enabled [bool] Enable this NIM
-  ## @param llama_3_2_nv_rerankqa_1b_v2.image.* [various] Image repository/tag for this NIM
-  ## @param llama_3_2_nv_rerankqa_1b_v2.authSecret [string] Authentication secret for the NIM
-  ## @param llama_3_2_nv_rerankqa_1b_v2.storage.* [various] Storage/PVC configuration
-  ## @param llama_3_2_nv_rerankqa_1b_v2.replicas [int] Number of replicas
-  ## @param llama_3_2_nv_rerankqa_1b_v2.resources [object] Limits/requests for resources
-  ## @param llama_3_2_nv_rerankqa_1b_v2.expose.* [object] Port/service configuration
-  ## @param llama_3_2_nv_rerankqa_1b_v2.env [array] Additional environment variables
-  llama_3_2_nv_rerankqa_1b_v2:
+  ## @param rerankqa [object] Configuration for LLaMA-3.2 NV RerankQA 1B v2 NIM
+  ## @param rerankqa.enabled [bool] Enable this NIM
+  ## @param rerankqa.image.* [various] Image repository/tag for this NIM
+  ## @param rerankqa.authSecret [string] Authentication secret for the NIM
+  ## @param rerankqa.storage.* [various] Storage/PVC configuration
+  ## @param rerankqa.replicas [int] Number of replicas
+  ## @param rerankqa.resources [object] Limits/requests for resources
+  ## @param rerankqa.expose.* [object] Port/service configuration
+  ## @param rerankqa.env [array] Additional environment variables
+  rerankqa:
     enabled: false
     image:
       repository: nvcr.io/nim/nvidia/llama-nemotron-rerank-1b-v2
diff --git a/nemo_retriever/Dockerfile b/nemo_retriever/Dockerfile
index a3b536abc..a6fc16607 100644
--- a/nemo_retriever/Dockerfile
+++ b/nemo_retriever/Dockerfile
@@ -4,7 +4,9 @@
 # syntax=docker/dockerfile:1.3
 #
 # Build from repo root: docker build -f nemo_retriever/Dockerfile -t nemo-retriever .
-# Run: docker run nemo-retriever  (help) or docker run -v /host/docs:/data nemo-retriever /data
+# Run: docker run nemo-retriever  (shell with venv active)
+# Run with dev mount: docker run -v $(pwd):/workspace -it nemo-retriever   (code changes reflect without rebuild)
+# Run with data:     docker run -v /host/docs:/data nemo-retriever /data
 
 ARG BASE_IMG=nvcr.io/nvidia/base/ubuntu
 ARG BASE_IMG_TAG=jammy-20250619
@@ -91,6 +93,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # ---------------------------------------------------------------------------
 # Install nemo_retriever and path deps (build context = repo root)
+# To pick up dev changes without rebuilding, run with:
+#   -v /path/to/NeMo-Retriever/main:/workspace
+# The editable install points at /workspace, so the mounted tree is used.
 # ---------------------------------------------------------------------------
 FROM base AS install
 
@@ -99,17 +104,21 @@ WORKDIR /workspace
 # Unbuffered stdout/stderr so CLI output appears when run without a TTY (e.g. docker run without -it)
 ENV PYTHONUNBUFFERED=1
 
-# COPY nemo_retriever nemo_retriever
-# COPY src src
-# COPY api api
-# COPY client client
-
-# Use base stage's venv at /opt/retriever_runtime; install nemo_retriever in editable mode (path deps: ../src, ../api, ../client)
-# SHELL ["/bin/bash", "-c"]
-# RUN --mount=type=cache,target=/root/.cache/pip \
-#     --mount=type=cache,target=/root/.cache/uv \
-#     . /opt/retriever_runtime/bin/activate \
-#     && uv pip install -e ./nemo_retriever
+# Activate venv by default so CLI and python see nemo_retriever; mount over /workspace for dev.
+ENV VIRTUAL_ENV=/opt/retriever_runtime
+ENV PATH=/opt/retriever_runtime/bin:/root/.local/bin:$PATH
+
+COPY nemo_retriever nemo_retriever
+COPY src src
+COPY api api
+COPY client client
+
+# Editable install: at runtime, -v host_repo:/workspace overrides these dirs so dev changes apply.
+SHELL ["/bin/bash", "-c"]
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    . /opt/retriever_runtime/bin/activate \
+    && uv pip install -e ./nemo_retriever
 
 # Default: run in-process pipeline (help if no args)
 CMD ["/bin/bash"]
diff --git a/nemo_retriever/LICENSE b/nemo_retriever/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/nemo_retriever/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/nemo_retriever/README.md b/nemo_retriever/README.md
index dd4d29eff..71f58efd0 100644
--- a/nemo_retriever/README.md
+++ b/nemo_retriever/README.md
@@ -10,6 +10,8 @@ You’ll set up a CUDA 13–compatible environment, install the library and its
 
 ## Prerequisites
 
+> **Warning:** The `online` and `fused` run modes are experimental and not fully supported. They may be incomplete, unstable, or subject to breaking changes. Use `batch` or `inprocess` modes for production workloads.
+
 Before you start, make sure your system meets the following requirements:
 
 - The host is running CUDA 13.x so that `libcudart.so.13` is available.
@@ -165,6 +167,51 @@ uv run python -m nemo_retriever.examples.batch_pipeline /datasets/nemo-retriever
 ```
 This uses the module form of the NeMo Retriever Library batch pipeline example and points it at a sample dataset directory, verifying both ingestion and OCR under CUDA 13.
 
+7. Ingest image files
+
+NeMo Retriever Library can ingest standalone image files through the same detection, OCR, and embedding pipeline used for PDFs. Supported formats are PNG, JPEG, BMP, TIFF, and SVG. SVG support requires the optional `cairosvg` package. Each image is treated as a single page.
+
+To run the batch pipeline on a directory of images, use `--input-type image` to match all supported formats at once.
+
+```bash
+uv run python nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py /path/to/images \
+  --input-type image
+```
+
+You can also pass a single-format shortcut to restrict which files are picked up.
+
+```bash
+uv run python nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py /path/to/images \
+  --input-type png
+```
+
+Valid single-format values are `png`, `jpg`, `jpeg`, `bmp`, `tiff`, `tif`, and `svg`.
+
+For in-process mode, build the ingestor chain with `extract_image_files` instead of `extract`.
+
+```python
+from nemo_retriever import create_ingestor
+from nemo_retriever.params import ExtractParams, EmbedParams
+
+ingestor = (
+    create_ingestor(run_mode="inprocess")
+    .files("images/*.png")
+    .extract_image_files(
+        ExtractParams(
+            extract_text=True,
+            extract_tables=True,
+            extract_charts=True,
+            extract_infographics=True,
+        )
+    )
+    .embed()
+    .vdb_upload()
+    .ingest()
+)
+```
+
+All `ExtractParams` options (`extract_text`, `extract_tables`, `extract_charts`, `extract_infographics`) apply to image ingestion.
+
 ### Render one document as markdown
 
 If you want a readable page-by-page markdown view of a single in-process result, pass the
diff --git a/nemo_retriever/pyproject.toml b/nemo_retriever/pyproject.toml
index ea22099fb..c21968dc4 100644
--- a/nemo_retriever/pyproject.toml
+++ b/nemo_retriever/pyproject.toml
@@ -15,11 +15,11 @@ requires-python = ">=3.12"
 authors = [
   { name = "Jeremy Dyer", email = "jdyer@nvidia.com" },
 ]
-license = { text = "Apache-2.0" }
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 classifiers = [
   "Programming Language :: Python :: 3",
   "Programming Language :: Python :: 3.12",
-  "License :: OSI Approved :: Apache Software License",
   "Operating System :: OS Independent",
 ]
 dependencies = [
@@ -63,6 +63,7 @@ dependencies = [
   "nemotron-ocr>=0.dev0",
   "markitdown",
   "timm==1.0.22",
+  "tqdm",
   "accelerate==1.12.0",
   "albumentations==2.0.8",
   "scikit-learn>=1.6.0",
diff --git a/nemo_retriever/src/nemo_retriever/audio/asr_actor.py b/nemo_retriever/src/nemo_retriever/audio/asr_actor.py
index 019df4761..0c4d0b043 100644
--- a/nemo_retriever/src/nemo_retriever/audio/asr_actor.py
+++ b/nemo_retriever/src/nemo_retriever/audio/asr_actor.py
@@ -142,6 +142,12 @@ def __call__(self, batch_df: pd.DataFrame) -> pd.DataFrame:
                 columns=["path", "source_path", "duration", "chunk_index", "metadata", "page_number", "text"]
             )
 
+        if self._client is not None:
+            return self._call_remote_batch(batch_df)
+        return self._call_local_batch(batch_df)
+
+    def _call_remote_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame:
+        """Remote ASR: one infer call per row (no batching on server side)."""
         out_rows: List[Dict[str, Any]] = []
         for _, row in batch_df.iterrows():
             try:
@@ -158,6 +164,91 @@ def __call__(self, batch_df: pd.DataFrame) -> pd.DataFrame:
             )
         return pd.DataFrame(out_rows)
 
+    def _call_local_batch(self, batch_df: pd.DataFrame) -> pd.DataFrame:
+        """Local ASR: one batched transcribe call for the whole batch."""
+        if self._model is None:
+            return pd.DataFrame(
+                columns=["path", "source_path", "duration", "chunk_index", "metadata", "page_number", "text"]
+            )
+        temp_paths: List[Optional[str]] = []
+        paths_for_model: List[str] = []
+        rows_list: List[pd.Series] = []
+        for _, row in batch_df.iterrows():
+            rows_list.append(row)
+            raw = row.get("bytes")
+            path = row.get("path")
+            path_to_use: Optional[str] = None
+            temp_created: Optional[str] = None
+            if path and Path(path).exists():
+                path_to_use = str(path)
+            elif raw is not None:
+                try:
+                    f = tempfile.NamedTemporaryFile(suffix=".audio", delete=False)
+                    f.write(raw)
+                    f.close()
+                    path_to_use = f.name
+                    temp_created = f.name
+                except Exception as e:
+                    logger.warning("Failed to write temp file for ASR: %s", e)
+                    path_to_use = ""
+            else:
+                if path:
+                    try:
+                        with open(path, "rb") as fp:
+                            raw = fp.read()
+                    except Exception as e:
+                        logger.warning("Could not read %s: %s", path, e)
+                        path_to_use = ""
+                    else:
+                        try:
+                            f = tempfile.NamedTemporaryFile(suffix=".audio", delete=False)
+                            f.write(raw)
+                            f.close()
+                            path_to_use = f.name
+                            temp_created = f.name
+                        except Exception as e:
+                            logger.warning("Failed to write temp file for ASR: %s", e)
+                            path_to_use = ""
+                else:
+                    path_to_use = ""
+            paths_for_model.append(path_to_use or "")
+            temp_paths.append(temp_created)
+
+        try:
+            transcripts = self._model.transcribe(paths_for_model) if paths_for_model else []
+        finally:
+            for p in temp_paths:
+                if p:
+                    Path(p).unlink(missing_ok=True)
+
+        out_rows: List[Dict[str, Any]] = []
+        for row, transcript in zip(rows_list, transcripts):
+            path = row.get("path")
+            source_path = row.get("source_path", path)
+            duration = row.get("duration")
+            chunk_index = row.get("chunk_index", 0)
+            metadata = row.get("metadata")
+            if not isinstance(metadata, dict):
+                metadata = {"source_path": source_path, "chunk_index": chunk_index, "duration": duration}
+            page_number = row.get("page_number", chunk_index)
+            out_rows.append(
+                {
+                    "path": path,
+                    "source_path": source_path,
+                    "duration": duration,
+                    "chunk_index": chunk_index,
+                    "metadata": metadata,
+                    "page_number": page_number,
+                    "text": transcript or "",
+                }
+            )
+
+        if not out_rows:
+            return pd.DataFrame(
+                columns=["path", "source_path", "duration", "chunk_index", "metadata", "page_number", "text"]
+            )
+        return pd.DataFrame(out_rows)
+
     def _transcribe_remote(self, raw: bytes, path: Optional[str]) -> Optional[str]:
         """Use remote gRPC client to transcribe audio bytes."""
         audio_b64 = base64.b64encode(raw).decode("ascii")
diff --git a/nemo_retriever/src/nemo_retriever/chart/chart_detection.py b/nemo_retriever/src/nemo_retriever/chart/chart_detection.py
index 1e5a9bb18..23e1d8798 100644
--- a/nemo_retriever/src/nemo_retriever/chart/chart_detection.py
+++ b/nemo_retriever/src/nemo_retriever/chart/chart_detection.py
@@ -30,6 +30,13 @@
 except Exception:  # pragma: no cover
     Image = None  # type: ignore[assignment]
 
+try:
+    from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
+        YOLOX_GRAPHIC_MIN_SCORE,
+    )
+except ImportError:
+    YOLOX_GRAPHIC_MIN_SCORE = 0.1  # type: ignore[assignment]
+
 
 def _error_payload(*, stage: str, exc: BaseException) -> Dict[str, Any]:
     return {
@@ -443,7 +450,13 @@ def graphic_elements_ocr_page_elements(
                 if len(response_items) != len(crops):
                     raise RuntimeError(f"Expected {len(crops)} GE responses, got {len(response_items)}")
                 for resp in response_items:
-                    ge_results.append(_remote_response_to_ge_detections(resp))
+                    ge_results.append(
+                        [
+                            d
+                            for d in _remote_response_to_ge_detections(resp)
+                            if (d.get("score") or 0.0) >= YOLOX_GRAPHIC_MIN_SCORE
+                        ]
+                    )
             else:
                 # Local batched inference.
                 for _, _, crop_array in crops:
@@ -458,7 +471,7 @@ def graphic_elements_ocr_page_elements(
                         pre = pre.unsqueeze(0)
                     pred = graphic_elements_model.invoke(pre, (h, w))
                     ge_dets = _prediction_to_detections(pred, label_names=label_names)
-                    ge_results.append(ge_dets)
+                    ge_results.append([d for d in ge_dets if (d.get("score") or 0.0) >= YOLOX_GRAPHIC_MIN_SCORE])
 
             # --- Run OCR on all crops ---
             ocr_results: List[Any] = []
diff --git a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
index 5aef13f6d..b36958b22 100644
--- a/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/batch_pipeline.py
@@ -189,6 +189,12 @@ def main(
         "--debug/--no-debug",
         help="Enable debug-level logging for this full pipeline run.",
     ),
+    dpi: int = typer.Option(
+        300,
+        "--dpi",
+        min=72,
+        help="Render DPI for PDF page images (default: 300).",
+    ),
     input_path: Path = typer.Argument(
         ...,
         help="File or directory containing PDFs, .txt, .html, or .doc/.pptx files to ingest.",
@@ -284,7 +290,7 @@ def main(
     method: str = typer.Option(
         "pdfium",
         "--method",
-        help="PDF text extraction method: 'pdfium' (native only), 'pdfium_hybrid' (native + OCR for scanned), or 'ocr' (OCR all pages).",  # noqa: E501
+        help="PDF text extraction method: 'pdfium' (native only), 'pdfium_hybrid' (native + OCR for scanned), 'ocr' (OCR all pages), or 'nemotron_parse' (Nemotron Parse only, auto-configured).",  # noqa: E501
     ),
     log_file: Optional[Path] = typer.Option(
         None,
@@ -425,6 +431,14 @@ def main(
         "--runtime-metrics-prefix",
         help="Optional filename prefix for per-run metrics artifacts.",
     ),
+    reranker: Optional[bool] = typer.Option(
+        False, "--reranker/--no-reranker", help="Enable a re-ranking stage with a cross-encoder model."
+    ),
+    reranker_model_name: str = typer.Option(
+        "nvidia/llama-nemotron-rerank-1b-v2",
+        "--reranker-model-name",
+        help="Cross-encoder model name for re-ranking stage (passed to .embed()).",
+    ),
     structured_elements_modality: Optional[str] = typer.Option(
         None,
         "--structured-elements-modality",
@@ -462,6 +476,24 @@ def main(
             "(used when --table-output-format=markdown)."
         ),
     ),
+    text_chunk: bool = typer.Option(
+        False,
+        "--text-chunk",
+        help=(
+            "Re-chunk extracted page text by token count before embedding. "
+            "Uses --text-chunk-max-tokens and --text-chunk-overlap-tokens (defaults: 1024, 150)."
+        ),
+    ),
+    text_chunk_max_tokens: Optional[int] = typer.Option(
+        None,
+        "--text-chunk-max-tokens",
+        help="Max tokens per text chunk (default: 1024). Implies --text-chunk.",
+    ),
+    text_chunk_overlap_tokens: Optional[int] = typer.Option(
+        None,
+        "--text-chunk-overlap-tokens",
+        help="Token overlap between consecutive text chunks (default: 150). Implies --text-chunk.",
+    ),
 ) -> None:
     log_handle, original_stdout, original_stderr = _configure_logging(log_file, debug=bool(debug))
     try:
@@ -591,6 +623,7 @@ def main(
                 "embed_workers": embed_actors,
                 "embed_batch_size": int(embed_batch_size),
                 "embed_cpus_per_actor": float(embed_cpus_per_actor),
+                "gpu_embed": float(embed_gpus_per_actor),
             },
         )
         # txt/html don't use embed_granularity from batch_tuning the same way,
@@ -627,6 +660,7 @@ def main(
         def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
             return ExtractParams(
                 method=method,
+                dpi=int(dpi),
                 extract_text=True,
                 extract_tables=True,
                 extract_charts=True,
@@ -643,33 +677,31 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
                 batch_tuning={**batch_tuning, **overrides},
             )
 
+        _text_chunk_params = TextChunkParams(
+            max_tokens=text_chunk_max_tokens or 1024,
+            overlap_tokens=text_chunk_overlap_tokens if text_chunk_overlap_tokens is not None else 150,
+        )
+
         if input_type == "txt":
-            ingestor = (
-                ingestor.files(file_patterns)
-                .extract_txt(TextChunkParams(max_tokens=512, overlap_tokens=0))
-                .embed(embed_params)
-            )
+            ingestor = ingestor.files(file_patterns).extract_txt(_text_chunk_params)
         elif input_type == "html":
-            ingestor = (
-                ingestor.files(file_patterns)
-                .extract_html(TextChunkParams(max_tokens=512, overlap_tokens=0))
-                .embed(embed_params)
-            )
+            ingestor = ingestor.files(file_patterns).extract_html(_text_chunk_params)
         elif input_type == "image":
-            ingestor = (
-                ingestor.files(file_patterns)
-                .extract_image_files(_extract_params(_detection_batch_tuning))
-                .embed(embed_params)
-            )
+            ingestor = ingestor.files(file_patterns).extract_image_files(_extract_params(_detection_batch_tuning))
         elif input_type == "doc":
-            ingestor = ingestor.files(file_patterns).extract(_extract_params(_pdf_batch_tuning)).embed(embed_params)
+            ingestor = ingestor.files(file_patterns).extract(_extract_params(_pdf_batch_tuning))
         else:
-            ingestor = (
-                ingestor.files(file_patterns)
-                .extract(_extract_params(_pdf_batch_tuning, inference_batch_size=page_elements_batch_size))
-                .embed(embed_params)
+            ingestor = ingestor.files(file_patterns).extract(
+                _extract_params(_pdf_batch_tuning, inference_batch_size=page_elements_batch_size)
             )
 
+        enable_text_chunk = text_chunk or text_chunk_max_tokens is not None or text_chunk_overlap_tokens is not None
+        if enable_text_chunk:
+            ingestor = ingestor.split(_text_chunk_params)
+
+        ingestor = ingestor.embed(embed_params)
+
+        logger.info("Running extraction...")
         ingest_start = time.perf_counter()
 
         ingest_results = (
@@ -766,6 +798,7 @@ def _extract_params(batch_tuning: dict, **overrides: Any) -> ExtractParams:
             ks=(1, 5, 10),
             hybrid=hybrid,
             match_mode=recall_match_mode,
+            reranker=reranker_model_name if reranker else None,
         )
 
         # Capture recall only times.
diff --git a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
index d275c2068..b4bdb34ef 100644
--- a/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
+++ b/nemo_retriever/src/nemo_retriever/examples/inprocess_pipeline.py
@@ -44,7 +44,7 @@ def main(
     input_type: str = typer.Option(
         "pdf",
         "--input-type",
-        help="Input format: 'pdf', 'txt', 'html', or 'doc'. Use 'txt' for .txt, 'html' for .html (markitdown -> chunks), 'doc' for .docx/.pptx (converted to PDF via LibreOffice).",  # noqa: E501
+        help="Input format: 'pdf', 'txt', 'html', 'doc', or 'image'. Use 'txt' for .txt, 'html' for .html (markitdown -> chunks), 'doc' for .docx/.pptx (converted to PDF via LibreOffice), 'image' for standalone image files (PNG, JPEG, BMP, TIFF, SVG).",  # noqa: E501
     ),
     query_csv: Path = typer.Option(
         "bo767_query_gt.csv",
@@ -95,25 +95,7 @@ def main(
     method: str = typer.Option(
         "pdfium",
         "--method",
-        help="PDF text extraction method: 'pdfium' (native only), 'pdfium_hybrid' (native + OCR for scanned), or 'ocr' (OCR all pages).",  # noqa: E501
-    ),
-    nemotron_parse_actors: float = typer.Option(
-        0.0,
-        "--nemotron-parse-actors",
-        min=0.0,
-        help="Enable Parse-only extraction path when > 0.0 with parse GPU/batch-size.",
-    ),
-    nemotron_parse_gpus_per_actor: float = typer.Option(
-        0.0,
-        "--nemotron-parse-gpus-per-actor",
-        min=0.0,
-        help="GPU allocation hint for Parse-only extraction path.",
-    ),
-    nemotron_parse_ray_batch_size: float = typer.Option(
-        0.0,
-        "--nemotron-parse-ray-batch-size",
-        min=0.0,
-        help="Parse stage batch size (enables Parse-only path when > 0.0 with parse workers/GPU).",
+        help="PDF text extraction method: 'pdfium' (native only), 'pdfium_hybrid' (native + OCR for scanned), 'ocr' (OCR all pages), or 'nemotron_parse' (Nemotron Parse only).",  # noqa: E501
     ),
     embed_modality: str = typer.Option(
         "text",
@@ -168,6 +150,29 @@ def main(
         "--graphic-elements-invoke-url",
         help="Optional remote endpoint URL for graphic-elements model inference.",
     ),
+    hybrid: bool = typer.Option(
+        False,
+        "--hybrid/--no-hybrid",
+        help="Enable LanceDB hybrid mode (dense + FTS text).",
+    ),
+    text_chunk: bool = typer.Option(
+        False,
+        "--text-chunk",
+        help=(
+            "Re-chunk extracted page text by token count before embedding. "
+            "Uses --text-chunk-max-tokens and --text-chunk-overlap-tokens (defaults: 1024, 150)."
+        ),
+    ),
+    text_chunk_max_tokens: Optional[int] = typer.Option(
+        None,
+        "--text-chunk-max-tokens",
+        help="Max tokens per text chunk (default: 1024). Implies --text-chunk.",
+    ),
+    text_chunk_overlap_tokens: Optional[int] = typer.Option(
+        None,
+        "--text-chunk-overlap-tokens",
+        help="Token overlap between consecutive text chunks (default: 150). Implies --text-chunk.",
+    ),
 ) -> None:
     if gpu_devices is not None and num_gpus is not None:
         raise typer.BadParameter("--gpu-devices and --num-gpus are mutually exclusive.")
@@ -186,6 +191,7 @@ def main(
             "txt": ["*.txt"],
             "html": ["*.html"],
             "doc": ["*.docx", "*.pptx"],
+            "image": ["*.png", "*.jpg", "*.jpeg", "*.bmp", "*.tiff", "*.tif", "*.svg"],
         }
         exts = ext_map.get(input_type, ["*.pdf"])
         file_patterns = [str(input_path / e) for e in exts]
@@ -194,146 +200,101 @@ def main(
 
     ingestor = create_ingestor(run_mode="inprocess")
     if input_type == "txt":
-        ingestor = (
-            ingestor.files(file_patterns)
-            .extract_txt(TextChunkParams(max_tokens=512, overlap_tokens=0))
-            .embed(
-                EmbedParams(
-                    model_name=str(embed_model_name),
-                    embed_invoke_url=embed_invoke_url,
-                    embed_modality=embed_modality,
-                    text_elements_modality=text_elements_modality,
-                    structured_elements_modality=structured_elements_modality,
-                    embed_granularity=embed_granularity,
-                )
-            )
-            .vdb_upload(
-                VdbUploadParams(
-                    lancedb={
-                        "lancedb_uri": LANCEDB_URI,
-                        "table_name": LANCEDB_TABLE,
-                        "overwrite": True,
-                        "create_index": True,
-                    }
-                )
+        ingestor = ingestor.files(file_patterns).extract_txt(
+            TextChunkParams(
+                max_tokens=text_chunk_max_tokens or 1024,
+                overlap_tokens=text_chunk_overlap_tokens if text_chunk_overlap_tokens is not None else 150,
             )
         )
     elif input_type == "html":
-        ingestor = (
-            ingestor.files(file_patterns)
-            .extract_html(TextChunkParams(max_tokens=512, overlap_tokens=0))
-            .embed(
-                EmbedParams(
-                    model_name=str(embed_model_name),
-                    embed_invoke_url=embed_invoke_url,
-                    embed_modality=embed_modality,
-                    text_elements_modality=text_elements_modality,
-                    structured_elements_modality=structured_elements_modality,
-                    embed_granularity=embed_granularity,
-                )
+        ingestor = ingestor.files(file_patterns).extract_html(
+            TextChunkParams(
+                max_tokens=text_chunk_max_tokens or 1024,
+                overlap_tokens=text_chunk_overlap_tokens if text_chunk_overlap_tokens is not None else 150,
             )
-            .vdb_upload(
-                VdbUploadParams(
-                    lancedb={
-                        "lancedb_uri": LANCEDB_URI,
-                        "table_name": LANCEDB_TABLE,
-                        "overwrite": True,
-                        "create_index": True,
-                    }
-                )
+        )
+    elif input_type == "image":
+        ingestor = ingestor.files(file_patterns).extract_image_files(
+            ExtractParams(
+                method=method,
+                extract_text=True,
+                extract_tables=True,
+                extract_charts=True,
+                extract_infographics=False,
+                use_graphic_elements=use_graphic_elements,
+                graphic_elements_invoke_url=graphic_elements_invoke_url,
+                use_table_structure=use_table_structure,
+                table_output_format=table_output_format,
+                table_structure_invoke_url=table_structure_invoke_url,
+                page_elements_invoke_url=page_elements_invoke_url,
+                ocr_invoke_url=ocr_invoke_url,
             )
         )
     elif input_type == "doc":
-        ingestor = (
-            ingestor.files(file_patterns)
-            .extract(
-                ExtractParams(
-                    method=method,
-                    extract_text=True,
-                    extract_tables=True,
-                    extract_charts=True,
-                    extract_infographics=False,
-                    use_graphic_elements=use_graphic_elements,
-                    graphic_elements_invoke_url=graphic_elements_invoke_url,
-                    use_table_structure=use_table_structure,
-                    table_output_format=table_output_format,
-                    table_structure_invoke_url=table_structure_invoke_url,
-                    page_elements_invoke_url=page_elements_invoke_url,
-                    ocr_invoke_url=ocr_invoke_url,
-                    batch_tuning={
-                        "nemotron_parse_workers": float(nemotron_parse_actors),
-                        "gpu_nemotron_parse": float(nemotron_parse_gpus_per_actor),
-                        "nemotron_parse_batch_size": float(nemotron_parse_ray_batch_size),
-                    },
-                )
-            )
-            .embed(
-                EmbedParams(
-                    model_name=str(embed_model_name),
-                    embed_invoke_url=embed_invoke_url,
-                    embed_modality=embed_modality,
-                    text_elements_modality=text_elements_modality,
-                    structured_elements_modality=structured_elements_modality,
-                    embed_granularity=embed_granularity,
-                )
-            )
-            .vdb_upload(
-                VdbUploadParams(
-                    lancedb={
-                        "lancedb_uri": LANCEDB_URI,
-                        "table_name": LANCEDB_TABLE,
-                        "overwrite": True,
-                        "create_index": True,
-                    }
-                )
+        ingestor = ingestor.files(file_patterns).extract(
+            ExtractParams(
+                method=method,
+                extract_text=True,
+                extract_tables=True,
+                extract_charts=True,
+                extract_infographics=False,
+                use_graphic_elements=use_graphic_elements,
+                graphic_elements_invoke_url=graphic_elements_invoke_url,
+                use_table_structure=use_table_structure,
+                table_output_format=table_output_format,
+                table_structure_invoke_url=table_structure_invoke_url,
+                page_elements_invoke_url=page_elements_invoke_url,
+                ocr_invoke_url=ocr_invoke_url,
             )
         )
     else:
-        ingestor = (
-            ingestor.files(file_patterns)
-            .extract(
-                ExtractParams(
-                    method=method,
-                    extract_text=True,
-                    extract_tables=True,
-                    extract_charts=True,
-                    extract_infographics=False,
-                    use_graphic_elements=use_graphic_elements,
-                    graphic_elements_invoke_url=graphic_elements_invoke_url,
-                    use_table_structure=use_table_structure,
-                    table_output_format=table_output_format,
-                    table_structure_invoke_url=table_structure_invoke_url,
-                    page_elements_invoke_url=page_elements_invoke_url,
-                    ocr_invoke_url=ocr_invoke_url,
-                    batch_tuning={
-                        "nemotron_parse_workers": float(nemotron_parse_actors),
-                        "gpu_nemotron_parse": float(nemotron_parse_gpus_per_actor),
-                        "nemotron_parse_batch_size": float(nemotron_parse_ray_batch_size),
-                    },
-                )
-            )
-            .embed(
-                EmbedParams(
-                    model_name=str(embed_model_name),
-                    embed_invoke_url=embed_invoke_url,
-                    embed_modality=embed_modality,
-                    text_elements_modality=text_elements_modality,
-                    structured_elements_modality=structured_elements_modality,
-                    embed_granularity=embed_granularity,
-                )
+        ingestor = ingestor.files(file_patterns).extract(
+            ExtractParams(
+                method=method,
+                extract_text=True,
+                extract_tables=True,
+                extract_charts=True,
+                extract_infographics=False,
+                use_graphic_elements=use_graphic_elements,
+                graphic_elements_invoke_url=graphic_elements_invoke_url,
+                use_table_structure=use_table_structure,
+                table_output_format=table_output_format,
+                table_structure_invoke_url=table_structure_invoke_url,
+                page_elements_invoke_url=page_elements_invoke_url,
+                ocr_invoke_url=ocr_invoke_url,
             )
-            .vdb_upload(
-                VdbUploadParams(
-                    lancedb={
-                        "lancedb_uri": LANCEDB_URI,
-                        "table_name": LANCEDB_TABLE,
-                        "overwrite": True,
-                        "create_index": True,
-                    }
-                )
+        )
+
+    enable_text_chunk = text_chunk or text_chunk_max_tokens is not None or text_chunk_overlap_tokens is not None
+    if enable_text_chunk:
+        ingestor = ingestor.split(
+            TextChunkParams(
+                max_tokens=text_chunk_max_tokens or 1024,
+                overlap_tokens=text_chunk_overlap_tokens if text_chunk_overlap_tokens is not None else 150,
             )
         )
 
+    ingestor = ingestor.embed(
+        EmbedParams(
+            model_name=str(embed_model_name),
+            embed_invoke_url=embed_invoke_url,
+            embed_modality=embed_modality,
+            text_elements_modality=text_elements_modality,
+            structured_elements_modality=structured_elements_modality,
+            embed_granularity=embed_granularity,
+        )
+    ).vdb_upload(
+        VdbUploadParams(
+            lancedb={
+                "lancedb_uri": LANCEDB_URI,
+                "table_name": LANCEDB_TABLE,
+                "overwrite": True,
+                "create_index": True,
+                "hybrid": hybrid,
+            }
+        )
+    )
+
     print("Running extraction...")
     ingest_start = time.perf_counter()
     ingestor.ingest(
@@ -375,6 +336,7 @@ def main(
         embedding_http_endpoint=embed_invoke_url,
         top_k=10,
         ks=(1, 5, 10),
+        hybrid=hybrid,
     )
 
     _df_query, _gold, _raw_hits, _retrieved_keys, metrics = retrieve_and_score(query_csv=query_csv, cfg=cfg)
diff --git a/nemo_retriever/src/nemo_retriever/html/ray_data.py b/nemo_retriever/src/nemo_retriever/html/ray_data.py
index f2dafbd80..1c87a18b7 100644
--- a/nemo_retriever/src/nemo_retriever/html/ray_data.py
+++ b/nemo_retriever/src/nemo_retriever/html/ray_data.py
@@ -35,12 +35,14 @@ def __call__(self, batch_df: pd.DataFrame) -> pd.DataFrame:
         out_dfs: List[pd.DataFrame] = []
         for _, row in batch_df.iterrows():
             raw = row.get("bytes")
+            text = row.get("text")
             path = row.get("path")
-            if raw is None or path is None:
+            if (raw is None and text is None) or path is None:
                 continue
             path_str = str(path) if path is not None else ""
             try:
-                chunk_df = html_bytes_to_chunks_df(raw, path_str, params=params)
+                payload = raw or text.encode("utf-8")
+                chunk_df = html_bytes_to_chunks_df(payload, path_str, params=params)
                 if not chunk_df.empty:
                     out_dfs.append(chunk_df)
             except Exception:
diff --git a/nemo_retriever/src/nemo_retriever/ingest-config.yaml b/nemo_retriever/src/nemo_retriever/ingest-config.yaml
index 1d8acd6c1..25f10997c 100644
--- a/nemo_retriever/src/nemo_retriever/ingest-config.yaml
+++ b/nemo_retriever/src/nemo_retriever/ingest-config.yaml
@@ -42,6 +42,13 @@ pdf:
       http: null
       model_name: null
 
+  # PDF rendering mode for page-element detection images:
+  #   full_dpi     – render at `dpi` (default 300), then resize_pad down to 1024×1024.
+  #                  Higher source resolution, but bilinear downscale may differ from NIM.
+  #   fit_to_model – render directly at the scale that fits within 1024×1024 (~93 DPI
+  #                  for US Letter), matching the nv-ingest/NIM container rasterization.
+  render_mode: fit_to_model
+
   extract:
     text: true
     # Text depth: page | document
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
index 49e568770..e00037285 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/batch.py
@@ -38,6 +38,7 @@
 )
 from nemo_retriever.ingest_modes.inprocess import collapse_content_to_page_rows, explode_content_to_rows
 
+from ..image.load import SUPPORTED_IMAGE_EXTENSIONS
 from ..ingestor import Ingestor
 from ..params import ASRParams
 from ..params import AudioChunkParams
@@ -295,6 +296,9 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "BatchI
         This does not run extraction yet; it records configuration so the batch
         executor can build a concrete pipeline later.
 
+        If all input files have a ``.txt`` extension, the pipeline automatically
+        delegates to :meth:`extract_txt` with default :class:`TextChunkParams`.
+
         Resource-tuning kwargs (auto-detected from available resources if omitted):
 
         - ``pdf_split_batch_size``: Batch size for PDF split stage (default 1).
@@ -308,6 +312,18 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "BatchI
         - ``ocr_cpus_per_actor``: CPUs reserved per OCR actor (default 1).
         """
 
+        if self._input_documents and all(f.lower().endswith(".txt") for f in self._input_documents):
+            txt_params = TextChunkParams(
+                max_tokens=kwargs.pop("max_tokens", 1024),
+                overlap_tokens=kwargs.pop("overlap_tokens", 0),
+            )
+            return self.extract_txt(params=txt_params)
+
+        if self._input_documents and all(
+            os.path.splitext(f)[1].lower() in SUPPORTED_IMAGE_EXTENSIONS for f in self._input_documents
+        ):
+            return self.extract_image_files(params=params, **kwargs)
+
         resolved = _coerce_params(params, ExtractParams, kwargs)
         if (
             any(
@@ -322,6 +338,7 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "BatchI
             and not resolved.api_key
         ):
             resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()})
+
         kwargs = {
             **resolved.model_dump(mode="python", exclude={"remote_retry", "batch_tuning"}, exclude_none=True),
             **resolved.remote_retry.model_dump(mode="python", exclude_none=True),
@@ -341,10 +358,9 @@ def _endpoint_count(raw: Any) -> int:
 
         # 200 DPI is sufficient for both detection and OCR.  YOLOX resizes to
         # 1024x1024 internally, and NemotronOCR also resizes crops to 1024x1024,
-        # so resolution above ~1200px per side is wasted.  200 DPI (Letter =
-        # 1700x2200) gives enough detail while reducing extraction time and
-        # memory usage by ~30-40% vs 300 DPI.
-        kwargs.setdefault("dpi", 200)
+        # nv-ingest NIM uses 300 DPI for page-element detection; match that
+        # default here so local-model recall matches the container path.
+        kwargs.setdefault("dpi", 300)
         kwargs.setdefault("image_format", "jpeg")
         kwargs.setdefault("jpeg_quality", 100)
         self._pipeline_type = "pdf"
@@ -384,10 +400,37 @@ def _endpoint_count(raw: Any) -> int:
             compute=rd.TaskPoolStrategy(size=self._requested_plan.get_pdf_extract_tasks()),
         )
 
+        self._apply_nemotron_parse_overrides(kwargs)
+
         self._append_detection_stages(kwargs)
 
         return self
 
+    def _apply_nemotron_parse_overrides(self, kwargs: dict[str, Any]) -> None:
+        """Update ``_requested_plan`` with user-provided Nemotron Parse resource overrides
+        and set ``_use_nemotron_parse_only``."""
+        nemotron_parse_workers = float(kwargs.get("nemotron_parse_workers", 0.0) or 0.0)
+        gpu_nemotron_parse = float(kwargs.get("gpu_nemotron_parse", 0.0) or 0.0)
+        nemotron_parse_batch_size = float(kwargs.get("nemotron_parse_batch_size", 0.0) or 0.0)
+        self._use_nemotron_parse_only = kwargs.get("method") == "nemotron_parse" or (
+            nemotron_parse_workers > 0.0 and gpu_nemotron_parse > 0.0 and nemotron_parse_batch_size > 0.0
+        )
+
+        # Forward CLI overrides into the RequestedPlan so that downstream Ray
+        # actor pools (batch size, GPU fraction, pool size) honour them.
+        overrides: dict[str, Any] = {}
+        if nemotron_parse_workers > 0.0:
+            workers = int(nemotron_parse_workers)
+            overrides["nemotron_parse_initial_actors"] = workers
+            overrides["nemotron_parse_min_actors"] = workers
+            overrides["nemotron_parse_max_actors"] = workers
+        if gpu_nemotron_parse > 0.0:
+            overrides["nemotron_parse_gpus_per_actor"] = gpu_nemotron_parse
+        if nemotron_parse_batch_size > 0.0:
+            overrides["nemotron_parse_batch_size"] = int(nemotron_parse_batch_size)
+        if overrides:
+            self._requested_plan = self._requested_plan.model_copy(update=overrides)
+
     def _append_detection_stages(self, kwargs: dict[str, Any]) -> None:
         """Append downstream GPU detection stages (page elements, OCR, table/chart/infographic).
 
@@ -637,6 +680,7 @@ def extract_image_files(self, params: ExtractParams | None = None, **kwargs: Any
             and not resolved.api_key
         ):
             resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()})
+
         kwargs = {
             **resolved.model_dump(mode="python", exclude={"remote_retry", "batch_tuning"}, exclude_none=True),
             **resolved.remote_retry.model_dump(mode="python", exclude_none=True),
@@ -655,10 +699,32 @@ def extract_image_files(self, params: ExtractParams | None = None, **kwargs: Any
         )
 
         # Downstream detection stages (page elements, OCR, table/chart/infographic).
+        self._apply_nemotron_parse_overrides(kwargs)
         self._append_detection_stages(kwargs)
 
         return self
 
+    def split(self, params: TextChunkParams | None = None, **kwargs: Any) -> "BatchIngestor":
+        """
+        Re-chunk the ``text`` column by token count (post-extraction transform).
+
+        Adds a ``map_batches(TextChunkActor, ...)`` stage to the Ray Dataset so
+        already-extracted text is re-chunked before embedding.
+        """
+        from nemo_retriever.txt.ray_data import TextChunkActor
+
+        resolved = _coerce_params(params, TextChunkParams, kwargs)
+        self._tasks.append(("split", resolved.model_dump(mode="python")))
+
+        self._rd_dataset = self._rd_dataset.map_batches(
+            TextChunkActor,
+            batch_size=4,
+            batch_format="pandas",
+            num_cpus=1,
+            fn_constructor_kwargs={"params": resolved},
+        )
+        return self
+
     def extract_txt(self, params: TextChunkParams | None = None, **kwargs: Any) -> "BatchIngestor":
         """
         Configure txt-only pipeline: read_binary_files -> TxtSplitActor (bytes -> chunk rows).
@@ -717,6 +783,8 @@ def extract_audio(
         Use with .files("mp3/*.mp3").extract_audio(...).embed().vdb_upload().ingest().
         Do not call .extract() when using .extract_audio().
         ASR requires a remote or self-deployed Parakeet/Riva gRPC endpoint (see ASRParams.audio_endpoints).
+        Optional kwargs: audio_chunk_batch_size (default 4), asr_batch_size (default 8),
+        asr_num_gpus (default 0.5; GPUs reserved per ASR actor for local Parakeet).
         """
         from nemo_retriever.audio import ASRActor
         from nemo_retriever.audio import MediaChunkActor
@@ -732,6 +800,7 @@ def extract_audio(
 
         audio_chunk_batch_size = kwargs.get("audio_chunk_batch_size", 4)
         asr_batch_size = kwargs.get("asr_batch_size", 8)
+        asr_num_gpus = kwargs.get("asr_num_gpus", 0.5)
 
         self._rd_dataset = self._rd_dataset.map_batches(
             MediaChunkActor,
@@ -745,6 +814,7 @@ def extract_audio(
             batch_size=asr_batch_size,
             batch_format="pandas",
             num_cpus=1,
+            num_gpus=asr_num_gpus,
             fn_constructor_kwargs={"params": ASRParams(**self._extract_audio_asr_kwargs)},
         )
         return self
@@ -785,6 +855,7 @@ def embed(
         resolved = _coerce_params(params, EmbedParams, kwargs)
         if any((resolved.embedding_endpoint, resolved.embed_invoke_url)) and not resolved.api_key:
             resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()})
+
         kwargs = build_embed_kwargs(resolved, include_batch_tuning=True)
 
         # Remaining kwargs are forwarded to the actor constructor.
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py b/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py
index 6df62b656..7fd35373a 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/fused.py
@@ -200,7 +200,7 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "FusedI
         pdf_extract_workers = int(kwargs.pop("pdf_extract_workers", max(1, self._num_cpus // 2)))
 
         kwargs.setdefault("extract_page_as_image", True)
-        kwargs.setdefault("dpi", 200)
+        kwargs.setdefault("dpi", 300)
 
         self._tasks.append(("extract", dict(kwargs)))
         self._fused_extract_flags = {
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
index 35f8e5185..1f1d229a2 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/inprocess.py
@@ -45,6 +45,7 @@
     pdfium = None  # type: ignore[assignment]
     _PDFIUM_IMPORT_ERROR = e
 
+from ..image.load import SUPPORTED_IMAGE_EXTENSIONS
 from ..utils.convert import SUPPORTED_EXTENSIONS, convert_to_pdf_bytes
 from ..ingestor import Ingestor
 from ..params import ASRParams
@@ -1001,7 +1002,16 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "InProc
         # NOTE: `kwargs` passed to `.extract()` are intended primarily for PDF extraction
         # (e.g. `extract_text`, `dpi`, etc). Downstream model stages do NOT necessarily
         # accept the same keyword arguments. Keep per-stage kwargs isolated.
-
+        if self._input_documents and all(f.lower().endswith(".txt") for f in self._input_documents):
+            txt_params = TextChunkParams()
+            return self.extract_txt(params=txt_params)
+        if self._input_documents and all(f.lower().endswith(".html") for f in self._input_documents):
+            html_params = HtmlChunkParams()
+            return self.extract_html(params=html_params)
+        if self._input_documents and all(
+            os.path.splitext(f)[1].lower() in SUPPORTED_IMAGE_EXTENSIONS for f in self._input_documents
+        ):
+            return self.extract_image_files(params=params, **kwargs)
         resolved = _coerce_params(params, ExtractParams, kwargs)
         if (
             any(
@@ -1017,13 +1027,7 @@ def extract(self, params: ExtractParams | None = None, **kwargs: Any) -> "InProc
         ):
             resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()})
         kwargs = resolved.model_dump(mode="python")
-        batch_tuning = kwargs.get("batch_tuning") if isinstance(kwargs.get("batch_tuning"), dict) else {}
-        nemotron_parse_workers = float(batch_tuning.get("nemotron_parse_workers", 0.0) or 0.0)
-        gpu_nemotron_parse = float(batch_tuning.get("gpu_nemotron_parse", 0.0) or 0.0)
-        nemotron_parse_batch_size = float(batch_tuning.get("nemotron_parse_batch_size", 0.0) or 0.0)
-        use_nemotron_parse_only = (
-            nemotron_parse_workers > 0.0 and gpu_nemotron_parse > 0.0 and nemotron_parse_batch_size > 0.0
-        )
+        use_nemotron_parse_only = kwargs.get("method") == "nemotron_parse"
         extract_kwargs = dict(kwargs)
         # Downstream in-process stages (page elements / table / chart / infographic) assume
         # `page_image.image_b64` exists. Ensure PDF extraction emits a page image unless
@@ -1051,9 +1055,6 @@ def _append_detection_tasks(
 
         Shared by ``extract()`` (PDF) and ``extract_image_files()`` (standalone images).
         """
-        batch_tuning = kwargs.get("batch_tuning") if isinstance(kwargs.get("batch_tuning"), dict) else {}
-        nemotron_parse_batch_size = float(batch_tuning.get("nemotron_parse_batch_size", 0.0) or 0.0)
-
         # Common, optional knobs shared by our detect_* helpers.
         detect_passthrough_keys = {
             "inference_batch_size",
@@ -1099,7 +1100,6 @@ def _detect_kwargs_with_model(model_obj: Any, *, stage_name: str, allow_remote:
                 parse_flags["extract_charts"] = True
             if kwargs.get("extract_infographics") is True:
                 parse_flags["extract_infographics"] = True
-            parse_flags["inference_batch_size"] = int(nemotron_parse_batch_size)
             parse_flags.update(_stage_remote_kwargs("nemotron_parse"))
             parse_invoke_url = kwargs.get(
                 "nemotron_parse_invoke_url", kwargs.get("ocr_invoke_url", kwargs.get("invoke_url", ""))
@@ -1256,17 +1256,26 @@ def extract_image_files(self, params: ExtractParams | None = None, **kwargs: Any
         ):
             resolved = resolved.model_copy(update={"api_key": resolve_remote_api_key()})
         kwargs = resolved.model_dump(mode="python")
-        batch_tuning = kwargs.get("batch_tuning") if isinstance(kwargs.get("batch_tuning"), dict) else {}
-        nemotron_parse_workers = float(batch_tuning.get("nemotron_parse_workers", 0.0) or 0.0)
-        gpu_nemotron_parse = float(batch_tuning.get("gpu_nemotron_parse", 0.0) or 0.0)
-        nemotron_parse_batch_size = float(batch_tuning.get("nemotron_parse_batch_size", 0.0) or 0.0)
-        use_nemotron_parse_only = (
-            nemotron_parse_workers > 0.0 and gpu_nemotron_parse > 0.0 and nemotron_parse_batch_size > 0.0
-        )
+        use_nemotron_parse_only = kwargs.get("method") == "nemotron_parse"
         self._pipeline_type = "image"
         self._append_detection_tasks(kwargs, use_nemotron_parse_only=use_nemotron_parse_only)
         return self
 
+    def split(self, params: TextChunkParams | None = None, **kwargs: Any) -> "InProcessIngestor":
+        """
+        Re-chunk the ``text`` column by token count (post-extraction transform).
+
+        Appends :func:`~nemo_retriever.txt.split.split_df` as a GPU-category
+        task so it runs in sequence after extraction and before embedding.
+        """
+        from nemo_retriever.txt.split import split_df
+
+        resolved = _coerce_params(params, TextChunkParams, kwargs)
+        split_kwargs = resolved.model_dump(mode="python")
+        split_kwargs.pop("encoding", None)
+        self._tasks.append((split_df, split_kwargs))
+        return self
+
     def extract_txt(self, params: TextChunkParams | None = None, **kwargs: Any) -> "InProcessIngestor":
         """
         Configure txt ingestion: tokenizer-based chunking only (no PDF extraction).
@@ -1274,9 +1283,13 @@ def extract_txt(self, params: TextChunkParams | None = None, **kwargs: Any) -> "
         Use with .files("*.txt").extract_txt(...).embed().vdb_upload().ingest().
         Do not call .extract() when using .extract_txt().
         """
+        from nemo_retriever.txt.ray_data import TxtSplitActor
+
         self._pipeline_type = "txt"
         resolved = _coerce_params(params, TextChunkParams, kwargs)
         self._extract_txt_kwargs = resolved.model_dump(mode="python")
+        text_split = TxtSplitActor(params=TextChunkParams(**self._extract_txt_kwargs))
+        self._tasks.append((text_split, {}))
         return self
 
     def extract_html(self, params: HtmlChunkParams | None = None, **kwargs: Any) -> "InProcessIngestor":
@@ -1286,9 +1299,15 @@ def extract_html(self, params: HtmlChunkParams | None = None, **kwargs: Any) ->
         Use with .files("*.html").extract_html(...).embed().vdb_upload().ingest().
         Do not call .extract() when using .extract_html().
         """
+        from nemo_retriever.html.ray_data import HtmlSplitActor
+
         self._pipeline_type = "html"
         resolved = _coerce_params(params, HtmlChunkParams, kwargs)
         self._extract_html_kwargs = resolved.model_dump(mode="python")
+        html_split = HtmlSplitActor(
+            params=HtmlChunkParams(**self._extract_html_kwargs),
+        )
+        self._tasks.append((html_split, {}))
         return self
 
     def extract_audio(
diff --git a/nemo_retriever/src/nemo_retriever/ingest_modes/lancedb_utils.py b/nemo_retriever/src/nemo_retriever/ingest_modes/lancedb_utils.py
index 1e3a98069..41fd24378 100644
--- a/nemo_retriever/src/nemo_retriever/ingest_modes/lancedb_utils.py
+++ b/nemo_retriever/src/nemo_retriever/ingest_modes/lancedb_utils.py
@@ -128,6 +128,13 @@ def build_lancedb_row(
         metadata_obj["pdf_page"] = pdf_page
     metadata_obj.update(_build_detection_metadata(row))
 
+    # Preserve split metadata (chunk_index, chunk_count) from the original row.
+    orig_meta = getattr(row, "metadata", None)
+    if isinstance(orig_meta, dict):
+        for k in ("chunk_index", "chunk_count"):
+            if k in orig_meta:
+                metadata_obj[k] = orig_meta[k]
+
     source_obj: Dict[str, Any] = {"source_id": str(path)}
 
     row_out: Dict[str, Any] = {
@@ -190,7 +197,9 @@ def lancedb_schema(vector_dim: int = 2048) -> Any:
             pa.field("pdf_basename", pa.string()),
             pa.field("page_number", pa.int32()),
             pa.field("source", pa.string()),
-            pa.field("source_id", pa.string()),
+            pa.field(
+                "source_id", pa.string()
+            ),  # Different than the source. Field contains path+page_number for aggregation tasks
             pa.field("path", pa.string()),
             pa.field("text", pa.string()),
             pa.field("metadata", pa.string()),
diff --git a/nemo_retriever/src/nemo_retriever/ingestor.py b/nemo_retriever/src/nemo_retriever/ingestor.py
index 9e64f259f..7bbc19486 100644
--- a/nemo_retriever/src/nemo_retriever/ingestor.py
+++ b/nemo_retriever/src/nemo_retriever/ingestor.py
@@ -22,6 +22,7 @@
 from nemo_retriever.application.modes.factory import create_runmode_ingestor
 from nemo_retriever.params import EmbedParams
 from nemo_retriever.params import ExtractParams
+from nemo_retriever.params import TextChunkParams
 from nemo_retriever.params import IngestExecuteParams
 from nemo_retriever.params import IngestorCreateParams
 from nemo_retriever.params import RunMode
@@ -132,8 +133,9 @@ def filter(self) -> "ingestor":
         """Record a filter task configuration."""
         self._not_implemented("filter")
 
-    def split(self) -> "ingestor":
+    def split(self, params: TextChunkParams | None = None, **kwargs: Any) -> "ingestor":
         """Record a split task configuration."""
+        _ = _merge_params(params, kwargs)
         self._not_implemented("split")
 
     def store(self) -> "ingestor":
diff --git a/nemo_retriever/src/nemo_retriever/io/markdown.py b/nemo_retriever/src/nemo_retriever/io/markdown.py
index 366677e40..a473aa951 100644
--- a/nemo_retriever/src/nemo_retriever/io/markdown.py
+++ b/nemo_retriever/src/nemo_retriever/io/markdown.py
@@ -59,11 +59,11 @@ def to_markdown_by_page(results: object) -> dict[int, str]:
     return rendered
 
 
-def to_markdown(results: object) -> str:
+def to_markdown(results: object) -> str | None:
     """Render a single document result as one markdown document."""
     pages = to_markdown_by_page(results)
     if not pages:
-        return f"# {_DOCUMENT_TITLE}\n\n_No content found._"
+        return None
     return f"# {_DOCUMENT_TITLE}\n\n" + "\n\n".join(pages.values())
 
 
@@ -110,6 +110,9 @@ def _records_from_mapping(results: Mapping[str, Any]) -> list[dict[str, Any]]:
         return [dict(results)]
     raise ValueError("Markdown rendering expects a document row, row list, or saved results payload.")
 
+    # TODO(jioffe): Centralize retriever result-shape detection so helpers and
+    # actor outputs do not rely on duplicated key-based heuristics.
+
 
 def _looks_like_record(record: Mapping[str, Any]) -> bool:
     return any(
diff --git a/nemo_retriever/src/nemo_retriever/model/local/__init__.py b/nemo_retriever/src/nemo_retriever/model/local/__init__.py
index 7fa66d3f7..791df4daa 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/__init__.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/__init__.py
@@ -17,6 +17,7 @@
     "NemotronTableStructureV1",
     "NemotronGraphicElementsV1",
     "NemotronParseV12",
+    "NemotronRerankV2",
     "ParakeetCTC1B1ASR",
 ]
 
@@ -42,6 +43,10 @@ def __getattr__(name: str):
         from .nemotron_parse_v1_2 import NemotronParseV12
 
         return NemotronParseV12
+    if name == "NemotronRerankV2":
+        from .nemotron_rerank_v2 import NemotronRerankV2
+
+        return NemotronRerankV2
     if name == "ParakeetCTC1B1ASR":
         from .parakeet_ctc_1_1b_asr import ParakeetCTC1B1ASR
 
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_page_elements_v3.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_page_elements_v3.py
index 9e2cd1074..21c9077da 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/nemotron_page_elements_v3.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_page_elements_v3.py
@@ -67,6 +67,8 @@ def preprocess(self, tensor: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
                     raise TypeError(f"resize_pad returned non-tensor: {type(y)!r}")
                 if y.ndim != 3:
                     raise ValueError(f"Expected CHW from resize_pad, got {tuple(y.shape)}")
+                # Match NIM preprocessing: quantize to uint8 after interpolation
+                y = torch.clamp(y, 0, 255).to(torch.uint8).float()
                 return y.unsqueeze(0)
 
             outs: List[torch.Tensor] = []
@@ -74,6 +76,8 @@ def preprocess(self, tensor: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
                 y = resize_pad_page_elements(x[i], self.input_shape)
                 if not isinstance(y, torch.Tensor) or y.ndim != 3:
                     raise ValueError(f"resize_pad produced unexpected output for batch item {i}: {type(y)!r}")
+                # Match NIM preprocessing: quantize to uint8 after interpolation
+                y = torch.clamp(y, 0, 255).to(torch.uint8).float()
                 outs.append(y)
             return torch.stack(outs, dim=0)
 
@@ -83,6 +87,8 @@ def preprocess(self, tensor: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
                 raise TypeError(f"resize_pad returned non-tensor: {type(y)!r}")
             if y.ndim != 3:
                 raise ValueError(f"Expected CHW from resize_pad, got {tuple(y.shape)}")
+            # Match NIM preprocessing: quantize to uint8 after interpolation
+            y = torch.clamp(y, 0, 255).to(torch.uint8).float()
             return y.unsqueeze(0)
 
         raise ValueError(f"Expected CHW or BCHW tensor, got shape {tuple(x.shape)}")
@@ -133,8 +139,13 @@ def postprocess(self, preds: Union[Dict[str, torch.Tensor], Sequence[Dict[str, t
         # may pass a *list* of per-image preds for batched inference, so handle both cases
         # and always return torch tensors (or lists of torch tensors).
 
+        # Use a zero threshold so all NMS survivors reach WBF before per-class
+        # filtering.  The real per-class gate is _apply_final_score_filter (after WBF),
+        # matching the NIM pipeline ordering.
+        passthrough_thresholds = {k: 0.0 for k in self._model.thresholds_per_class}
+
         def _one(p: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            b_np, l_np, s_np = postprocess_preds_page_element(p, self._model.thresholds_per_class, self._model.labels)
+            b_np, l_np, s_np = postprocess_preds_page_element(p, passthrough_thresholds, self._model.labels)
             b = torch.as_tensor(b_np, dtype=torch.float32)
             l = torch.as_tensor(l_np, dtype=torch.int64)  # noqa: E741
             s = torch.as_tensor(s_np, dtype=torch.float32)
@@ -212,7 +223,7 @@ def output(self) -> Any:
                 "labels": "List[str] - class names",
                 "scores": "np.ndarray[N] - confidence scores",
             },
-            "classes": ["table", "chart", "infographic", "title", "text", "header_footer"],
+            "classes": ["table", "chart", "title", "infographic", "text", "header_footer"],
             "post_processing": {"conf_thresh": 0.01, "iou_thresh": 0.5},
         }
 
diff --git a/nemo_retriever/src/nemo_retriever/model/local/nemotron_rerank_v2.py b/nemo_retriever/src/nemo_retriever/model/local/nemotron_rerank_v2.py
new file mode 100644
index 000000000..eca0ee674
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/model/local/nemotron_rerank_v2.py
@@ -0,0 +1,210 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Local wrapper for nvidia/llama-nemotron-rerank-1b-v2 cross-encoder reranker."""
+
+from __future__ import annotations
+
+from typing import List, Optional
+
+from nemo_retriever.utils.hf_cache import configure_global_hf_cache_base
+from ..model import BaseModel, RunMode
+
+
+_DEFAULT_MODEL = "nvidia/llama-nemotron-rerank-1b-v2"
+_DEFAULT_MAX_LENGTH = 512
+_DEFAULT_BATCH_SIZE = 32
+
+
+def _prompt_template(query: str, passage: str) -> str:
+    """Format a (query, passage) pair as the model expects."""
+    return f"question:{query} \n \n passage:{passage}"
+
+
+class NemotronRerankV2(BaseModel):
+    """
+    Local cross-encoder reranker wrapping nvidia/llama-nemotron-rerank-1b-v2.
+
+    The model scores (query, document) pairs and returns raw logits; higher
+    values indicate greater relevance.  It is fine-tuned from
+    meta-llama/Llama-3.2-1B with bi-directional attention and supports 26
+    languages with sequences up to 8 192 tokens.
+
+    Example::
+
+        reranker = NemotronRerankV2()
+        scores = reranker.score("What is ML?", ["Machine learning is…", "Paris is…"])
+        # scores -> [20.6, -23.1]  (higher = more relevant)
+    """
+
+    def __init__(
+        self,
+        model_name: str = _DEFAULT_MODEL,
+        device: Optional[str] = None,
+        hf_cache_dir: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        import torch
+        from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+        configure_global_hf_cache_base()
+
+        self._model_name = model_name
+        self._device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+
+        kwargs: dict = {"trust_remote_code": True}
+        if hf_cache_dir:
+            kwargs["cache_dir"] = hf_cache_dir
+
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            padding_side="left",
+            **kwargs,
+        )
+        if self._tokenizer.pad_token is None:
+            self._tokenizer.pad_token = self._tokenizer.eos_token
+
+        self._model = (
+            AutoModelForSequenceClassification.from_pretrained(
+                model_name,
+                torch_dtype=torch.bfloat16,
+                **kwargs,
+            )
+            .eval()
+            .to(self._device)
+        )
+
+        if self._model.config.pad_token_id is None:
+            self._model.config.pad_token_id = self._tokenizer.eos_token_id
+
+    # ------------------------------------------------------------------
+    # BaseModel abstract properties
+    # ------------------------------------------------------------------
+
+    @property
+    def model_name(self) -> str:
+        return self._model_name
+
+    @property
+    def model_type(self) -> str:
+        return "reranker"
+
+    @property
+    def model_runmode(self) -> RunMode:
+        return "local"
+
+    @property
+    def input(self):
+        return "List[Tuple[str, str]]"
+
+    @property
+    def output(self):
+        return "List[float]"
+
+    @property
+    def input_batch_size(self) -> int:
+        return _DEFAULT_BATCH_SIZE
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def score(
+        self,
+        query: str,
+        documents: List[str],
+        *,
+        max_length: int = _DEFAULT_MAX_LENGTH,
+        batch_size: int = _DEFAULT_BATCH_SIZE,
+    ) -> List[float]:
+        """
+        Score relevance of *documents* to *query*.
+
+        Parameters
+        ----------
+        query:
+            The search query.
+        documents:
+            Candidate passages/documents to score.
+        max_length:
+            Tokenizer truncation length (default 512; max supported 8 192).
+        batch_size:
+            Number of (query, doc) pairs to process per GPU forward pass.
+
+        Returns
+        -------
+        List[float]
+            Raw logit scores aligned with *documents* (higher = more relevant).
+        """
+        import torch
+
+        if not documents:
+            return []
+
+        texts = [_prompt_template(query, d) for d in documents]
+        all_scores: List[float] = []
+
+        with torch.inference_mode():
+            for start in range(0, len(texts), batch_size):
+                chunk = texts[start : start + batch_size]
+                batch = self._tokenizer(
+                    chunk,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=max_length,
+                )
+                batch = {k: v.to(self._device) for k, v in batch.items()}
+                logits = self._model(**batch).logits
+                all_scores.extend(logits.view(-1).cpu().tolist())
+
+        return all_scores
+
+    def score_pairs(
+        self,
+        pairs: List[tuple],
+        *,
+        max_length: int = _DEFAULT_MAX_LENGTH,
+        batch_size: int = _DEFAULT_BATCH_SIZE,
+    ) -> List[float]:
+        """
+        Score a list of (query, document) pairs.
+
+        Parameters
+        ----------
+        pairs:
+            Sequence of ``(query, document)`` tuples.
+        max_length:
+            Tokenizer truncation length.
+        batch_size:
+            GPU forward-pass batch size.
+
+        Returns
+        -------
+        List[float]
+            Raw logit scores (higher = more relevant).
+        """
+        import torch
+
+        if not pairs:
+            return []
+
+        texts = [_prompt_template(q, d) for q, d in pairs]
+        all_scores: List[float] = []
+
+        with torch.inference_mode():
+            for start in range(0, len(texts), batch_size):
+                chunk = texts[start : start + batch_size]
+                batch = self._tokenizer(
+                    chunk,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt",
+                    max_length=max_length,
+                )
+                batch = {k: v.to(self._device) for k, v in batch.items()}
+                logits = self._model(**batch).logits
+                all_scores.extend(logits.view(-1).cpu().tolist())
+
+        return all_scores
diff --git a/nemo_retriever/src/nemo_retriever/model/local/parakeet_ctc_1_1b_asr.py b/nemo_retriever/src/nemo_retriever/model/local/parakeet_ctc_1_1b_asr.py
index 06b1cfe77..3138c1d4e 100644
--- a/nemo_retriever/src/nemo_retriever/model/local/parakeet_ctc_1_1b_asr.py
+++ b/nemo_retriever/src/nemo_retriever/model/local/parakeet_ctc_1_1b_asr.py
@@ -137,32 +137,56 @@ def _ensure_loaded(self) -> None:
 
     def transcribe(self, paths: List[str]) -> List[str]:
         """
-        Transcribe one or more audio files to text.
+        Transcribe one or more audio files to text (batched inference).
 
-        Each path is loaded and resampled to 16 kHz mono as required by the model.
-        Returns one string per path; empty string on load/transcribe failure.
+        Each path is loaded and resampled to 16 kHz mono; then all are processed
+        in a single model forward pass. Returns one string per path; empty string
+        on load/transcribe failure.
         <pad> tokens are removed via skip_special_tokens and/or post-processing.
         """
         self._ensure_loaded()
-        results: List[str] = []
+        audios: List[Optional[np.ndarray]] = []
         for path in paths:
             audio = _load_audio_16k(path)
-            if audio is None:
-                results.append("")
-                continue
-            results.append(self._transcribe_audio(audio) or "")
-        return results
+            audios.append(audio)
+        return self.transcribe_audios(audios)
 
-    def _transcribe_audio(self, audio: np.ndarray) -> str:
-        if self._model is None or self._processor is None:
-            return ""
+    def transcribe_audios(self, audios: List[Optional[np.ndarray]]) -> List[str]:
+        """
+        Transcribe a batch of audio arrays (16 kHz mono float32) in one forward pass.
+
+        Each element can be np.ndarray or None; None yields "" in the output.
+        Returns one string per input; empty string for None or on failure.
+        """
+        self._ensure_loaded()
+        valid: List[np.ndarray] = []
+        indices: List[int] = []
+        for i, audio in enumerate(audios):
+            if audio is not None and audio.size > 0:
+                valid.append(audio)
+                indices.append(i)
+        if not valid:
+            return [""] * len(audios)
+        try:
+            transcripts = self._transcribe_audio_batch(valid)
+        except Exception as e:
+            logger.warning("ASR (transformers) batch failed: %s", e)
+            transcripts = [""] * len(valid)
+        # Map back to original order; empty string for missing/failed
+        result = [""] * len(audios)
+        for idx, text in zip(indices, transcripts):
+            result[idx] = _strip_pad_from_transcript((text or "").strip())
+        return result
+
+    def _transcribe_audio_batch(self, audios: List[np.ndarray]) -> List[str]:
+        """Single forward pass for a list of audio arrays; returns one string per array."""
+        if self._model is None or self._processor is None or not audios:
+            return [""] * len(audios)
         try:
             import torch
 
-            # Single sample: wrap in list for processor
-            speech = [audio]
             inputs = self._processor(
-                speech,
+                audios,
                 sampling_rate=self._processor.feature_extractor.sampling_rate,
                 return_tensors="pt",
                 padding=True,
@@ -170,11 +194,13 @@ def _transcribe_audio(self, audio: np.ndarray) -> str:
             inputs = inputs.to(self._model.device, dtype=self._model.dtype)
             with torch.no_grad():
                 outputs = self._model.generate(**inputs)
-            # batch_decode with skip_special_tokens to drop pad tokens
             decoded = self._processor.batch_decode(outputs, skip_special_tokens=True)
-            text = decoded[0] if decoded else ""
-            # Fallback: strip any remaining <pad> and normalize spaces
-            return _strip_pad_from_transcript(text.strip())
+            return [t.strip() for t in decoded]
         except Exception as e:
-            logger.warning("ASR (transformers) failed: %s", e)
-            return ""
+            logger.warning("ASR (transformers) batch failed: %s", e)
+            return [""] * len(audios)
+
+    def _transcribe_audio(self, audio: np.ndarray) -> str:
+        """Single-sample path for API compatibility; delegates to batch."""
+        results = self._transcribe_audio_batch([audio])
+        return results[0] if results else ""
diff --git a/nemo_retriever/src/nemo_retriever/ocr/ocr.py b/nemo_retriever/src/nemo_retriever/ocr/ocr.py
index 8ef24ebfe..34ae7258b 100644
--- a/nemo_retriever/src/nemo_retriever/ocr/ocr.py
+++ b/nemo_retriever/src/nemo_retriever/ocr/ocr.py
@@ -337,16 +337,29 @@ def _parse_ocr_result(preds: Any) -> List[Dict[str, Any]]:
 
 
 def _blocks_to_text(blocks: List[Dict[str, Any]]) -> str:
-    """Sort text blocks by reading order (y then x) and join with newlines."""
+    """Sort text blocks by reading order (y then x) and join with whitespace."""
     blocks.sort(key=lambda b: (b.get("sort_y", 0.0), b.get("sort_x", 0.0)))
-    return "\n".join(b["text"] for b in blocks if b.get("text"))
+    return " ".join(b["text"] for b in blocks if b.get("text"))
 
 
-def _blocks_to_pseudo_markdown(blocks: List[Dict[str, Any]]) -> str:
+def _blocks_to_pseudo_markdown(
+    blocks: List[Dict[str, Any]],
+    crop_hw: Tuple[int, int] = (0, 0),
+) -> str:
     """Convert OCR text blocks into pseudo-markdown table format.
 
-    Uses DBSCAN clustering on y-coordinates to identify rows, then
+    Uses DBSCAN clustering on pixel y-coordinates to identify rows, then
     sorts within each row by x-coordinate and joins with pipe separators.
+
+    Parameters
+    ----------
+    blocks : list of dict
+        OCR text blocks with ``sort_y`` (normalised [0,1]) and ``sort_x``.
+    crop_hw : (height, width)
+        Pixel dimensions of the crop image.  When provided the normalised
+        ``sort_y`` values are scaled to pixels and clustered with
+        ``eps=10`` (matching nv-ingest behaviour).  Falls back to the old
+        normalised-space heuristic when the height is unavailable.
     """
     if not blocks:
         return ""
@@ -358,19 +371,27 @@ def _blocks_to_pseudo_markdown(blocks: List[Dict[str, Any]]) -> str:
     from sklearn.cluster import DBSCAN
 
     df = pd.DataFrame(valid)
+    df = df.sort_values("sort_y")
 
-    # Normalize y-coordinates to [0,1] for scale-invariant clustering.
     y_vals = df["sort_y"].values
-    y_range = y_vals.max() - y_vals.min()
-    if y_range > 0:
-        y_norm = (y_vals - y_vals.min()) / y_range
-        eps = 0.03  # ~3% of bbox height ≈ one text line
+    crop_h = crop_hw[0] if crop_hw else 0
+
+    if crop_h > 0:
+        # Pixel-space clustering (matches nv-ingest eps=10).
+        y_pixels = (y_vals * crop_h).astype(int)
+        eps = 10
     else:
-        y_norm = y_vals
-        eps = 0.1
+        # Fallback: normalise to [0,1] when pixel dims are unknown.
+        y_range = y_vals.max() - y_vals.min()
+        if y_range > 0:
+            y_pixels = (y_vals - y_vals.min()) / y_range
+            eps = 0.03
+        else:
+            y_pixels = y_vals
+            eps = 0.1
 
     dbscan = DBSCAN(eps=eps, min_samples=1)
-    dbscan.fit(y_norm.reshape(-1, 1))
+    dbscan.fit(y_pixels.reshape(-1, 1))
     df["cluster"] = dbscan.labels_
 
     df = df.sort_values(["cluster", "sort_x"])
@@ -574,7 +595,15 @@ def ocr_page_elements(
 
                         blocks = _parse_ocr_result(preds)
                         if label_name == "table":
-                            text = _blocks_to_pseudo_markdown(blocks) or _blocks_to_text(blocks)
+                            crop_hw_table: Tuple[int, int] = (0, 0)
+                            try:
+                                _raw = base64.b64decode(crop_b64s[i])
+                                with Image.open(io.BytesIO(_raw)) as _cim:
+                                    _cw, _ch = _cim.size
+                                    crop_hw_table = (_ch, _cw)
+                            except Exception:
+                                pass
+                            text = _blocks_to_pseudo_markdown(blocks, crop_hw=crop_hw_table) or _blocks_to_text(blocks)
                         else:
                             text = _blocks_to_text(blocks)
                         entry = {"bbox_xyxy_norm": bbox, "text": text}
@@ -615,7 +644,7 @@ def _append_local_result(
                                 return
                     blocks = _parse_ocr_result(preds)
                     if label_name == "table":
-                        text = _blocks_to_pseudo_markdown(blocks)
+                        text = _blocks_to_pseudo_markdown(blocks, crop_hw=crop_hw)
                         if not text:
                             text = _blocks_to_text(blocks)
                     else:
diff --git a/nemo_retriever/src/nemo_retriever/page_elements/page_elements.py b/nemo_retriever/src/nemo_retriever/page_elements/page_elements.py
index fc9052ddf..09a3179f9 100644
--- a/nemo_retriever/src/nemo_retriever/page_elements/page_elements.py
+++ b/nemo_retriever/src/nemo_retriever/page_elements/page_elements.py
@@ -34,10 +34,12 @@
     from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
         postprocess_page_elements_v3,
         YOLOX_PAGE_V3_CLASS_LABELS,
+        YOLOX_PAGE_V3_FINAL_SCORE,
     )
 except ImportError:
     postprocess_page_elements_v3 = None  # type: ignore[assignment,misc]
     YOLOX_PAGE_V3_CLASS_LABELS = None  # type: ignore[assignment]
+    YOLOX_PAGE_V3_FINAL_SCORE = {}  # type: ignore[assignment]
 
 from nemo_retriever.nim.nim import invoke_page_elements_batches
 
@@ -123,6 +125,10 @@ def _decode_b64_image_to_np_array(image_b64: str) -> Tuple["np.array", Tuple[int
         im = im0.convert("RGB")
         w, h = im.size
         arr = np.array(im)
+        # The NIM container receives BGR images (PNG encoded from BGR numpy
+        # arrays) and decodes the raw channels as-is, so the model effectively
+        # runs on BGR input.  Match that here by reversing the channel order.
+        arr = arr[:, :, ::-1].copy()
 
     return arr, (int(h), int(w))
 
@@ -339,6 +345,25 @@ def _bounding_boxes_to_detections(
     return dets
 
 
+def _apply_final_score_filter(
+    dets: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Filter detections by per-class final score thresholds (YOLOX_PAGE_V3_FINAL_SCORE).
+
+    This should be applied **after** WBF post-processing to match the NIM pipeline ordering.
+    Maps retriever label "text" to API label "paragraph" for threshold lookup.
+    """
+    if not YOLOX_PAGE_V3_FINAL_SCORE or not dets:
+        return dets
+    filtered: List[Dict[str, Any]] = []
+    for d in dets:
+        api_name = _RETRIEVER_TO_API.get(d["label_name"], d["label_name"])
+        threshold = YOLOX_PAGE_V3_FINAL_SCORE.get(api_name, 0.0)
+        if d.get("score") is not None and d["score"] >= threshold:
+            filtered.append(d)
+    return filtered
+
+
 def _apply_page_elements_v3_postprocess(
     dets: List[Dict[str, Any]],
 ) -> List[Dict[str, Any]]:
@@ -495,7 +520,11 @@ def detect_page_elements_v3(
     if model is not None and hasattr(model, "thresholds_per_class"):
         thresholds_per_class = getattr(model, "thresholds_per_class")
     else:
-        thresholds_per_class = [0.0 for _ in label_names]
+        # Use the same per-class thresholds as the yolox pipeline.
+        # label_names uses "text" where yolox uses "paragraph"; _RETRIEVER_TO_API maps between them.
+        thresholds_per_class = [
+            YOLOX_PAGE_V3_FINAL_SCORE.get(_RETRIEVER_TO_API.get(name, name), 0.0) for name in label_names
+        ]
 
     for _, row in pages_df.iterrows():
         try:
@@ -671,6 +700,7 @@ def detect_page_elements_v3(
                     labels_list.append(torch.as_tensor(l_np, dtype=torch.int64))
                     scores_list.append(torch.as_tensor(s_np, dtype=torch.float32))
                 boxes, labels, scores = boxes_list, labels_list, scores_list
+
             per_image_dets = _postprocess_to_per_image_detections(
                 boxes=boxes,
                 labels=labels,
@@ -678,8 +708,10 @@ def detect_page_elements_v3(
                 batch_size=len(pre_list),
                 label_names=label_names,
             )
-            # Apply v3 postprocessing (box fusion, title matching, expansion, overlap removal)
+            # Apply v3 postprocessing (box fusion via WBF at iou=0.01, title matching, expansion, overlap removal)
             per_image_dets = [_apply_page_elements_v3_postprocess(dets) for dets in per_image_dets]
+            # Apply per-class final score filtering AFTER WBF (matches NIM pipeline ordering)
+            per_image_dets = [_apply_final_score_filter(dets) for dets in per_image_dets]
             for local_i, row_i in enumerate(chunk_idx):
                 dets = per_image_dets[local_i] if local_i < len(per_image_dets) else []
                 row_payloads[row_i] = {
diff --git a/nemo_retriever/src/nemo_retriever/params/models.py b/nemo_retriever/src/nemo_retriever/params/models.py
index d6f407cd0..1f81e38e0 100644
--- a/nemo_retriever/src/nemo_retriever/params/models.py
+++ b/nemo_retriever/src/nemo_retriever/params/models.py
@@ -65,7 +65,7 @@ class PdfSplitParams(_ParamsModel):
 
 
 class TextChunkParams(_ParamsModel):
-    max_tokens: int = 512
+    max_tokens: int = 1024
     overlap_tokens: int = 0
     tokenizer_model_id: Optional[str] = None
     encoding: str = "utf-8"
@@ -167,6 +167,7 @@ class ExtractParams(_ParamsModel):
     dpi: int = 200
     image_format: str = "jpeg"
     jpeg_quality: int = 100
+    render_mode: Literal["full_dpi", "fit_to_model"] = "fit_to_model"
     inference_batch_size: int = 8
     ocr_model_dir: Optional[str] = None
 
diff --git a/nemo_retriever/src/nemo_retriever/pdf/extract.py b/nemo_retriever/src/nemo_retriever/pdf/extract.py
index af2a92a8c..992c18ebe 100644
--- a/nemo_retriever/src/nemo_retriever/pdf/extract.py
+++ b/nemo_retriever/src/nemo_retriever/pdf/extract.py
@@ -6,7 +6,7 @@
 
 from io import BytesIO
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Literal, Optional, Tuple
 
 import base64
 import traceback
@@ -33,6 +33,37 @@
 except Exception:  # pragma: no cover
     np = None  # type: ignore[assignment]
 
+# Default model input size used by nv-ingest for page-element detection.
+_MODEL_INPUT_SIZE: Tuple[int, int] = (1024, 1024)
+
+# Allowed render-mode values.
+RenderMode = Literal["full_dpi", "fit_to_model"]
+
+
+def _compute_fit_to_model_scale(
+    page: Any,
+    target_wh: Tuple[int, int] = _MODEL_INPUT_SIZE,
+    max_dpi: int = 300,
+) -> float:
+    """Compute a pdfium render scale that fits the page within *target_wh* pixels.
+
+    This mirrors the logic in ``nv_ingest_api.util.pdf.pdfium._compute_render_scale_to_fit``
+    combined with the ``min(base_scale, fit_scale)`` cap applied in
+    ``pdfium_pages_to_numpy`` when ``scale_tuple`` is provided.
+
+    For a US-Letter page (612×792 pt) fitting into 1024×1024 the result is
+    ``min(300/72, min(1024/612, 1024/792)) ≈ 1.293`` → ~93 effective DPI.
+    """
+    target_w, target_h = target_wh
+    page_w = float(page.get_width())
+    page_h = float(page.get_height())
+    if page_w <= 0 or page_h <= 0 or target_w <= 0 or target_h <= 0:
+        return max(float(max_dpi) / 72.0, 0.01)
+
+    fit_scale = max(min(target_w / page_w, target_h / page_h), 1e-3)
+    base_scale = max(float(max_dpi) / 72.0, 0.01)
+    return min(base_scale, fit_scale)
+
 
 def _render_page_to_base64(
     page: Any,
@@ -40,17 +71,28 @@ def _render_page_to_base64(
     dpi: int = 200,
     image_format: str = "jpeg",
     jpeg_quality: int = 100,
+    render_mode: RenderMode = "fit_to_model",
 ) -> Dict[str, Any]:
-    """
-    Render a page at full DPI and encode as JPEG or PNG.
+    """Render a page and encode as JPEG or PNG.
+
+    Parameters
+    ----------
+    render_mode:
+        ``"full_dpi"`` – render at *dpi* (default 300 → 2550×3300 for US Letter).
+        ``"fit_to_model"`` – render at the nv-ingest fit-to-1024 scale (~93 DPI
+        for US Letter) so the raster is already close to the model's input size,
+        avoiding a large bilinear down-scale in ``resize_pad``.
 
     Returns dict with:
     - image_b64: str
     - encoding: str ("jpeg" or "png")
     - orig_shape_hw: tuple[int,int] (H,W) of the rendered raster
     """
-    base_scale = max(float(dpi) / 72.0, 0.01)
-    bitmap = page.render(scale=base_scale)
+    if render_mode == "fit_to_model":
+        render_scale = _compute_fit_to_model_scale(page, _MODEL_INPUT_SIZE, max_dpi=dpi)
+    else:
+        render_scale = max(float(dpi) / 72.0, 0.01)
+    bitmap = page.render(scale=render_scale)
 
     arr = convert_bitmap_to_corrected_numpy(bitmap)
 
@@ -144,6 +186,7 @@ def pdf_extraction(
     jpeg_quality: int = 100,
     text_extraction_method: str = "pdfium_hybrid",
     text_depth: str = "page",
+    render_mode: RenderMode = "fit_to_model",
     **kwargs: Any,
 ) -> Any:
     """
@@ -250,6 +293,7 @@ def pdf_extraction(
                             dpi=dpi,
                             image_format=image_format,
                             jpeg_quality=jpeg_quality,
+                            render_mode=render_mode,
                         )
 
                     page_record: Dict[str, Any] = {
diff --git a/nemo_retriever/src/nemo_retriever/pdf/stage.py b/nemo_retriever/src/nemo_retriever/pdf/stage.py
index d34313697..be39cb332 100644
--- a/nemo_retriever/src/nemo_retriever/pdf/stage.py
+++ b/nemo_retriever/src/nemo_retriever/pdf/stage.py
@@ -145,6 +145,7 @@ def _normalize_page_elements_config(raw: Dict[str, Any]) -> Dict[str, Any]:
         outputs, "json_output_dir", "json-output-dir"
     )
     out["limit"] = _cfg_get(raw, "limit")
+    out["render_mode"] = _cfg_get(raw, "render_mode")
 
     # Drop Nones so "not specified" stays not specified.
     return {k: v for k, v in out.items() if v is not None}
@@ -522,6 +523,14 @@ def render_page_elements(
         "--text-depth",
         help="Text depth for extracted text primitives: 'page' or 'document'.",
     ),
+    render_mode: str = typer.Option(
+        "fit_to_model",
+        "--render-mode",
+        help=(
+            "Page rendering mode: 'full_dpi' (render at DPI then resize_pad) or "
+            "'fit_to_model' (render at nv-ingest fit-to-1024 scale, ~93 DPI for US Letter)."
+        ),
+    ),
     write_json_outputs: bool = typer.Option(
         True,
         "--write-json-outputs/--no-write-json-outputs",
@@ -583,6 +592,9 @@ def render_page_elements(
     if not _argv_has_any(["--text-depth"]):
         text_depth = str(cfg_raw.get("text_depth", text_depth))
 
+    if not _argv_has_any(["--render-mode"]):
+        render_mode = str(cfg_raw.get("render_mode", render_mode))
+
     if not _argv_has_any(["--write-json-outputs", "--no-write-json-outputs"]):
         write_json_outputs = bool(cfg_raw.get("write_json_outputs", write_json_outputs))
     if not _argv_has_any(["--json-output-dir"]):
diff --git a/nemo_retriever/src/nemo_retriever/recall/core.py b/nemo_retriever/src/nemo_retriever/recall/core.py
index d5174b968..882e3722b 100644
--- a/nemo_retriever/src/nemo_retriever/recall/core.py
+++ b/nemo_retriever/src/nemo_retriever/recall/core.py
@@ -9,6 +9,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple
+from nemo_retriever.retriever import Retriever
 import json
 
 logger = logging.getLogger(__name__)
@@ -48,6 +49,10 @@ class RecallConfig:
     # - pdf_page: compare on "{pdf}_{page}" keys
     # - pdf_only: compare on "{pdf}" document keys
     match_mode: str = "pdf_page"
+    reranker: Optional[str] = None
+    reranker_endpoint: Optional[str] = None
+    reranker_api_key: str = ""
+    reranker_batch_size: int = 32
 
 
 def _normalize_pdf_name(value: str) -> str:
@@ -179,81 +184,21 @@ def _embed_queries_local_hf(
     return vecs.detach().to("cpu").tolist()
 
 
-def _search_lancedb(
-    *,
-    lancedb_uri: str,
-    table_name: str,
-    query_vectors: List[List[float]],
-    top_k: int,
-    vector_column_name: str = "vector",
-    nprobes: int = 0,
-    refine_factor: int = 10,
-    query_texts: Optional[List[str]] = None,
-    hybrid: bool = False,
-) -> List[List[Dict[str, Any]]]:
-    import lancedb  # type: ignore
-
-    db = lancedb.connect(lancedb_uri)
-    table = db.open_table(table_name)
-
-    # Determine nprobes: 0 means "search all partitions" for exhaustive ANN search.
-    # Read the actual partition count from the index so we don't hard-code it.
-    effective_nprobes = nprobes
-    if effective_nprobes <= 0:
-        try:
-            indices = table.list_indices()
-            for idx in indices:
-                np_ = getattr(idx, "num_partitions", None)
-                if np_ and int(np_) > 0:
-                    effective_nprobes = int(np_)
-                    break
-        except Exception:
-            pass
-        if effective_nprobes <= 0:
-            effective_nprobes = 16  # safe fallback matching default index config
-
-    results: List[List[Dict[str, Any]]] = []
-    for i, v in enumerate(query_vectors):
-        q = np.asarray(v, dtype="float32")
-
-        if hybrid and query_texts is not None:
-            from lancedb.rerankers import RRFReranker  # type: ignore
-
-            text = query_texts[i]
-            hits = (
-                table.search(query_type="hybrid")
-                .vector(q)
-                .text(text)
-                .nprobes(effective_nprobes)
-                .refine_factor(refine_factor)
-                .select(["text", "metadata", "source", "page_number"])
-                .limit(top_k)
-                .rerank(RRFReranker())
-                .to_list()
-            )
-        else:
-            hits = (
-                table.search(q, vector_column_name=vector_column_name)
-                .nprobes(effective_nprobes)
-                .refine_factor(refine_factor)
-                .select(["text", "metadata", "source", "page_number", "_distance"])
-                .limit(top_k)
-                .to_list()
-            )
-
-        results.append(hits)
-    return results
-
-
 def _hits_to_keys(raw_hits: List[List[Dict[str, Any]]]) -> List[List[str]]:
     retrieved_keys: List[List[str]] = []
     for hits in raw_hits:
         keys: List[str] = []
         for h in hits:
+            page_number = h["page_number"]
+            source = h["source"]
             page_number = h["page_number"]
             source = h["source"]
             # Prefer explicit `pdf_page` column; fall back to derived form.
             # if res.get("page_number") is not None and source.get("source_id"):
+            if page_number is not None and source:
+                filename = Path(source).stem
+                keys.append(f"{filename}_{str(page_number)}")
+            # if res.get("page_number") is not None and source.get("source_id"):
             if page_number is not None and source:
                 filename = Path(source).stem
                 keys.append(f"{filename}_{str(page_number)}")
@@ -359,35 +304,34 @@ def retrieve_and_score(
 
     queries = df_query["query"].astype(str).tolist()
     gold = df_query["golden_answer"].astype(str).tolist()
-
     endpoint, use_grpc = _resolve_embedding_endpoint(cfg)
-    if endpoint is not None and use_grpc is not None:
-        vectors = _embed_queries_nim(
-            queries,
-            endpoint=endpoint,
-            model=cfg.embedding_model,
-            api_key=cfg.embedding_api_key,
-            grpc=bool(use_grpc),
-        )
-    else:
-        vectors = _embed_queries_local_hf(
-            queries,
-            device=cfg.local_hf_device,
-            cache_dir=cfg.local_hf_cache_dir,
-            batch_size=int(cfg.local_hf_batch_size),
-            model_name=cfg.embedding_model,
-        )
-    raw_hits = _search_lancedb(
+    retriever = Retriever(
         lancedb_uri=cfg.lancedb_uri,
-        table_name=cfg.lancedb_table,
-        query_vectors=vectors,
-        top_k=int(cfg.top_k),
-        vector_column_name=vector_column_name,
-        nprobes=int(cfg.nprobes),
-        refine_factor=int(cfg.refine_factor),
-        query_texts=queries,
+        lancedb_table=cfg.lancedb_table,
+        embedder=cfg.embedding_model or "nvidia/llama-nemotron-embed-1b-v2",
+        embedding_http_endpoint=cfg.embedding_http_endpoint,
+        embedding_api_key=cfg.embedding_api_key,
+        top_k=cfg.top_k,
+        nprobes=cfg.nprobes,
+        refine_factor=cfg.refine_factor,
         hybrid=bool(cfg.hybrid),
+        local_hf_device=cfg.local_hf_device,
+        local_hf_cache_dir=cfg.local_hf_cache_dir,
+        local_hf_batch_size=cfg.local_hf_batch_size,
+        reranker=cfg.reranker,
+        reranker_endpoint=cfg.reranker_endpoint,
+        reranker_api_key=cfg.reranker_api_key,
+        reranker_batch_size=cfg.reranker_batch_size,
     )
+    start = time.time()
+    raw_hits = retriever.queries(queries)
+    end_queries = time.time() - start
+    print(
+        f"Retrieval time for {len(queries)} ",
+        f"queries: {end_queries:.2f} seconds ",
+        f"(average {len(queries)/end_queries:.2f} queries/second)",
+    )
+
     retrieved_keys = _hits_to_keys(raw_hits)
     metrics = {
         f"recall@{k}": _recall_at_k(gold, retrieved_keys, int(k), match_mode=str(cfg.match_mode)) for k in cfg.ks
diff --git a/nemo_retriever/src/nemo_retriever/rerank/__init__.py b/nemo_retriever/src/nemo_retriever/rerank/__init__.py
new file mode 100644
index 000000000..988355cdd
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/rerank/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Reranking stage using nvidia/llama-nemotron-rerank-1b-v2.
+
+Exports
+-------
+NemotronRerankActor
+    Ray Data-compatible stateful actor that initialises the cross-encoder once
+    per worker and scores (query, document) pairs in batch DataFrames.
+rerank_hits
+    Convenience function to rerank a list of LanceDB hit dicts for a single
+    query string, using either a local ``NemotronRerankV2`` model or a remote
+    vLLM / NIM ``/rerank`` endpoint.
+"""
+
+from .rerank import NemotronRerankActor, rerank_hits
+
+__all__ = [
+    "NemotronRerankActor",
+    "rerank_hits",
+]
diff --git a/nemo_retriever/src/nemo_retriever/rerank/rerank.py b/nemo_retriever/src/nemo_retriever/rerank/rerank.py
new file mode 100644
index 000000000..189b56a89
--- /dev/null
+++ b/nemo_retriever/src/nemo_retriever/rerank/rerank.py
@@ -0,0 +1,377 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Reranking stage using nvidia/llama-nemotron-rerank-1b-v2.
+
+Provides:
+  - ``rerank_hits``         – rerank a list of LanceDB hits for a single query
+  - ``NemotronRerankActor`` – Ray Data-compatible stateful actor for batch DataFrames
+
+Remote endpoint
+---------------
+When ``invoke_url`` is set the actor/function calls a vLLM (>=0.14) or NIM
+server that exposes the OpenAI-compatible ``/rerank`` REST API::
+
+    POST /rerank
+    {
+      "model": "nvidia/llama-nemotron-rerank-1b-v2",
+      "query": "...",
+      "documents": ["...", "..."],
+      "top_n": N
+    }
+
+Local model
+-----------
+When no endpoint is configured the model is loaded directly from HuggingFace
+(or ``hf_cache_dir``) using ``NemotronRerankV2``.
+
+Ray Data actor usage::
+
+    import ray
+    ds = ds.map_batches(
+        NemotronRerankActor,
+        batch_size=64,
+        batch_format="pandas",
+        num_gpus=1,
+        compute=ray.data.ActorPoolStrategy(size=4),
+        fn_constructor_kwargs={
+            "model_name": "nvidia/llama-nemotron-rerank-1b-v2",
+            "query_column": "query",
+            "text_column": "text",
+            "score_column": "rerank_score",
+            "max_length": 512,
+            "batch_size": 32,
+        },
+    )
+"""
+
+from __future__ import annotations
+
+import traceback
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+
+
+_DEFAULT_MODEL = "nvidia/llama-nemotron-rerank-1b-v2"
+_DEFAULT_MAX_LENGTH = 512
+_DEFAULT_BATCH_SIZE = 32
+_SCORE_COLUMN = "rerank_score"
+
+
+# ---------------------------------------------------------------------------
+# Remote endpoint helper
+# ---------------------------------------------------------------------------
+
+
+def _rerank_via_endpoint(
+    query: str,
+    documents: List[str],
+    *,
+    endpoint: str,
+    model_name: str = _DEFAULT_MODEL,
+    api_key: str = "",
+    top_n: Optional[int] = None,
+) -> List[float]:
+    """
+    Call a vLLM / NIM ``/rerank`` REST endpoint and return per-document scores.
+
+    The server must expose the OpenAI-compatible rerank API introduced in
+    vLLM >= 0.14.0::
+
+        POST {endpoint}/rerank
+        {"model": ..., "query": ..., "documents": [...], "top_n": N}
+
+    Returns
+    -------
+    List[float]
+        Scores aligned with *documents* (higher = more relevant).
+        Documents not returned by ``top_n`` truncation receive ``-inf``.
+    """
+    import requests
+
+    url = endpoint.rstrip("/") + "/rerank"
+    headers: Dict[str, str] = {"Content-Type": "application/json"}
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    payload: Dict[str, Any] = {
+        "model": model_name,
+        "query": query,
+        "documents": documents,
+    }
+    if top_n is not None:
+        payload["top_n"] = top_n
+
+    response = requests.post(url, json=payload, headers=headers, timeout=120)
+    response.raise_for_status()
+    data = response.json()
+
+    # Build score list aligned with input document order.
+    scores = [float("-inf")] * len(documents)
+    for item in data.get("results", []):
+        idx = item.get("index")
+        score = item.get("relevance_score")
+        if idx is not None and score is not None:
+            scores[idx] = float(score)
+    return scores
+
+
+# ---------------------------------------------------------------------------
+# Public helper: rerank LanceDB hits for a single query
+# ---------------------------------------------------------------------------
+
+
+def rerank_hits(
+    query: str,
+    hits: List[Dict[str, Any]],
+    *,
+    model: Optional[Any] = None,
+    invoke_url: Optional[str] = None,
+    model_name: str = _DEFAULT_MODEL,
+    api_key: str = "",
+    max_length: int = _DEFAULT_MAX_LENGTH,
+    batch_size: int = _DEFAULT_BATCH_SIZE,
+    top_n: Optional[int] = None,
+    text_key: str = "text",
+) -> List[Dict[str, Any]]:
+    """
+    Rerank *hits* (list of LanceDB result dicts) by relevance to *query*.
+
+    Each hit that has a ``text_key`` field is scored; hits without text are
+    placed at the end.  The returned list is sorted highest-score first and
+    each dict gains a ``"_rerank_score"`` field.
+
+    Parameters
+    ----------
+    query:
+        The search query.
+    hits:
+        LanceDB result dicts (as returned by ``Retriever.queries()``).
+    model:
+        A ``NemotronRerankV2`` instance (local GPU inference).  Ignored when
+        *invoke_url* is set.
+    invoke_url:
+        Base URL of a vLLM / NIM ``/rerank`` endpoint.  Takes priority over
+        *model*.
+    model_name:
+        Model identifier sent to the remote endpoint (default
+        ``"nvidia/llama-nemotron-rerank-1b-v2"``).
+    api_key:
+        Bearer token for the remote endpoint.
+    max_length:
+        Tokenizer truncation length for local inference (max 8 192).
+    batch_size:
+        GPU forward-pass batch size for local inference.
+    top_n:
+        If set, only the top-N results (after reranking) are returned.
+    text_key:
+        Dict key used to extract document text from each hit (default
+        ``"text"``).
+
+    Returns
+    -------
+    List[dict]
+        Hits sorted by ``"_rerank_score"`` descending.  Each dict has a new
+        ``"_rerank_score"`` key with the raw logit (local) or relevance score
+        (remote).
+    """
+    if not hits:
+        return hits
+
+    documents = [str(h.get(text_key) or "") for h in hits]
+
+    if invoke_url:
+        scores = _rerank_via_endpoint(
+            query,
+            documents,
+            endpoint=invoke_url,
+            model_name=model_name,
+            api_key=api_key,
+        )
+    elif model is not None:
+        scores = model.score(query, documents, max_length=max_length, batch_size=batch_size)
+    else:
+        raise ValueError("Either 'model' (NemotronRerankV2 instance) or 'invoke_url' must be provided.")
+
+    ranked = sorted(
+        [{"_rerank_score": s, **h} for s, h in zip(scores, hits)],
+        key=lambda x: x["_rerank_score"],
+        reverse=True,
+    )
+
+    if top_n is not None:
+        ranked = ranked[:top_n]
+
+    return ranked
+
+
+# ---------------------------------------------------------------------------
+# Error payload helper (mirrors other actors in this project)
+# ---------------------------------------------------------------------------
+
+
+def _error_payload(*, stage: str, exc: BaseException) -> Dict[str, Any]:
+    return {
+        "status": "error",
+        "stage": stage,
+        "error_message": str(exc),
+        "traceback": traceback.format_exc(),
+    }
+
+
+# ---------------------------------------------------------------------------
+# Ray Data actor
+# ---------------------------------------------------------------------------
+
+
+class NemotronRerankActor:
+    """
+    Ray Data-compatible stateful actor for cross-encoder reranking.
+
+    Initialises ``nvidia/llama-nemotron-rerank-1b-v2`` **once** per actor
+    instance and reuses it across batches, avoiding repeated model loads.
+
+    Each row in the input DataFrame is expected to have a *query* column and a
+    *text* (document) column.  The actor appends a ``rerank_score`` column
+    (name configurable) with the raw logit score.
+
+    Usage with Ray Data::
+
+        import ray
+        ds = ds.map_batches(
+            NemotronRerankActor,
+            batch_size=64,
+            batch_format="pandas",
+            num_gpus=1,
+            compute=ray.data.ActorPoolStrategy(size=4),
+            fn_constructor_kwargs={
+                "model_name": "nvidia/llama-nemotron-rerank-1b-v2",
+                "query_column": "query",
+                "text_column": "text",
+                "score_column": "rerank_score",
+                "max_length": 512,
+                "batch_size": 32,
+            },
+        )
+
+    Parameters
+    ----------
+    model_name:
+        HuggingFace model ID (default ``"nvidia/llama-nemotron-rerank-1b-v2"``).
+    invoke_url:
+        Base URL of a vLLM / NIM ``/rerank`` endpoint.  When set the actor
+        skips local model creation and delegates all scoring to the endpoint.
+        Also accepted as ``rerank_invoke_url``.
+    api_key:
+        Bearer token for the remote endpoint.
+    device:
+        Torch device string (default: ``"cuda"`` if available, else ``"cpu"``).
+    hf_cache_dir:
+        Directory for HuggingFace model cache.
+    query_column:
+        DataFrame column containing query strings (default ``"query"``).
+    text_column:
+        DataFrame column containing document/passage text (default ``"text"``).
+    score_column:
+        Output column name for rerank scores (default ``"rerank_score"``).
+    max_length:
+        Tokenizer truncation length (default 512).
+    batch_size:
+        GPU forward-pass micro-batch size (default 32).
+    sort_results:
+        If ``True`` (default) rows in each batch are sorted by score descending.
+    """
+
+    __slots__ = ("_kwargs", "_model")
+
+    def __init__(self, **kwargs: Any) -> None:
+        self._kwargs = dict(kwargs)
+
+        invoke_url = str(self._kwargs.get("rerank_invoke_url") or self._kwargs.get("invoke_url") or "").strip()
+        if invoke_url and "invoke_url" not in self._kwargs:
+            self._kwargs["invoke_url"] = invoke_url
+
+        if invoke_url:
+            self._model = None
+        else:
+            from nemo_retriever.model.local import NemotronRerankV2
+
+            self._model = NemotronRerankV2(
+                model_name=str(self._kwargs.get("model_name", _DEFAULT_MODEL)),
+                device=self._kwargs.get("device") or None,
+                hf_cache_dir=str(self._kwargs["hf_cache_dir"]) if self._kwargs.get("hf_cache_dir") else None,
+            )
+
+    def __call__(self, batch_df: Any, **override_kwargs: Any) -> Any:
+        try:
+            return _rerank_batch(batch_df, model=self._model, **self._kwargs, **override_kwargs)
+        except BaseException as exc:
+            if isinstance(batch_df, pd.DataFrame):
+                out = batch_df.copy()
+                payload = _error_payload(stage="actor_call", exc=exc)
+                score_col = str(self._kwargs.get("score_column", _SCORE_COLUMN))
+                out[score_col] = [payload for _ in range(len(out.index))]
+                return out
+            return [{"rerank_score": _error_payload(stage="actor_call", exc=exc)}]
+
+
+# ---------------------------------------------------------------------------
+# Batch processing function (called by actor and usable standalone)
+# ---------------------------------------------------------------------------
+
+
+def _rerank_batch(
+    batch_df: pd.DataFrame,
+    *,
+    model: Optional[Any] = None,
+    invoke_url: Optional[str] = None,
+    model_name: str = _DEFAULT_MODEL,
+    api_key: str = "",
+    query_column: str = "query",
+    text_column: str = "text",
+    score_column: str = _SCORE_COLUMN,
+    max_length: int = _DEFAULT_MAX_LENGTH,
+    batch_size: int = _DEFAULT_BATCH_SIZE,
+    sort_results: bool = True,
+    **_ignored: Any,
+) -> pd.DataFrame:
+    """
+    Score each (query, document) row in *batch_df* and append *score_column*.
+
+    When *sort_results* is ``True`` the returned DataFrame is sorted by score
+    descending within the batch.
+    """
+    if not isinstance(batch_df, pd.DataFrame):
+        raise TypeError(f"Expected a pandas DataFrame, got {type(batch_df)}")
+
+    queries = batch_df[query_column].tolist()
+    texts = batch_df[text_column].tolist()
+    pairs = list(zip(queries, texts))
+
+    if invoke_url:
+        # Remote endpoint: score pair-by-pair (each row may have a different query).
+        scores: List[float] = []
+        for q, d in pairs:
+            row_scores = _rerank_via_endpoint(
+                q,
+                [d],
+                endpoint=invoke_url,
+                model_name=model_name,
+                api_key=api_key,
+            )
+            scores.append(row_scores[0])
+    elif model is not None:
+        scores = model.score_pairs(pairs, max_length=max_length, batch_size=batch_size)
+    else:
+        raise ValueError("Either 'model' or 'invoke_url' must be provided to NemotronRerankActor.")
+
+    out = batch_df.copy()
+    out[score_column] = scores
+
+    if sort_results:
+        out = out.sort_values(score_column, ascending=False).reset_index(drop=True)
+
+    return out
diff --git a/nemo_retriever/src/nemo_retriever/retriever.py b/nemo_retriever/src/nemo_retriever/retriever.py
index bffd35cf0..aab11b519 100644
--- a/nemo_retriever/src/nemo_retriever/retriever.py
+++ b/nemo_retriever/src/nemo_retriever/retriever.py
@@ -4,14 +4,39 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Optional, Sequence
+from tqdm import tqdm
 
 
 @dataclass
 class Retriever:
-    """Simple query helper over LanceDB with configurable embedders."""
+    """Simple query helper over LanceDB with configurable embedders.
+
+    Retrieval pipeline
+    ------------------
+    1. Embed query strings (NIM endpoint or local HuggingFace model).
+    2. Search LanceDB (vector or hybrid vector+BM25).
+    3. Optionally rerank the results with ``nvidia/llama-nemotron-rerank-1b-v2``
+       (NIM/vLLM endpoint or local HuggingFace model).
+
+    Reranking
+    ---------
+    Set ``reranker`` to a model name (e.g.
+    ``"nvidia/llama-nemotron-rerank-1b-v2"``) to enable post-retrieval
+    reranking.  Results are re-sorted by the cross-encoder score and a
+    ``"_rerank_score"`` key is added to each hit dict.
+
+    Use ``reranker_endpoint`` to delegate to a running vLLM (>=0.14) or NIM
+    server instead of loading the model locally::
+
+        retriever = Retriever(
+            reranker="nvidia/llama-nemotron-rerank-1b-v2",
+            reranker_endpoint="http://localhost:8000",
+        )
+        results = retriever.query("What is machine learning?")
+    """
 
     lancedb_uri: str = "lancedb"
     lancedb_table: str = "nv-ingest"
@@ -27,6 +52,25 @@ class Retriever:
     local_hf_device: Optional[str] = None
     local_hf_cache_dir: Optional[Path] = None
     local_hf_batch_size: int = 64
+    # Reranking -----------------------------------------------------------
+    reranker: Optional[bool] = False
+    """True to enable reranking with the default model, will use the reranker_model_name as hf model"""
+    reranker_model_name: Optional[str] = "nvidia/llama-nemotron-rerank-1b-v2"
+    """HuggingFace model ID for local reranking (e.g. 'nvidia/llama-nemotron-rerank-1b-v2').
+    Set to None to skip reranking (default)."""
+    reranker_endpoint: Optional[str] = None
+    """Base URL of a vLLM / NIM /rerank endpoint.  Takes priority over local model."""
+    reranker_api_key: str = ""
+    """Bearer token for the remote rerank endpoint."""
+    reranker_max_length: int = 512
+    """Tokenizer truncation length for local reranking (max 8 192)."""
+    reranker_batch_size: int = 32
+    """GPU micro-batch size for local reranking."""
+    reranker_refine_factor: int = 4
+    """Number of candidates to rerank = top_k * reranker_refine_factor.
+    Set to 1 to rerank only the top_k results."""
+    # Internal cache for the local rerank model (not part of the public API).
+    _reranker_model: Any = field(default=None, init=False, repr=False, compare=False)
 
     def _resolve_embedding_endpoint(self) -> Optional[str]:
         http_ep = self.embedding_http_endpoint.strip() if isinstance(self.embedding_http_endpoint, str) else None
@@ -107,6 +151,8 @@ def _search_lancedb(
         results: list[list[dict[str, Any]]] = []
         for i, vector in enumerate(query_vectors):
             q = np.asarray(vector, dtype="float32")
+            # doubling top_k for both hybrid and dense search in order to have more to rerank
+            top_k = self.top_k if not self.reranker else self.top_k * self.reranker_refine_factor
             if self.hybrid:
                 from lancedb.rerankers import RRFReranker  # type: ignore
 
@@ -116,8 +162,8 @@ def _search_lancedb(
                     .text(query_texts[i])
                     .nprobes(effective_nprobes)
                     .refine_factor(int(self.refine_factor))
-                    .select(["text", "metadata", "source"])
-                    .limit(int(self.top_k))
+                    .select(["text", "metadata", "source", "page_number"])
+                    .limit(int(top_k))
                     .rerank(RRFReranker())
                     .to_list()
                 )
@@ -126,13 +172,62 @@ def _search_lancedb(
                     table.search(q, vector_column_name=self.vector_column_name)
                     .nprobes(effective_nprobes)
                     .refine_factor(int(self.refine_factor))
-                    .select(["text", "metadata", "source", "_distance"])
-                    .limit(int(self.top_k))
+                    .select(["text", "metadata", "source", "page_number", "_distance"])
+                    .limit(int(top_k))
                     .to_list()
                 )
             results.append(hits)
         return results
 
+    # ------------------------------------------------------------------
+    # Reranking helpers
+    # ------------------------------------------------------------------
+
+    def _get_reranker_model(self) -> Any:
+        """Lazily load and cache the local NemotronRerankV2 model."""
+        if self._reranker_model is None and self.reranker:
+            from nemo_retriever.model.local import NemotronRerankV2
+
+            cache_dir = str(self.local_hf_cache_dir) if self.local_hf_cache_dir else None
+            self._reranker_model = NemotronRerankV2(
+                model_name=self.reranker_model_name if self.reranker else None,
+                device=self.local_hf_device,
+                hf_cache_dir=cache_dir,
+            )
+        return self._reranker_model
+
+    def _rerank_results(
+        self,
+        query_texts: list[str],
+        results: list[list[dict[str, Any]]],
+    ) -> list[list[dict[str, Any]]]:
+        """Rerank each per-query result list using the configured reranker."""
+        from nemo_retriever.rerank import rerank_hits
+
+        reranker_endpoint = (self.reranker_endpoint or "").strip() or None
+        model = None if reranker_endpoint else self._get_reranker_model()
+
+        reranked: list[list[dict[str, Any]]] = []
+        for query, hits in tqdm(zip(query_texts, results), desc="Reranking", unit="query", total=len(query_texts)):
+            reranked.append(
+                rerank_hits(
+                    query,
+                    hits,
+                    model=model,
+                    invoke_url=reranker_endpoint,
+                    model_name=str(self.reranker),
+                    api_key=(self.reranker_api_key or "").strip(),
+                    max_length=int(self.reranker_max_length),
+                    batch_size=int(self.reranker_batch_size),
+                    top_n=int(self.top_k),
+                )
+            )
+        return reranked
+
+    # ------------------------------------------------------------------
+    # Public query API
+    # ------------------------------------------------------------------
+
     def query(
         self,
         query: str,
@@ -157,7 +252,13 @@ def queries(
         lancedb_uri: Optional[str] = None,
         lancedb_table: Optional[str] = None,
     ) -> list[list[dict[str, Any]]]:
-        """Run retrieval for multiple query strings."""
+        """Run retrieval for multiple query strings.
+
+        If ``reranker`` is set on this instance the initial vector-search
+        results are re-scored with ``nvidia/llama-nemotron-rerank-1b-v2``
+        (or the configured endpoint) and returned sorted by cross-encoder
+        score.  Each hit gains a ``"_rerank_score"`` key.
+        """
         query_texts = [str(q) for q in queries]
         if not query_texts:
             return []
@@ -179,13 +280,21 @@ def queries(
                 model_name=resolved_embedder,
             )
 
-        return self._search_lancedb(
+        results = self._search_lancedb(
             lancedb_uri=resolved_lancedb_uri,
             lancedb_table=resolved_lancedb_table,
             query_vectors=vectors,
             query_texts=query_texts,
         )
 
+        if self.reranker:
+            assert self.top_k * self.reranker_refine_factor == len(
+                results[0]
+            ), "top_k must be at least 1/4 of the number of retrieved hits for reranking to work properly."
+            results = self._rerank_results(query_texts, results)
+
+        return results
+
 
 # Backward compatibility alias.
 retriever = Retriever
diff --git a/nemo_retriever/src/nemo_retriever/table/table_detection.py b/nemo_retriever/src/nemo_retriever/table/table_detection.py
index bfb82a187..841526431 100644
--- a/nemo_retriever/src/nemo_retriever/table/table_detection.py
+++ b/nemo_retriever/src/nemo_retriever/table/table_detection.py
@@ -17,6 +17,13 @@
 except Exception:  # pragma: no cover
     torch = None  # type: ignore[assignment]
 
+try:
+    from nv_ingest_api.internal.primitives.nim.model_interface.yolox import (
+        YOLOX_TABLE_MIN_SCORE,
+    )
+except ImportError:
+    YOLOX_TABLE_MIN_SCORE = 0.1  # type: ignore[assignment]
+
 _DEFAULT_TABLE_STRUCTURE_LABELS: List[str] = ["cell", "row", "column"]
 
 
@@ -351,7 +358,7 @@ def table_structure_ocr_page_elements(
                     if not parsed:
                         pred_item = _extract_remote_pred_item(resp)
                         parsed = _prediction_to_detections(pred_item, label_names=label_names)
-                    structure_results.append(parsed)
+                    structure_results.append([d for d in parsed if (d.get("score") or 0.0) >= YOLOX_TABLE_MIN_SCORE])
             else:
                 # Local batched inference.
                 for _, _, crop_array in crops:
@@ -366,7 +373,7 @@ def table_structure_ocr_page_elements(
                         pre = pre.unsqueeze(0)
                     pred = table_structure_model.invoke(pre, (h, w))
                     dets = _prediction_to_detections(pred, label_names=label_names)
-                    structure_results.append(dets)
+                    structure_results.append([d for d in dets if (d.get("score") or 0.0) >= YOLOX_TABLE_MIN_SCORE])
 
             # --- Pass 3: Run OCR on all crops ---
             ocr_results: List[Any] = []
diff --git a/nemo_retriever/src/nemo_retriever/txt/ray_data.py b/nemo_retriever/src/nemo_retriever/txt/ray_data.py
index 1cb86970e..b74f482cd 100644
--- a/nemo_retriever/src/nemo_retriever/txt/ray_data.py
+++ b/nemo_retriever/src/nemo_retriever/txt/ray_data.py
@@ -17,6 +17,29 @@
 from .split import txt_bytes_to_chunks_df
 
 
+class TextChunkActor:
+    """
+    Ray Data map_batches callable: re-chunk existing ``text`` column by token count.
+
+    This is the batch-mode equivalent of :func:`~nemo_retriever.txt.split.split_df`.
+    Constructor takes :class:`TextChunkParams`; ``__call__`` receives a pandas batch
+    and returns the split result.
+    """
+
+    def __init__(self, params: TextChunkParams | None = None) -> None:
+        self._params = params or TextChunkParams()
+
+    def __call__(self, batch_df: pd.DataFrame) -> pd.DataFrame:
+        from .split import split_df
+
+        if not isinstance(batch_df, pd.DataFrame) or batch_df.empty:
+            return batch_df
+
+        kw = self._params.model_dump(mode="python")
+        kw.pop("encoding", None)
+        return split_df(batch_df, **kw)
+
+
 class TxtSplitActor:
     """
     Ray Data map_batches callable: DataFrame with bytes, path -> DataFrame of chunks.
@@ -35,12 +58,14 @@ def __call__(self, batch_df: pd.DataFrame) -> pd.DataFrame:
         out_dfs: List[pd.DataFrame] = []
         for _, row in batch_df.iterrows():
             raw = row.get("bytes")
+            text = row.get("text")
             path = row.get("path")
-            if raw is None or path is None:
+            if (raw is None and text is None) or path is None:
                 continue
             path_str = str(path) if path is not None else ""
             try:
-                chunk_df = txt_bytes_to_chunks_df(raw, path_str, params=params)
+                payload = raw or text.encode("utf-8")
+                chunk_df = txt_bytes_to_chunks_df(payload, path_str, params=params)
                 if not chunk_df.empty:
                     out_dfs.append(chunk_df)
             except Exception:
diff --git a/nemo_retriever/src/nemo_retriever/txt/split.py b/nemo_retriever/src/nemo_retriever/txt/split.py
index d47b8dfd3..b94dba30d 100644
--- a/nemo_retriever/src/nemo_retriever/txt/split.py
+++ b/nemo_retriever/src/nemo_retriever/txt/split.py
@@ -18,7 +18,7 @@
 from nemo_retriever.params import TextChunkParams
 
 DEFAULT_TOKENIZER_MODEL_ID = "nvidia/llama-nemotron-embed-1b-v2"
-DEFAULT_MAX_TOKENS = 512
+DEFAULT_MAX_TOKENS = 1024
 DEFAULT_OVERLAP_TOKENS = 0
 
 
@@ -91,6 +91,84 @@ def split_text_by_tokens(
     return chunks if chunks else [text]
 
 
+def split_df(
+    df: pd.DataFrame,
+    *,
+    max_tokens: int = DEFAULT_MAX_TOKENS,
+    overlap_tokens: int = DEFAULT_OVERLAP_TOKENS,
+    tokenizer_model_id: Optional[str] = None,
+    tokenizer_cache_dir: Optional[str] = None,
+    encoding: str = "utf-8",
+) -> pd.DataFrame:
+    """
+    Re-chunk a DataFrame's ``text`` column by token count.
+
+    This is a **post-extraction** transform: it takes rows that already have a
+    ``text`` column (produced by ``extract`` / ``extract_txt`` / etc.) and
+    splits long texts into multiple rows using :func:`split_text_by_tokens`.
+    All other columns (``path``, ``page_number``, ``metadata``, …) are
+    preserved on every output row.  Each chunk row's ``metadata`` dict is
+    updated with ``chunk_index`` and ``chunk_count``.
+
+    Rows whose ``text`` is empty or missing are passed through unchanged.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input DataFrame with at least a ``text`` column.
+    max_tokens, overlap_tokens, tokenizer_model_id, tokenizer_cache_dir, encoding
+        Forwarded to :func:`split_text_by_tokens` / :func:`_get_tokenizer`.
+
+    Returns
+    -------
+    pd.DataFrame
+        Expanded DataFrame (one row per chunk).
+    """
+    if df.empty:
+        return df.copy()
+
+    model_id = tokenizer_model_id or DEFAULT_TOKENIZER_MODEL_ID
+    tokenizer = _get_tokenizer(model_id, cache_dir=tokenizer_cache_dir)
+
+    out_rows: List[Dict[str, Any]] = []
+    for _, row in df.iterrows():
+        row_dict = row.to_dict()
+        text = row_dict.get("text")
+        if not isinstance(text, str) or not text.strip():
+            out_rows.append(row_dict)
+            continue
+
+        chunks = split_text_by_tokens(
+            text,
+            tokenizer=tokenizer,
+            max_tokens=max_tokens,
+            overlap_tokens=overlap_tokens,
+        )
+        if len(chunks) <= 1:
+            out_rows.append(row_dict)
+            continue
+
+        import copy
+
+        for i, chunk in enumerate(chunks):
+            new_row = {k: copy.deepcopy(v) if isinstance(v, (dict, list)) else v for k, v in row_dict.items()}
+            new_row["text"] = chunk
+            if "content" in new_row:
+                new_row["content"] = chunk
+            meta = new_row.get("metadata")
+            if isinstance(meta, dict):
+                meta["chunk_index"] = i
+                meta["chunk_count"] = len(chunks)
+                meta["content"] = chunk
+            new_row["page_number"] = i + 1
+            out_rows.append(new_row)
+
+    if not out_rows:
+        return df.iloc[:0].copy()
+
+    return pd.DataFrame(out_rows)
+
+
 def txt_file_to_chunks_df(
     path: str,
     params: TextChunkParams | None = None,
diff --git a/nemo_retriever/src/nemo_retriever/utils/benchmark/audio_extract_actor.py b/nemo_retriever/src/nemo_retriever/utils/benchmark/audio_extract_actor.py
index 748a00975..8ade02b36 100644
--- a/nemo_retriever/src/nemo_retriever/utils/benchmark/audio_extract_actor.py
+++ b/nemo_retriever/src/nemo_retriever/utils/benchmark/audio_extract_actor.py
@@ -55,6 +55,73 @@ def __call__(self, batch_df: pd.DataFrame) -> pd.DataFrame:
 app = typer.Typer(help="Benchmark audio extraction (MediaChunkActor + ASRActor) throughput (chunk rows/sec).")
 
 
+def run_benchmark(
+    audio_path: Path,
+    rows: int = 16,
+    workers: str = "1,2",
+    batch_sizes: str = "2,4,8",
+    mock_asr: bool = True,
+    split_type: str = "size",
+    split_interval: int = 450,
+    ray_address: Optional[str] = None,
+    output_json: Optional[Path] = None,
+) -> None:
+    if not is_media_available():
+        raise typer.BadParameter("Audio benchmark requires ffmpeg on PATH.")
+
+    if split_type not in ("size", "time", "frame"):
+        raise typer.BadParameter("--split-type must be one of: size, time, frame")
+
+    maybe_init_ray(ray_address)
+    worker_grid = parse_csv_ints(workers, name="workers")
+    batch_grid = parse_csv_ints(batch_sizes, name="batch_sizes")
+    seed_row = make_seed_audio_row(audio_path)
+
+    chunk_params = AudioChunkParams(
+        split_type=split_type,
+        split_interval=split_interval,
+    )
+
+    def _map(ds: rd.Dataset, worker_count: int, batch_size: int) -> rd.Dataset:
+        chunk_actor = MediaChunkActor(params=chunk_params)
+        if mock_asr:
+            asr_actor = MockASRActor()
+        else:
+            asr_actor = ASRActor(params=asr_params_from_env())
+
+        ds = ds.map_batches(
+            chunk_actor,
+            batch_size=int(batch_size),
+            batch_format="pandas",
+            num_cpus=1,
+            num_gpus=0,
+            compute=rd.TaskPoolStrategy(size=int(worker_count)),
+        )
+        ds = ds.map_batches(
+            asr_actor,
+            batch_size=int(batch_size),
+            batch_format="pandas",
+            num_cpus=1,
+            num_gpus=0.25 if not mock_asr else 0,
+            compute=rd.TaskPoolStrategy(size=int(worker_count)),
+        )
+        return ds
+
+    best, results = benchmark_sweep(
+        stage_name="audio_extract",
+        seed_row=seed_row,
+        rows=int(rows),
+        workers=worker_grid,
+        batch_sizes=batch_grid,
+        map_builder=_map,
+    )
+    typer.echo(
+        f"BEST audio_extract: workers={best.workers} batch_size={best.batch_size} "
+        f"chunk_rows={best.rows} elapsed={best.elapsed_seconds:.3f}s rows_per_second={best.rows_per_second:.2f}"
+    )
+    maybe_write_results_json(output_json, best=best, results=results)
+
+
 @app.command("run")
 def run(
     audio_path: Path = typer.Option(
@@ -108,57 +175,14 @@ def run(
         help="Optional output JSON summary path.",
     ),
 ) -> None:
-    if not is_media_available():
-        raise typer.BadParameter("Audio benchmark requires ffmpeg on PATH.")
-
-    if split_type not in ("size", "time", "frame"):
-        raise typer.BadParameter("--split-type must be one of: size, time, frame")
-
-    maybe_init_ray(ray_address)
-    worker_grid = parse_csv_ints(workers, name="workers")
-    batch_grid = parse_csv_ints(batch_sizes, name="batch_sizes")
-    seed_row = make_seed_audio_row(audio_path)
-
-    chunk_params = AudioChunkParams(
+    run_benchmark(
+        audio_path=audio_path,
+        rows=rows,
+        workers=workers,
+        batch_sizes=batch_sizes,
+        mock_asr=mock_asr,
         split_type=split_type,
         split_interval=split_interval,
+        ray_address=ray_address,
+        output_json=output_json,
     )
-
-    def _map(ds: rd.Dataset, worker_count: int, batch_size: int) -> rd.Dataset:
-        chunk_actor = MediaChunkActor(params=chunk_params)
-        if mock_asr:
-            asr_actor = MockASRActor()
-        else:
-            asr_actor = ASRActor(params=asr_params_from_env())
-
-        ds = ds.map_batches(
-            chunk_actor,
-            batch_size=int(batch_size),
-            batch_format="pandas",
-            num_cpus=1,
-            num_gpus=0,
-            compute=rd.TaskPoolStrategy(size=int(worker_count)),
-        )
-        ds = ds.map_batches(
-            asr_actor,
-            batch_size=int(batch_size),
-            batch_format="pandas",
-            num_cpus=1,
-            num_gpus=0.25 if not mock_asr else 0,
-            compute=rd.TaskPoolStrategy(size=int(worker_count)),
-        )
-        return ds
-
-    best, results = benchmark_sweep(
-        stage_name="audio_extract",
-        seed_row=seed_row,
-        rows=int(rows),
-        workers=worker_grid,
-        batch_sizes=batch_grid,
-        map_builder=_map,
-    )
-    typer.echo(
-        f"BEST audio_extract: workers={best.workers} batch_size={best.batch_size} "
-        f"chunk_rows={best.rows} elapsed={best.elapsed_seconds:.3f}s rows_per_second={best.rows_per_second:.2f}"
-    )
-    maybe_write_results_json(output_json, best=best, results=results)
diff --git a/nemo_retriever/src/nemo_retriever/utils/hf_model_registry.py b/nemo_retriever/src/nemo_retriever/utils/hf_model_registry.py
index 46022b03f..2589e198a 100644
--- a/nemo_retriever/src/nemo_retriever/utils/hf_model_registry.py
+++ b/nemo_retriever/src/nemo_retriever/utils/hf_model_registry.py
@@ -28,6 +28,7 @@
     "nvidia/llama-nemotron-embed-vl-1b-v2": "859e1f2dac29c56c37a5279cf55f53f3e74efc6b",
     "meta-llama/Llama-3.2-1B": "4e20de362430cd3b72f300e6b0f18e50e7166e08",
     "intfloat/e5-large-unsupervised": "15af9288f69a6291f37bfb89b47e71abc747b206",
+    "nvidia/llama-nemotron-rerank-1b-v2": "aee9a1be0bbd89489f8bd0ec5763614c8bb85878",
 }
 
 
diff --git a/nemo_retriever/src/nemo_retriever/vector_store/lancedb_store.py b/nemo_retriever/src/nemo_retriever/vector_store/lancedb_store.py
index ebe460204..2b46ecbb5 100644
--- a/nemo_retriever/src/nemo_retriever/vector_store/lancedb_store.py
+++ b/nemo_retriever/src/nemo_retriever/vector_store/lancedb_store.py
@@ -295,26 +295,6 @@ def write_text_embeddings_dir_to_lancedb(
 
     lancedb.run(results)
 
-    # all_rows: List[Dict[str, Any]] = []
-    # for p in files:
-    #     try:
-    #         df = _read_text_embeddings_json_df(p)
-    #         if df.empty:
-    #             skipped += 1
-    #             continue
-    #         rows = _build_lancedb_rows_from_df(df)
-    #         if not rows:
-    #             skipped += 1
-    #             continue
-    #         all_rows.extend(rows)
-    #         processed += 1
-    #     except Exception:
-    #         failed += 1
-    #         logger.exception("Failed reading embeddings from %s", p)
-
-    # # Write once so --overwrite behaves as expected.
-    # _write_rows_to_lancedb(all_rows, cfg=cfg)
-
     return {
         "input_dir": str(input_dir),
         "n_files": len(files),
diff --git a/nemo_retriever/src/nemo_retriever/version.py b/nemo_retriever/src/nemo_retriever/version.py
index 81c099013..9999c919c 100644
--- a/nemo_retriever/src/nemo_retriever/version.py
+++ b/nemo_retriever/src/nemo_retriever/version.py
@@ -108,6 +108,18 @@ def _base_version() -> str:
     return os.getenv("RETRIEVER_VERSION") or os.getenv("NV_INGEST_VERSION") or _build_datetime().strftime("%Y.%m.%d")
 
 
+def _has_prerelease(version_str: str) -> bool:
+    """Return True if *version_str* already contains a PEP 440 pre-release segment."""
+    from packaging.version import Version
+
+    try:
+        return Version(version_str).pre is not None
+    except Exception:
+        import re
+
+        return bool(re.search(r"(a|alpha|b|beta|rc|c|dev|pre)[-_.]?\d*", version_str, re.I))
+
+
 def get_build_version() -> str:
     """Return a PEP 440 compliant version string for packaging."""
     release_type = (os.getenv("RETRIEVER_RELEASE_TYPE") or os.getenv("NV_INGEST_RELEASE_TYPE") or "dev").lower()
@@ -116,6 +128,8 @@ def get_build_version() -> str:
     build_number = _build_number()
 
     if release_type == "release":
+        if _has_prerelease(base_version):
+            return base_version
         return f"{base_version}.post{build_number}" if int(build_number) > 0 else base_version
     if release_type == "dev":
         return f"{base_version}.dev{build_number}"
diff --git a/nemo_retriever/tests/test_audio_benchmark.py b/nemo_retriever/tests/test_audio_benchmark.py
index 3862d3a83..a6b67f092 100644
--- a/nemo_retriever/tests/test_audio_benchmark.py
+++ b/nemo_retriever/tests/test_audio_benchmark.py
@@ -31,25 +31,12 @@ def test_audio_benchmark_run_mock_asr(tmp_path: Path):
     wav = tmp_path / "tiny.wav"
     _make_small_wav(wav, duration_sec=0.3)
 
-    from typer.testing import CliRunner
-
-    from nemo_retriever.utils.benchmark.audio_extract_actor import app
-
-    runner = CliRunner()
-    result = runner.invoke(
-        app,
-        [
-            "run",
-            "--audio-path",
-            str(wav),
-            "--rows",
-            "2",
-            "--workers",
-            "1",
-            "--batch-sizes",
-            "2",
-            "--mock-asr",
-        ],
+    from nemo_retriever.utils.benchmark.audio_extract_actor import run_benchmark
+
+    run_benchmark(
+        audio_path=wav,
+        rows=2,
+        workers="1",
+        batch_sizes="2",
+        mock_asr=True,
     )
-    assert result.exit_code == 0, (result.stdout, result.stderr)
-    assert "audio_extract" in result.stdout or "BEST" in result.stdout
diff --git a/nemo_retriever/tests/test_audio_pipeline_batch.py b/nemo_retriever/tests/test_audio_pipeline_batch.py
index 09ce1a7ad..ae0d3a136 100644
--- a/nemo_retriever/tests/test_audio_pipeline_batch.py
+++ b/nemo_retriever/tests/test_audio_pipeline_batch.py
@@ -96,6 +96,7 @@ def test_batch_audio_pipeline_with_mocked_asr(tmp_path: Path):
                 runtime_env={"working_dir": str(_nv_ingest_root)},
             )
             results = ingestor.ingest()
+            results = results._rd_dataset.take_all() if results is not None else None
         finally:
             try:
                 ray.shutdown()
@@ -219,6 +220,7 @@ def test_fused_audio_pipeline_with_mocked_asr(tmp_path: Path):
                 runtime_env={"working_dir": str(_nv_ingest_root)},
             )
             results = ingestor.ingest()
+            results = results._rd_dataset.take_all() if results is not None else None
         finally:
             try:
                 ray.shutdown()
diff --git a/nemo_retriever/tests/test_batch_pipeline.py b/nemo_retriever/tests/test_batch_pipeline.py
index 2d18d92bb..6dfc913a6 100644
--- a/nemo_retriever/tests/test_batch_pipeline.py
+++ b/nemo_retriever/tests/test_batch_pipeline.py
@@ -1,23 +1,6 @@
-import pytest
-
-pytest.importorskip("ray")
-
-from nemo_retriever.examples.batch_pipeline import _count_materialized_rows
 from nemo_retriever.utils.input_files import resolve_input_patterns
 
 
-class _DatasetWithoutLen:
-    def count(self) -> int:
-        return 42
-
-    def __len__(self) -> int:
-        raise AssertionError("__len__ should not be used")
-
-
-def test_count_materialized_rows_prefers_dataset_count() -> None:
-    assert _count_materialized_rows(_DatasetWithoutLen()) == 42
-
-
 def test_resolve_input_file_patterns_recurses_for_directory_inputs(tmp_path) -> None:
     dataset_dir = tmp_path / "earnings_consulting"
     dataset_dir.mkdir()
diff --git a/nemo_retriever/tests/test_html_convert.py b/nemo_retriever/tests/test_html_convert.py
index e558a4b29..61a72012d 100644
--- a/nemo_retriever/tests/test_html_convert.py
+++ b/nemo_retriever/tests/test_html_convert.py
@@ -11,7 +11,12 @@
 import pandas as pd
 import pytest
 
-from nemo_retriever.html.convert import html_bytes_to_chunks_df, html_file_to_chunks_df, html_to_markdown
+from nemo_retriever.html.convert import (
+    html_bytes_to_chunks_df,
+    html_file_to_chunks_df,
+    html_to_markdown,
+    HtmlChunkParams,
+)
 
 
 def test_html_to_markdown_str():
@@ -49,8 +54,7 @@ def test_html_file_to_chunks_df(tmp_path: Path):
     )
     df = html_file_to_chunks_df(
         str(f),
-        max_tokens=512,
-        overlap_tokens=0,
+        params=HtmlChunkParams(max_tokens=512, overlap_tokens=0),
     )
     assert isinstance(df, pd.DataFrame)
     assert "text" in df.columns and "path" in df.columns and "page_number" in df.columns and "metadata" in df.columns
@@ -67,7 +71,7 @@ def test_html_file_to_chunks_df_empty_content(tmp_path: Path):
     pytest.importorskip("transformers")
     f = tmp_path / "empty.html"
     f.write_text("<html><body></body></html>", encoding="utf-8")
-    df = html_file_to_chunks_df(str(f), max_tokens=512)
+    df = html_file_to_chunks_df(str(f), params=HtmlChunkParams(max_tokens=512))
     assert isinstance(df, pd.DataFrame)
     assert list(df.columns) == ["text", "path", "page_number", "metadata"]
     assert len(df) == 0
@@ -78,7 +82,7 @@ def test_html_bytes_to_chunks_df(tmp_path: Path):
     pytest.importorskip("transformers")
     html_bytes = b"<html><body><p>Chunk content from bytes.</p></body></html>"
     path = str(tmp_path / "virtual.html")
-    df = html_bytes_to_chunks_df(html_bytes, path, max_tokens=512, overlap_tokens=0)
+    df = html_bytes_to_chunks_df(html_bytes, path, params=HtmlChunkParams(max_tokens=512, overlap_tokens=0))
     assert isinstance(df, pd.DataFrame)
     assert "text" in df.columns and "path" in df.columns and "page_number" in df.columns and "metadata" in df.columns
     assert len(df) >= 1
diff --git a/nemo_retriever/tests/test_io_markdown.py b/nemo_retriever/tests/test_io_markdown.py
index e2ce7ed52..837f6d8e0 100644
--- a/nemo_retriever/tests/test_io_markdown.py
+++ b/nemo_retriever/tests/test_io_markdown.py
@@ -120,8 +120,8 @@ def test_to_markdown_reads_saved_records_wrapper(tmp_path: Path) -> None:
     assert "### Table 1" in markdown
 
 
-def test_to_markdown_empty_results_returns_placeholder() -> None:
-    assert to_markdown([]) == "# Extracted Content\n\n_No content found._"
+def test_to_markdown_empty_results_returns_none() -> None:
+    assert to_markdown([]) is None
 
 
 def test_to_markdown_rejects_multi_document_results() -> None:
diff --git a/nemo_retriever/tests/test_lancedb_utils.py b/nemo_retriever/tests/test_lancedb_utils.py
index cc0541195..9fd6734f3 100644
--- a/nemo_retriever/tests/test_lancedb_utils.py
+++ b/nemo_retriever/tests/test_lancedb_utils.py
@@ -198,6 +198,7 @@ def test_returns_schema_with_correct_fields(self):
         assert "text" in names
         assert "metadata" in names
         assert "source" in names
+        assert "source_id" in names
         assert len(names) == 10
 
 
diff --git a/nemo_retriever/tests/test_nemotron_rerank_v2.py b/nemo_retriever/tests/test_nemotron_rerank_v2.py
new file mode 100644
index 000000000..4c6761a5b
--- /dev/null
+++ b/nemo_retriever/tests/test_nemotron_rerank_v2.py
@@ -0,0 +1,608 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Unit tests for NemotronRerankV2 and the rerank module helpers.
+
+All heavy dependencies (torch, transformers, nemo_retriever.utils.hf_cache)
+are stubbed via sys.modules injection so no GPU or model download is required.
+"""
+
+from __future__ import annotations
+
+import sys
+from types import ModuleType
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Helpers to build lightweight torch / transformers stubs
+# ---------------------------------------------------------------------------
+
+
+def _make_tensor_stub(values: list[float]) -> MagicMock:
+    """Return a mock that mimics a 1-D torch.Tensor view(-1).cpu().tolist()."""
+    t = MagicMock()
+    t.view.return_value = t
+    t.cpu.return_value = t
+    t.tolist.return_value = values
+    return t
+
+
+def _make_model_output_stub(logits_values: list[float]) -> MagicMock:
+    out = MagicMock()
+    out.logits = _make_tensor_stub(logits_values)
+    return out
+
+
+def _build_torch_stub() -> MagicMock:
+    torch_mod = MagicMock()
+    torch_mod.cuda.is_available.return_value = False
+    torch_mod.bfloat16 = "bfloat16"
+    torch_mod.inference_mode.return_value.__enter__ = lambda s: None
+    torch_mod.inference_mode.return_value.__exit__ = MagicMock(return_value=False)
+    return torch_mod
+
+
+def _build_transformers_stub(model_output_values: list[float]) -> tuple[MagicMock, MagicMock, MagicMock]:
+    """Return (transformers_mod, tokenizer_instance, model_instance)."""
+    tokenizer_inst = MagicMock()
+    tokenizer_inst.pad_token = "pad"
+    tokenizer_inst.eos_token_id = 0
+    # __call__ on the tokenizer returns a dict of tensors
+    tokenizer_inst.return_value = {"input_ids": MagicMock(), "attention_mask": MagicMock()}
+
+    model_inst = MagicMock()
+    model_inst.eval.return_value = model_inst
+    model_inst.to.return_value = model_inst
+    model_inst.config.pad_token_id = 1
+    model_inst.return_value = _make_model_output_stub(model_output_values)
+
+    AutoTokenizer = MagicMock()
+    AutoTokenizer.from_pretrained.return_value = tokenizer_inst
+
+    AutoModelForSequenceClassification = MagicMock()
+    AutoModelForSequenceClassification.from_pretrained.return_value = model_inst
+
+    transformers_mod = MagicMock()
+    transformers_mod.AutoTokenizer = AutoTokenizer
+    transformers_mod.AutoModelForSequenceClassification = AutoModelForSequenceClassification
+
+    return transformers_mod, tokenizer_inst, model_inst
+
+
+@pytest.fixture()
+def _patch_heavy_deps(monkeypatch):
+    """Inject torch + transformers stubs and disable hf_cache setup."""
+    torch_stub = _build_torch_stub()
+    transformers_stub, tok, mdl = _build_transformers_stub([1.5, -0.3])
+
+    monkeypatch.setitem(sys.modules, "torch", torch_stub)
+    monkeypatch.setitem(sys.modules, "transformers", transformers_stub)
+
+    # Stub hf_cache so configure_global_hf_cache_base() is a no-op.
+    hf_cache_mod = ModuleType("nemo_retriever.utils.hf_cache")
+    hf_cache_mod.configure_global_hf_cache_base = MagicMock()
+    monkeypatch.setitem(sys.modules, "nemo_retriever.utils.hf_cache", hf_cache_mod)
+
+    # Also stub the parent model module so BaseModel import works.
+    # We bypass by importing NemotronRerankV2 after patching.
+    yield torch_stub, transformers_stub, tok, mdl
+
+
+# ---------------------------------------------------------------------------
+# _prompt_template
+# ---------------------------------------------------------------------------
+
+
+def test_prompt_template_format():
+    from nemo_retriever.rerank.rerank import _rerank_via_endpoint  # noqa: F401 — just ensure importable
+    from nemo_retriever.model.local.nemotron_rerank_v2 import _prompt_template
+
+    result = _prompt_template("What is ML?", "Machine learning is a branch of AI.")
+    assert "question:What is ML?" in result
+    assert "passage:Machine learning is a branch of AI." in result
+
+
+# ---------------------------------------------------------------------------
+# NemotronRerankV2 — properties & initialisation
+# ---------------------------------------------------------------------------
+
+
+class TestNemotronRerankV2Properties:
+    """Test BaseModel properties without loading real weights."""
+
+    def _make_instance(self, model_name: str = "nvidia/llama-nemotron-rerank-1b-v2") -> object:
+        """Instantiate NemotronRerankV2 with all heavy ops mocked out."""
+        from nemo_retriever.model.local import nemotron_rerank_v2 as mod
+
+        with (
+            patch.object(mod, "configure_global_hf_cache_base"),
+            patch("torch.cuda.is_available", return_value=False),
+            patch("transformers.AutoTokenizer") as MockTok,
+            patch("transformers.AutoModelForSequenceClassification") as MockModel,
+        ):
+            tok = MockTok.from_pretrained.return_value
+            tok.pad_token = "pad"
+            tok.eos_token_id = 0
+            mdl = MockModel.from_pretrained.return_value
+            mdl.eval.return_value = mdl
+            mdl.to.return_value = mdl
+            mdl.config.pad_token_id = 1
+            obj = mod.NemotronRerankV2(model_name=model_name)
+        return obj
+
+    def test_model_name(self):
+        obj = self._make_instance()
+        assert obj.model_name == "nvidia/llama-nemotron-rerank-1b-v2"
+
+    def test_model_type(self):
+        obj = self._make_instance()
+        assert obj.model_type == "reranker"
+
+    def test_model_runmode(self):
+        obj = self._make_instance()
+        assert obj.model_runmode == "local"
+
+    def test_input_batch_size(self):
+        obj = self._make_instance()
+        assert obj.input_batch_size == 32
+
+    def test_custom_model_name_stored(self):
+        obj = self._make_instance("my-org/my-reranker")
+        assert obj.model_name == "my-org/my-reranker"
+
+    def test_device_defaults_to_cpu_when_no_cuda(self):
+        obj = self._make_instance()
+        assert obj._device == "cpu"
+
+
+# ---------------------------------------------------------------------------
+# NemotronRerankV2 — score() logic (batch chunking, empty input)
+# ---------------------------------------------------------------------------
+
+
+class TestNemotronRerankV2Score:
+    """Test score() and score_pairs() without real model weights."""
+
+    @pytest.fixture()
+    def reranker(self):
+        from nemo_retriever.model.local import nemotron_rerank_v2 as mod
+
+        with (
+            patch.object(mod, "configure_global_hf_cache_base"),
+            patch("torch.cuda.is_available", return_value=False),
+            patch("transformers.AutoTokenizer") as MockTok,
+            patch("transformers.AutoModelForSequenceClassification") as MockModel,
+        ):
+            tok_inst = MockTok.from_pretrained.return_value
+            tok_inst.pad_token = "pad"
+            tok_inst.eos_token_id = 0
+            mdl_inst = MockModel.from_pretrained.return_value
+            mdl_inst.eval.return_value = mdl_inst
+            mdl_inst.to.return_value = mdl_inst
+            mdl_inst.config.pad_token_id = 1
+            obj = mod.NemotronRerankV2()
+
+        return obj
+
+    def test_score_empty_documents_returns_empty(self, reranker):
+        assert reranker.score("q", []) == []
+
+    def test_score_pairs_empty_returns_empty(self, reranker):
+        assert reranker.score_pairs([]) == []
+
+    def test_score_calls_model_and_returns_flat_list(self, reranker):
+        """score() should return one float per document."""
+        logit_tensor = MagicMock()
+        logit_tensor.view.return_value = logit_tensor
+        logit_tensor.cpu.return_value = logit_tensor
+        logit_tensor.tolist.return_value = [3.5, -1.2]
+
+        model_out = MagicMock()
+        model_out.logits = logit_tensor
+
+        reranker._tokenizer.return_value = {"input_ids": MagicMock(), "attention_mask": MagicMock()}
+        reranker._model.return_value = model_out
+
+        with patch("torch.inference_mode") as inf_mode:
+            inf_mode.return_value.__enter__ = lambda s: None
+            inf_mode.return_value.__exit__ = MagicMock(return_value=False)
+            scores = reranker.score("What is ML?", ["Machine learning is…", "Paris is…"])
+
+        assert len(scores) == 2
+        assert scores == [3.5, -1.2]
+
+    def test_score_prompts_are_formatted_correctly(self, reranker):
+        """The tokenizer must receive the templated text, not the raw document."""
+        captured_texts = []
+
+        def fake_tokenizer(texts, **kwargs):
+            captured_texts.extend(texts)
+            m = MagicMock()
+            m.items.return_value = []
+            return m
+
+        reranker._tokenizer.side_effect = fake_tokenizer
+
+        logit_tensor = MagicMock()
+        logit_tensor.view.return_value = logit_tensor
+        logit_tensor.cpu.return_value = logit_tensor
+        logit_tensor.tolist.return_value = [0.0]
+
+        model_out = MagicMock()
+        model_out.logits = logit_tensor
+        reranker._model.return_value = model_out
+
+        with patch("torch.inference_mode") as inf_mode:
+            inf_mode.return_value.__enter__ = lambda s: None
+            inf_mode.return_value.__exit__ = MagicMock(return_value=False)
+            reranker.score("my query", ["my document"])
+
+        assert len(captured_texts) == 1
+        assert "question:my query" in captured_texts[0]
+        assert "passage:my document" in captured_texts[0]
+
+    def test_score_splits_into_batches(self, reranker):
+        """With batch_size=2 and 5 documents, model should be called 3 times."""
+        call_count = [0]
+
+        def fake_tokenizer(texts, **kwargs):
+            m = MagicMock()
+            m.items.return_value = [("input_ids", MagicMock())]
+            return m
+
+        reranker._tokenizer.side_effect = fake_tokenizer
+
+        def fake_model(**kwargs):
+            # Count items in the batch by inspecting how many texts were tokenized
+            call_count[0] += 1
+            logit_tensor = MagicMock()
+            logit_tensor.view.return_value = logit_tensor
+            logit_tensor.cpu.return_value = logit_tensor
+            logit_tensor.tolist.return_value = [1.0] * 2  # Return 2 scores per call
+            out = MagicMock()
+            out.logits = logit_tensor
+            return out
+
+        reranker._model.side_effect = fake_model
+
+        with patch("torch.inference_mode") as inf_mode:
+            inf_mode.return_value.__enter__ = lambda s: None
+            inf_mode.return_value.__exit__ = MagicMock(return_value=False)
+            # 5 documents, batch_size=2 → ceil(5/2) = 3 forward passes
+            reranker.score("q", ["d1", "d2", "d3", "d4", "d5"], batch_size=2)
+
+        assert call_count[0] == 3
+
+    def test_score_pairs_uses_query_per_pair(self, reranker):
+        """score_pairs() must use each pair's own query, not a shared one."""
+        captured = []
+
+        def fake_tokenizer(texts, **kwargs):
+            captured.extend(texts)
+            m = MagicMock()
+            m.items.return_value = []
+            return m
+
+        reranker._tokenizer.side_effect = fake_tokenizer
+
+        logit_tensor = MagicMock()
+        logit_tensor.view.return_value = logit_tensor
+        logit_tensor.cpu.return_value = logit_tensor
+        logit_tensor.tolist.return_value = [0.0, 0.0]
+
+        model_out = MagicMock()
+        model_out.logits = logit_tensor
+        reranker._model.return_value = model_out
+
+        with patch("torch.inference_mode") as inf_mode:
+            inf_mode.return_value.__enter__ = lambda s: None
+            inf_mode.return_value.__exit__ = MagicMock(return_value=False)
+            reranker.score_pairs([("q1", "doc A"), ("q2", "doc B")])
+
+        assert any("question:q1" in t for t in captured)
+        assert any("question:q2" in t for t in captured)
+
+
+# ---------------------------------------------------------------------------
+# rerank_hits() — standalone helper
+# ---------------------------------------------------------------------------
+
+
+class TestRerankHits:
+    """Test the public rerank_hits() convenience function."""
+
+    def _make_hits(self, n: int, prefix: str = "doc") -> list[dict]:
+        return [{"text": f"{prefix}{i}", "_distance": float(i)} for i in range(n)]
+
+    def test_empty_hits_returns_empty(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        model = MagicMock()
+        assert rerank_hits("q", [], model=model) == []
+
+    def test_results_sorted_by_score_descending(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        hits = self._make_hits(3)
+        model = MagicMock()
+        model.score.return_value = [0.1, 5.0, -1.0]
+
+        out = rerank_hits("q", hits, model=model)
+
+        scores = [h["_rerank_score"] for h in out]
+        assert scores == sorted(scores, reverse=True)
+
+    def test_rerank_score_added_to_each_hit(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        hits = [{"text": "hello"}, {"text": "world"}]
+        model = MagicMock()
+        model.score.return_value = [2.0, 3.0]
+
+        out = rerank_hits("q", hits, model=model)
+        assert all("_rerank_score" in h for h in out)
+
+    def test_top_n_truncates_output(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        hits = self._make_hits(5)
+        model = MagicMock()
+        model.score.return_value = [5.0, 4.0, 3.0, 2.0, 1.0]
+
+        out = rerank_hits("q", hits, model=model, top_n=3)
+        assert len(out) == 3
+
+    def test_model_score_called_with_query_and_texts(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        hits = [{"text": "first"}, {"text": "second"}]
+        model = MagicMock()
+        model.score.return_value = [1.0, 2.0]
+
+        rerank_hits("my query", hits, model=model)
+
+        model.score.assert_called_once_with("my query", ["first", "second"], max_length=512, batch_size=32)
+
+    def test_raises_without_model_or_endpoint(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        with pytest.raises(ValueError, match="model.*invoke_url"):
+            rerank_hits("q", [{"text": "doc"}])
+
+    def test_custom_text_key(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        hits = [{"content": "alpha"}, {"content": "beta"}]
+        model = MagicMock()
+        model.score.return_value = [1.0, 2.0]
+
+        out = rerank_hits("q", hits, model=model, text_key="content")
+        assert len(out) == 2
+
+    def test_original_hit_keys_preserved(self):
+        from nemo_retriever.rerank import rerank_hits
+
+        hits = [{"text": "t", "metadata": "m", "_distance": 0.5}]
+        model = MagicMock()
+        model.score.return_value = [7.0]
+
+        out = rerank_hits("q", hits, model=model)
+        assert out[0]["metadata"] == "m"
+        assert out[0]["_distance"] == 0.5
+
+
+# ---------------------------------------------------------------------------
+# _rerank_via_endpoint()
+# ---------------------------------------------------------------------------
+
+
+class TestRerankViaEndpoint:
+    def test_posts_to_rerank_url(self):
+        from nemo_retriever.rerank.rerank import _rerank_via_endpoint
+
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = {
+            "results": [
+                {"index": 0, "relevance_score": 0.9},
+                {"index": 1, "relevance_score": 0.3},
+            ]
+        }
+        mock_resp.raise_for_status = MagicMock()
+
+        with patch("requests.post", return_value=mock_resp) as mock_post:
+            scores = _rerank_via_endpoint(
+                "What is ML?",
+                ["Machine learning is…", "Paris is…"],
+                endpoint="http://localhost:8000",
+                model_name="nvidia/llama-nemotron-rerank-1b-v2",
+            )
+
+        mock_post.assert_called_once()
+        call_kwargs = mock_post.call_args
+        assert call_kwargs[0][0] == "http://localhost:8000/rerank"
+        assert call_kwargs[1]["json"]["query"] == "What is ML?"
+        assert len(call_kwargs[1]["json"]["documents"]) == 2
+
+        assert scores == [0.9, 0.3]
+
+    def test_scores_aligned_with_input_order(self):
+        from nemo_retriever.rerank.rerank import _rerank_via_endpoint
+
+        # Server returns results in reversed order
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = {
+            "results": [
+                {"index": 2, "relevance_score": 0.1},
+                {"index": 0, "relevance_score": 0.8},
+                {"index": 1, "relevance_score": 0.5},
+            ]
+        }
+        mock_resp.raise_for_status = MagicMock()
+
+        with patch("requests.post", return_value=mock_resp):
+            scores = _rerank_via_endpoint(
+                "q",
+                ["d0", "d1", "d2"],
+                endpoint="http://localhost:8000",
+            )
+
+        assert scores[0] == 0.8  # index 0
+        assert scores[1] == 0.5  # index 1
+        assert scores[2] == 0.1  # index 2
+
+    def test_authorization_header_sent_when_api_key_provided(self):
+        from nemo_retriever.rerank.rerank import _rerank_via_endpoint
+
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = {"results": [{"index": 0, "relevance_score": 1.0}]}
+        mock_resp.raise_for_status = MagicMock()
+
+        with patch("requests.post", return_value=mock_resp) as mock_post:
+            _rerank_via_endpoint(
+                "q",
+                ["d"],
+                endpoint="http://localhost:8000",
+                api_key="my-secret-key",
+            )
+
+        headers = mock_post.call_args[1]["headers"]
+        assert headers["Authorization"] == "Bearer my-secret-key"
+
+    def test_trailing_slash_on_endpoint_normalized(self):
+        from nemo_retriever.rerank.rerank import _rerank_via_endpoint
+
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = {"results": [{"index": 0, "relevance_score": 0.5}]}
+        mock_resp.raise_for_status = MagicMock()
+
+        with patch("requests.post", return_value=mock_resp) as mock_post:
+            _rerank_via_endpoint("q", ["d"], endpoint="http://localhost:8000/")
+
+        url = mock_post.call_args[0][0]
+        assert url == "http://localhost:8000/rerank"
+
+    def test_top_n_sent_in_payload_when_specified(self):
+        from nemo_retriever.rerank.rerank import _rerank_via_endpoint
+
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = {"results": [{"index": 0, "relevance_score": 0.5}]}
+        mock_resp.raise_for_status = MagicMock()
+
+        with patch("requests.post", return_value=mock_resp) as mock_post:
+            _rerank_via_endpoint("q", ["d"], endpoint="http://localhost:8000", top_n=5)
+
+        payload = mock_post.call_args[1]["json"]
+        assert payload["top_n"] == 5
+
+    def test_top_n_not_in_payload_when_not_specified(self):
+        from nemo_retriever.rerank.rerank import _rerank_via_endpoint
+
+        mock_resp = MagicMock()
+        mock_resp.json.return_value = {"results": [{"index": 0, "relevance_score": 0.5}]}
+        mock_resp.raise_for_status = MagicMock()
+
+        with patch("requests.post", return_value=mock_resp) as mock_post:
+            _rerank_via_endpoint("q", ["d"], endpoint="http://localhost:8000")
+
+        payload = mock_post.call_args[1]["json"]
+        assert "top_n" not in payload
+
+
+# ---------------------------------------------------------------------------
+# NemotronRerankActor
+# ---------------------------------------------------------------------------
+
+
+class TestNemotronRerankActor:
+    """Test the Ray Data-compatible actor."""
+
+    def test_actor_with_invoke_url_skips_local_model(self):
+        from nemo_retriever.rerank.rerank import NemotronRerankActor
+
+        actor = NemotronRerankActor(invoke_url="http://localhost:8000")
+        assert actor._model is None
+
+    def test_actor_with_rerank_invoke_url_alias(self):
+        from nemo_retriever.rerank.rerank import NemotronRerankActor
+
+        actor = NemotronRerankActor(rerank_invoke_url="http://localhost:8000")
+        assert actor._model is None
+        assert actor._kwargs.get("invoke_url") == "http://localhost:8000"
+
+    def test_actor_call_scores_dataframe(self):
+        import pandas as pd
+        from nemo_retriever.rerank.rerank import NemotronRerankActor
+
+        actor = NemotronRerankActor(invoke_url="http://localhost:8000")
+
+        df = pd.DataFrame({"query": ["q1", "q2"], "text": ["doc A", "doc B"]})
+
+        mock_resp = MagicMock()
+        mock_resp.raise_for_status = MagicMock()
+        mock_resp.json.side_effect = [
+            {"results": [{"index": 0, "relevance_score": 0.9}]},
+            {"results": [{"index": 0, "relevance_score": 0.4}]},
+        ]
+
+        with patch("requests.post", return_value=mock_resp):
+            out = actor(df)
+
+        assert "rerank_score" in out.columns
+        assert len(out) == 2
+
+    def test_actor_call_sorts_descending_by_default(self):
+        import pandas as pd
+        from nemo_retriever.rerank.rerank import NemotronRerankActor
+
+        actor = NemotronRerankActor(invoke_url="http://localhost:8000")
+        df = pd.DataFrame({"query": ["q", "q"], "text": ["low relevance", "high relevance"]})
+
+        mock_resp = MagicMock()
+        mock_resp.raise_for_status = MagicMock()
+        mock_resp.json.side_effect = [
+            {"results": [{"index": 0, "relevance_score": 0.1}]},
+            {"results": [{"index": 0, "relevance_score": 0.9}]},
+        ]
+
+        with patch("requests.post", return_value=mock_resp):
+            out = actor(df)
+
+        scores = out["rerank_score"].tolist()
+        assert scores == sorted(scores, reverse=True)
+
+    def test_actor_call_returns_error_payload_on_exception(self):
+        import pandas as pd
+        from nemo_retriever.rerank.rerank import NemotronRerankActor
+
+        actor = NemotronRerankActor(invoke_url="http://localhost:8000")
+        df = pd.DataFrame({"query": ["q"], "text": ["doc"]})
+
+        with patch("requests.post", side_effect=RuntimeError("connection failed")):
+            out = actor(df)
+
+        # Should not raise; should return a DataFrame with error payload
+        assert isinstance(out, pd.DataFrame)
+        assert "rerank_score" in out.columns
+        payload = out["rerank_score"].iloc[0]
+        assert payload["status"] == "error"
+
+    def test_actor_custom_score_column_name(self):
+        import pandas as pd
+        from nemo_retriever.rerank.rerank import NemotronRerankActor
+
+        actor = NemotronRerankActor(invoke_url="http://localhost:8000", score_column="my_score")
+        df = pd.DataFrame({"query": ["q"], "text": ["doc"]})
+
+        mock_resp = MagicMock()
+        mock_resp.raise_for_status = MagicMock()
+        mock_resp.json.return_value = {"results": [{"index": 0, "relevance_score": 0.7}]}
+
+        with patch("requests.post", return_value=mock_resp):
+            out = actor(df)
+
+        assert "my_score" in out.columns
diff --git a/nemo_retriever/tests/test_pdf_render_scale.py b/nemo_retriever/tests/test_pdf_render_scale.py
index 5c2bba5ec..6344c5d3b 100644
--- a/nemo_retriever/tests/test_pdf_render_scale.py
+++ b/nemo_retriever/tests/test_pdf_render_scale.py
@@ -54,7 +54,7 @@ def test_renders_at_full_dpi(self):
         dpi = 200
         base_scale = dpi / 72.0
 
-        _extract._render_page_to_base64(page, dpi=dpi)
+        _extract._render_page_to_base64(page, dpi=dpi, render_mode="full_dpi")
 
         render_call = page.render.call_args
         actual_scale = render_call.kwargs.get("scale", render_call.args[0] if render_call.args else None)
diff --git a/nemo_retriever/tests/test_retriever_queries.py b/nemo_retriever/tests/test_retriever_queries.py
new file mode 100644
index 000000000..b398c48ac
--- /dev/null
+++ b/nemo_retriever/tests/test_retriever_queries.py
@@ -0,0 +1,372 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Unit tests for Retriever.queries() and Retriever.query().
+
+All external I/O (LanceDB, embedders, requests) is mocked so the tests run
+without any GPU, network, or database dependency.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+_EMBED_DIM = 4
+_DUMMY_VECTOR = [0.1, 0.2, 0.3, 0.4]
+
+
+def _make_hits(n: int, base_score: float = 0.5) -> list[dict]:
+    return [
+        {
+            "text": f"passage {i}",
+            "metadata": "{}",
+            "source": "{}",
+            "page_number": i,
+            "_distance": base_score + i * 0.01,
+        }
+        for i in range(n)
+    ]
+
+
+def _make_retriever(**overrides):
+    """Return a Retriever with reranker disabled by default and sane test values."""
+    from nemo_retriever.retriever import Retriever
+
+    defaults = dict(
+        reranker=None,
+        top_k=5,
+        nprobes=16,
+    )
+    defaults.update(overrides)
+    return Retriever(**defaults)
+
+
+# ---------------------------------------------------------------------------
+# Retriever._resolve_embedding_endpoint
+# ---------------------------------------------------------------------------
+
+
+class TestResolveEmbeddingEndpoint:
+    def test_returns_none_when_no_endpoints_set(self):
+        r = _make_retriever()
+        assert r._resolve_embedding_endpoint() is None
+
+    def test_http_endpoint_takes_priority(self):
+        r = _make_retriever(
+            embedding_http_endpoint="http://embed.example.com",
+            embedding_endpoint="http://other.example.com",
+        )
+        assert r._resolve_embedding_endpoint() == "http://embed.example.com"
+
+    def test_single_endpoint_returned_when_http(self):
+        r = _make_retriever(embedding_endpoint="http://embed.example.com")
+        assert r._resolve_embedding_endpoint() == "http://embed.example.com"
+
+    def test_grpc_endpoint_raises(self):
+        r = _make_retriever(embedding_endpoint="grpc://embed.example.com")
+        with pytest.raises(ValueError, match="gRPC"):
+            r._resolve_embedding_endpoint()
+
+    def test_whitespace_only_endpoint_treated_as_none(self):
+        r = _make_retriever(embedding_http_endpoint="   ")
+        assert r._resolve_embedding_endpoint() is None
+
+
+# ---------------------------------------------------------------------------
+# Retriever.queries() — basic (no reranking)
+# ---------------------------------------------------------------------------
+
+
+class TestQueriesNoReranking:
+    def _run_queries(self, retriever, query_texts, fake_vectors, fake_hits):
+        """Patch embed + search helpers and call queries()."""
+        with (
+            patch.object(retriever, "_embed_queries_local_hf", return_value=fake_vectors),
+            patch.object(retriever, "_search_lancedb", return_value=fake_hits),
+        ):
+            return retriever.queries(query_texts)
+
+    def test_empty_queries_returns_empty(self):
+        r = _make_retriever()
+        assert r.queries([]) == []
+
+    def test_single_query_returns_one_result_list(self):
+        r = _make_retriever()
+        hits = [_make_hits(5)]
+        result = self._run_queries(r, ["What is ML?"], [_DUMMY_VECTOR], hits)
+        assert len(result) == 1
+        assert result[0] is hits[0]
+
+    def test_multiple_queries_return_matching_result_count(self):
+        r = _make_retriever()
+        n_queries = 3
+        fake_hits = [_make_hits(5)] * n_queries
+        result = self._run_queries(
+            r,
+            [f"query {i}" for i in range(n_queries)],
+            [_DUMMY_VECTOR] * n_queries,
+            fake_hits,
+        )
+        assert len(result) == n_queries
+
+    def test_embed_local_hf_called_with_query_texts(self):
+        r = _make_retriever()
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=[_DUMMY_VECTOR]) as mock_embed,
+            patch.object(r, "_search_lancedb", return_value=[_make_hits(5)]),
+        ):
+            r.queries(["hello world"])
+
+        mock_embed.assert_called_once_with(["hello world"], model_name=r.embedder)
+
+    def test_embed_nim_called_when_endpoint_set(self):
+        r = _make_retriever(embedding_http_endpoint="http://nim.example.com")
+        with (
+            patch.object(r, "_embed_queries_nim", return_value=[_DUMMY_VECTOR]) as mock_nim,
+            patch.object(r, "_search_lancedb", return_value=[_make_hits(5)]),
+        ):
+            r.queries(["hello"])
+
+        mock_nim.assert_called_once()
+        call_kwargs = mock_nim.call_args[1]
+        assert call_kwargs["endpoint"] == "http://nim.example.com"
+
+    def test_search_lancedb_receives_vectors_and_texts(self):
+        r = _make_retriever()
+        vecs = [[0.1, 0.2, 0.3, 0.4]]
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=vecs),
+            patch.object(r, "_search_lancedb", return_value=[_make_hits(5)]) as mock_search,
+        ):
+            r.queries(["my query"])
+
+        kwargs = mock_search.call_args[1]
+        assert kwargs["query_vectors"] == vecs
+        assert kwargs["query_texts"] == ["my query"]
+
+    def test_embedder_override_forwarded(self):
+        r = _make_retriever()
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=[_DUMMY_VECTOR]) as mock_embed,
+            patch.object(r, "_search_lancedb", return_value=[_make_hits(5)]),
+        ):
+            r.queries(["q"], embedder="custom/embedder")
+
+        assert mock_embed.call_args[1]["model_name"] == "custom/embedder"
+
+    def test_lancedb_uri_and_table_overrides_forwarded(self):
+        r = _make_retriever()
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=[_DUMMY_VECTOR]),
+            patch.object(r, "_search_lancedb", return_value=[_make_hits(5)]) as mock_search,
+        ):
+            r.queries(["q"], lancedb_uri="/tmp/db", lancedb_table="my-table")
+
+        kwargs = mock_search.call_args[1]
+        assert kwargs["lancedb_uri"] == "/tmp/db"
+        assert kwargs["lancedb_table"] == "my-table"
+
+
+# ---------------------------------------------------------------------------
+# Retriever.query() — single-query convenience wrapper
+# ---------------------------------------------------------------------------
+
+
+class TestQuerySingleConvenience:
+    def test_query_delegates_to_queries_and_returns_first_element(self):
+        r = _make_retriever()
+        expected = _make_hits(5)
+        with patch.object(r, "queries", return_value=[expected]) as mock_queries:
+            result = r.query("find something")
+
+        mock_queries.assert_called_once_with(
+            ["find something"],
+            embedder=None,
+            lancedb_uri=None,
+            lancedb_table=None,
+        )
+        assert result is expected
+
+    def test_query_passes_through_overrides(self):
+        r = _make_retriever()
+        with patch.object(r, "queries", return_value=[[]]) as mock_queries:
+            r.query("q", embedder="e", lancedb_uri="u", lancedb_table="t")
+
+        mock_queries.assert_called_once_with(["q"], embedder="e", lancedb_uri="u", lancedb_table="t")
+
+
+# ---------------------------------------------------------------------------
+# Retriever.queries() — with reranking via remote endpoint
+# ---------------------------------------------------------------------------
+
+
+class TestQueriesWithEndpointReranking:
+    def _retriever_with_endpoint(self, top_k: int = 3, refine: int = 2) -> object:
+        return _make_retriever(
+            reranker="nvidia/llama-nemotron-rerank-1b-v2",
+            reranker_endpoint="http://rerank.example.com",
+            top_k=top_k,
+            reranker_refine_factor=refine,
+        )
+
+    def _fake_search_results(self, retriever) -> list[list[dict]]:
+        """Return the number of hits that satisfies the assertion check."""
+        n = retriever.top_k * retriever.reranker_refine_factor
+        return [_make_hits(n)]
+
+    def test_rerank_results_called_when_reranker_set(self):
+        r = self._retriever_with_endpoint()
+        fake_results = self._fake_search_results(r)
+
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=[_DUMMY_VECTOR]),
+            patch.object(r, "_search_lancedb", return_value=fake_results),
+            patch.object(r, "_rerank_results", return_value=[_make_hits(3)]) as mock_rerank,
+        ):
+            r.queries(["q"])
+
+        mock_rerank.assert_called_once_with(["q"], fake_results)
+
+    def test_rerank_not_called_when_reranker_is_none(self):
+        r = _make_retriever(reranker=None)
+        fake_results = [_make_hits(5)]
+
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=[_DUMMY_VECTOR]),
+            patch.object(r, "_search_lancedb", return_value=fake_results),
+            patch.object(r, "_rerank_results") as mock_rerank,
+        ):
+            r.queries(["q"])
+
+        mock_rerank.assert_not_called()
+
+    def test_reranked_results_are_returned(self):
+        r = self._retriever_with_endpoint()
+        fake_results = self._fake_search_results(r)
+        reranked = [_make_hits(3)]
+
+        with (
+            patch.object(r, "_embed_queries_local_hf", return_value=[_DUMMY_VECTOR]),
+            patch.object(r, "_search_lancedb", return_value=fake_results),
+            patch.object(r, "_rerank_results", return_value=reranked),
+        ):
+            out = r.queries(["q"])
+
+        assert out is reranked
+
+    def test_rerank_results_uses_endpoint_not_local_model(self):
+        r = self._retriever_with_endpoint()
+        fake_hits = self._fake_search_results(r)[0]
+
+        mock_resp = MagicMock()
+        mock_resp.raise_for_status = MagicMock()
+        # Return relevance scores in reverse original order
+        mock_resp.json.return_value = {
+            "results": [{"index": i, "relevance_score": float(len(fake_hits) - i)} for i in range(len(fake_hits))]
+        }
+
+        with patch("requests.post", return_value=mock_resp) as mock_post:
+            out = r._rerank_results(["q"], [fake_hits])
+
+        mock_post.assert_called()
+        # Results should be sorted descending
+        scores = [h["_rerank_score"] for h in out[0]]
+        assert scores == sorted(scores, reverse=True)
+
+
+# ---------------------------------------------------------------------------
+# Retriever.queries() — with local reranking model
+# ---------------------------------------------------------------------------
+
+
+class TestQueriesWithLocalReranking:
+
+    def test_rerank_results_with_local_model(self):
+        r = _make_retriever(reranker="nvidia/llama-nemotron-rerank-1b-v2")
+        hits = _make_hits(4)
+        fake_model = MagicMock()
+        fake_model.score.return_value = [0.1, 0.9, 0.5, 0.3]
+
+        with patch.object(r, "_get_reranker_model", return_value=fake_model):
+            out = r._rerank_results(["q"], [hits])
+
+        scores = [h["_rerank_score"] for h in out[0]]
+        assert scores == sorted(scores, reverse=True)
+        assert max(scores) == 0.9
+
+    def test_rerank_results_respects_top_k(self):
+        r = _make_retriever(reranker="nvidia/llama-nemotron-rerank-1b-v2", top_k=2)
+        hits = _make_hits(4)
+        fake_model = MagicMock()
+        fake_model.score.return_value = [0.1, 0.9, 0.5, 0.3]
+
+        with patch.object(r, "_get_reranker_model", return_value=fake_model):
+            out = r._rerank_results(["q"], [hits])
+
+        assert len(out[0]) == 2
+
+    def test_rerank_results_multiple_queries(self):
+        r = _make_retriever(reranker="nvidia/llama-nemotron-rerank-1b-v2", top_k=2)
+        hits_a = _make_hits(2)
+        hits_b = _make_hits(2)
+        fake_model = MagicMock()
+        fake_model.score.side_effect = [[0.2, 0.8], [0.6, 0.4]]
+
+        with patch.object(r, "_get_reranker_model", return_value=fake_model):
+            out = r._rerank_results(["q1", "q2"], [hits_a, hits_b])
+
+        assert len(out) == 2
+        # Each per-query list should be sorted descending
+        for per_query in out:
+            scores = [h["_rerank_score"] for h in per_query]
+            assert scores == sorted(scores, reverse=True)
+
+
+# ---------------------------------------------------------------------------
+# Retriever defaults: reranker field behaviour
+# ---------------------------------------------------------------------------
+
+
+class TestRetrieverDefaults:
+    def test_default_reranker_is_nemotron_model(self):
+        from nemo_retriever.retriever import Retriever
+
+        r = Retriever()
+        assert r.reranker_model_name == "nvidia/llama-nemotron-rerank-1b-v2"
+
+    def test_reranker_can_be_disabled(self):
+        r = _make_retriever(reranker=None)
+        assert r.reranker is None
+
+    def test_reranker_refine_factor_default(self):
+        from nemo_retriever.retriever import Retriever
+
+        r = Retriever()
+        assert r.reranker_refine_factor == 4
+
+    def test_reranker_max_length_default(self):
+        from nemo_retriever.retriever import Retriever
+
+        r = Retriever()
+        assert r.reranker_max_length == 512
+
+    def test_reranker_model_not_initialized_at_construction(self):
+        from nemo_retriever.retriever import Retriever
+
+        r = Retriever()
+        # Should be None until first use
+        assert r._reranker_model is None
+
+    def test_retriever_alias_is_retriever_class(self):
+        from nemo_retriever.retriever import retriever, Retriever
+
+        assert retriever is Retriever
diff --git a/nemo_retriever/tests/test_txt_split.py b/nemo_retriever/tests/test_txt_split.py
index 212c94813..966fff242 100644
--- a/nemo_retriever/tests/test_txt_split.py
+++ b/nemo_retriever/tests/test_txt_split.py
@@ -12,7 +12,7 @@
 import pandas as pd
 import pytest
 
-from nemo_retriever.txt.split import split_text_by_tokens, txt_file_to_chunks_df
+from nemo_retriever.txt.split import split_text_by_tokens, txt_file_to_chunks_df, TextChunkParams
 
 
 class _MockTokenizer:
@@ -63,11 +63,10 @@ def test_txt_file_to_chunks_df(tmp_path: Path):
     f.write_text("First paragraph here. Second paragraph there.", encoding="utf-8")
     df = txt_file_to_chunks_df(
         str(f),
-        max_tokens=512,
-        overlap_tokens=0,
+        params=TextChunkParams(max_tokens=512, overlap_tokens=0),
     )
     assert isinstance(df, pd.DataFrame)
-    assert list(df.columns) == ["text", "path", "page_number", "metadata"]
+    assert list(df.columns) == ["text", "content", "path", "page_number", "metadata"]
     assert len(df) >= 1
     assert df["path"].iloc[0] == str(f.resolve())
     assert df["page_number"].iloc[0] >= 1
@@ -79,7 +78,7 @@ def test_txt_file_to_chunks_df_empty_file(tmp_path: Path):
     pytest.importorskip("transformers")
     f = tmp_path / "empty.txt"
     f.write_text("", encoding="utf-8")
-    df = txt_file_to_chunks_df(str(f), max_tokens=512)
+    df = txt_file_to_chunks_df(str(f), params=TextChunkParams(max_tokens=512))
     assert isinstance(df, pd.DataFrame)
     assert list(df.columns) == ["text", "path", "page_number", "metadata"]
     assert len(df) == 0
diff --git a/retrieval-bench/README.md b/retrieval-bench/README.md
index dc1ca6814..0bc7a7995 100644
--- a/retrieval-bench/README.md
+++ b/retrieval-bench/README.md
@@ -65,7 +65,7 @@ Backend-specific overrides can be passed via `--pipeline-args` JSON:
 retrieval-bench evaluate dense-retrieval \
   --dataset-name bright/biology \
   --backend llama-nv-embed-reasoning-3b \
-  --pipeline-args '{"model_id":"~/checkpoints/my_model","scoring_batch_size":2048}'
+  --pipeline-args '{"model_id":"~/checkpoints/my_model","max_scoring_batch_size":2048}'
 ```
 
 ## Agentic retrieval
diff --git a/retrieval-bench/src/retrieval_bench/pipelines/backends.py b/retrieval-bench/src/retrieval_bench/pipelines/backends.py
index a05189d20..bd0ce6f00 100644
--- a/retrieval-bench/src/retrieval_bench/pipelines/backends.py
+++ b/retrieval-bench/src/retrieval_bench/pipelines/backends.py
@@ -41,8 +41,7 @@ def infer_bright_task_key(dataset_name: Any) -> Optional[str]:
         "pooling": "mean",
         "score_scale": 100.0,
         "corpus_batch_size": 1,
-        "scoring_batch_size": 4096,
-        "preload_corpus_to_gpu": False,
+        "max_scoring_batch_size": 4096,
         "query_prefix_fallback": (
             "Instruct: Given the following post, retrieve relevant passages that help answer the post.\nQuery:"
         ),
@@ -51,8 +50,7 @@ def infer_bright_task_key(dataset_name: Any) -> Optional[str]:
         "model_id": "nvidia/llama-nemoretriever-colembed-3b-v1",
         "batch_size": 32,
         "corpus_batch_size": 32,
-        "corpus_chunk_size": 256,
-        "preload_corpus_to_gpu": True,
+        "max_scoring_batch_size": 256,
     },
     "llama-nemotron-embed-vl-1b-v2": {
         "model_id": "nvidia/llama-nemotron-embed-vl-1b-v2",
@@ -60,17 +58,16 @@ def infer_bright_task_key(dataset_name: Any) -> Optional[str]:
         "doc_modality": "image_text",
         "doc_max_length": "auto",
         "query_max_length": 10240,
-        "corpus_batch_size": 4,
-        "corpus_chunk_size": 4096,
-        "preload_corpus_to_gpu": False,
+        "corpus_batch_size": 32,
+        "max_scoring_batch_size": 4096,
         "max_input_tiles": 6,
         "use_thumbnail": True,
     },
     "nemotron-colembed-vl-8b-v2": {
         "model_id": "nvidia/nemotron-colembed-vl-8b-v2",
-        "corpus_batch_size": 8,
-        "corpus_chunk_size": 256,
-        "preload_corpus_to_gpu": False,
+        "corpus_batch_size": 32,
+        "max_scoring_batch_size": 3000,
+        "scoring_chunk_size": 1311,
         "max_input_tiles": 8,
         "use_thumbnail": True,
         "cache_dir": "cache/nemotron_colembed_vl_v2",
@@ -170,8 +167,10 @@ def init_backend(
         max_length = int(cfg.pop("max_length", 8192))
         score_scale = float(cfg.pop("score_scale", 100.0))
         corpus_batch_size = int(cfg.pop("corpus_batch_size", 1))
-        scoring_batch_size = int(cfg.pop("scoring_batch_size", 4096))
-        preload_corpus_to_gpu = bool(cfg.pop("preload_corpus_to_gpu", False))
+        max_scoring_batch_size = int(cfg.pop("max_scoring_batch_size", 4096))
+
+        if cfg:
+            raise ValueError(f"Unknown pipeline arg(s) for backend {backend!r}: {', '.join(sorted(cfg))}")
 
         retriever.init(
             dataset_name=dataset_name,
@@ -188,9 +187,8 @@ def init_backend(
             score_scale=score_scale,
             batch_size=1,
             corpus_batch_size=corpus_batch_size,
-            scoring_batch_size=scoring_batch_size,
+            max_scoring_batch_size=max_scoring_batch_size,
             cache_dir="cache/hf_dense",
-            preload_corpus_to_gpu=preload_corpus_to_gpu,
         )
         init_info.update(
             {
@@ -208,8 +206,10 @@ def init_backend(
     elif backend == "llama-nemoretriever-colembed-3b-v1":
         batch_size = int(cfg.pop("batch_size", 32))
         corpus_batch_size = int(cfg.pop("corpus_batch_size", 32))
-        corpus_chunk_size = int(cfg.pop("corpus_chunk_size", 256))
-        preload_corpus_to_gpu = bool(cfg.pop("preload_corpus_to_gpu", True))
+        max_scoring_batch_size = int(cfg.pop("max_scoring_batch_size", 256))
+
+        if cfg:
+            raise ValueError(f"Unknown pipeline arg(s) for backend {backend!r}: {', '.join(sorted(cfg))}")
 
         retriever.init(
             dataset_name=dataset_name,
@@ -219,9 +219,8 @@ def init_backend(
             top_k=top_k,
             batch_size=batch_size,
             corpus_batch_size=corpus_batch_size,
-            corpus_chunk_size=corpus_chunk_size,
+            max_scoring_batch_size=max_scoring_batch_size,
             cache_dir="cache",
-            preload_corpus_to_gpu=preload_corpus_to_gpu,
         )
         init_info.update({"model_id": model_id})
         return retriever, model_id, init_info
@@ -233,16 +232,18 @@ def init_backend(
 
         # Auto-detect: fall back to text-only when the corpus has no images
         # (e.g. BRIGHT text-only datasets).
-        if doc_modality != "text" and not any("image" in doc for doc in corpus[:5]):
+        if doc_modality != "text" and not any(doc.get("image") is not None for doc in corpus[:5]):
             doc_modality = "text"
 
         query_max_length = int(cfg.pop("query_max_length", 10240))
         corpus_batch_size = int(cfg.pop("corpus_batch_size", 4))
-        corpus_chunk_size = int(cfg.pop("corpus_chunk_size", 4096))
-        preload_corpus_to_gpu = bool(cfg.pop("preload_corpus_to_gpu", False))
+        max_scoring_batch_size = int(cfg.pop("max_scoring_batch_size", 4096))
         max_input_tiles = int(cfg.pop("max_input_tiles", 6))
         use_thumbnail = bool(cfg.pop("use_thumbnail", True))
 
+        if cfg:
+            raise ValueError(f"Unknown pipeline arg(s) for backend {backend!r}: {', '.join(sorted(cfg))}")
+
         retriever.init(
             dataset_name=dataset_name,
             corpus_ids=corpus_ids,
@@ -254,9 +255,8 @@ def init_backend(
             doc_max_length=doc_max_length,
             query_max_length=query_max_length,
             corpus_batch_size=corpus_batch_size,
-            corpus_chunk_size=corpus_chunk_size,
+            max_scoring_batch_size=max_scoring_batch_size,
             cache_dir="cache/nemotron_vl_dense",
-            preload_corpus_to_gpu=preload_corpus_to_gpu,
             max_input_tiles=max_input_tiles,
             use_thumbnail=use_thumbnail,
         )
@@ -270,21 +270,23 @@ def init_backend(
                 "max_input_tiles": max_input_tiles,
                 "use_thumbnail": use_thumbnail,
                 "corpus_batch_size": corpus_batch_size,
-                "corpus_chunk_size": corpus_chunk_size,
-                "preload_corpus_to_gpu": preload_corpus_to_gpu,
+                "max_scoring_batch_size": max_scoring_batch_size,
             }
         )
         active = _NemotronEmbedVLAdapter(retriever)
         return active, model_id, init_info
 
     else:  # nemotron-colembed-vl-8b-v2
-        corpus_batch_size = int(cfg.pop("corpus_batch_size", 8))
-        corpus_chunk_size = int(cfg.pop("corpus_chunk_size", 256))
-        preload_corpus_to_gpu = bool(cfg.pop("preload_corpus_to_gpu", False))
+        corpus_batch_size = int(cfg.pop("corpus_batch_size", 32))
+        max_scoring_batch_size = int(cfg.pop("max_scoring_batch_size", 3000))
+        scoring_chunk_size = int(cfg.pop("scoring_chunk_size", 1311))
         max_input_tiles = int(cfg.pop("max_input_tiles", 8))
         use_thumbnail = bool(cfg.pop("use_thumbnail", True))
         cache_dir = str(cfg.pop("cache_dir", "cache/nemotron_colembed_vl_v2"))
 
+        if cfg:
+            raise ValueError(f"Unknown pipeline arg(s) for backend {backend!r}: {', '.join(sorted(cfg))}")
+
         retriever.init(
             dataset_name=str(dataset_name),
             corpus_ids=corpus_ids,
@@ -293,9 +295,9 @@ def init_backend(
             device="cuda",
             top_k=top_k,
             corpus_batch_size=corpus_batch_size,
-            corpus_chunk_size=corpus_chunk_size,
+            max_scoring_batch_size=max_scoring_batch_size,
+            scoring_chunk_size=scoring_chunk_size,
             cache_dir=cache_dir,
-            preload_corpus_to_gpu=preload_corpus_to_gpu,
             max_input_tiles=max_input_tiles,
             use_thumbnail=use_thumbnail,
         )
diff --git a/retrieval-bench/src/retrieval_bench/singletons/colembed_retriever.py b/retrieval-bench/src/retrieval_bench/singletons/colembed_retriever.py
index 24a6972ae..680dc1017 100644
--- a/retrieval-bench/src/retrieval_bench/singletons/colembed_retriever.py
+++ b/retrieval-bench/src/retrieval_bench/singletons/colembed_retriever.py
@@ -34,8 +34,6 @@
         "Please install at least: torch (and for actual retrieval: transformers, optionally flash-attn)."
     ) from e
 
-from retrieval_bench.singletons._shared import try_preload_corpus_to_gpu as _try_preload_corpus_to_gpu
-
 
 class _ColEmbedState:
     def __init__(
@@ -43,7 +41,7 @@ def __init__(
         *,
         model_id: str,
         device: str,
-        corpus_chunk_size: int,
+        max_scoring_batch_size: int,
         batch_size: int,
         corpus_batch_size: int,
         top_k: int,
@@ -51,7 +49,7 @@ def __init__(
     ) -> None:
         self.model_id = model_id
         self.device = device
-        self.corpus_chunk_size = corpus_chunk_size
+        self.max_scoring_batch_size = max_scoring_batch_size
         self.batch_size = batch_size
         self.corpus_batch_size = corpus_batch_size
         self.top_k = top_k
@@ -149,6 +147,8 @@ def _load_or_build_corpus_embeddings(
                 return emb
             except Exception:
                 logger.debug("Cache load failed for %s, recomputing", cache_path, exc_info=True)
+                # fall through to recompute
+                pass
 
         t0 = time.time()
         emb = self._embed_corpus_batched(corpus)
@@ -160,58 +160,60 @@ def _load_or_build_corpus_embeddings(
         return emb
 
     def _embed_query(self, query: str) -> torch.Tensor:
-        # Returns CPU tensor [seq_len, embed_dim]
         with torch.no_grad():
-            q_emb = self.model.forward_queries([query], batch_size=1).cpu()
-        return q_emb[0]  # [seq_len, dim]
+            q_emb = self.model.forward_queries([query], batch_size=1).detach()
+        return q_emb[0].to(self.device)  # [seq_len, dim] on GPU
 
-    def _score_query(self, query_embedding_cpu: torch.Tensor) -> torch.Tensor:
-        if self.corpus_embeddings_cpu is None:
-            raise RuntimeError("corpus_embeddings_cpu is not set; call init() first")
+    def _score_query(self, query_embedding: torch.Tensor) -> torch.Tensor:
+        emb_gpu = self.corpus_embeddings_gpu
+        emb_cpu = self.corpus_embeddings_cpu
+        if emb_gpu is None and emb_cpu is None:
+            raise RuntimeError("No corpus embeddings available.")
 
-        num_corpus = self.corpus_embeddings_cpu.shape[0]
-        scores_cpu = torch.empty(num_corpus, dtype=torch.float32, device="cpu")
-        chunk = self.corpus_chunk_size
+        num_corpus = (emb_gpu if emb_gpu is not None else emb_cpu).shape[0]
         device = self.device
+        scores = torch.empty(num_corpus, dtype=torch.float32, device=device)
+
+        chunk = max(1, int(self.max_scoring_batch_size))
 
         with torch.no_grad():
-            q_gpu = query_embedding_cpu.to(device, non_blocking=True)  # [q_seq, dim]
-            q_t = q_gpu.transpose(0, 1)  # [dim, q_seq]
+            q_t = query_embedding.transpose(0, 1)  # [dim, q_seq]
 
             for c_start in range(0, num_corpus, chunk):
                 c_end = min(c_start + chunk, num_corpus)
-
-                if self.corpus_embeddings_gpu is not None:
-                    c_gpu = self.corpus_embeddings_gpu[c_start:c_end]
-                else:
-                    c_gpu = self.corpus_embeddings_cpu[c_start:c_end].to(device, non_blocking=True)
-
-                token_sims = torch.matmul(c_gpu, q_t)  # [chunk, c_seq, q_seq]
+                c_chunk = emb_gpu[c_start:c_end] if emb_gpu is not None else emb_cpu[c_start:c_end].to(device)
+                token_sims = torch.matmul(c_chunk, q_t)  # [chunk, c_seq, q_seq]
                 chunk_scores = token_sims.max(dim=1).values.float().sum(dim=1)  # [chunk]
-                scores_cpu[c_start:c_end] = chunk_scores.cpu()
+                scores[c_start:c_end] = chunk_scores
 
-        return scores_cpu
+        return scores
 
     def retrieve_one(
         self, query: str, *, return_markdown: bool = False
     ) -> Union[Dict[str, float], Tuple[Dict[str, float], Dict[str, str]]]:
-        if self.corpus_ids is None or self.corpus_embeddings_cpu is None or self.corpus_markdown is None:
+        if (
+            self.corpus_ids is None
+            or (self.corpus_embeddings_gpu is None and self.corpus_embeddings_cpu is None)
+            or self.corpus_markdown is None
+        ):
             raise RuntimeError("Retriever not initialized. Call retriever.init(...) first.")
 
-        query_embedding_cpu = self._embed_query(query)
-        scores_cpu = self._score_query(query_embedding_cpu)
+        q_emb = self._embed_query(query)
+        scores = self._score_query(q_emb)
 
         k = min(self.top_k, len(self.corpus_ids))
-        topk_scores, topk_indices = torch.topk(scores_cpu, k)
+        topk_scores, topk_indices = torch.topk(scores, k)
 
         corpus_ids = self.corpus_ids
-        run = {corpus_ids[int(idx)]: float(score) for idx, score in zip(topk_indices.tolist(), topk_scores.tolist())}
+        topk_indices_cpu = topk_indices.cpu().tolist()
+        topk_scores_cpu = topk_scores.cpu().tolist()
+        run = {corpus_ids[int(idx)]: float(score) for idx, score in zip(topk_indices_cpu, topk_scores_cpu)}
 
         if not return_markdown:
             return run
 
         corpus_markdown = self.corpus_markdown
-        markdown_by_id = {corpus_ids[int(idx)]: corpus_markdown[int(idx)] for idx in topk_indices.tolist()}
+        markdown_by_id = {corpus_ids[int(idx)]: corpus_markdown[int(idx)] for idx in topk_indices_cpu}
         return run, markdown_by_id
 
 
@@ -238,9 +240,8 @@ def init(
         top_k: int = 100,
         batch_size: int = 32,
         corpus_batch_size: int = 32,
-        corpus_chunk_size: int = 256,
+        max_scoring_batch_size: int = 256,
         cache_dir: str | Path = "cache",
-        preload_corpus_to_gpu: bool = True,
     ) -> None:
         """
         Initialize (or re-initialize) the singleton for a given dataset/corpus.
@@ -259,7 +260,7 @@ def init(
                 self._state = _ColEmbedState(
                     model_id=model_id,
                     device=device,
-                    corpus_chunk_size=corpus_chunk_size,
+                    max_scoring_batch_size=max_scoring_batch_size,
                     batch_size=batch_size,
                     corpus_batch_size=corpus_batch_size,
                     top_k=top_k,
@@ -270,7 +271,7 @@ def init(
                 self._state.top_k = top_k
                 self._state.batch_size = batch_size
                 self._state.corpus_batch_size = corpus_batch_size
-                self._state.corpus_chunk_size = corpus_chunk_size
+                self._state.max_scoring_batch_size = max_scoring_batch_size
                 self._state.cache_dir = cache_dir
 
             # If already initialized for the same dataset with same corpus_ids length, keep as-is.
@@ -294,12 +295,9 @@ def init(
             self._state.corpus_markdown = corpus_markdown
             self._state.corpus_embeddings_cpu = corpus_embeddings_cpu
 
-            # Optional preload to GPU for faster repeated retrieval.
             self._state.corpus_embeddings_gpu = None
-            if preload_corpus_to_gpu:
-                self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(
-                    corpus_embeddings_cpu, self._state.device
-                )
+            if corpus_embeddings_cpu.shape[0] <= self._state.max_scoring_batch_size:
+                self._state.corpus_embeddings_gpu = corpus_embeddings_cpu.to(self._state.device)
 
     def retrieve(
         self, query: str, *, return_markdown: bool = False
diff --git a/retrieval-bench/src/retrieval_bench/singletons/hf_dense_retriever.py b/retrieval-bench/src/retrieval_bench/singletons/hf_dense_retriever.py
index 859f5b71c..047b53437 100644
--- a/retrieval-bench/src/retrieval_bench/singletons/hf_dense_retriever.py
+++ b/retrieval-bench/src/retrieval_bench/singletons/hf_dense_retriever.py
@@ -39,9 +39,18 @@
 except ImportError as e:  # pragma: no cover
     raise ImportError("Required dependencies not installed for HF dense retriever. Install: torch") from e
 
-from retrieval_bench.singletons._shared import hash_corpus_ids10 as _hash_corpus_ids10
-from retrieval_bench.singletons._shared import slugify as _slugify
-from retrieval_bench.singletons._shared import try_preload_corpus_to_gpu as _try_preload_corpus_to_gpu
+
+def _hash_corpus_ids10(corpus_ids: Sequence[str]) -> str:
+    h = hashlib.sha256()
+    for cid in corpus_ids:
+        h.update(str(cid).encode("utf-8"))
+        h.update(b"\n")
+    return h.hexdigest()[:10]
+
+
+def _slugify(value: str) -> str:
+    v = (value or "").strip().replace("/", "__")
+    return v or "unnamed"
 
 
 def _last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
@@ -139,7 +148,7 @@ def __init__(
         score_scale: float,
         batch_size: int,
         corpus_batch_size: int,
-        scoring_batch_size: int,
+        max_scoring_batch_size: int,
         top_k: int,
         cache_dir: Path,
     ) -> None:
@@ -153,7 +162,7 @@ def __init__(
         self.score_scale = float(score_scale)
         self.batch_size = int(batch_size)
         self.corpus_batch_size = int(corpus_batch_size)
-        self.scoring_batch_size = int(scoring_batch_size)
+        self.max_scoring_batch_size = int(max_scoring_batch_size)
         self.top_k = int(top_k)
         self.cache_dir = cache_dir
 
@@ -322,6 +331,7 @@ def _load_or_build_corpus_embeddings(
                 return emb
             except Exception:
                 logger.debug("Cache load failed for %s, recomputing", emb_path, exc_info=True)
+                pass
 
         # Build from scratch.
         emb_path.parent.mkdir(parents=True, exist_ok=True)
@@ -345,36 +355,40 @@ def embed_query(self, query_text: str) -> torch.Tensor:
             q = str(self.query_prefix) + str(query_text)
         else:
             q = _wrap_instruct(self.task_description, str(query_text))
-        emb = self._embed_texts_batched([q], batch_size=1)
-        if emb.ndim != 2 or emb.shape[0] != 1:
-            raise RuntimeError(f"Unexpected query embedding shape: {tuple(emb.shape)}")
-        return emb[0]  # [dim] on CPU
-
-    def score_query(self, query_embedding_cpu: torch.Tensor) -> torch.Tensor:
-        if self.corpus_embeddings_cpu is None:
-            raise RuntimeError("corpus_embeddings_cpu is not set; call init() first")
-        num_docs = self.corpus_embeddings_cpu.shape[0]
-        scores_cpu = torch.empty((num_docs,), dtype=torch.float32, device="cpu")
-
-        chunk = max(1, int(self.scoring_batch_size))
+
+        with torch.no_grad():
+            batch = self._tokenize([q])
+            outputs = self.model(**batch)
+            pooled = self._pool(outputs.last_hidden_state, batch["attention_mask"])
+            mode = str(self.pooling or "last_token").strip().lower()
+            if mode not in ("mean", "avg", "average"):
+                pooled = F.normalize(pooled, p=2, dim=1)
+
+        return pooled[0].detach()  # [dim], stays on GPU
+
+    def score_query(self, query_embedding: torch.Tensor) -> torch.Tensor:
+        emb_gpu = self.corpus_embeddings_gpu
+        emb_cpu = self.corpus_embeddings_cpu
+        if emb_gpu is None and emb_cpu is None:
+            raise RuntimeError("No corpus embeddings available.")
+
+        num_docs = (emb_gpu if emb_gpu is not None else emb_cpu).shape[0]
+        device = str(self.device)
+        scores = torch.empty((num_docs,), dtype=torch.float32, device=device)
+
+        chunk = max(1, int(self.max_scoring_batch_size))
         scale = float(self.score_scale)
 
         with torch.no_grad():
-            q_gpu = query_embedding_cpu.to(self.device, non_blocking=True)  # [dim]
-            q_gpu = q_gpu.unsqueeze(1)  # [dim, 1]
+            q_col = query_embedding.unsqueeze(1)  # [dim, 1]
 
             for c_start in range(0, num_docs, chunk):
                 c_end = min(c_start + chunk, num_docs)
-                if self.corpus_embeddings_gpu is not None:
-                    c_gpu = self.corpus_embeddings_gpu[c_start:c_end]
-                else:
-                    c_gpu = self.corpus_embeddings_cpu[c_start:c_end].to(self.device, non_blocking=True)
-
-                # [chunk, dim] @ [dim, 1] -> [chunk, 1]
-                chunk_scores = torch.matmul(c_gpu, q_gpu).squeeze(1).float() * scale
-                scores_cpu[c_start:c_end] = chunk_scores.to("cpu")
+                c_chunk = emb_gpu[c_start:c_end] if emb_gpu is not None else emb_cpu[c_start:c_end].to(device)
+                chunk_scores = torch.matmul(c_chunk, q_col).squeeze(1).float() * scale
+                scores[c_start:c_end] = chunk_scores
 
-        return scores_cpu
+        return scores
 
     def retrieve_one(
         self,
@@ -383,38 +397,40 @@ def retrieve_one(
         return_markdown: bool = False,
         excluded_ids: Optional[Sequence[str]] = None,
     ) -> Union[Dict[str, float], Tuple[Dict[str, float], Dict[str, str]]]:
-        if self.corpus_ids is None or self.corpus_embeddings_cpu is None or self.corpus_markdown is None:
+        if (
+            self.corpus_ids is None
+            or (self.corpus_embeddings_gpu is None and self.corpus_embeddings_cpu is None)
+            or self.corpus_markdown is None
+        ):
             raise RuntimeError("Retriever not initialized. Call retriever.init(...) first.")
 
-        q_emb_cpu = self.embed_query(query)
-        scores_cpu = self.score_query(q_emb_cpu)
+        q_emb = self.embed_query(query)
+        scores = self.score_query(q_emb)
 
-        # Apply per-query excluded ids BEFORE top-k selection (BRIGHT semantics).
-        # This prevents excluded docs from "stealing" slots in top-k.
         if excluded_ids and self.corpus_id_to_idx:
+            excluded_indices = []
             for did in set(str(x) for x in excluded_ids):
                 if did == "N/A":
                     continue
                 idx = self.corpus_id_to_idx.get(did, None)
-                if idx is None:
-                    continue
-                try:
-                    scores_cpu[int(idx)] = float("-inf")
-                except Exception:
-                    # Ignore malformed indices; keep scoring robust.
-                    pass
+                if idx is not None:
+                    excluded_indices.append(int(idx))
+            if excluded_indices:
+                scores[torch.tensor(excluded_indices, device=scores.device)] = float("-inf")
 
         k = min(int(self.top_k), len(self.corpus_ids))
-        topk_scores, topk_indices = torch.topk(scores_cpu, k)
+        topk_scores, topk_indices = torch.topk(scores, k)
 
         ids = self.corpus_ids
-        run = {ids[int(idx)]: float(score) for idx, score in zip(topk_indices.tolist(), topk_scores.tolist())}
+        topk_indices_cpu = topk_indices.cpu().tolist()
+        topk_scores_cpu = topk_scores.cpu().tolist()
+        run = {ids[int(idx)]: float(score) for idx, score in zip(topk_indices_cpu, topk_scores_cpu)}
 
         if not return_markdown:
             return run
 
         md = self.corpus_markdown
-        markdown_by_id = {ids[int(idx)]: md[int(idx)] for idx in topk_indices.tolist()}
+        markdown_by_id = {ids[int(idx)]: md[int(idx)] for idx in topk_indices_cpu}
         return run, markdown_by_id
 
 
@@ -444,9 +460,8 @@ def init(
         score_scale: float = 100.0,
         batch_size: int = 1,
         corpus_batch_size: int = 1,
-        scoring_batch_size: int = 4096,
+        max_scoring_batch_size: int = 4096,
         cache_dir: str | Path = "cache/hf_dense",
-        preload_corpus_to_gpu: bool = False,
     ) -> None:
         """
         Initialize (or re-initialize) the singleton for a given dataset/corpus.
@@ -476,7 +491,7 @@ def init(
                     score_scale=float(score_scale),
                     batch_size=int(batch_size),
                     corpus_batch_size=int(corpus_batch_size),
-                    scoring_batch_size=int(scoring_batch_size),
+                    max_scoring_batch_size=int(max_scoring_batch_size),
                     top_k=int(top_k),
                     cache_dir=cache_dir,
                 )
@@ -485,7 +500,7 @@ def init(
                 self._state.top_k = int(top_k)
                 self._state.batch_size = int(batch_size)
                 self._state.corpus_batch_size = int(corpus_batch_size)
-                self._state.scoring_batch_size = int(scoring_batch_size)
+                self._state.max_scoring_batch_size = int(max_scoring_batch_size)
                 self._state.cache_dir = cache_dir
                 self._state.task_description = str(task_description)
                 self._state.query_prefix = str(query_prefix) if isinstance(query_prefix, str) else None
@@ -502,12 +517,10 @@ def init(
                 and _hash_corpus_ids10(self._state.corpus_ids) == corpus_ids_hash10
                 and self._state.corpus_embeddings_cpu is not None
             ):
-                # Already initialized for the same corpus; only (possibly) update GPU preload.
-                if preload_corpus_to_gpu and self._state.corpus_embeddings_gpu is None:
-                    self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(
-                        self._state.corpus_embeddings_cpu, self._state.device
-                    )
-                if (not preload_corpus_to_gpu) and self._state.corpus_embeddings_gpu is not None:
+                should_be_on_gpu = len(corpus_ids_list) <= self._state.max_scoring_batch_size
+                if should_be_on_gpu and self._state.corpus_embeddings_gpu is None:
+                    self._state.corpus_embeddings_gpu = self._state.corpus_embeddings_cpu.to(self._state.device)
+                if (not should_be_on_gpu) and self._state.corpus_embeddings_gpu is not None:
                     self._state.corpus_embeddings_gpu = None
                 return
 
@@ -524,8 +537,8 @@ def init(
             self._state.corpus_embeddings_cpu = emb_cpu
 
             self._state.corpus_embeddings_gpu = None
-            if preload_corpus_to_gpu:
-                self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(emb_cpu, self._state.device)
+            if emb_cpu.shape[0] <= self._state.max_scoring_batch_size:
+                self._state.corpus_embeddings_gpu = emb_cpu.to(self._state.device)
 
     def retrieve(
         self, query: str, *, return_markdown: bool = False, excluded_ids: Optional[Sequence[str]] = None
diff --git a/retrieval-bench/src/retrieval_bench/singletons/nemotron_colembed_vl_v2_retriever.py b/retrieval-bench/src/retrieval_bench/singletons/nemotron_colembed_vl_v2_retriever.py
index 711e3d280..7f7f22ee4 100644
--- a/retrieval-bench/src/retrieval_bench/singletons/nemotron_colembed_vl_v2_retriever.py
+++ b/retrieval-bench/src/retrieval_bench/singletons/nemotron_colembed_vl_v2_retriever.py
@@ -23,6 +23,7 @@
 
 import hashlib
 import logging
+import math
 import os
 import time
 import threading
@@ -39,8 +40,6 @@
         "Please install at least: torch (and for actual retrieval: transformers, optionally flash-attn)."
     ) from e
 
-from retrieval_bench.singletons._shared import try_preload_corpus_to_gpu as _try_preload_corpus_to_gpu
-
 
 def _set_tiling_knobs_if_present(model: Any, *, max_input_tiles: int, use_thumbnail: bool) -> None:
     """
@@ -74,13 +73,21 @@ def _set_tiling_knobs_if_present(model: Any, *, max_input_tiles: int, use_thumbn
             pass
 
 
+def _balanced_chunk_size(num_items: int, max_chunk: int) -> int:
+    if num_items <= max_chunk:
+        return num_items
+    k = math.ceil(num_items / max_chunk)
+    return math.ceil(num_items / k)
+
+
 class _NemotronColEmbedVLV2State:
     def __init__(
         self,
         *,
         model_id: str,
         device: str,
-        corpus_chunk_size: int,
+        max_scoring_batch_size: int,
+        scoring_chunk_size: int,
         corpus_batch_size: int,
         top_k: int,
         cache_dir: Path,
@@ -89,7 +96,8 @@ def __init__(
     ) -> None:
         self.model_id = str(model_id)
         self.device = str(device)
-        self.corpus_chunk_size = int(corpus_chunk_size)
+        self.max_scoring_batch_size = int(max_scoring_batch_size)
+        self.scoring_chunk_size = int(scoring_chunk_size)
         self.corpus_batch_size = int(corpus_batch_size)
         self.top_k = int(top_k)
         self.cache_dir = cache_dir
@@ -156,11 +164,7 @@ def _from_pretrained(*, attn_implementation: str):
     def _corpus_cache_path(self, dataset_name: str) -> Path:
         dataset_slug = str(dataset_name).replace("/", "__")
         model_slug = self.model_id.split("/")[-1].replace("/", "__")
-        key = (
-            f"{dataset_name}::{self.model_id}::images"
-            f"::max_input_tiles={int(self.max_input_tiles)}"
-            f"::use_thumbnail={bool(self.use_thumbnail)}"
-        )
+        key = f"{dataset_name}::{self.model_id}::images::max_input_tiles={int(self.max_input_tiles)}::use_thumbnail={bool(self.use_thumbnail)}"
         key_hash = hashlib.sha256(key.encode("utf-8")).hexdigest()[:10]
         filename = f"corpus_image_embeddings__{dataset_slug}__{model_slug}__{key_hash}.pt"
         return self.cache_dir / filename
@@ -242,54 +246,68 @@ def _load_or_build_corpus_embeddings(
         return emb, lengths
 
     def _embed_query(self, query: str) -> torch.Tensor:
-        # Returns CPU tensor [q_seq, dim]
         with torch.no_grad():
-            q_emb = self.model.forward_queries([str(query)], batch_size=1).detach().to("cpu")
+            q_emb = self.model.forward_queries([str(query)], batch_size=1).detach()
         if not isinstance(q_emb, torch.Tensor) or q_emb.ndim != 3 or q_emb.shape[0] != 1:
             raise RuntimeError(f"Unexpected query embedding shape: {getattr(q_emb, 'shape', None)}")
-        return q_emb[0]
+        return q_emb[0].to(self.device)
 
-    def _score_query(self, query_embedding_cpu: torch.Tensor) -> torch.Tensor:
-        if self.corpus_embeddings_cpu is None:
-            raise RuntimeError("corpus_embeddings_cpu is not set; call init() first")
-        if self.corpus_token_lengths_cpu is None:
-            raise RuntimeError("corpus_token_lengths_cpu is not set; call init() first")
-
-        num_corpus = self.corpus_embeddings_cpu.shape[0]
-        scores_cpu = torch.empty((num_corpus,), dtype=torch.float32, device="cpu")
-
-        chunk = max(1, int(self.corpus_chunk_size))
+    def _score_maxsim_block(
+        self,
+        emb_block: torch.Tensor,
+        len_block: torch.Tensor,
+        q_t: torch.Tensor,
+        device: str,
+    ) -> torch.Tensor:
+        token_sims = torch.matmul(emb_block, q_t)  # [block, c_seq, q_seq]
+        c_seq = int(token_sims.shape[1])
+        pos = torch.arange(c_seq, device=device).unsqueeze(0)
+        valid = pos < len_block.unsqueeze(1)
+        token_sims = token_sims.masked_fill(~valid.unsqueeze(-1), float("-inf"))
+        return token_sims.max(dim=1).values.float().sum(dim=1)
+
+    def _score_query(self, query_embedding: torch.Tensor) -> torch.Tensor:
+        emb_gpu = self.corpus_embeddings_gpu
+        emb_cpu = self.corpus_embeddings_cpu
+        len_gpu = self.corpus_token_lengths_gpu
+        len_cpu = self.corpus_token_lengths_cpu
+        if emb_gpu is None and emb_cpu is None:
+            raise RuntimeError("No corpus embeddings available.")
+        if len_gpu is None and len_cpu is None:
+            raise RuntimeError("No corpus token lengths available.")
+
+        source = emb_gpu if emb_gpu is not None else emb_cpu
+        num_corpus = source.shape[0]
         device = str(self.device)
+        scores = torch.empty((num_corpus,), dtype=torch.float32, device=device)
 
         with torch.no_grad():
-            q_gpu = query_embedding_cpu.to(device, non_blocking=True)  # [q_seq, dim]
-            q_t = q_gpu.transpose(0, 1)  # [dim, q_seq]
+            q_t = query_embedding.transpose(0, 1)  # [dim, q_seq]
 
-            for c_start in range(0, num_corpus, chunk):
-                c_end = min(c_start + chunk, num_corpus)
+            if emb_gpu is not None:
+                scores[:] = self._score_maxsim_block(emb_gpu, len_gpu, q_t, device)
+            else:
+                transfer_chunk = _balanced_chunk_size(num_corpus, max(1, int(self.max_scoring_batch_size)))
+                score_chunk = max(1, int(self.scoring_chunk_size))
 
-                if self.corpus_embeddings_gpu is not None:
-                    c_gpu = self.corpus_embeddings_gpu[c_start:c_end]
-                else:
-                    c_gpu = self.corpus_embeddings_cpu[c_start:c_end].to(device, non_blocking=True)
+                for t_start in range(0, num_corpus, transfer_chunk):
+                    t_end = min(t_start + transfer_chunk, num_corpus)
+                    t_emb = emb_cpu[t_start:t_end].to(device)
+                    t_len = len_cpu[t_start:t_end].to(device)
 
-                if self.corpus_token_lengths_gpu is not None:
-                    len_gpu = self.corpus_token_lengths_gpu[c_start:c_end]
-                else:
-                    len_gpu = self.corpus_token_lengths_cpu[c_start:c_end].to(device, non_blocking=True)
+                    for s_start in range(0, t_emb.shape[0], score_chunk):
+                        s_end = min(s_start + score_chunk, t_emb.shape[0])
+                        block_scores = self._score_maxsim_block(
+                            t_emb[s_start:s_end],
+                            t_len[s_start:s_end],
+                            q_t,
+                            device,
+                        )
+                        scores[t_start + s_start : t_start + s_end] = block_scores
 
-                # c_gpu: [chunk, c_seq, dim]
-                # q_t:   [dim, q_seq]
-                token_sims = torch.matmul(c_gpu, q_t)  # [chunk, c_seq, q_seq]
-                # Mask padded tokens so they can never win the max.
-                c_seq = int(token_sims.shape[1])
-                pos = torch.arange(c_seq, device=device).unsqueeze(0)  # [1, c_seq]
-                valid = pos < len_gpu.to(device=device).unsqueeze(1)  # [chunk, c_seq]
-                token_sims = token_sims.masked_fill(~valid.unsqueeze(-1), float("-inf"))
-                chunk_scores = token_sims.max(dim=1).values.float().sum(dim=1)  # [chunk]
-                scores_cpu[c_start:c_end] = chunk_scores.detach().to("cpu")
+                    del t_emb, t_len
 
-        return scores_cpu
+        return scores
 
     def retrieve_one(
         self,
@@ -298,35 +316,35 @@ def retrieve_one(
         return_markdown: bool = False,
         excluded_ids: Optional[Sequence[str]] = None,
     ) -> Union[Dict[str, float], Tuple[Dict[str, float], Dict[str, str]]]:
-        if self.corpus_ids is None or self.corpus_embeddings_cpu is None:
+        if self.corpus_ids is None or (self.corpus_embeddings_gpu is None and self.corpus_embeddings_cpu is None):
             raise RuntimeError("Retriever not initialized. Call retriever.init(...) first.")
 
-        q_emb_cpu = self._embed_query(str(query))
-        scores_cpu = self._score_query(q_emb_cpu)
+        q_emb = self._embed_query(str(query))
+        scores = self._score_query(q_emb)
 
-        # Apply per-query excluded ids BEFORE top-k selection (BRIGHT semantics).
         if excluded_ids and self.corpus_id_to_idx:
+            excluded_indices = []
             for did in set(str(x) for x in excluded_ids):
                 if did == "N/A":
                     continue
                 idx = self.corpus_id_to_idx.get(did, None)
-                if idx is None:
-                    continue
-                try:
-                    scores_cpu[int(idx)] = float("-inf")
-                except Exception:
-                    pass
+                if idx is not None:
+                    excluded_indices.append(int(idx))
+            if excluded_indices:
+                scores[torch.tensor(excluded_indices, device=scores.device)] = float("-inf")
 
         k = min(int(self.top_k), len(self.corpus_ids))
-        topk_scores, topk_indices = torch.topk(scores_cpu, k)
+        topk_scores, topk_indices = torch.topk(scores, k)
         ids = self.corpus_ids
-        run = {ids[int(idx)]: float(score) for idx, score in zip(topk_indices.tolist(), topk_scores.tolist())}
+        topk_indices_cpu = topk_indices.cpu().tolist()
+        topk_scores_cpu = topk_scores.cpu().tolist()
+        run = {ids[int(idx)]: float(score) for idx, score in zip(topk_indices_cpu, topk_scores_cpu)}
 
         if not return_markdown:
             return run
 
         md = self.corpus_markdown or [""] * len(ids)
-        markdown_by_id = {ids[int(idx)]: str(md[int(idx)]) for idx in topk_indices.tolist()}
+        markdown_by_id = {ids[int(idx)]: str(md[int(idx)]) for idx in topk_indices_cpu}
         return run, markdown_by_id
 
 
@@ -348,10 +366,10 @@ def init(
         model_id: str = "nvidia/nemotron-colembed-vl-8b-v2",
         device: str = "cuda",
         top_k: int = 100,
-        corpus_batch_size: int = 8,
-        corpus_chunk_size: int = 256,
+        corpus_batch_size: int = 32,
+        max_scoring_batch_size: int = 3000,
+        scoring_chunk_size: int = 1311,
         cache_dir: str | Path = "cache/nemotron_colembed_vl_v2",
-        preload_corpus_to_gpu: bool = False,
         max_input_tiles: int = 8,
         use_thumbnail: bool = True,
     ) -> None:
@@ -366,7 +384,8 @@ def init(
                 self._state = _NemotronColEmbedVLV2State(
                     model_id=str(model_id),
                     device=str(device),
-                    corpus_chunk_size=int(corpus_chunk_size),
+                    max_scoring_batch_size=int(max_scoring_batch_size),
+                    scoring_chunk_size=int(scoring_chunk_size),
                     corpus_batch_size=int(corpus_batch_size),
                     top_k=int(top_k),
                     cache_dir=cache_dir_p,
@@ -377,7 +396,8 @@ def init(
                 # Update tunables.
                 self._state.top_k = int(top_k)
                 self._state.corpus_batch_size = int(corpus_batch_size)
-                self._state.corpus_chunk_size = int(corpus_chunk_size)
+                self._state.max_scoring_batch_size = int(max_scoring_batch_size)
+                self._state.scoring_chunk_size = int(scoring_chunk_size)
                 self._state.cache_dir = cache_dir_p
                 self._state.max_input_tiles = int(max_input_tiles)
                 self._state.use_thumbnail = bool(use_thumbnail)
@@ -390,18 +410,14 @@ def init(
                 and self._state.corpus_embeddings_cpu is not None
                 and self._state.corpus_token_lengths_cpu is not None
             ):
-                # Only adjust GPU preload.
-                if preload_corpus_to_gpu and self._state.corpus_embeddings_gpu is None:
-                    self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(
-                        self._state.corpus_embeddings_cpu, self._state.device
-                    )
-                if preload_corpus_to_gpu and self._state.corpus_token_lengths_gpu is None:
-                    self._state.corpus_token_lengths_gpu = self._state.corpus_token_lengths_cpu.to(
-                        self._state.device, non_blocking=True
-                    )
-                if (not preload_corpus_to_gpu) and self._state.corpus_embeddings_gpu is not None:
+                should_be_on_gpu = len(corpus_ids) <= self._state.max_scoring_batch_size
+                if should_be_on_gpu and self._state.corpus_embeddings_gpu is None:
+                    self._state.corpus_embeddings_gpu = self._state.corpus_embeddings_cpu.to(self._state.device)
+                if should_be_on_gpu and self._state.corpus_token_lengths_gpu is None:
+                    self._state.corpus_token_lengths_gpu = self._state.corpus_token_lengths_cpu.to(self._state.device)
+                if (not should_be_on_gpu) and self._state.corpus_embeddings_gpu is not None:
                     self._state.corpus_embeddings_gpu = None
-                if (not preload_corpus_to_gpu) and self._state.corpus_token_lengths_gpu is not None:
+                if (not should_be_on_gpu) and self._state.corpus_token_lengths_gpu is not None:
                     self._state.corpus_token_lengths_gpu = None
                 return
 
@@ -424,11 +440,9 @@ def init(
 
             self._state.corpus_embeddings_gpu = None
             self._state.corpus_token_lengths_gpu = None
-            if preload_corpus_to_gpu:
-                self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(emb_cpu, self._state.device)
-                self._state.corpus_token_lengths_gpu = self._state.corpus_token_lengths_cpu.to(
-                    self._state.device, non_blocking=True
-                )
+            if emb_cpu.shape[0] <= self._state.max_scoring_batch_size:
+                self._state.corpus_embeddings_gpu = emb_cpu.to(self._state.device)
+                self._state.corpus_token_lengths_gpu = lengths_cpu.to(self._state.device)
 
     def retrieve(
         self,
diff --git a/retrieval-bench/src/retrieval_bench/singletons/nemotron_embed_vl_dense_retriever.py b/retrieval-bench/src/retrieval_bench/singletons/nemotron_embed_vl_dense_retriever.py
index 4e6c256da..49a495a7c 100644
--- a/retrieval-bench/src/retrieval_bench/singletons/nemotron_embed_vl_dense_retriever.py
+++ b/retrieval-bench/src/retrieval_bench/singletons/nemotron_embed_vl_dense_retriever.py
@@ -38,7 +38,6 @@
 
 from retrieval_bench.singletons._shared import hash_corpus_ids10 as _hash_corpus_ids10
 from retrieval_bench.singletons._shared import slugify as _slugify
-from retrieval_bench.singletons._shared import try_preload_corpus_to_gpu as _try_preload_corpus_to_gpu
 
 
 def _l2_normalize_fp32(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
@@ -94,7 +93,7 @@ def __init__(
         doc_max_length: int,
         query_max_length: int,
         corpus_batch_size: int,
-        corpus_chunk_size: int,
+        max_scoring_batch_size: int,
         cache_dir: Path,
         max_input_tiles: int,
         use_thumbnail: bool,
@@ -106,7 +105,7 @@ def __init__(
         self.doc_max_length = int(doc_max_length)
         self.query_max_length = int(query_max_length)
         self.corpus_batch_size = int(corpus_batch_size)
-        self.corpus_chunk_size = int(corpus_chunk_size)
+        self.max_scoring_batch_size = int(max_scoring_batch_size)
         self.cache_dir = cache_dir
         self.max_input_tiles = int(max_input_tiles)
         self.use_thumbnail = bool(use_thumbnail)
@@ -156,11 +155,13 @@ def _from_pretrained(*, attn_implementation: str):
                     **common_kwargs,
                 )
 
-        # Prefer FlashAttention2 when available; fall back to eager.
         try:
             model = _from_pretrained(attn_implementation="flash_attention_2")
-        except Exception:
-            model = _from_pretrained(attn_implementation="eager")
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to load {self.model_id} with flash_attention_2: {e}\n"
+                'Install a compatible flash-attn: pip install "flash-attn>=2.6.3,<2.8" --no-build-isolation'
+            ) from e
 
         model.to("cuda")
         model.eval()
@@ -288,33 +289,47 @@ def _embed_corpus_batched(self, corpus: Sequence[Dict[str, Any]]) -> torch.Tenso
         bs = max(1, int(self.corpus_batch_size))
         out: List[torch.Tensor] = []
         modality = str(self.doc_modality).strip().lower()
+        n_batches = (len(corpus) + bs - 1) // bs
+        total_preprocess_s = 0.0
+        total_forward_s = 0.0
 
-        # Set doc max length for document calls.
         self._set_processor_max_length_for_call(p_max_length=int(self.doc_max_length))
 
-        with torch.inference_mode():
+        with torch.no_grad():
             for i in range(0, len(corpus), bs):
                 batch = corpus[i : i + bs]
 
+                t0 = time.time()
                 if modality == "image":
                     images = [doc["image"].convert("RGB") for doc in batch]
-                    emb = self.model.encode_documents(images=images)
+                    examples = [{"image": img, "text": ""} for img in images]
                 elif modality == "text":
                     texts = [str(doc.get("markdown", "")) for doc in batch]
-                    emb = self.model.encode_documents(texts=texts)
+                    examples = [{"image": "", "text": t} for t in texts]
                 else:  # image_text
                     images = [doc["image"].convert("RGB") for doc in batch]
                     texts = [str(doc.get("markdown", "")) for doc in batch]
-                    emb = self.model.encode_documents(images=images, texts=texts)
+                    examples = [{"image": img, "text": t} for img, t in zip(images, texts)]
 
-                if not isinstance(emb, torch.Tensor):
-                    raise RuntimeError(f"encode_documents returned unexpected type: {type(emb)}")
-                if emb.ndim != 2:
-                    raise RuntimeError(f"Unexpected document embedding shape: {tuple(emb.shape)}")
+                docs_dict = self.model.processor.process_documents(examples)
+                t1 = time.time()
+
+                emb = self.model._embed_batch(docs_dict)
+                torch.cuda.synchronize()
+                t2 = time.time()
+
+                total_preprocess_s += t1 - t0
+                total_forward_s += t2 - t1
 
-                emb = _l2_normalize_fp32(emb).to(torch.float16).detach().to("cpu")
-                out.append(emb)
+                if not isinstance(emb, torch.Tensor) or emb.ndim != 2:
+                    raise RuntimeError(f"Unexpected embedding: type={type(emb)}, shape={getattr(emb, 'shape', None)}")
 
+                out.append(_l2_normalize_fp32(emb).to(torch.float16).detach().to("cpu"))
+
+        print(
+            f"[nemotron-vl-dense] corpus embedding: {n_batches} batches x {bs} docs, "
+            f"preprocess={total_preprocess_s:.1f}s, forward={total_forward_s:.1f}s"
+        )
         return torch.cat(out, dim=0) if out else torch.empty((0, 0), dtype=torch.float16, device="cpu")
 
     def _load_or_build_corpus_embeddings(
@@ -378,10 +393,9 @@ def _load_or_build_corpus_embeddings(
         return emb
 
     def embed_query(self, query_text: str) -> torch.Tensor:
-        # Set query max length for query call.
         self._set_processor_max_length_for_call(p_max_length=int(self.query_max_length))
 
-        with torch.inference_mode():
+        with torch.no_grad():
             emb = self.model.encode_queries([str(query_text)])
 
         if not isinstance(emb, torch.Tensor):
@@ -390,34 +404,32 @@ def embed_query(self, query_text: str) -> torch.Tensor:
             raise RuntimeError(f"Unexpected query embedding shape: {tuple(emb.shape)}")
 
         emb1 = emb[0]
-        emb1 = _l2_normalize_fp32(emb1).to(torch.float16).detach().to("cpu")
-        return emb1  # [dim] on CPU
+        emb1 = _l2_normalize_fp32(emb1).to(torch.float16).detach()
+        return emb1  # [dim] on GPU
 
-    def score_query(self, query_embedding_cpu: torch.Tensor) -> torch.Tensor:
-        if self.corpus_embeddings_cpu is None:
-            raise RuntimeError("corpus_embeddings_cpu is not set; call init() first")
+    def score_query(self, query_embedding: torch.Tensor) -> torch.Tensor:
+        emb_gpu = self.corpus_embeddings_gpu
+        emb_cpu = self.corpus_embeddings_cpu
+        if emb_gpu is None and emb_cpu is None:
+            raise RuntimeError("No corpus embeddings available.")
 
-        num_docs = self.corpus_embeddings_cpu.shape[0]
-        scores_cpu = torch.empty((num_docs,), dtype=torch.float32, device="cpu")
+        with torch.no_grad():
+            q_col = query_embedding.unsqueeze(1)  # [dim, 1]
 
-        chunk = max(1, int(self.corpus_chunk_size))
-        device = str(self.device)
+            if emb_gpu is not None:
+                return torch.matmul(emb_gpu, q_col).squeeze(1).float()
 
-        with torch.inference_mode():
-            q_gpu = query_embedding_cpu.to(device, non_blocking=True)  # [dim]
-            q_gpu = q_gpu.unsqueeze(1)  # [dim, 1]
+            num_docs = emb_cpu.shape[0]
+            device = str(self.device)
+            scores = torch.empty((num_docs,), dtype=torch.float32, device=device)
+            chunk = max(1, int(self.max_scoring_batch_size))
 
             for c_start in range(0, num_docs, chunk):
                 c_end = min(c_start + chunk, num_docs)
-                if self.corpus_embeddings_gpu is not None:
-                    c_gpu = self.corpus_embeddings_gpu[c_start:c_end]
-                else:
-                    c_gpu = self.corpus_embeddings_cpu[c_start:c_end].to(device, non_blocking=True)
-
-                chunk_scores = torch.matmul(c_gpu, q_gpu).squeeze(1).float()  # [chunk]
-                scores_cpu[c_start:c_end] = chunk_scores.to("cpu")
+                c_chunk = emb_cpu[c_start:c_end].to(device)
+                scores[c_start:c_end] = torch.matmul(c_chunk, q_col).squeeze(1).float()
 
-        return scores_cpu
+        return scores
 
     def retrieve_one(
         self,
@@ -426,35 +438,35 @@ def retrieve_one(
         return_markdown: bool = False,
         excluded_ids: Optional[Sequence[str]] = None,
     ) -> Union[Dict[str, float], Tuple[Dict[str, float], Dict[str, str]]]:
-        if self.corpus_ids is None or self.corpus_embeddings_cpu is None:
+        if self.corpus_ids is None or (self.corpus_embeddings_gpu is None and self.corpus_embeddings_cpu is None):
             raise RuntimeError("Retriever not initialized. Call retriever.init(...) first.")
 
-        q_emb_cpu = self.embed_query(str(query))
-        scores_cpu = self.score_query(q_emb_cpu)
+        q_emb = self.embed_query(str(query))
+        scores = self.score_query(q_emb)
 
-        # Apply per-query excluded ids BEFORE top-k selection (BRIGHT semantics).
         if excluded_ids and self.corpus_id_to_idx:
+            excluded_indices = []
             for did in set(str(x) for x in excluded_ids):
                 if did == "N/A":
                     continue
                 idx = self.corpus_id_to_idx.get(did, None)
-                if idx is None:
-                    continue
-                try:
-                    scores_cpu[int(idx)] = float("-inf")
-                except Exception:
-                    pass
+                if idx is not None:
+                    excluded_indices.append(int(idx))
+            if excluded_indices:
+                scores[torch.tensor(excluded_indices, device=scores.device)] = float("-inf")
 
         k = min(int(self.top_k), len(self.corpus_ids))
-        topk_scores, topk_indices = torch.topk(scores_cpu, k)
+        topk_scores, topk_indices = torch.topk(scores, k)
         ids = self.corpus_ids
-        run = {ids[int(idx)]: float(score) for idx, score in zip(topk_indices.tolist(), topk_scores.tolist())}
+        topk_indices_cpu = topk_indices.cpu().tolist()
+        topk_scores_cpu = topk_scores.cpu().tolist()
+        run = {ids[int(idx)]: float(score) for idx, score in zip(topk_indices_cpu, topk_scores_cpu)}
 
         if not return_markdown:
             return run
 
         md = self.corpus_markdown or [""] * len(ids)
-        markdown_by_id = {ids[int(idx)]: str(md[int(idx)]) for idx in topk_indices.tolist()}
+        markdown_by_id = {ids[int(idx)]: str(md[int(idx)]) for idx in topk_indices_cpu}
         return run, markdown_by_id
 
 
@@ -475,10 +487,9 @@ def init(
         doc_modality: str = "image_text",
         doc_max_length: Union[int, str] = "auto",
         query_max_length: int = 10240,
-        corpus_batch_size: int = 4,
-        corpus_chunk_size: int = 4096,
+        corpus_batch_size: int = 32,
+        max_scoring_batch_size: int = 4096,
         cache_dir: str | Path = "cache/nemotron_vl_dense",
-        preload_corpus_to_gpu: bool = False,
         max_input_tiles: int = 6,
         use_thumbnail: bool = True,
     ) -> None:
@@ -521,7 +532,7 @@ def init(
                     doc_max_length=int(doc_max_length_eff),
                     query_max_length=int(query_max_length),
                     corpus_batch_size=int(corpus_batch_size),
-                    corpus_chunk_size=int(corpus_chunk_size),
+                    max_scoring_batch_size=int(max_scoring_batch_size),
                     cache_dir=cache_dir_p,
                     max_input_tiles=int(max_input_tiles),
                     use_thumbnail=bool(use_thumbnail),
@@ -530,7 +541,7 @@ def init(
                 # Update tunables.
                 self._state.top_k = int(top_k)
                 self._state.corpus_batch_size = int(corpus_batch_size)
-                self._state.corpus_chunk_size = int(corpus_chunk_size)
+                self._state.max_scoring_batch_size = int(max_scoring_batch_size)
                 self._state.cache_dir = cache_dir_p
 
             corpus_ids_list = [str(x) for x in corpus_ids]
@@ -542,12 +553,10 @@ def init(
                 and _hash_corpus_ids10(self._state.corpus_ids) == corpus_ids_hash10
                 and self._state.corpus_embeddings_cpu is not None
             ):
-                # Already initialized for the same corpus; only adjust GPU preload.
-                if preload_corpus_to_gpu and self._state.corpus_embeddings_gpu is None:
-                    self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(
-                        self._state.corpus_embeddings_cpu, self._state.device
-                    )
-                if (not preload_corpus_to_gpu) and self._state.corpus_embeddings_gpu is not None:
+                should_be_on_gpu = len(corpus_ids_list) <= self._state.max_scoring_batch_size
+                if should_be_on_gpu and self._state.corpus_embeddings_gpu is None:
+                    self._state.corpus_embeddings_gpu = self._state.corpus_embeddings_cpu.to(self._state.device)
+                if (not should_be_on_gpu) and self._state.corpus_embeddings_gpu is not None:
                     self._state.corpus_embeddings_gpu = None
                 return
 
@@ -564,8 +573,8 @@ def init(
             self._state.corpus_embeddings_cpu = emb_cpu
 
             self._state.corpus_embeddings_gpu = None
-            if preload_corpus_to_gpu:
-                self._state.corpus_embeddings_gpu = _try_preload_corpus_to_gpu(emb_cpu, self._state.device)
+            if emb_cpu.shape[0] <= self._state.max_scoring_batch_size:
+                self._state.corpus_embeddings_gpu = emb_cpu.to(self._state.device)
 
     def retrieve(
         self,
diff --git a/retrieval-bench/submissions/bright_agentic.md b/retrieval-bench/submissions/bright_agentic.md
new file mode 100644
index 000000000..3296bdcc9
--- /dev/null
+++ b/retrieval-bench/submissions/bright_agentic.md
@@ -0,0 +1,146 @@
+<h1 align="center">NVIDIA NeMo Retriever’s Agentic Retrieval Pipeline</h1>
+
+<!-- <p align="center">
+  <strong>A Pragmatic, Production-Grade Framework for Reasoning-Intensive Retrieval</strong>
+</p> -->
+
+<p align="center">
+  <a href="https://brightbenchmark.github.io/"><img src="https://img.shields.io/badge/BRIGHT_Benchmark-Rank_2nd-8A2BE2" alt="Rank"></a>
+  <!-- <a href=""><img src="https://img.shields.io/badge/🤗%20Hugging%20Face-NeMo--Retriever-yellow" alt="Hugging Face"></a> -->
+  <!-- <a href="https://github.com/yaoyichen/INF-X-Retriever"><img src="https://img.shields.io/badge/GitHub-Repo-black?logo=github" alt="GitHub Repo"></a> -->
+  <!-- <a href="https://opensource.org/licenses/Apache-2.0"><img src="https://img.shields.io/badge/License-Apache--2.0-green.svg" alt="License"></a> -->
+</p>
+
+<p align="center">
+  <strong>NeMo Retriever's agentic retrieval pipeline</strong> is a high-performance agentic retrieval pipeline augmented with reasoning retriever developed by NVIDIA NeMo Retriever team. <br>
+  It provides a strong and light solution for complex reasoning-intensive tasks with minimal supervision.
+</p>
+
+<p align="center">
+  <a href="#introduction">Introduction</a> •
+  <a href="#model-architecture">Architecture</a> •
+  <a href="#synthetic-training-data-generation-for-nemo-retriever">Synthetic Data</a> •
+  <a href="#nemo-retrievers-agentic-retrieval-pipeline">Agentic Pipeline</a> •
+  <a href="#usage">Usage</a> •
+  <a href="#performance">Performance</a> •
+  <a href="#acknowledgement">Acknowledgement</a>
+</p>
+
+<a id="introduction"></a>
+## 📖 Introduction
+
+With the rapid development of large language models (LLMs), AI agents are able to handle increasingly complex tasks, and the paradigm of information retrieval has evolved accordingly. The BRIGHT benchmark introduces reasoning-intensive retrieval tasks, in which the queries require explicit reasoning to identify the correct documents. This benchmark moves beyond surface-level keyword matching and shifts toward complex intent-aware matching.
+
+NeMo Retriever's agentic retrieval pipeline addresses this challenge through agentic retrieval and by designing a retrieval model specifically tailored for reasoning-oriented tasks, introducing innovations in both reasoning and retrieval. Our method achieves second place on the BRIGHT benchmark, demonstrating its effectiveness and generalizability while achieving strong performance on this benchmark.
+
+<a id="model-architecture"></a>
+## 🛠️ Model Architecture
+
+The framework is built around two highly independent, custom-designed modules.
+
+### 1. Retriever Model
+* **Model**: [llama-nv-embed-reasoning-3b](https://huggingface.co/nvidia/llama-nv-embed-reasoning-3b)
+* **Backbone**: LLaMA-3.2-3B (non-instruct, fine-tuned via InfoNCE loss)  
+* **Function**: As a dense retriever, it is trained on large-scale LLM-generated synthetic data, aiming to align queries with documents that do not rely on keyword overlap by bringing them closer in the representation space.
+
+### 2. Agentic retrieval pipeline
+* **Model**: [pipeline](https://github.com/NVIDIA/NeMo-Retriever/tree/main/retrieval-bench#agentic-retrieval) with Claude Opus 4.5
+* **Principle**: Built around a commercial LLM, we develop an agentic RAG framework that uses carefully designed prompts to guide the LLM through multi-step retrieval and reasoning. The framework iteratively retrieves and answers sub-questions, then summarizes the retrieved evidence and selects the top-k documents to support answering the main query.
+
+<a id="synthetic-training-data-generation-for-nemo-retriever"></a>
+## 📦 Synthetic Training Data Generation for NeMo Retriever
+
+<p align="center">
+  <img src="sdg.png" alt="" width="90%"/>
+</p>
+
+The data synthesis pipeline constructs high-quality query–document training data through LLM-driven query generation and retrieval-based positive and hard negative selection.
+
+(1) **Query Generation.**
+Starting from a large raw corpus (e.g., the BRIGHT corpus), a domain-specific filter is applied to obtain a set of relevant documents. In addition, to ensure that no test-set documents are leaked, we explicitly filtered out all positive documents associated with the test sets during the question generation process. Each filtered document is treated as an anchor document and used to retrieve its top-4 most similar documents from the corpus using a retrieval model ([*reason-embed-basic-qwen3-4b-0928*](https://huggingface.co/hanhainebula/reason-embed-basic-qwen3-4b-0928)). The anchor document together with its retrieved neighbors forms a document set. Conditioned on each document set, we use an LLM ([*Qwen3-235B-A22B*](https://huggingface.co/Qwen/Qwen3-235B-A22B)) to generate natural language queries of at most 300 tokens. The model is instructed to generate reasoning-intensive questions, such that answering the question requires non-trivial reasoning and is necessary to leverage the associated documents.
+
+(2) **Positive Annotation and Hard Negative Mining.**
+Given the generated queries, query–document pairs are constructed by pairing each query with documents from the corpus. An LLM ([*Qwen3-next-80b-a3b-instruct*](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct)) is used to identify documents that genuinely support or answer the query, yielding a set of positive documents and their similarity distribution. To further improve training quality, hard negatives are mined using a retrieval model ([*reason-embed-basic-qwen3-4b-0928*](https://huggingface.co/hanhainebula/reason-embed-basic-qwen3-4b-0928)). For each query, documents are retrieved based on embedding similarity, and those with similarity scores slightly lower than the positives (i.e., $\text{Sim}_{neg} < \text{Sim}_{pos}$) are selected as hard negatives. This criterion ensures that negatives remain semantically close to the query while avoiding false positives.
+
+After generating our synthetic data, we additionally incorporate the training sets from [ReasonEmbed](https://arxiv.org/abs/2510.08252), [ReasonAug](https://arxiv.org/abs/2505.15045), and [ReasonRank](https://arxiv.org/abs/2508.07050) to further train the retriever. In particular, based on empirical performance, we selected data from the following domains for training:
+
+- From **ReasonEmbed**: *biology*, *earth_science*, *economics*, *psychology*, *robotics*, *sustainable_living*, *stackoverflow*, *pony*, *theoremqa_questions*, and *theoremqa_theorems*.
+- From **ReasonAug**: *math* and *theorem*.
+- From **ReasonRank**: *biology*, *stackoverflow*, *math-qa*, *math-theorem*, *earth_science*, and *robotics*.
+
+
+<a id="nemo-retrievers-agentic-retrieval-pipeline"></a>
+## 🤖 NeMo Retriever’s agentic retrieval pipeline
+
+### The Agentic Loop
+Our agentic retrieval pipeline relies on a ReACT architecture. Instead of a single "one-and-done" query, the agent iteratively searches, evaluates, and refines its approach. The agent utilizes built-in tools like `think` to plan its approach and `final_results` to output the exact documents needed, alongside a `retrieve(query, top_k)` tool to explore the corpus. Through this loop, we observed successful search patterns emerge naturally:
+- Generating better queries: The agent dynamically adjusts its search queries based on newly discovered information.
+- Persistent rephrasing: It continually rephrases queries until useful information is found.
+- Breaking down complexity: It translates complex, multi-part queries into multiple simpler queries with clear goals.
+
+Finally, the agent calls a `final_results` tool to output the most relevant documents. As a safety net—for example, when the agent hits the maximum number of steps or the context length limit—the pipeline falls back to Reciprocal Rank Fusion (RRF), which scores documents based on their ranks across all retrieval attempts in the agent trajectory.
+
+### Engineering for Speed and Scale
+Agentic workflows are notoriously slow. Initially, we used a Model Context Protocol (MCP) server to connect the retriever and the agent, but this architecture imposed a heavy "performance tax." The overhead of managing separate processes, loading GPU-resident embeddings for every run, and handling network latency created significant bottlenecks and frequent server freezes. To resolve this, we replaced the MCP server with a thread-safe singleton retriever that lives in-process.
+
+By loading the model and corpus once and protecting access with a reentrant lock, we achieved safe, shared GPU access without network serialization overhead. This single change eliminated deployment errors and dramatically improved both GPU utilization and experiment throughput.
+
+<a id="usage"></a>
+## 🚀 Usage
+To reproduce our results on BRIGHT, follow the [installation instructions](https://github.com/NVIDIA/NeMo-Retriever/tree/main/retrieval-bench), then run the following command:
+
+```bash
+retrieval-bench evaluate agentic-retrieval \
+        --dataset-name bright/biology \
+        --backend llama-embed-nemotron-reasoning-3b \
+        --llm-model openai/aws/anthropic/claude-opus-4-5 \
+        --num-concurrent 1
+```
+You can specify BRIGHT task names via `--dataset-name`. By default, the pipeline reads `OPENAI_API_KEY` and `OPENAI_BASE_URL` from environment variables; override these via `--pipeline-args`:
+```bash
+retrieval-bench evaluate agentic-retrieval \
+  --dataset-name bright/biology \
+  --backend llama-nv-embed-reasoning-3b \
+  --llm-model your-llm-model \
+  --pipeline-args '{"api_key":"os.environ/MY_KEY","base_url":"os.environ/MY_URL"}'
+```
+
+<a id="performance"></a>
+## 📊 Performance
+
+**BRIGHT** is a benchmark for reasoning-intensive information retrieval, where determining the relevance between a query and a document goes beyond lexical or semantic matching and requires deliberate, multi-step reasoning. It covers diverse and advanced domains such as economics, mathematics, programming, and natural sciences, with queries drawn from real human data and carefully curated sources. In BRIGHT, relevant documents often share underlying principles, theories, or algorithms with the query rather than surface-level similarity. As a result, state-of-the-art retrievers that perform well on traditional benchmarks like BEIR and MTEB show substantial performance drops on BRIGHT, highlighting its difficulty and its role in evaluating and advancing retrieval models with genuine reasoning capabilities.
+
+### Short document in BRIGHT 
+
+####  Results (nDCG@10) Across 12 Datasets for the NeMo Retriever’s agentic retrieval pipeline
+
+| Model | Avg | Bio. | Earth. | Econ. | Psy. | Rob. | Stack. | Sus. | Leet. | Pony | AoPS | TheoQ. | TheoT. |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+| INF-X-Retriever | 63.4 | 79.8 | 70.9 | 69.9 | 73.3 | 57.7 | 64.3 | 61.9 | 56.1 | 54.5 | 51.9 | 53.1 | 67.9 |
+| **NeMo Retriever's Agentic Pipeline** | **50.9** | **72.8** | **66.0** | **48.7** | **59.6** | **52.5** | **47.1** | **50.2** | **49.3** | **42.1** | **21.0** | **53.3** | **48.0** |
+| DIVER (v3) | 46.8 | 66.0 | 63.7 | 42.4 | 55.0 | 40.6 | 44.7 | 50.4 | 32.5 | 47.3 | 17.2 | 46.4 | 55.6 |
+| BGE-Reasoner-0928* | 46.4 | 68.5 | 66.4 | 40.6 | 53.1 | 43.2 | 44.1 | 47.8 | 29.0 | 41.6 | 17.2 | 46.5 | 58.4 |
+| LATTICE | 42.1 | 64.4 | 62.4 | 45.4 | 57.4 | 47.6 | 37.6 | 46.4 | 19.9 | 34.0 | 12.0 | 30.1 | 47.8 |
+| ReasonRank | 40.8 | 62.7 | 55.5 | 36.7 | 54.6 | 35.7 | 38.0 | 44.8 | 29.5 | 25.6 | 14.4 | 42.0 | 50.1 |
+| XDR2 | 40.3 | 63.1 | 55.4 | 38.5 | 52.9 | 37.1 | 38.2 | 44.6 | 21.9 | 35.0 | 15.7 | 34.4 | 46.2 |
+
+####  Results (nDCG@10) Across 12 Datasets for the llama-nv-embed-reasoning-3b model
+
+| Model | Avg | Bio. | Earth. | Econ. | Psy. | Rob. | Stack. | Sus. | Leet. | Pony | AoPS | TheoQ. | TheoT. |
+| :--- | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+|**llama-nv-embed-reasoning-3b**| **38.3**| **63.4**|**60.2**|	**39.5** |	**45.5**|**32.6**|**34.0**|	**43.3** |**37.5**|**15.0**|**10.5**|**39.5**|**38.5**|
+| ReasonEmbed-Qwen3-8B (Redapter) | 38.1 | 55.5 |56.6 |36.2 |47.4 | 35.3 |36.6 |39.1| 33.6| 16.4 |12.5 |41.4| 47.2 |
+| ReasonEmbed-Qwen3-4B (Redapter) |37.1| 55.4| 54.5 |34.9 |46.9| 34.0 |36.1 |37.4|  34.5| 13.6 | 11.3 |41.4 |45.1 |
+| ReasonEmbed-Llama-3.1-8B (Redapter) |36.2| 55.4| 56.2 |35.2| 48.5 |32.1| 37.3 | 41.1 |28.8| 16.8 |9.1 |37.9 |36.6|
+| DIVER-Retriever | 28.9 | 41.8| 43.7 |21.7| 35.3 |21.0| 21.2| 25.1 |37.6 |13.2| 10.7 |38.4| 37.3|
+| Seed-1.5-Embedding | 27.2 | 34.8| 46.9 |23.4| 31.6 |19.1 |25.4 |21.0 |43.2| 4.9 |12.2| 33.3| 30.5|
+| RaDeR-gte-Qwen2-7B | 25.5 | 34.6 | 38.9 | 22.1 | 33.0 | 14.8 | 22.5 | 23.7 | 37.3 | 5.0 | 10.2 | 28.4 | 35.1 |
+| ReasonIR-8B | 24.4 | 26.2 | 31.4 | 23.3 | 30.0 | 18.0 | 23.9 | 20.5 | 35.0 | 10.5 | 14.7| 31.9 | 27.2 |
+| Qwen3-Embedding-8B | 22.8 | 21.0 | 33.0 | 18.4 | 26.1 | 15.7 | 19.4 | 17.3 | 33.8 | 1.2 | 9.4 | 39.2 | 39.3 |
+| llama-nemotron-embed-3b-v2 |22.3| 31.1|36.7	|22.3	|28.4	|18.0|	18.4|	20.3|	32.1	|6.8|	12.1	|25.1	|16.5	|
+| Qwen3-Embedding-4B | 21.8 | 17.8 | 34.7 | 16.9 | 23.3 | 12.5 | 16.2 | 16.8 | 35.7 | 1.4 | 9.8 | 35.5 | 41.5 |
+| BM25 | 14.5 | 18.9 | 27.2 | 14.9 | 12.5 | 13.6 | 18.4 | 15.0 | 24.4 | 7.9 | 6.2 | 10.4 | 4.9 |
+
+<a id="acknowledgement"></a>
+## ✨ Acknowledgement
+This pipeline is based on work done by [Jie He](https://probe2.github.io/) (retriever model) and [Reza Esfandiarpoor](https://reza.website/) (agentic pipeline) during their internships at NVIDIA.
diff --git a/retrieval-bench/submissions/sdg.png b/retrieval-bench/submissions/sdg.png
new file mode 100644
index 000000000..0484f0606
Binary files /dev/null and b/retrieval-bench/submissions/sdg.png differ
diff --git a/src/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py b/src/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py
index c96977ec2..f40dbeca6 100644
--- a/src/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py
+++ b/src/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py
@@ -64,8 +64,8 @@ def get_instance() -> "RedisIngestService":
             redis_task_queue: str = os.getenv("REDIS_INGEST_TASK_QUEUE", "ingest_task_queue")
 
             fetch_mode: "FetchMode" = get_fetch_mode_from_env()
-            result_data_ttl: int = int(os.getenv("RESULT_DATA_TTL_SECONDS", "3600"))
-            state_ttl: int = int(os.getenv("STATE_TTL_SECONDS", "7200"))
+            result_data_ttl: int = int(os.getenv("RESULT_DATA_TTL_SECONDS", "172800"))
+            state_ttl: int = int(os.getenv("STATE_TTL_SECONDS", "172800"))
 
             cache_config: Dict[str, Any] = {
                 "directory": os.getenv("FETCH_CACHE_DIR", "./.fetch_cache"),
diff --git a/src/pyproject.toml b/src/pyproject.toml
index be01e194c..817af365b 100644
--- a/src/pyproject.toml
+++ b/src/pyproject.toml
@@ -13,10 +13,10 @@ readme = "README.md"
 authors = [
     {name = "Jeremy Dyer", email = "jdyer@nvidia.com"}
 ]
-license = {file = "LICENSE"}
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 classifiers = [
     "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
 dependencies = [
diff --git a/tools/harness/README.md b/tools/harness/README.md
index ee8989d92..63934a845 100644
--- a/tools/harness/README.md
+++ b/tools/harness/README.md
@@ -377,6 +377,7 @@ Recall testing evaluates retrieval accuracy against ground truth query sets. Two
 - Creates multimodal collection during ingestion
 - Evaluates recall immediately after ingestion
 - Combines ingestion metrics with recall metrics
+- With **`--minimize-vram`** (managed only): keeps only the ingestion stack up for e2e, then stops ingestion services and starts retrieval services (e.g. reranker) before recall to reduce VRAM usage. Mutually exclusive with `--keep-up`.
 
 ### Reranker Configuration
 
@@ -775,6 +776,9 @@ uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed --no-build
 # Keep services running after test (useful for multi-test scenarios)
 uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed --keep-up
 
+# Minimize VRAM during e2e_recall: stop ingestion-only services between e2e and recall, start reranker only when needed (managed + e2e_recall only; mutually exclusive with --keep-up)
+uv run nv-ingest-harness-run --case=e2e_recall --dataset=bo767 --managed --minimize-vram
+
 # Use GPU-specific configuration (A10G, L40S, A100-40GB)
 uv run nv-ingest-harness-run --case=e2e --dataset=bo767 --managed --sku=a10g
 
diff --git a/tools/harness/nightly_config.yaml b/tools/harness/nightly_config.yaml
index f0113b315..38ef89921 100644
--- a/tools/harness/nightly_config.yaml
+++ b/tools/harness/nightly_config.yaml
@@ -26,7 +26,7 @@ runs:
 # Recall configuration (applies to all e2e_recall runs)
 recall:
   reranker_mode: both  # Options: "none", "with", "both"
-  reranker_endpoint: http://localhost:8020/v1/ranking  # Local container
+  reranker_endpoint: http://localhost:8015/v1/ranking  # Local container
   top_k: 10
 
 # Sinks configuration
diff --git a/tools/harness/src/nv_ingest_harness/cases/recall.py b/tools/harness/src/nv_ingest_harness/cases/recall.py
index 7b1788b9d..2c00ecb99 100644
--- a/tools/harness/src/nv_ingest_harness/cases/recall.py
+++ b/tools/harness/src/nv_ingest_harness/cases/recall.py
@@ -138,8 +138,9 @@ def main(config=None, log_path: str = "test_results") -> int:
             "top_k": recall_top_k,
             "gpu_search": gpu_search,
             "ground_truth_dir": ground_truth_dir,
+            "dataset_dir": config.dataset_dir,
             "vdb_backend": vdb_backend,
-            "nv_ranker_endpoint": f"http://{hostname}:8020/v1/ranking",
+            "nv_ranker_endpoint": f"http://{hostname}:8015/v1/ranking",
             "nv_ranker_model_name": "nvidia/llama-nemotron-rerank-1b-v2",
         }
         if vdb_backend == "lancedb":
diff --git a/tools/harness/src/nv_ingest_harness/cli/nightly.py b/tools/harness/src/nv_ingest_harness/cli/nightly.py
index 68a41f9e1..2e4541cb1 100644
--- a/tools/harness/src/nv_ingest_harness/cli/nightly.py
+++ b/tools/harness/src/nv_ingest_harness/cli/nightly.py
@@ -39,8 +39,9 @@ def run_harness(
     managed: bool = False,
     sku: str | None = None,
     test_config_path: str | None = None,
+    minimize_vram: bool = False,
 ) -> tuple[int, Path | None]:
-    """Run a single harness test."""
+    """Run a single harness test (or multiple datasets when dataset is comma-separated and case is e2e_recall)."""
     cmd = [
         sys.executable,
         "-m",
@@ -53,6 +54,9 @@ def run_harness(
     if managed:
         cmd.append("--managed")
 
+    if minimize_vram:
+        cmd.append("--minimize-vram")
+
     if session_dir:
         cmd.append(f"--session-dir={str(session_dir)}")
 
@@ -142,6 +146,12 @@ def load_results(artifact_dir: Path) -> dict:
     is_flag=True,
     help="Keep services running after nightly run completes (only with --managed)",
 )
+@click.option(
+    "--minimize-vram",
+    is_flag=True,
+    help="Minimize VRAM during e2e_recall by stopping ingestion services between e2e and recall (managed only; "
+    "`--managed` required; mutually exclusive with `--keep-up`)",
+)
 @click.option(
     "--skip-slack",
     is_flag=True,
@@ -199,6 +209,7 @@ def main(
     deployment_type: str,
     managed: bool,
     keep_up: bool,
+    minimize_vram: bool,
     skip_slack: bool,
     skip_history: bool,
     skip_fresh_start: bool,
@@ -210,6 +221,10 @@ def main(
     test_config_path: str | None,
 ):
     """Run nightly benchmarks and post results."""
+    if keep_up and minimize_vram:
+        print("Error: --keep-up and --minimize-vram are mutually exclusive.", file=sys.stderr)
+        return 1
+
     if replay_dirs:
         return _replay_results(replay_dirs)
 
@@ -264,7 +279,7 @@ def main(
 
     os.environ["RERANKER_MODE"] = reranker_mode
     # Use local reranker container instead of build API
-    reranker_endpoint = recall_config.get("reranker_endpoint", "http://localhost:8020/v1/ranking")
+    reranker_endpoint = recall_config.get("reranker_endpoint", "http://localhost:8015/v1/ranking")
     os.environ["RERANKER_NIM_ENDPOINT"] = reranker_endpoint
     # Pass recall_top_k from nightly config to harness
     recall_top_k = recall_config.get("top_k", 10)
@@ -342,8 +357,9 @@ def main(
             print("Failed to start services")
             return 1
 
-        # Wait for readiness
-        if not service_manager.check_readiness(service_config.readiness_timeout):
+        # Wait for readiness (skip Milvus check when using LanceDB)
+        check_milvus = service_config.vdb_backend == "milvus"
+        if not service_manager.check_readiness(service_config.readiness_timeout, check_milvus=check_milvus):
             print("Services failed to become ready")
             service_manager.stop()
             return 1
@@ -370,22 +386,63 @@ def main(
         for sink in sinks:
             sink.process_result(result)
 
-    for dataset in recall_datasets:
-        print(f"\n--- Running e2e_recall for {dataset} ---")
-        rc, artifact_dir = run_harness(
-            dataset,
+    if minimize_vram and recall_datasets:
+        # Run all recall datasets in one harness invocation so run_datasets can do
+        # e2e -> stop ingestion -> recall -> start ingestion per dataset.
+        # Pass managed=True so the run CLI owns service lifecycle and stage control.
+        print(f"\n--- Running e2e_recall (minimize VRAM) for {', '.join(recall_datasets)} ---")
+        rc, _ = run_harness(
+            ",".join(recall_datasets),
             case="e2e_recall",
             session_dir=session_dir,
             deployment_type=deployment_type,
-            managed=False,  # Don't manage per-dataset, already managed at nightly level
+            managed=True,
             sku=sku,
             test_config_path=test_config_path,
+            minimize_vram=True,
         )
-        result = _process_result(dataset, rc, artifact_dir, case="e2e_recall")
-        all_results.append(result)
+        # Per-dataset results: read artifact_paths and each dataset's results.json for return_code
+        artifact_paths_file = session_dir / ".artifact_paths.json"
+        if artifact_paths_file.exists():
+            with open(artifact_paths_file) as f:
+                artifact_paths = json.load(f)
+            for dataset in recall_datasets:
+                artifact_dir_str = artifact_paths.get(dataset)
+                artifact_dir = Path(artifact_dir_str) if artifact_dir_str else None
+                dataset_rc = rc
+                if artifact_dir and (artifact_dir / "results.json").exists():
+                    try:
+                        with open(artifact_dir / "results.json") as rf:
+                            dataset_rc = json.load(rf).get("return_code", rc)
+                    except (json.JSONDecodeError, IOError):
+                        pass
+                result = _process_result(dataset, dataset_rc, artifact_dir, case="e2e_recall")
+                all_results.append(result)
+                for sink in sinks:
+                    sink.process_result(result)
+        else:
+            for dataset in recall_datasets:
+                result = _process_result(dataset, rc, None, case="e2e_recall")
+                all_results.append(result)
+                for sink in sinks:
+                    sink.process_result(result)
+    else:
+        for dataset in recall_datasets:
+            print(f"\n--- Running e2e_recall for {dataset} ---")
+            rc, artifact_dir = run_harness(
+                dataset,
+                case="e2e_recall",
+                session_dir=session_dir,
+                deployment_type=deployment_type,
+                managed=False,  # Don't manage per-dataset, already managed at nightly level
+                sku=sku,
+                test_config_path=test_config_path,
+            )
+            result = _process_result(dataset, rc, artifact_dir, case="e2e_recall")
+            all_results.append(result)
 
-        for sink in sinks:
-            sink.process_result(result)
+            for sink in sinks:
+                sink.process_result(result)
 
     # Cleanup services and port forwards if needed
     if service_manager:
diff --git a/tools/harness/src/nv_ingest_harness/cli/run.py b/tools/harness/src/nv_ingest_harness/cli/run.py
index 6c13e55b0..51d903fd5 100644
--- a/tools/harness/src/nv_ingest_harness/cli/run.py
+++ b/tools/harness/src/nv_ingest_harness/cli/run.py
@@ -39,6 +39,7 @@ def run_datasets(
     sku: str | None = None,
     dump_logs: bool = True,
     config_file: str | None = None,
+    minimize_vram: bool = False,
 ) -> int:
     """Run test for one or more datasets sequentially."""
     results = []
@@ -63,135 +64,324 @@ def run_datasets(
             print("Failed to start services")
             return 1
 
-        # Wait for readiness
-        if not service_manager.check_readiness(first_config.readiness_timeout):
-            print("Services failed to become ready")
+        # When minimize_vram and e2e_recall: stop non-ingestion services before readiness check
+        # so only the ingestion stack must be ready (avoids VRAM contention blocking readiness)
+        if case == "e2e_recall" and minimize_vram:
+            service_manager.stop_non_ingestion_services()
+
+        # Wait for readiness (skip Milvus check when using LanceDB)
+        print("Checking service readiness...")
+        check_milvus = first_config.vdb_backend == "milvus"
+        if not service_manager.check_readiness(first_config.readiness_timeout, check_milvus=check_milvus):
+            print("Services failed to become ready (see above for which services were not ready)")
             service_manager.stop()
             return 1
 
-    # Run each dataset
-    for dataset_name in dataset_list:
-        print(f"\n{'='*60}")
-        print(f"Running {case} for dataset: {dataset_name}")
-        print(f"{'='*60}\n")
-
-        # Load config for this dataset (applies dataset-specific extraction configs)
-        try:
-            config = load_config(
-                config_file=config_file or "test_configs.yaml",
-                case=case,
-                dataset=dataset_name,
-                deployment_type=deployment_type,
-            )
-        except (FileNotFoundError, ValueError) as e:
-            print(f"Configuration error for {dataset_name}: {e}", file=sys.stderr)
-            results.append({"dataset": dataset_name, "status": "config_error", "rc": 1, "artifact_dir": "N/A"})
-            continue
-
-        # Determine artifact name
-        artifact_name = config.test_name
-        if not artifact_name:
-            artifact_name = os.path.basename(config.dataset_dir.rstrip("/"))
-
-        out_dir = get_artifact_path(
-            session_dir,
-            artifact_name,
-            base_dir=config.artifacts_dir,
-            deployment_type=deployment_type if managed else None,
-        )
-        stdout_path = os.path.join(out_dir, "stdout.txt")
+        # When recall/e2e_recall without minimize_vram: reranker is already up; wait for it once if any dataset uses it
+        if case in ("recall", "e2e_recall") and not minimize_vram:
+            reranker_needed = False
+            for dataset_name in dataset_list:
+                try:
+                    cfg = load_config(
+                        config_file=config_file or "test_configs.yaml",
+                        case=case,
+                        dataset=dataset_name,
+                        deployment_type=deployment_type,
+                    )
+                    if getattr(cfg, "reranker_mode", "none") in ("with", "both"):
+                        reranker_needed = True
+                        break
+                except (FileNotFoundError, ValueError):
+                    pass
+            if reranker_needed:
+                timeout = getattr(first_config, "readiness_timeout", 600)
+                if not service_manager.wait_for_reranker_readiness(timeout, verbose=True):
+                    print("Reranker did not become ready.", file=sys.stderr)
+                    service_manager.stop()
+                    return 1
+
+    # Branch: e2e_recall with minimize_vram runs e2e then stop_ingestion then start_retrieval then recall per dataset
+    if case == "e2e_recall" and managed and minimize_vram:
+        from nv_ingest_harness.utils.recall import get_recall_collection_name
+
+        for idx, dataset_name in enumerate(dataset_list):
+            print(f"\n{'='*60}")
+            print(f"Running e2e_recall (minimize VRAM) for dataset: {dataset_name}")
+            print(f"{'='*60}\n")
 
-        print(f"Dataset: {config.dataset_dir}")
-        print(f"Artifacts: {out_dir}")
-        print()
+            try:
+                config = load_config(
+                    config_file=config_file or "test_configs.yaml",
+                    case="e2e_recall",
+                    dataset=dataset_name,
+                    deployment_type=deployment_type,
+                )
+            except (FileNotFoundError, ValueError) as e:
+                print(f"Configuration error for {dataset_name}: {e}", file=sys.stderr)
+                results.append({"dataset": dataset_name, "status": "config_error", "rc": 1, "artifact_dir": "N/A"})
+                continue
 
-        # For recall case, validate recall_dataset and set collection_name
-        if case in ("recall", "e2e_recall"):
             recall_dataset = getattr(config, "recall_dataset", None)
             if not recall_dataset:
                 print(f"ERROR: Dataset '{dataset_name}' does not have recall_dataset configured", file=sys.stderr)
-                print(f"  This dataset cannot be used with --case={case}", file=sys.stderr)
-                print(
-                    "  Set recall_dataset in test_configs.yaml datasets section or use a different dataset",
-                    file=sys.stderr,
-                )
                 results.append({"dataset": dataset_name, "status": "config_error", "rc": 1, "artifact_dir": "N/A"})
                 continue
 
-            # Default to local reranker if not explicitly configured
             if not os.environ.get("RERANKER_NIM_ENDPOINT"):
-                os.environ["RERANKER_NIM_ENDPOINT"] = "http://localhost:8020/v1/ranking"
+                os.environ["RERANKER_NIM_ENDPOINT"] = "http://localhost:8015/v1/ranking"
 
-            # Set collection_name from dataset if not set
-            if case == "recall" and not config.collection_name:
-                from nv_ingest_harness.utils.recall import get_recall_collection_name
+            test_name = config.test_name or os.path.basename(config.dataset_dir.rstrip("/"))
+            config.collection_name = get_recall_collection_name(test_name)
 
-                # Use same logic as recall.py: test_name from config, or basename of dataset_dir
-                test_name_for_collection = config.test_name or os.path.basename(config.dataset_dir.rstrip("/"))
-                config.collection_name = get_recall_collection_name(test_name_for_collection)
+            artifact_name = config.test_name or os.path.basename(config.dataset_dir.rstrip("/"))
+            out_dir = get_artifact_path(
+                session_dir,
+                artifact_name,
+                base_dir=config.artifacts_dir,
+                deployment_type=deployment_type,
+            )
+            stdout_path = os.path.join(out_dir, "stdout.txt")
 
-        # Run the test case
-        if case in CASES:
-            rc = run_case(case, stdout_path, config, doc_analysis)
-        else:
-            print(f"Unknown case: {case}")
-            rc = 2
-
-        # Consolidate runner metadata + test results into single results.json
-        consolidated = {
-            "case": case,
-            "timestamp": now_timestr(),
-            "latest_commit": last_commit(),
-            "infrastructure": "managed" if managed else "attach",
-            "api_version": config.api_version,
-            "pdf_split_page_count": config.pdf_split_page_count,
-            "return_code": rc,
-        }
-
-        if managed:
-            consolidated["profiles"] = config.profiles
-
-        # Merge test results if available
-        test_results_file = os.path.join(out_dir, "_test_results.json")
-        if os.path.exists(test_results_file):
+            print(f"Dataset: {config.dataset_dir}")
+            print(f"Artifacts: {out_dir}\n")
+
+            # Step 1: Run e2e
+            rc = run_case("e2e", stdout_path, config, doc_analysis)
+            if rc != 0:
+                results.append({"dataset": dataset_name, "artifact_dir": str(out_dir), "rc": rc, "status": "failed"})
+                if idx < len(dataset_list) - 1:
+                    service_manager.start_ingestion_services()
+                continue
+
+            # Load e2e results before recall overwrites _test_results.json
+            e2e_results = {}
+            test_results_file = os.path.join(out_dir, "_test_results.json")
+            if os.path.exists(test_results_file):
+                try:
+                    with open(test_results_file) as f:
+                        e2e_data = json.load(f)
+                    e2e_results = {
+                        "test_config": e2e_data.get("test_config", {}),
+                        "results": e2e_data.get("results", {}),
+                    }
+                except (json.JSONDecodeError, IOError):
+                    pass
+
+            # Step 2: Stop ingestion-only services
+            service_manager.stop_ingestion_services()
+
+            # Step 3: Start retrieval services (reranker) if config requires it
+            reranker_needed = getattr(config, "reranker_mode", "none") in ("with", "both")
+            service_manager.start_retrieval_services(reranker=reranker_needed)
+            if reranker_needed:
+                timeout = getattr(config, "readiness_timeout", 600)
+                if not service_manager.wait_for_reranker_readiness(timeout, verbose=True):
+                    print("Reranker did not become ready; skipping recall for this dataset.", file=sys.stderr)
+                    results.append(
+                        {"dataset": dataset_name, "artifact_dir": str(out_dir), "rc": 1, "status": "reranker_not_ready"}
+                    )
+                    if idx < len(dataset_list) - 1:
+                        service_manager.start_ingestion_services()
+                    continue
+
+            # Step 4: Run recall
+            rc = run_case("recall", stdout_path, config, doc_analysis)
+
+            # Load recall results and build combined e2e_recall output
+            recall_results = {}
+            if os.path.exists(test_results_file):
+                try:
+                    with open(test_results_file) as f:
+                        recall_data = json.load(f)
+                    recall_results = recall_data.get("recall_results", {})
+                except (json.JSONDecodeError, IOError):
+                    pass
+
+            test_results = {
+                "test_type": "e2e_recall",
+                "test_config": {
+                    "test_name": test_name,
+                    "collection_name": config.collection_name,
+                    "recall_dataset": recall_dataset,
+                },
+                "ingestion_results": e2e_results.get("results", {}),
+                "recall_results": recall_results,
+            }
+            for key in ["api_version", "dataset_dir", "hostname", "model_name", "dense_dim", "sparse", "gpu_search"]:
+                if key in e2e_results.get("test_config", {}):
+                    test_results["test_config"][key] = e2e_results["test_config"][key]
+
+            with open(test_results_file, "w") as f:
+                json.dump(test_results, f, indent=2)
+
+            consolidated = {
+                "case": "e2e_recall",
+                "timestamp": now_timestr(),
+                "latest_commit": last_commit(),
+                "infrastructure": "managed",
+                "api_version": config.api_version,
+                "pdf_split_page_count": getattr(config, "pdf_split_page_count", None),
+                "return_code": rc,
+            }
+            if managed:
+                consolidated["profiles"] = config.profiles
+            consolidated.update(test_results)
+
+            results_path = os.path.join(out_dir, "results.json")
+            with open(results_path, "w") as f:
+                json.dump(consolidated, f, indent=2)
+
+            if session_dir:
+                artifact_paths_file = Path(session_dir) / ".artifact_paths.json"
+                artifact_paths = {}
+                if artifact_paths_file.exists():
+                    with open(artifact_paths_file) as f:
+                        artifact_paths = json.load(f)
+                artifact_paths[dataset_name] = str(out_dir)
+                with open(artifact_paths_file, "w") as f:
+                    json.dump(artifact_paths, f, indent=2)
+
+            print(f"\n{'='*60}")
+            print(f"Results written to: {results_path}")
+            print(f"{'='*60}")
+
+            results.append(
+                {
+                    "dataset": dataset_name,
+                    "artifact_dir": str(out_dir),
+                    "rc": rc,
+                    "status": "success" if rc == 0 else "failed",
+                }
+            )
+
+            # Step 5: If more datasets, start ingestion services again for next e2e
+            if idx < len(dataset_list) - 1:
+                service_manager.start_ingestion_services()
+
+    else:
+        # Run each dataset (standard path)
+        for dataset_name in dataset_list:
+            print(f"\n{'='*60}")
+            print(f"Running {case} for dataset: {dataset_name}")
+            print(f"{'='*60}\n")
+
+            # Load config for this dataset (applies dataset-specific extraction configs)
             try:
-                with open(test_results_file) as f:
-                    test_data = json.load(f)
-                    consolidated.update(test_data)
-                # Clean up intermediate file
-                os.remove(test_results_file)
-            except (json.JSONDecodeError, IOError) as e:
-                print(f"Warning: Could not read test results: {e}")
-
-        # Write consolidated results.json
-        results_path = os.path.join(out_dir, "results.json")
-        with open(results_path, "w") as f:
-            json.dump(consolidated, f, indent=2)
-
-        # Write artifact path to session directory for parent processes (e.g., nightly runner)
-        if session_dir:
-            artifact_paths_file = Path(session_dir) / ".artifact_paths.json"
-            artifact_paths = {}
-            if artifact_paths_file.exists():
-                with open(artifact_paths_file) as f:
-                    artifact_paths = json.load(f)
-            artifact_paths[dataset_name] = str(out_dir)
-            with open(artifact_paths_file, "w") as f:
-                json.dump(artifact_paths, f, indent=2)
-
-        print(f"\n{'='*60}")
-        print(f"Results written to: {results_path}")
-        print(f"{'='*60}")
-
-        # Collect results
-        results.append(
-            {
-                "dataset": dataset_name,
-                "artifact_dir": str(out_dir),
-                "rc": rc,
-                "status": "success" if rc == 0 else "failed",
+                config = load_config(
+                    config_file=config_file or "test_configs.yaml",
+                    case=case,
+                    dataset=dataset_name,
+                    deployment_type=deployment_type,
+                )
+            except (FileNotFoundError, ValueError) as e:
+                print(f"Configuration error for {dataset_name}: {e}", file=sys.stderr)
+                results.append({"dataset": dataset_name, "status": "config_error", "rc": 1, "artifact_dir": "N/A"})
+                continue
+
+            # Determine artifact name
+            artifact_name = config.test_name
+            if not artifact_name:
+                artifact_name = os.path.basename(config.dataset_dir.rstrip("/"))
+
+            out_dir = get_artifact_path(
+                session_dir,
+                artifact_name,
+                base_dir=config.artifacts_dir,
+                deployment_type=deployment_type if managed else None,
+            )
+            stdout_path = os.path.join(out_dir, "stdout.txt")
+
+            print(f"Dataset: {config.dataset_dir}")
+            print(f"Artifacts: {out_dir}")
+            print()
+
+            # For recall case, validate recall_dataset and set collection_name
+            if case in ("recall", "e2e_recall"):
+                recall_dataset = getattr(config, "recall_dataset", None)
+                if not recall_dataset:
+                    print(f"ERROR: Dataset '{dataset_name}' does not have recall_dataset configured", file=sys.stderr)
+                    print(f"  This dataset cannot be used with --case={case}", file=sys.stderr)
+                    print(
+                        "  Set recall_dataset in test_configs.yaml datasets section or use a different dataset",
+                        file=sys.stderr,
+                    )
+                    results.append({"dataset": dataset_name, "status": "config_error", "rc": 1, "artifact_dir": "N/A"})
+                    continue
+
+                # Default to local reranker if not explicitly configured
+                if not os.environ.get("RERANKER_NIM_ENDPOINT"):
+                    os.environ["RERANKER_NIM_ENDPOINT"] = "http://localhost:8015/v1/ranking"
+
+                # Set collection_name from dataset if not set
+                if case == "recall" and not config.collection_name:
+                    from nv_ingest_harness.utils.recall import get_recall_collection_name
+
+                    # Use same logic as recall.py: test_name from config, or basename of dataset_dir
+                    test_name_for_collection = config.test_name or os.path.basename(config.dataset_dir.rstrip("/"))
+                    config.collection_name = get_recall_collection_name(test_name_for_collection)
+
+            # Run the test case
+            if case in CASES:
+                rc = run_case(case, stdout_path, config, doc_analysis)
+            else:
+                print(f"Unknown case: {case}")
+                rc = 2
+
+            # Consolidate runner metadata + test results into single results.json
+            consolidated = {
+                "case": case,
+                "timestamp": now_timestr(),
+                "latest_commit": last_commit(),
+                "infrastructure": "managed" if managed else "attach",
+                "api_version": config.api_version,
+                "pdf_split_page_count": config.pdf_split_page_count,
+                "return_code": rc,
             }
-        )
+
+            if managed:
+                consolidated["profiles"] = config.profiles
+
+            # Merge test results if available
+            test_results_file = os.path.join(out_dir, "_test_results.json")
+            if os.path.exists(test_results_file):
+                try:
+                    with open(test_results_file) as f:
+                        test_data = json.load(f)
+                    consolidated.update(test_data)
+                    # Clean up intermediate file
+                    os.remove(test_results_file)
+                except (json.JSONDecodeError, IOError) as e:
+                    print(f"Warning: Could not read test results: {e}")
+
+            # Write consolidated results.json
+            results_path = os.path.join(out_dir, "results.json")
+            with open(results_path, "w") as f:
+                json.dump(consolidated, f, indent=2)
+
+            # Write artifact path to session directory for parent processes (e.g., nightly runner)
+            if session_dir:
+                artifact_paths_file = Path(session_dir) / ".artifact_paths.json"
+                artifact_paths = {}
+                if artifact_paths_file.exists():
+                    with open(artifact_paths_file) as f:
+                        artifact_paths = json.load(f)
+                artifact_paths[dataset_name] = str(out_dir)
+                with open(artifact_paths_file, "w") as f:
+                    json.dump(artifact_paths, f, indent=2)
+
+            print(f"\n{'='*60}")
+            print(f"Results written to: {results_path}")
+            print(f"{'='*60}")
+
+            # Collect results
+            results.append(
+                {
+                    "dataset": dataset_name,
+                    "artifact_dir": str(out_dir),
+                    "rc": rc,
+                    "status": "success" if rc == 0 else "failed",
+                }
+            )
 
     # Cleanup managed services
     if managed and service_manager:
@@ -337,6 +527,12 @@ def close(self):
 )
 @click.option("--no-build", is_flag=True, help="Skip building Docker images (managed mode only)")
 @click.option("--keep-up", is_flag=True, help="Keep services running after test (managed mode only)")
+@click.option(
+    "--minimize-vram",
+    is_flag=True,
+    help="Between e2e and recall, stop ingestion-only services to free VRAM (e2e_recall + managed only; mutually "
+    "exclusive with --keep-up)",
+)
 @click.option("--doc-analysis", is_flag=True, help="Show per-document element breakdown")
 @click.option(
     "--session-dir",
@@ -376,6 +572,7 @@ def main(
     dataset,
     no_build,
     keep_up,
+    minimize_vram,
     doc_analysis,
     session_dir,
     session_name,
@@ -383,6 +580,9 @@ def main(
     dump_logs,
     test_config_path,
 ):
+    if keep_up and minimize_vram:
+        print("Error: --keep-up and --minimize-vram are mutually exclusive.", file=sys.stderr)
+        sys.exit(1)
 
     if not dataset:
         print("Error: --dataset is required. Use --dataset=<name> or --dataset=<name1>,<name2>", file=sys.stderr)
@@ -417,6 +617,7 @@ def main(
         sku=sku,
         dump_logs=dump_logs,
         config_file=test_config_path,
+        minimize_vram=minimize_vram,
     )
 
 
diff --git a/tools/harness/src/nv_ingest_harness/service_manager/base.py b/tools/harness/src/nv_ingest_harness/service_manager/base.py
index 02368d635..809f766d2 100644
--- a/tools/harness/src/nv_ingest_harness/service_manager/base.py
+++ b/tools/harness/src/nv_ingest_harness/service_manager/base.py
@@ -45,7 +45,9 @@ def stop(self, clean: bool = False) -> int:
         pass
 
     @abstractmethod
-    def check_readiness(self, timeout_s: int, check_milvus: bool = True, check_embedding: bool = True) -> bool:
+    def check_readiness(
+        self, timeout_s: int, check_milvus: bool = True, check_embedding: bool = True, verbose: bool = True
+    ) -> bool:
         """
         Check if services are ready.
 
@@ -53,6 +55,7 @@ def check_readiness(self, timeout_s: int, check_milvus: bool = True, check_embed
             timeout_s: Timeout in seconds
             check_milvus: If True, also check Milvus health endpoint
             check_embedding: If True, also check embedding service health endpoint
+            verbose: If True, print waiting message and per-service readiness status
 
         Returns:
             True when ready, False on timeout
@@ -117,3 +120,52 @@ def restart(self, build: bool = False, clean: bool = True, timeout: int = 600) -
 
         print("Services restarted successfully!")
         return 0
+
+    def stop_ingestion_services(self) -> int:
+        """
+        Stop only ingestion-related services (ingest API + doc-parsing NIMs).
+        Used after e2e when minimize_vram to free VRAM before recall.
+        Default: no-op (return 0).
+        """
+        return 0
+
+    def start_ingestion_services(self) -> int:
+        """
+        Start ingestion-related services (ingest API + doc-parsing NIMs).
+        Used before the next dataset's e2e when minimize_vram.
+        Default: no-op (return 0).
+        """
+        return 0
+
+    def stop_non_ingestion_services(self) -> int:
+        """
+        Stop services not needed for ingestion (e.g. reranker, attu).
+        Called after initial start() when minimize_vram so only ingestion stack runs before e2e.
+        Default: no-op (return 0).
+        """
+        return 0
+
+    def start_retrieval_services(self, reranker: bool = False) -> int:
+        """
+        Start recall-required services; if reranker is True, bring up reranker.
+        Called before recall when minimize_vram.
+        Default: no-op (return 0).
+
+        Args:
+            reranker: If True, start/scale up the reranker service.
+        """
+        return 0
+
+    def wait_for_reranker_readiness(self, timeout_s: int, verbose: bool = True) -> bool:
+        """
+        Wait for the reranker service to become ready (e.g. after start_retrieval_services).
+        Default: no-op (return True).
+
+        Args:
+            timeout_s: Timeout in seconds
+            verbose: If True, print waiting message
+
+        Returns:
+            True when ready, False on timeout
+        """
+        return True
diff --git a/tools/harness/src/nv_ingest_harness/service_manager/docker_compose.py b/tools/harness/src/nv_ingest_harness/service_manager/docker_compose.py
index 0a0a1285a..b04942741 100644
--- a/tools/harness/src/nv_ingest_harness/service_manager/docker_compose.py
+++ b/tools/harness/src/nv_ingest_harness/service_manager/docker_compose.py
@@ -50,6 +50,27 @@ def _build_compose_cmd(self, base_cmd: list[str]) -> list[str]:
             cmd += ["-f", self.override_file]
         return cmd
 
+    def _compose_cmd_with_profiles(self, subcmd: str, *args: str) -> list[str]:
+        """Build compose command with same files and profiles as start(), for stop/start by service name."""
+        profile_list = self.config.profiles or []
+        cmd = self._build_compose_cmd(["docker", "compose"])
+        for p in profile_list:
+            cmd += ["--profile", p]
+        cmd += [subcmd]
+        cmd += list(args)
+        return cmd
+
+    # Ingestion-only services (per STAGE-BASED-DEPLOYMENT / DOCKER-COMPOSE-SERVICES-BY-STAGE)
+    _INGESTION_SERVICES = (
+        "nv-ingest-ms-runtime",
+        "page-elements",
+        "graphic-elements",
+        "table-structure",
+        "ocr",
+    )
+    # Non-ingestion services to stop after initial start when minimize_vram (recall-only, etc.)
+    _NON_INGESTION_SERVICES = ("reranker",)
+
     def start(self, no_build: bool = False) -> int:
         """
         Start Docker Compose services with profiles.
@@ -112,7 +133,9 @@ def stop(self, clean: bool = False) -> int:
 
         return 0
 
-    def check_readiness(self, timeout_s: int, check_milvus: bool = True, check_embedding: bool = True) -> bool:
+    def check_readiness(
+        self, timeout_s: int, check_milvus: bool = True, check_embedding: bool = True, verbose: bool = True
+    ) -> bool:
         """
         Poll the health endpoint until ready.
 
@@ -120,47 +143,75 @@ def check_readiness(self, timeout_s: int, check_milvus: bool = True, check_embed
             timeout_s: Timeout in seconds
             check_milvus: If True, also check Milvus health endpoint
             check_embedding: If True, also check embedding service health endpoint
+            verbose: If True, print waiting message and per-service readiness status
 
         Returns:
             True if ready, False on timeout
         """
         url = self.get_service_url("health")
         deadline = time.time() + timeout_s
+        hostname = getattr(self.config, "hostname", "localhost")
+        last_status_time = 0.0
+        status_interval = 10.0  # print status every 10s when verbose
+
+        if verbose:
+            print(f"Waiting for services to become ready (timeout: {timeout_s}s)...")
 
         while time.time() < deadline:
+            now = time.time()
+            main_ready = False
+            milvus_ready = not check_milvus
+            embedding_ready = not check_embedding
+
             try:
-                # Check main service health
                 with urllib.request.urlopen(url, timeout=5) as resp:
-                    if resp.status == 200:
-                        all_services_ready = True
-
-                        # If Milvus check is enabled, verify it's also ready
-                        if check_milvus:
-                            hostname = getattr(self.config, "hostname", "localhost")
-                            milvus_url = f"http://{hostname}:9091/healthz"
-                            try:
-                                with urllib.request.urlopen(milvus_url, timeout=5) as milvus_resp:
-                                    if milvus_resp.status != 200:
-                                        all_services_ready = False
-                            except Exception:
-                                all_services_ready = False
-
-                        # If embedding check is enabled, verify it's also ready
-                        if check_embedding:
-                            hostname = getattr(self.config, "hostname", "localhost")
-                            embedding_url = f"http://{hostname}:8012/v1/health/ready"
-                            try:
-                                with urllib.request.urlopen(embedding_url, timeout=5) as embedding_resp:
-                                    if embedding_resp.status != 200:
-                                        all_services_ready = False
-                            except Exception:
-                                all_services_ready = False
-
-                        if all_services_ready:
-                            return True
+                    main_ready = resp.status == 200
             except Exception:
                 pass
+
+            if main_ready and check_milvus:
+                milvus_url = f"http://{hostname}:9091/healthz"
+                try:
+                    with urllib.request.urlopen(milvus_url, timeout=5) as milvus_resp:
+                        milvus_ready = milvus_resp.status == 200
+                except Exception:
+                    pass
+
+            if main_ready and check_embedding:
+                embedding_url = f"http://{hostname}:8012/v1/health/ready"
+                try:
+                    with urllib.request.urlopen(embedding_url, timeout=5) as embedding_resp:
+                        embedding_ready = embedding_resp.status == 200
+                except Exception:
+                    pass
+
+            all_services_ready = main_ready and milvus_ready and embedding_ready
+            if all_services_ready:
+                if verbose:
+                    print("All services ready.")
+                return True
+
+            if verbose and (now - last_status_time >= status_interval or last_status_time == 0):
+                last_status_time = now
+                parts = [f"main (7670): {'ready' if main_ready else 'not ready'}"]
+                if main_ready:
+                    if check_milvus:
+                        parts.append(f"milvus (9091): {'ready' if milvus_ready else 'not ready'}")
+                    if check_embedding:
+                        parts.append(f"embedding (8012): {'ready' if embedding_ready else 'not ready'}")
+                print("  " + " | ".join(parts))
+
             time.sleep(3)
+
+        if verbose:
+            parts = []
+            if not main_ready:
+                parts.append("main (7670)")
+            if main_ready and check_milvus and not milvus_ready:
+                parts.append("milvus (9091)")
+            if main_ready and check_embedding and not embedding_ready:
+                parts.append("embedding (8012)")
+            print(f"Readiness timeout. Not ready: {', '.join(parts) or 'unknown'}")
         return False
 
     def get_service_url(self, service: str = "api") -> str:
@@ -277,3 +328,73 @@ def dump_logs(self, artifacts_dir: Path) -> int:
         except Exception as e:
             print(f"Error: Failed to dump logs: {e}")
             return 1
+
+    def stop_ingestion_services(self) -> int:
+        """Stop only ingestion-related services to free VRAM before recall."""
+        if not self.config.profiles:
+            print("No profiles specified; skipping stop_ingestion_services")
+            return 0
+        print("Stopping ingestion-only services (minimize VRAM)...")
+        cmd = self._compose_cmd_with_profiles("stop", *self._INGESTION_SERVICES)
+        return run_cmd(cmd)
+
+    def start_ingestion_services(self) -> int:
+        """Start ingestion-related services before the next dataset's e2e."""
+        if not self.config.profiles:
+            print("No profiles specified; skipping start_ingestion_services")
+            return 0
+        print("Starting ingestion-only services...")
+        cmd = self._compose_cmd_with_profiles("start", *self._INGESTION_SERVICES)
+        return run_cmd(cmd)
+
+    def stop_non_ingestion_services(self) -> int:
+        """Stop reranker and other non-ingestion services so only ingestion stack runs before e2e."""
+        if not self.config.profiles:
+            print("No profiles specified; skipping stop_non_ingestion_services")
+            return 0
+        print("Stopping non-ingestion services (reranker, etc.)...")
+        cmd = self._compose_cmd_with_profiles("stop", *self._NON_INGESTION_SERVICES)
+        return run_cmd(cmd)
+
+    def wait_for_reranker_readiness(self, timeout_s: int, verbose: bool = True) -> bool:
+        """Wait for reranker NIM to become ready (poll /v1/health/ready on port 8015)."""
+        hostname = getattr(self.config, "hostname", "localhost")
+        url = f"http://{hostname}:8015/v1/health/ready"
+        deadline = time.time() + timeout_s
+        last_status_time = 0.0
+        status_interval = 10.0
+
+        if verbose:
+            print(f"Waiting for reranker to become ready (timeout: {timeout_s}s)...")
+
+        while time.time() < deadline:
+            now = time.time()
+            try:
+                with urllib.request.urlopen(url, timeout=5) as resp:
+                    if resp.status == 200:
+                        if verbose:
+                            print("Reranker ready.")
+                        return True
+            except Exception:
+                pass
+
+            if verbose and (now - last_status_time >= status_interval or last_status_time == 0):
+                last_status_time = now
+                print("  reranker (8015): not ready")
+
+            time.sleep(3)
+
+        if verbose:
+            print("Readiness timeout. Reranker (8015) did not become ready.")
+        return False
+
+    def start_retrieval_services(self, reranker: bool = False) -> int:
+        """Start recall-required services; if reranker is True, bring up reranker."""
+        if not reranker:
+            return 0
+        if not self.config.profiles:
+            print("No profiles specified; skipping start_retrieval_services")
+            return 0
+        print("Starting retrieval services (reranker)...")
+        cmd = self._compose_cmd_with_profiles("start", "reranker")
+        return run_cmd(cmd)
diff --git a/tools/harness/src/nv_ingest_harness/service_manager/helm.py b/tools/harness/src/nv_ingest_harness/service_manager/helm.py
index 0fc8d7ba1..e1028ac7d 100644
--- a/tools/harness/src/nv_ingest_harness/service_manager/helm.py
+++ b/tools/harness/src/nv_ingest_harness/service_manager/helm.py
@@ -225,6 +225,41 @@ def _find_services_by_pattern(self, pattern: str) -> list[str]:
             print(f"Warning: Error finding services: {e}")
             return []
 
+    def _wait_for_services_by_pattern(self, pattern: str, timeout_s: int = 120, interval_s: int = 5) -> list[str]:
+        """
+        Wait for at least one service matching the pattern to appear, then return all matches.
+
+        For patterns without wildcards, returns [pattern] immediately (no wait).
+        For wildcard patterns, polls until matches are found or timeout.
+
+        Args:
+            pattern: Service name or pattern (e.g., "nv-ingest", "*embed*")
+            timeout_s: Maximum time to wait in seconds
+            interval_s: Time between poll attempts in seconds
+
+        Returns:
+            List of matching service names (may be empty if timeout)
+        """
+        if "*" not in pattern:
+            return [pattern]
+
+        deadline = time.time() + timeout_s
+        attempt = 0
+        while time.time() < deadline:
+            attempt += 1
+            service_names = self._find_services_by_pattern(pattern)
+            if service_names:
+                if attempt > 1:
+                    print(f"  Found {len(service_names)} service(s) matching '{pattern}' after {attempt} attempt(s)")
+                return service_names
+            if attempt == 1:
+                print(
+                    f"Waiting for service(s) matching '{pattern}' (timeout: {timeout_s}s, poll every {interval_s}s)..."
+                )
+            time.sleep(interval_s)
+
+        return []
+
     def _start_port_forwards(self) -> None:
         """Start port forwarding for all configured services."""
         # Get port forward configuration
@@ -253,8 +288,8 @@ def _start_port_forwards(self) -> None:
                 print(f"Warning: Invalid port forward config: {pf_config}")
                 continue
 
-            # Find matching services
-            service_names = self._find_services_by_pattern(service_pattern)
+            # Find matching services (wait for pattern-matched services to appear)
+            service_names = self._wait_for_services_by_pattern(service_pattern)
 
             if not service_names:
                 print(f"Warning: No services found matching pattern '{service_pattern}'")
@@ -606,7 +641,9 @@ def stop(self, clean: bool = False) -> int:
 
         return 0
 
-    def check_readiness(self, timeout_s: int, check_milvus: bool = True, check_embedding: bool = True) -> bool:
+    def check_readiness(
+        self, timeout_s: int, check_milvus: bool = True, check_embedding: bool = True, verbose: bool = True
+    ) -> bool:
         """
         Check readiness by polling HTTP endpoint.
 
@@ -614,47 +651,75 @@ def check_readiness(self, timeout_s: int, check_milvus: bool = True, check_embed
             timeout_s: Timeout in seconds
             check_milvus: If True, also check Milvus health endpoint
             check_embedding: If True, also check embedding service health endpoint
+            verbose: If True, print waiting message and per-service readiness status
 
         Returns:
             True if ready, False on timeout
         """
         url = self.get_service_url("health")
         deadline = time.time() + timeout_s
+        hostname = getattr(self.config, "hostname", "localhost")
+        last_status_time = 0.0
+        status_interval = 10.0  # print status every 10s when verbose
+
+        if verbose:
+            print(f"Waiting for services to become ready (timeout: {timeout_s}s)...")
 
         while time.time() < deadline:
+            now = time.time()
+            main_ready = False
+            milvus_ready = not check_milvus
+            embedding_ready = not check_embedding
+
             try:
-                # Check main service health
                 with urllib.request.urlopen(url, timeout=5) as resp:
-                    if resp.status == 200:
-                        all_services_ready = True
-
-                        # If Milvus check is enabled, verify it's also ready
-                        if check_milvus:
-                            hostname = getattr(self.config, "hostname", "localhost")
-                            milvus_url = f"http://{hostname}:9091/healthz"
-                            try:
-                                with urllib.request.urlopen(milvus_url, timeout=5) as milvus_resp:
-                                    if milvus_resp.status != 200:
-                                        all_services_ready = False
-                            except Exception:
-                                all_services_ready = False
-
-                        # If embedding check is enabled, verify it's also ready
-                        if check_embedding:
-                            hostname = getattr(self.config, "hostname", "localhost")
-                            embedding_url = f"http://{hostname}:8012/v1/health/ready"
-                            try:
-                                with urllib.request.urlopen(embedding_url, timeout=5) as embedding_resp:
-                                    if embedding_resp.status != 200:
-                                        all_services_ready = False
-                            except Exception:
-                                all_services_ready = False
-
-                        if all_services_ready:
-                            return True
+                    main_ready = resp.status == 200
             except Exception:
                 pass
+
+            if main_ready and check_milvus:
+                milvus_url = f"http://{hostname}:9091/healthz"
+                try:
+                    with urllib.request.urlopen(milvus_url, timeout=5) as milvus_resp:
+                        milvus_ready = milvus_resp.status == 200
+                except Exception:
+                    pass
+
+            if main_ready and check_embedding:
+                embedding_url = f"http://{hostname}:8012/v1/health/ready"
+                try:
+                    with urllib.request.urlopen(embedding_url, timeout=5) as embedding_resp:
+                        embedding_ready = embedding_resp.status == 200
+                except Exception:
+                    pass
+
+            all_services_ready = main_ready and milvus_ready and embedding_ready
+            if all_services_ready:
+                if verbose:
+                    print("All services ready.")
+                return True
+
+            if verbose and (now - last_status_time >= status_interval or last_status_time == 0):
+                last_status_time = now
+                parts = [f"main (7670): {'ready' if main_ready else 'not ready'}"]
+                if main_ready:
+                    if check_milvus:
+                        parts.append(f"milvus (9091): {'ready' if milvus_ready else 'not ready'}")
+                    if check_embedding:
+                        parts.append(f"embedding (8012): {'ready' if embedding_ready else 'not ready'}")
+                print("  " + " | ".join(parts))
+
             time.sleep(3)
+
+        if verbose:
+            parts = []
+            if not main_ready:
+                parts.append("main (7670)")
+            if main_ready and check_milvus and not milvus_ready:
+                parts.append("milvus (9091)")
+            if main_ready and check_embedding and not embedding_ready:
+                parts.append("embedding (8012)")
+            print(f"Readiness timeout. Not ready: {', '.join(parts) or 'unknown'}")
         return False
 
     def get_service_url(self, service: str = "api") -> str:
@@ -882,3 +947,132 @@ def dump_logs(self, artifacts_dir: Path) -> int:
         except Exception as e:
             print(f"Error: Failed to dump logs: {e}")
             return 1
+
+    # Deployment names for stage-based control (per chart NIMService names and main deploy)
+    _INGESTION_DEPLOYMENTS = (
+        # main deploy uses release name (e.g. nv-ingest); set at runtime
+        "nemotron-page-elements-v3",
+        "nemotron-graphic-elements-v1",
+        "nemotron-table-structure-v1",
+        "nemotron-ocr-v1",
+    )
+    _NON_INGESTION_DEPLOYMENTS = ("llama-nemotron-rerank-1b-v2",)
+
+    def _get_existing_deployments(self) -> set[str]:
+        """Return set of deployment names that exist in the release namespace."""
+        cmd = self.kubectl_cmd + [
+            "get",
+            "deployments",
+            "-n",
+            self.namespace,
+            "-o",
+            "jsonpath={range .items[*]}{.metadata.name}{'\\n'}{end}",
+        ]
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+            if result.returncode != 0:
+                return set()
+            return {n.strip() for n in result.stdout.strip().split("\n") if n.strip()}
+        except Exception:
+            return set()
+
+    def _scale_deployment(self, name: str, replicas: int) -> int:
+        """Scale a deployment to the given replica count. Returns 0 on success."""
+        cmd = self.kubectl_cmd + [
+            "scale",
+            "deployment",
+            name,
+            "-n",
+            self.namespace,
+            "--replicas",
+            str(replicas),
+        ]
+        return run_cmd(cmd)
+
+    def stop_ingestion_services(self) -> int:
+        """Stop only ingestion-related services (scale to 0) to free VRAM before recall."""
+        existing = self._get_existing_deployments()
+        main_name = self.release_name
+        to_stop = ([main_name] if main_name in existing else []) + [
+            d for d in self._INGESTION_DEPLOYMENTS if d in existing
+        ]
+        if not to_stop:
+            return 0
+        print("Stopping ingestion-only services (minimize VRAM)...")
+        for name in to_stop:
+            rc = self._scale_deployment(name, 0)
+            if rc != 0:
+                print(f"Warning: Failed to scale {name} to 0")
+        return 0
+
+    def start_ingestion_services(self) -> int:
+        """Start ingestion-related services (scale to 1) before the next dataset's e2e."""
+        existing = self._get_existing_deployments()
+        main_name = self.release_name
+        to_start = ([main_name] if main_name in existing else []) + [
+            d for d in self._INGESTION_DEPLOYMENTS if d in existing
+        ]
+        if not to_start:
+            return 0
+        print("Starting ingestion-only services...")
+        for name in to_start:
+            rc = self._scale_deployment(name, 1)
+            if rc != 0:
+                print(f"Warning: Failed to scale {name} to 1")
+        return 0
+
+    def stop_non_ingestion_services(self) -> int:
+        """Stop reranker and other non-ingestion services (scale to 0) after initial start."""
+        existing = self._get_existing_deployments()
+        to_stop = [d for d in self._NON_INGESTION_DEPLOYMENTS if d in existing]
+        if not to_stop:
+            return 0
+        print("Stopping non-ingestion services (reranker, etc.)...")
+        for name in to_stop:
+            rc = self._scale_deployment(name, 0)
+            if rc != 0:
+                print(f"Warning: Failed to scale {name} to 0")
+        return 0
+
+    def start_retrieval_services(self, reranker: bool = False) -> int:
+        """Start recall-required services; if reranker is True, scale reranker to 1."""
+        if not reranker:
+            return 0
+        existing = self._get_existing_deployments()
+        reranker_name = "llama-nemotron-rerank-1b-v2"
+        if reranker_name not in existing:
+            return 0
+        print("Starting retrieval services (reranker)...")
+        return self._scale_deployment(reranker_name, 1)
+
+    def wait_for_reranker_readiness(self, timeout_s: int, verbose: bool = True) -> bool:
+        """Wait for reranker NIM to become ready (poll /v1/health/ready on port 8015)."""
+        hostname = getattr(self.config, "hostname", "localhost")
+        url = f"http://{hostname}:8015/v1/health/ready"
+        deadline = time.time() + timeout_s
+        last_status_time = 0.0
+        status_interval = 10.0
+
+        if verbose:
+            print(f"Waiting for reranker to become ready (timeout: {timeout_s}s)...")
+
+        while time.time() < deadline:
+            now = time.time()
+            try:
+                with urllib.request.urlopen(url, timeout=5) as resp:
+                    if resp.status == 200:
+                        if verbose:
+                            print("Reranker ready.")
+                        return True
+            except Exception:
+                pass
+
+            if verbose and (now - last_status_time >= status_interval or last_status_time == 0):
+                last_status_time = now
+                print("  reranker (8015): not ready")
+
+            time.sleep(3)
+
+        if verbose:
+            print("Readiness timeout. Reranker (8015) did not become ready.")
+        return False
diff --git a/tools/harness/src/nv_ingest_harness/utils/recall.py b/tools/harness/src/nv_ingest_harness/utils/recall.py
index d5b20d863..c22f79813 100644
--- a/tools/harness/src/nv_ingest_harness/utils/recall.py
+++ b/tools/harness/src/nv_ingest_harness/utils/recall.py
@@ -377,6 +377,7 @@ def bo767_recall(
     gpu_search: bool = False,
     nv_ranker: bool = False,
     ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
     nv_ranker_endpoint: Optional[str] = None,
     nv_ranker_model_name: Optional[str] = None,
     vdb_backend: str = "milvus",
@@ -482,6 +483,7 @@ def finance_bench_recall(
     gpu_search: bool = False,
     nv_ranker: bool = False,
     ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
     nv_ranker_endpoint: Optional[str] = None,
     nv_ranker_model_name: Optional[str] = None,
     vdb_backend: str = "milvus",
@@ -585,6 +587,7 @@ def earnings_recall(
     gpu_search: bool = False,
     nv_ranker: bool = False,
     ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
     nv_ranker_endpoint: Optional[str] = None,
     nv_ranker_model_name: Optional[str] = None,
     vdb_backend: str = "milvus",
@@ -637,11 +640,17 @@ def audio_recall(
     collection_name: str,
     hostname: str = "localhost",
     sparse: bool = True,
+    hybrid: bool = False,
     model_name: str = None,
     top_k: int = 10,
     gpu_search: bool = False,
     nv_ranker: bool = False,
     ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
+    nv_ranker_endpoint: Optional[str] = None,
+    nv_ranker_model_name: Optional[str] = None,
+    vdb_backend: str = "milvus",
+    table_path: Optional[str] = None,
 ) -> Dict[int, float]:
     """
     Audio dataset recall evaluator (stub, multimodal-only).
@@ -673,11 +682,15 @@ def bo10k_load_ground_truth(ground_truth_dir: Optional[str] = None) -> pd.DataFr
     return df.reset_index(drop=True)
 
 
-def jp20_load_ground_truth(ground_truth_dir: Optional[str] = None) -> pd.DataFrame:
+def jp20_load_ground_truth(
+    ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
+) -> pd.DataFrame:
     """Load bo10k ground truth filtered to jp20 documents."""
     df = bo10k_load_ground_truth(ground_truth_dir=ground_truth_dir)
 
-    dataset_dir = os.environ.get("JP20_DATASET_DIR") or os.path.join(get_repo_root(), "data", "jp20")
+    if dataset_dir is None:
+        dataset_dir = os.environ.get("JP20_DATASET_DIR") or os.path.join(get_repo_root(), "data", "jp20")
     jp20_pdfs = {os.path.splitext(name)[0] for name in os.listdir(dataset_dir) if name.lower().endswith(".pdf")}
     filtered = df[df["pdf"].isin(jp20_pdfs)].reset_index(drop=True)
 
@@ -694,6 +707,7 @@ def bo10k_recall(
     gpu_search: bool = False,
     nv_ranker: bool = False,
     ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
     nv_ranker_endpoint: Optional[str] = None,
     nv_ranker_model_name: Optional[str] = None,
     vdb_backend: str = "milvus",
@@ -729,14 +743,19 @@ def jp20_recall(
     gpu_search: bool = False,
     nv_ranker: bool = False,
     ground_truth_dir: Optional[str] = None,
+    dataset_dir: Optional[str] = None,
     nv_ranker_endpoint: Optional[str] = None,
     nv_ranker_model_name: Optional[str] = None,
     vdb_backend: str = "milvus",
     table_path: Optional[str] = None,
 ) -> Dict[int, float]:
     """Evaluate recall@k for jp20 dataset (bo10k subset)."""
+
+    def loader(gt_dir):
+        return jp20_load_ground_truth(gt_dir, dataset_dir=dataset_dir)
+
     return evaluate_recall_orchestrator(
-        loader_func=jp20_load_ground_truth,
+        loader_func=loader,
         scorer_func=get_recall_scores,
         collection_name=collection_name,
         hostname=hostname,
diff --git a/tools/harness/test_configs.yaml b/tools/harness/test_configs.yaml
index 00bfb374d..fe98c3e8a 100644
--- a/tools/harness/test_configs.yaml
+++ b/tools/harness/test_configs.yaml
@@ -50,10 +50,10 @@ active:
         local_port: 8012
         remote_port: 8000
       - service: "*rerank*"  # Wildcard pattern to match reranker services
-        local_port: 8020
+        local_port: 8015
         remote_port: 8000
     values:  # inline Helm values
-      nimOperator.llama_3_2_nv_rerankqa_1b_v2.enabled: true
+      nimOperator.rerankqa.enabled: true
 
   # Runtime configuration
   sparse: false  # Use sparse embeddings (Milvus only)