From 52344391ed28feae8404879a7df2d3e0e20dffdc Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 19 Jan 2026 11:22:46 +0000
Subject: [PATCH 01/15] Add SDMX enrichment item selector

---
 .../sdmx/find_enrichment_items.py             | 275 ++++++++++++++++++
 .../templates/find_enrichment_items_prompt.j2 |  49 ++++
 2 files changed, 324 insertions(+)
 create mode 100644 tools/agentic_import/sdmx/find_enrichment_items.py
 create mode 100644 tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2

diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/find_enrichment_items.py
new file mode 100644
index 0000000000..2f47ac5063
--- /dev/null
+++ b/tools/agentic_import/sdmx/find_enrichment_items.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import platform
+import shutil
+import subprocess
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+from jinja2 import Environment, FileSystemLoader
+
+_FLAGS = flags.FLAGS
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def _define_flags():
+    try:
+        flags.DEFINE_string(
+            'input_metadata_json', None,
+            'Path to input SDMX metadata JSON (required)')
+        flags.mark_flag_as_required('input_metadata_json')
+
+        flags.DEFINE_string('output_path', None,
+                            'Path to output items JSON (required)')
+        flags.mark_flag_as_required('output_path')
+
+        flags.DEFINE_boolean('dry_run', False,
+                             'Generate prompt only without calling Gemini CLI')
+
+        flags.DEFINE_boolean(
+            'skip_confirmation', False,
+            'Skip user confirmation before running Gemini CLI')
+
+        flags.DEFINE_boolean(
+            'enable_sandboxing',
+            platform.system() == 'Darwin',
+            'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)'
+        )
+
+        flags.DEFINE_string(
+            'gemini_cli', 'gemini',
+            'Custom path or command to invoke Gemini CLI. '
+            'Example: "/usr/local/bin/gemini". '
+            'WARNING: This value is executed in a shell - use only with trusted input.'
+        )
+
+        flags.DEFINE_string(
+            'working_dir', None,
+            'Working directory for the run (default: current directory)')
+    except flags.DuplicateFlagError:
+        pass
+
+
+@dataclass
+class Config:
+    input_metadata_json: str
+    output_path: str
+    dry_run: bool = False
+    skip_confirmation: bool = False
+    enable_sandboxing: bool = False
+    gemini_cli: Optional[str] = None
+    working_dir: Optional[str] = None
+
+
+@dataclass
+class RunResult:
+    run_id: str
+    run_dir: Path
+    prompt_path: Path
+    gemini_log_path: Path
+    gemini_command: str
+    sandbox_enabled: bool
+
+
+class EnrichmentItemsFinder:
+    def __init__(self, config: Config):
+        self._config = config
+        self._working_dir = Path(
+            config.working_dir).resolve() if config.working_dir else Path.cwd()
+        self._input_path = self._resolve_path(config.input_metadata_json)
+        self._output_path = self._resolve_path(config.output_path)
+
+        if not self._input_path.exists():
+            raise FileNotFoundError(
+                f"input_metadata_json does not exist: {self._input_path}")
+
+        self._output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        self._datacommons_dir = self._working_dir / '.datacommons'
+        self._datacommons_dir.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self._run_id = f"gemini_{timestamp}"
+        self._run_dir = self._datacommons_dir / 'runs' / self._run_id
+        self._run_dir.mkdir(parents=True, exist_ok=True)
+
+    def find_items_to_enrich(self) -> RunResult:
+        prompt_file = self._generate_prompt()
+        gemini_log_file = self._run_dir / 'gemini_cli.log'
+        gemini_command = self._build_gemini_command(prompt_file,
+                                                    gemini_log_file)
+
+        result = RunResult(run_id=self._run_id,
+                           run_dir=self._run_dir,
+                           prompt_path=prompt_file,
+                           gemini_log_path=gemini_log_file,
+                           gemini_command=gemini_command,
+                           sandbox_enabled=self._config.enable_sandboxing)
+
+        if self._config.dry_run:
+            logging.info(
+                "Dry run mode: Prompt file generated at %s. "
+                "Skipping Gemini CLI execution.", prompt_file)
+            return result
+
+        if not self._config.skip_confirmation:
+            if not self._get_user_confirmation(prompt_file):
+                logging.info("Enrichment item selection cancelled by user.")
+                return result
+
+        if not self._check_gemini_cli_available():
+            logging.warning(
+                "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)."
+            )
+
+        logging.info(
+            "Launching gemini (cwd: %s): %s", self._working_dir, gemini_command)
+        logging.info("Gemini output will be saved to: %s", gemini_log_file)
+
+        exit_code = self._run_subprocess(gemini_command)
+        if exit_code == 0:
+            logging.info("Gemini CLI completed successfully")
+            return result
+
+        raise RuntimeError(
+            f"Gemini CLI execution failed with exit code {exit_code}")
+
+    def _resolve_path(self, path: str) -> Path:
+        resolved = Path(path).expanduser()
+        if not resolved.is_absolute():
+            resolved = self._working_dir / resolved
+        return resolved.resolve()
+
+    def _generate_prompt(self) -> Path:
+        template_dir = os.path.join(_SCRIPT_DIR, 'templates')
+        env = Environment(loader=FileSystemLoader(template_dir))
+        template = env.get_template('find_enrichment_items_prompt.j2')
+
+        rendered_prompt = template.render(
+            input_metadata_abs=str(self._input_path),
+            output_path_abs=str(self._output_path),
+        )
+
+        output_file = self._run_dir / 'find_enrichment_items_prompt.md'
+        with open(output_file, 'w') as f:
+            f.write(rendered_prompt)
+
+        logging.info("Generated prompt written to: %s", output_file)
+        return output_file
+
+    def _get_user_confirmation(self, prompt_file: Path) -> bool:
+        print("\n" + "=" * 60)
+        print("SDMX ENRICHMENT ITEM SELECTION SUMMARY")
+        print("=" * 60)
+        print(f"Input metadata file: {self._input_path}")
+        print(f"Output items file: {self._output_path}")
+        print(f"Prompt file: {prompt_file}")
+        print(f"Working directory: {self._working_dir}")
+        print(
+            f"Sandboxing: {'Enabled' if self._config.enable_sandboxing else 'Disabled'}"
+        )
+        if not self._config.enable_sandboxing:
+            print(
+                "WARNING: Sandboxing is disabled. Gemini will run without safety restrictions."
+            )
+        print("=" * 60)
+
+        while True:
+            try:
+                response = input(
+                    "Ready to run Gemini for enrichment item selection? (y/n): "
+                ).strip().lower()
+                if response in ['y', 'yes']:
+                    return True
+                if response in ['n', 'no']:
+                    print("Selection cancelled by user.")
+                    return False
+                print("Please enter 'y' or 'n'.")
+            except KeyboardInterrupt:
+                print("\nSelection cancelled by user.")
+                return False
+
+    def _check_gemini_cli_available(self) -> bool:
+        if self._config.gemini_cli:
+            return True
+        return shutil.which('gemini') is not None
+
+    def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
+        prompt_path = prompt_file.resolve()
+        log_path = log_file.resolve()
+        gemini_cmd = self._config.gemini_cli or 'gemini'
+        sandbox_flag = "--sandbox" if self._config.enable_sandboxing else ""
+        return (
+            f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'"
+        )
+
+    def _run_subprocess(self, command: str) -> int:
+        try:
+            process = subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                shell=True,
+                cwd=self._working_dir,
+                encoding='utf-8',
+                errors='replace',
+                bufsize=1,
+                universal_newlines=True)
+
+            while True:
+                output = process.stdout.readline()
+                if output == '' and process.poll() is not None:
+                    break
+                if output:
+                    print(output.rstrip())
+
+            return process.wait()
+        except Exception as e:
+            logging.error("Error running subprocess: %s", str(e))
+            return 1
+
+
+def prepare_config() -> Config:
+    return Config(input_metadata_json=_FLAGS.input_metadata_json,
+                  output_path=_FLAGS.output_path,
+                  dry_run=_FLAGS.dry_run,
+                  skip_confirmation=_FLAGS.skip_confirmation,
+                  enable_sandboxing=_FLAGS.enable_sandboxing,
+                  gemini_cli=_FLAGS.gemini_cli,
+                  working_dir=_FLAGS.working_dir)
+
+
+def main(_):
+    config = prepare_config()
+    logging.info("Loaded config for enrichment item selection")
+
+    finder = EnrichmentItemsFinder(config)
+    finder.find_items_to_enrich()
+
+    logging.info("Enrichment item selection completed.")
+    return 0
+
+
+if __name__ == '__main__':
+    _define_flags()
+    app.run(main)
diff --git a/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 b/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2
new file mode 100644
index 0000000000..11e965df39
--- /dev/null
+++ b/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2
@@ -0,0 +1,49 @@
+You are an expert SDMX metadata analyst. Your task is to select only the SDMX
+codes and concepts that need enrichment and to craft precise web search queries
+for them.
+
+# INPUT
+- Full extractor JSON: {{ input_metadata_abs }}
+
+# OUTPUT
+- Write JSON to: {{ output_path_abs }}
+- Output MUST be valid JSON only. No extra text.
+
+# CRITICAL RULES
+- Process the ENTIRE input file. Do not read only the first lines.
+- Do not add `enriched_name` anywhere.
+- Do not include `name` or `description` fields in the output.
+- Skip place names (countries, regions, cities, etc.).
+- Skip popular/self-explanatory terms when clear (e.g., GDP, Population).
+- Use full context (dataflow name/description, codelist name, dimension name,
+  concept name, code name/description) to decide and to build queries.
+- Example: Interpret HICP in the context of the dataset and codelist, not alone.
+
+# TASK
+1) Read the full JSON from the input path.
+2) Select only items that truly need enrichment.
+3) For each selected item, add an `enrichment_query` string that reflects the
+   full context needed for web search.
+4) Produce a PRUNED JSON that preserves the original structure but ONLY keeps
+   the selected items and their necessary parent structure.
+
+# OUTPUT SHAPE (pruned)
+- Keep `dataflows` array.
+- For each kept dataflow: include `id` and only the substructures that contain
+  selected items.
+- For code items, keep them under their original `representation.codelist.codes`.
+- For concept items, keep them under their original `concept` (components) and/or
+  `referenced_concept_schemes[*].concepts`.
+- Remove all unselected items and any parent objects left empty.
+
+# FIELD MINIMUMS (do not add name/description)
+- dataflow: `id`
+- data_structure_definition: `id`
+- component (dimension/attribute/measure): `id`
+- concept: `id`, `concept_scheme_id`, `enrichment_query`
+- representation: `type`
+- codelist: `id`
+- code: `id`, `enrichment_query`
+- referenced_concept_schemes: `id`
+
+Write ONLY the JSON file to the output path.

From 328e130bfcd7e29f8bbef1153aaa55e737b6d2b2 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 19 Jan 2026 11:40:56 +0000
Subject: [PATCH 02/15] Add SDMX enrichment data fetcher

---
 .../sdmx/fetch_enrichment_data.py             | 274 ++++++++++++++++++
 .../templates/fetch_enrichment_data_prompt.j2 |  37 +++
 2 files changed, 311 insertions(+)
 create mode 100644 tools/agentic_import/sdmx/fetch_enrichment_data.py
 create mode 100644 tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2

diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/fetch_enrichment_data.py
new file mode 100644
index 0000000000..1ecf61c1be
--- /dev/null
+++ b/tools/agentic_import/sdmx/fetch_enrichment_data.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import platform
+import shutil
+import subprocess
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+from jinja2 import Environment, FileSystemLoader
+
+_FLAGS = flags.FLAGS
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+def _define_flags():
+    try:
+        flags.DEFINE_string('input_items_json', None,
+                            'Path to input items JSON (required)')
+        flags.mark_flag_as_required('input_items_json')
+
+        flags.DEFINE_string('output_path', None,
+                            'Path to output items JSON (required)')
+        flags.mark_flag_as_required('output_path')
+
+        flags.DEFINE_boolean('dry_run', False,
+                             'Generate prompt only without calling Gemini CLI')
+
+        flags.DEFINE_boolean(
+            'skip_confirmation', False,
+            'Skip user confirmation before running Gemini CLI')
+
+        flags.DEFINE_boolean(
+            'enable_sandboxing',
+            platform.system() == 'Darwin',
+            'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)'
+        )
+
+        flags.DEFINE_string(
+            'gemini_cli', 'gemini',
+            'Custom path or command to invoke Gemini CLI. '
+            'Example: "/usr/local/bin/gemini". '
+            'WARNING: This value is executed in a shell - use only with trusted input.'
+        )
+
+        flags.DEFINE_string(
+            'working_dir', None,
+            'Working directory for the run (default: current directory)')
+    except flags.DuplicateFlagError:
+        pass
+
+
+@dataclass
+class Config:
+    input_items_json: str
+    output_path: str
+    dry_run: bool = False
+    skip_confirmation: bool = False
+    enable_sandboxing: bool = False
+    gemini_cli: Optional[str] = None
+    working_dir: Optional[str] = None
+
+
+@dataclass
+class RunResult:
+    run_id: str
+    run_dir: Path
+    prompt_path: Path
+    gemini_log_path: Path
+    gemini_command: str
+    sandbox_enabled: bool
+
+
+class EnrichmentDataFetcher:
+    def __init__(self, config: Config):
+        self._config = config
+        self._working_dir = Path(
+            config.working_dir).resolve() if config.working_dir else Path.cwd()
+        self._input_path = self._resolve_path(config.input_items_json)
+        self._output_path = self._resolve_path(config.output_path)
+
+        if not self._input_path.exists():
+            raise FileNotFoundError(
+                f"input_items_json does not exist: {self._input_path}")
+
+        self._output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        self._datacommons_dir = self._working_dir / '.datacommons'
+        self._datacommons_dir.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self._run_id = f"gemini_{timestamp}"
+        self._run_dir = self._datacommons_dir / 'runs' / self._run_id
+        self._run_dir.mkdir(parents=True, exist_ok=True)
+
+    def fetch_enrichment_data(self) -> RunResult:
+        prompt_file = self._generate_prompt()
+        gemini_log_file = self._run_dir / 'gemini_cli.log'
+        gemini_command = self._build_gemini_command(prompt_file,
+                                                    gemini_log_file)
+
+        result = RunResult(run_id=self._run_id,
+                           run_dir=self._run_dir,
+                           prompt_path=prompt_file,
+                           gemini_log_path=gemini_log_file,
+                           gemini_command=gemini_command,
+                           sandbox_enabled=self._config.enable_sandboxing)
+
+        if self._config.dry_run:
+            logging.info(
+                "Dry run mode: Prompt file generated at %s. "
+                "Skipping Gemini CLI execution.", prompt_file)
+            return result
+
+        if not self._config.skip_confirmation:
+            if not self._get_user_confirmation(prompt_file):
+                logging.info("Enrichment data fetch cancelled by user.")
+                return result
+
+        if not self._check_gemini_cli_available():
+            logging.warning(
+                "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)."
+            )
+
+        logging.info("Launching gemini (cwd: %s): %s", self._working_dir,
+                     gemini_command)
+        logging.info("Gemini output will be saved to: %s", gemini_log_file)
+
+        exit_code = self._run_subprocess(gemini_command)
+        if exit_code == 0:
+            logging.info("Gemini CLI completed successfully")
+            return result
+
+        raise RuntimeError(
+            f"Gemini CLI execution failed with exit code {exit_code}")
+
+    def _resolve_path(self, path: str) -> Path:
+        resolved = Path(path).expanduser()
+        if not resolved.is_absolute():
+            resolved = self._working_dir / resolved
+        return resolved.resolve()
+
+    def _generate_prompt(self) -> Path:
+        template_dir = os.path.join(_SCRIPT_DIR, 'templates')
+        env = Environment(loader=FileSystemLoader(template_dir))
+        template = env.get_template('fetch_enrichment_data_prompt.j2')
+
+        rendered_prompt = template.render(
+            input_items_abs=str(self._input_path),
+            output_path_abs=str(self._output_path),
+        )
+
+        output_file = self._run_dir / 'fetch_enrichment_data_prompt.md'
+        with open(output_file, 'w') as f:
+            f.write(rendered_prompt)
+
+        logging.info("Generated prompt written to: %s", output_file)
+        return output_file
+
+    def _get_user_confirmation(self, prompt_file: Path) -> bool:
+        print("\n" + "=" * 60)
+        print("SDMX ENRICHMENT DATA FETCH SUMMARY")
+        print("=" * 60)
+        print(f"Input items file: {self._input_path}")
+        print(f"Output items file: {self._output_path}")
+        print(f"Prompt file: {prompt_file}")
+        print(f"Working directory: {self._working_dir}")
+        print(
+            f"Sandboxing: {'Enabled' if self._config.enable_sandboxing else 'Disabled'}"
+        )
+        if not self._config.enable_sandboxing:
+            print(
+                "WARNING: Sandboxing is disabled. Gemini will run without safety restrictions."
+            )
+        print("=" * 60)
+
+        while True:
+            try:
+                response = input(
+                    "Ready to run Gemini for enrichment data fetch? (y/n): "
+                ).strip().lower()
+                if response in ['y', 'yes']:
+                    return True
+                if response in ['n', 'no']:
+                    print("Data fetch cancelled by user.")
+                    return False
+                print("Please enter 'y' or 'n'.")
+            except KeyboardInterrupt:
+                print("\nData fetch cancelled by user.")
+                return False
+
+    def _check_gemini_cli_available(self) -> bool:
+        if self._config.gemini_cli:
+            return True
+        return shutil.which('gemini') is not None
+
+    def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
+        prompt_path = prompt_file.resolve()
+        log_path = log_file.resolve()
+        gemini_cmd = self._config.gemini_cli or 'gemini'
+        sandbox_flag = "--sandbox" if self._config.enable_sandboxing else ""
+        return (
+            f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'"
+        )
+
+    def _run_subprocess(self, command: str) -> int:
+        try:
+            process = subprocess.Popen(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                shell=True,
+                cwd=self._working_dir,
+                encoding='utf-8',
+                errors='replace',
+                bufsize=1,
+                universal_newlines=True)
+
+            while True:
+                output = process.stdout.readline()
+                if output == '' and process.poll() is not None:
+                    break
+                if output:
+                    print(output.rstrip())
+
+            return process.wait()
+        except Exception as e:
+            logging.error("Error running subprocess: %s", str(e))
+            return 1
+
+
+def prepare_config() -> Config:
+    return Config(input_items_json=_FLAGS.input_items_json,
+                  output_path=_FLAGS.output_path,
+                  dry_run=_FLAGS.dry_run,
+                  skip_confirmation=_FLAGS.skip_confirmation,
+                  enable_sandboxing=_FLAGS.enable_sandboxing,
+                  gemini_cli=_FLAGS.gemini_cli,
+                  working_dir=_FLAGS.working_dir)
+
+
+def main(_):
+    config = prepare_config()
+    logging.info("Loaded config for enrichment data fetch")
+
+    fetcher = EnrichmentDataFetcher(config)
+    fetcher.fetch_enrichment_data()
+
+    logging.info("Enrichment data fetch completed.")
+    return 0
+
+
+if __name__ == '__main__':
+    _define_flags()
+    app.run(main)
diff --git a/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 b/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2
new file mode 100644
index 0000000000..8f3ada99d8
--- /dev/null
+++ b/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2
@@ -0,0 +1,37 @@
+You are an expert SDMX metadata analyst. Your task is to enrich selected SDMX
+codes and concepts using web search and provide concise descriptions.
+
+# INPUT
+- Items-to-enrich JSON: {{ input_items_abs }}
+
+# OUTPUT
+- Write JSON to: {{ output_path_abs }}
+- Output MUST be valid JSON only. No extra text.
+
+# CRITICAL RULES
+- Process the ENTIRE input file. Do not read only the first lines.
+- Use web search for each item, batching multiple items per web call when possible.
+- Do not add `enriched_name` anywhere.
+- Do not include `name` or `description` fields in the output.
+- Do not include `enrichment_query` in the output.
+- Ground descriptions in search results and dataset context.
+- Keep `enriched_description` concise (<= 240 chars).
+
+# TASK
+1) Read the full JSON from the input path.
+2) For each selected item, use its `enrichment_query` to search the web.
+3) Produce an `enriched_description` for each item.
+4) Output the SAME pruned JSON structure as input, but remove
+   `enrichment_query` and add `enriched_description`.
+
+# FIELD MINIMUMS (do not add name/description)
+- dataflow: `id`
+- data_structure_definition: `id`
+- component (dimension/attribute/measure): `id`
+- concept: `id`, `concept_scheme_id`, `enriched_description`
+- representation: `type`
+- codelist: `id`
+- code: `id`, `enriched_description`
+- referenced_concept_schemes: `id`
+
+Write ONLY the JSON file to the output path.

From b4089df45a9f180a5db5d7c395abd8700af3e3f6 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 19 Jan 2026 13:01:36 +0000
Subject: [PATCH 03/15] Add SDMX enrichment merge tool

---
 .../sdmx/sdmx_enrichment_merge.py             | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 tools/agentic_import/sdmx/sdmx_enrichment_merge.py

diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py
new file mode 100644
index 0000000000..41cb38183c
--- /dev/null
+++ b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+
+_FLAGS = flags.FLAGS
+
+
+def _define_flags():
+    try:
+        flags.DEFINE_string('input_metadata_json', None,
+                            'Path to base SDMX metadata JSON (required)')
+        flags.mark_flag_as_required('input_metadata_json')
+
+        flags.DEFINE_string('input_enriched_items_json', None,
+                            'Path to enriched items JSON (required)')
+        flags.mark_flag_as_required('input_enriched_items_json')
+
+        flags.DEFINE_string('output_path', None,
+                            'Path to output enriched metadata JSON (required)')
+        flags.mark_flag_as_required('output_path')
+    except flags.DuplicateFlagError:
+        pass
+
+
+@dataclass(frozen=True)
+class MergeTarget:
+    path: str
+    match_key: str = 'id'
+
+
+_MERGE_TARGETS = [
+    MergeTarget('dataflows'),
+    MergeTarget('dataflows.data_structure_definition.dimensions'),
+    MergeTarget('dataflows.data_structure_definition.attributes'),
+    MergeTarget('dataflows.data_structure_definition.measures'),
+    MergeTarget('dataflows.data_structure_definition.dimensions.concept'),
+    MergeTarget('dataflows.data_structure_definition.attributes.concept'),
+    MergeTarget('dataflows.data_structure_definition.measures.concept'),
+    MergeTarget(
+        'dataflows.data_structure_definition.dimensions.representation.codelist.codes'
+    ),
+    MergeTarget(
+        'dataflows.data_structure_definition.attributes.representation.codelist.codes'
+    ),
+    MergeTarget(
+        'dataflows.data_structure_definition.measures.representation.codelist.codes'
+    ),
+    MergeTarget('dataflows.referenced_concept_schemes'),
+    MergeTarget('dataflows.referenced_concept_schemes.concepts'),
+]
+
+
+class EnrichmentMerger:
+    def __init__(self, base_data: Dict[str, Any],
+                 enriched_data: Dict[str, Any]):
+        self._base = base_data
+        self._enriched = enriched_data
+        self._targets = _MERGE_TARGETS
+
+    def merge(self) -> Dict[str, Any]:
+        self._merge_targets()
+        return self._base
+
+    def _merge_targets(self) -> None:
+        for target in self._targets:
+            base_nodes = list(self._find_nodes(self._base, target.path))
+            enriched_nodes = list(self._find_nodes(self._enriched, target.path))
+            if not base_nodes and enriched_nodes:
+                logging.warning(
+                    "Enriched data has path '%s' not present in base JSON",
+                    target.path)
+                continue
+
+            base_by_key = {
+                node.get(target.match_key): node
+                for node in base_nodes
+                if isinstance(node, dict) and node.get(target.match_key)
+            }
+            for enriched_node in enriched_nodes:
+                if not isinstance(enriched_node, dict):
+                    continue
+                match_value = enriched_node.get(target.match_key)
+                if not match_value:
+                    continue
+                base_node = base_by_key.get(match_value)
+                if not base_node:
+                    logging.warning(
+                        "No base match for %s='%s' at path '%s'", target.match_key,
+                        match_value, target.path)
+                    continue
+                self._merge_node(base_node, enriched_node, target.path)
+
+    def _merge_node(self, base_node: Dict[str, Any],
+                    enriched_node: Dict[str, Any], path: str) -> None:
+        if 'enriched_description' in enriched_node:
+            if 'enriched_description' in base_node:
+                logging.warning(
+                    "Overwriting enriched_description at %s id=%s", path,
+                    base_node.get('id'))
+            base_node['enriched_description'] = enriched_node[
+                'enriched_description']
+
+    def _find_nodes(self, data: Dict[str, Any],
+                    path: str) -> Iterable[Dict[str, Any]]:
+        parts = path.split('.')
+        current = [data]
+        for part in parts:
+            next_level = []
+            for node in current:
+                if not isinstance(node, dict):
+                    continue
+                value = node.get(part)
+                if isinstance(value, list):
+                    next_level.extend([item for item in value if isinstance(
+                        item, dict)])
+                elif isinstance(value, dict):
+                    next_level.append(value)
+            current = next_level
+            if not current:
+                break
+        return current
+
+
+def _load_json(path: Path) -> Dict[str, Any]:
+    with open(path, 'r') as f:
+        return json.load(f)
+
+
+def _write_json(path: Path, data: Dict[str, Any]) -> None:
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=2)
+
+
+def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str,
+                     output_path: str) -> None:
+    base_data = _load_json(Path(input_metadata_json))
+    enriched_data = _load_json(Path(input_enriched_items_json))
+    merged = EnrichmentMerger(base_data, enriched_data).merge()
+    _write_json(Path(output_path), merged)
+
+
+def main(_):
+    merge_enrichment(_FLAGS.input_metadata_json,
+                     _FLAGS.input_enriched_items_json, _FLAGS.output_path)
+    logging.info("Merged enriched descriptions into base metadata JSON")
+    return 0
+
+
+if __name__ == '__main__':
+    _define_flags()
+    app.run(main)

From 108fef6ad8ade9248d3ee11e168dc762ddc378b7 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 19 Jan 2026 13:10:07 +0000
Subject: [PATCH 04/15] lint changes

---
 .../sdmx/fetch_enrichment_data.py             | 20 ++++++-------
 .../sdmx/find_enrichment_items.py             | 29 +++++++++----------
 .../sdmx/sdmx_enrichment_merge.py             | 19 ++++++------
 3 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/fetch_enrichment_data.py
index 1ecf61c1be..ab2878e664 100644
--- a/tools/agentic_import/sdmx/fetch_enrichment_data.py
+++ b/tools/agentic_import/sdmx/fetch_enrichment_data.py
@@ -91,6 +91,7 @@ class RunResult:
 
 
 class EnrichmentDataFetcher:
+
     def __init__(self, config: Config):
         self._config = config
         self._working_dir = Path(
@@ -224,16 +225,15 @@ def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
 
     def _run_subprocess(self, command: str) -> int:
         try:
-            process = subprocess.Popen(
-                command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                shell=True,
-                cwd=self._working_dir,
-                encoding='utf-8',
-                errors='replace',
-                bufsize=1,
-                universal_newlines=True)
+            process = subprocess.Popen(command,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       shell=True,
+                                       cwd=self._working_dir,
+                                       encoding='utf-8',
+                                       errors='replace',
+                                       bufsize=1,
+                                       universal_newlines=True)
 
             while True:
                 output = process.stdout.readline()
diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/find_enrichment_items.py
index 2f47ac5063..9da2221f03 100644
--- a/tools/agentic_import/sdmx/find_enrichment_items.py
+++ b/tools/agentic_import/sdmx/find_enrichment_items.py
@@ -34,9 +34,8 @@
 
 def _define_flags():
     try:
-        flags.DEFINE_string(
-            'input_metadata_json', None,
-            'Path to input SDMX metadata JSON (required)')
+        flags.DEFINE_string('input_metadata_json', None,
+                            'Path to input SDMX metadata JSON (required)')
         flags.mark_flag_as_required('input_metadata_json')
 
         flags.DEFINE_string('output_path', None,
@@ -92,6 +91,7 @@ class RunResult:
 
 
 class EnrichmentItemsFinder:
+
     def __init__(self, config: Config):
         self._config = config
         self._working_dir = Path(
@@ -142,8 +142,8 @@ def find_items_to_enrich(self) -> RunResult:
                 "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)."
             )
 
-        logging.info(
-            "Launching gemini (cwd: %s): %s", self._working_dir, gemini_command)
+        logging.info("Launching gemini (cwd: %s): %s", self._working_dir,
+                     gemini_command)
         logging.info("Gemini output will be saved to: %s", gemini_log_file)
 
         exit_code = self._run_subprocess(gemini_command)
@@ -225,16 +225,15 @@ def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
 
     def _run_subprocess(self, command: str) -> int:
         try:
-            process = subprocess.Popen(
-                command,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                shell=True,
-                cwd=self._working_dir,
-                encoding='utf-8',
-                errors='replace',
-                bufsize=1,
-                universal_newlines=True)
+            process = subprocess.Popen(command,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       shell=True,
+                                       cwd=self._working_dir,
+                                       encoding='utf-8',
+                                       errors='replace',
+                                       bufsize=1,
+                                       universal_newlines=True)
 
             while True:
                 output = process.stdout.readline()
diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py
index 41cb38183c..14320dbf99 100644
--- a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py
+++ b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py
@@ -72,8 +72,9 @@ class MergeTarget:
 
 
 class EnrichmentMerger:
-    def __init__(self, base_data: Dict[str, Any],
-                 enriched_data: Dict[str, Any]):
+
+    def __init__(self, base_data: Dict[str, Any], enriched_data: Dict[str,
+                                                                      Any]):
         self._base = base_data
         self._enriched = enriched_data
         self._targets = _MERGE_TARGETS
@@ -105,9 +106,8 @@ def _merge_targets(self) -> None:
                     continue
                 base_node = base_by_key.get(match_value)
                 if not base_node:
-                    logging.warning(
-                        "No base match for %s='%s' at path '%s'", target.match_key,
-                        match_value, target.path)
+                    logging.warning("No base match for %s='%s' at path '%s'",
+                                    target.match_key, match_value, target.path)
                     continue
                 self._merge_node(base_node, enriched_node, target.path)
 
@@ -115,9 +115,8 @@ def _merge_node(self, base_node: Dict[str, Any],
                     enriched_node: Dict[str, Any], path: str) -> None:
         if 'enriched_description' in enriched_node:
             if 'enriched_description' in base_node:
-                logging.warning(
-                    "Overwriting enriched_description at %s id=%s", path,
-                    base_node.get('id'))
+                logging.warning("Overwriting enriched_description at %s id=%s",
+                                path, base_node.get('id'))
             base_node['enriched_description'] = enriched_node[
                 'enriched_description']
 
@@ -132,8 +131,8 @@ def _find_nodes(self, data: Dict[str, Any],
                     continue
                 value = node.get(part)
                 if isinstance(value, list):
-                    next_level.extend([item for item in value if isinstance(
-                        item, dict)])
+                    next_level.extend(
+                        [item for item in value if isinstance(item, dict)])
                 elif isinstance(value, dict):
                     next_level.append(value)
             current = next_level

From b1850c3e07638d94a9da746b11ed37e47015e1b0 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 19 Jan 2026 13:11:44 +0000
Subject: [PATCH 05/15] Document SDMX enrichment tools

---
 tools/agentic_import/sdmx/README.md | 53 +++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 tools/agentic_import/sdmx/README.md

diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md
new file mode 100644
index 0000000000..ac7dfd0434
--- /dev/null
+++ b/tools/agentic_import/sdmx/README.md
@@ -0,0 +1,53 @@
+# SDMX Enrichment Tools
+
+This folder contains three standalone tools for SDMX metadata enrichment.
+Each tool supports CLI usage and can be called programmatically.
+
+## 1) find_enrichment_items.py
+Selects which SDMX codes/concepts need enrichment and generates
+`enrichment_query` values using full dataset context.
+
+CLI usage:
+```
+python tools/agentic_import/sdmx/find_enrichment_items.py \
+  --input_metadata_json="/path/to/metadata.json" \
+  --output_path="/path/to/items_to_enrich.json" \
+  --gemini_cli="gemini" \
+  --enable_sandboxing
+```
+
+Output:
+- A pruned JSON that preserves the original structure but keeps only selected
+  items with `enrichment_query`. Name/description fields are omitted.
+
+## 2) fetch_enrichment_data.py
+Uses Gemini CLI web search to populate `enriched_description` for each selected
+item.
+
+CLI usage:
+```
+python tools/agentic_import/sdmx/fetch_enrichment_data.py \
+  --input_items_json="/path/to/items_to_enrich.json" \
+  --output_path="/path/to/enriched_items.json" \
+  --gemini_cli="gemini" \
+  --enable_sandboxing
+```
+
+Output:
+- A pruned JSON in the same structure as the input, with `enriched_description`
+  added and `enrichment_query` removed.
+
+## 3) sdmx_enrichment_merge.py
+Merges `enriched_description` into the base metadata JSON.
+
+CLI usage:
+```
+python tools/agentic_import/sdmx/sdmx_enrichment_merge.py \
+  --input_metadata_json="/path/to/metadata.json" \
+  --input_enriched_items_json="/path/to/enriched_items.json" \
+  --output_path="/path/to/metadata_enriched.json"
+```
+
+Output:
+- A full metadata JSON with `enriched_description` merged into the matching
+  codes and concepts.

From dfc3b86fb047d614c41f9de3f2dcecb901096601 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Mon, 19 Jan 2026 16:20:35 +0000
Subject: [PATCH 06/15] Require dataset prefix for Gemini runs

Add dataset_prefix to SDMX tools and docs.
---
 tools/agentic_import/sdmx/README.md                |  2 ++
 tools/agentic_import/sdmx/fetch_enrichment_data.py | 12 +++++++++++-
 tools/agentic_import/sdmx/find_enrichment_items.py | 12 +++++++++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md
index ac7dfd0434..c9c4416ccb 100644
--- a/tools/agentic_import/sdmx/README.md
+++ b/tools/agentic_import/sdmx/README.md
@@ -11,6 +11,7 @@ CLI usage:
 ```
 python tools/agentic_import/sdmx/find_enrichment_items.py \
   --input_metadata_json="/path/to/metadata.json" \
+  --dataset_prefix="oecd_prices" \
   --output_path="/path/to/items_to_enrich.json" \
   --gemini_cli="gemini" \
   --enable_sandboxing
@@ -28,6 +29,7 @@ CLI usage:
 ```
 python tools/agentic_import/sdmx/fetch_enrichment_data.py \
   --input_items_json="/path/to/items_to_enrich.json" \
+  --dataset_prefix="oecd_prices" \
   --output_path="/path/to/enriched_items.json" \
   --gemini_cli="gemini" \
   --enable_sandboxing
diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/fetch_enrichment_data.py
index ab2878e664..22e3e89b55 100644
--- a/tools/agentic_import/sdmx/fetch_enrichment_data.py
+++ b/tools/agentic_import/sdmx/fetch_enrichment_data.py
@@ -38,6 +38,10 @@ def _define_flags():
                             'Path to input items JSON (required)')
         flags.mark_flag_as_required('input_items_json')
 
+        flags.DEFINE_string('dataset_prefix', None,
+                            'Dataset prefix for run id (required, non-empty)')
+        flags.mark_flag_as_required('dataset_prefix')
+
         flags.DEFINE_string('output_path', None,
                             'Path to output items JSON (required)')
         flags.mark_flag_as_required('output_path')
@@ -72,6 +76,7 @@ def _define_flags():
 @dataclass
 class Config:
     input_items_json: str
+    dataset_prefix: str
     output_path: str
     dry_run: bool = False
     skip_confirmation: bool = False
@@ -98,6 +103,10 @@ def __init__(self, config: Config):
             config.working_dir).resolve() if config.working_dir else Path.cwd()
         self._input_path = self._resolve_path(config.input_items_json)
         self._output_path = self._resolve_path(config.output_path)
+        self._dataset_prefix = (config.dataset_prefix or '').strip()
+
+        if not self._dataset_prefix:
+            raise ValueError("dataset_prefix must be a non-empty string.")
 
         if not self._input_path.exists():
             raise FileNotFoundError(
@@ -109,7 +118,7 @@ def __init__(self, config: Config):
         self._datacommons_dir.mkdir(parents=True, exist_ok=True)
 
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self._run_id = f"gemini_{timestamp}"
+        self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}"
         self._run_dir = self._datacommons_dir / 'runs' / self._run_id
         self._run_dir.mkdir(parents=True, exist_ok=True)
 
@@ -250,6 +259,7 @@ def _run_subprocess(self, command: str) -> int:
 
 def prepare_config() -> Config:
     return Config(input_items_json=_FLAGS.input_items_json,
+                  dataset_prefix=_FLAGS.dataset_prefix,
                   output_path=_FLAGS.output_path,
                   dry_run=_FLAGS.dry_run,
                   skip_confirmation=_FLAGS.skip_confirmation,
diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/find_enrichment_items.py
index 9da2221f03..8807b397ae 100644
--- a/tools/agentic_import/sdmx/find_enrichment_items.py
+++ b/tools/agentic_import/sdmx/find_enrichment_items.py
@@ -38,6 +38,10 @@ def _define_flags():
                             'Path to input SDMX metadata JSON (required)')
         flags.mark_flag_as_required('input_metadata_json')
 
+        flags.DEFINE_string('dataset_prefix', None,
+                            'Dataset prefix for run id (required, non-empty)')
+        flags.mark_flag_as_required('dataset_prefix')
+
         flags.DEFINE_string('output_path', None,
                             'Path to output items JSON (required)')
         flags.mark_flag_as_required('output_path')
@@ -72,6 +76,7 @@ def _define_flags():
 @dataclass
 class Config:
     input_metadata_json: str
+    dataset_prefix: str
     output_path: str
     dry_run: bool = False
     skip_confirmation: bool = False
@@ -98,6 +103,10 @@ def __init__(self, config: Config):
             config.working_dir).resolve() if config.working_dir else Path.cwd()
         self._input_path = self._resolve_path(config.input_metadata_json)
         self._output_path = self._resolve_path(config.output_path)
+        self._dataset_prefix = (config.dataset_prefix or '').strip()
+
+        if not self._dataset_prefix:
+            raise ValueError("dataset_prefix must be a non-empty string.")
 
         if not self._input_path.exists():
             raise FileNotFoundError(
@@ -109,7 +118,7 @@ def __init__(self, config: Config):
         self._datacommons_dir.mkdir(parents=True, exist_ok=True)
 
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self._run_id = f"gemini_{timestamp}"
+        self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}"
         self._run_dir = self._datacommons_dir / 'runs' / self._run_id
         self._run_dir.mkdir(parents=True, exist_ok=True)
 
@@ -250,6 +259,7 @@ def _run_subprocess(self, command: str) -> int:
 
 def prepare_config() -> Config:
     return Config(input_metadata_json=_FLAGS.input_metadata_json,
+                  dataset_prefix=_FLAGS.dataset_prefix,
                   output_path=_FLAGS.output_path,
                   dry_run=_FLAGS.dry_run,
                   skip_confirmation=_FLAGS.skip_confirmation,

From 1a7b29df90da7f16946b8aa9d047ff041c770688 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 20 Jan 2026 06:26:31 +0000
Subject: [PATCH 07/15] test: add sdmx enrichment fixtures

Cover SDMX merge and dry-run flows
---
 .../sdmx/fetch_enrichment_data_test.py        | 59 +++++++++++++
 .../sdmx/find_enrichment_items_test.py        | 58 +++++++++++++
 .../sdmx/sdmx_enrichment_merge_test.py        | 49 +++++++++++
 .../sdmx/testdata/sample_enriched_items.json  | 84 ++++++++++++++++++
 .../sdmx/testdata/sample_metadata.json        | 56 ++++++++++++
 .../sample_metadata_enriched_expected.json    | 85 +++++++++++++++++++
 6 files changed, 391 insertions(+)
 create mode 100644 tools/agentic_import/sdmx/fetch_enrichment_data_test.py
 create mode 100644 tools/agentic_import/sdmx/find_enrichment_items_test.py
 create mode 100644 tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py
 create mode 100644 tools/agentic_import/sdmx/testdata/sample_enriched_items.json
 create mode 100644 tools/agentic_import/sdmx/testdata/sample_metadata.json
 create mode 100644 tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json

diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
new file mode 100644
index 0000000000..cfb1c2a974
--- /dev/null
+++ b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from tools.agentic_import.sdmx.fetch_enrichment_data import (Config,
+                                                            EnrichmentDataFetcher)
+
+
+class EnrichmentDataFetcherTest(unittest.TestCase):
+
+    def test_dry_run_creates_prompt_and_run_dir(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            input_path = Path(tmpdir) / 'items.json'
+            input_path.write_text(json.dumps({"items": []}))
+            output_path = Path(tmpdir) / 'out' / 'items_enriched.json'
+
+            config = Config(
+                input_items_json=str(input_path),
+                dataset_prefix='demo',
+                output_path=str(output_path),
+                dry_run=True,
+                skip_confirmation=True,
+                enable_sandboxing=False,
+                working_dir=tmpdir,
+            )
+
+            fetcher = EnrichmentDataFetcher(config)
+            result = fetcher.fetch_enrichment_data()
+
+            self.assertTrue(result.run_id.startswith('demo_gemini_'))
+            self.assertTrue(result.run_dir.is_dir())
+            self.assertTrue(result.prompt_path.is_file())
+            self.assertTrue(result.gemini_log_path.is_absolute())
+            self.assertEqual(result.prompt_path.parent, result.run_dir)
+            self.assertIn(str(result.prompt_path), result.gemini_command)
+            self.assertIn(str(result.gemini_log_path), result.gemini_command)
+            self.assertTrue(output_path.parent.is_dir())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/find_enrichment_items_test.py
new file mode 100644
index 0000000000..ec07983e9e
--- /dev/null
+++ b/tools/agentic_import/sdmx/find_enrichment_items_test.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tempfile
+import unittest
+from pathlib import Path
+
+from tools.agentic_import.sdmx.find_enrichment_items import (Config,
+                                                            EnrichmentItemsFinder)
+
+
+class EnrichmentItemsFinderTest(unittest.TestCase):
+
+    def test_dry_run_creates_prompt_and_run_dir(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            input_path = Path(tmpdir) / 'metadata.json'
+            input_path.write_text(json.dumps({"dataflows": []}))
+            output_path = Path(tmpdir) / 'out' / 'items.json'
+
+            config = Config(
+                input_metadata_json=str(input_path),
+                dataset_prefix='demo',
+                output_path=str(output_path),
+                dry_run=True,
+                skip_confirmation=True,
+                enable_sandboxing=False,
+                working_dir=tmpdir,
+            )
+
+            finder = EnrichmentItemsFinder(config)
+            result = finder.find_items_to_enrich()
+
+            self.assertTrue(result.run_id.startswith('demo_gemini_'))
+            self.assertTrue(result.run_dir.is_dir())
+            self.assertTrue(result.prompt_path.is_file())
+            self.assertTrue(result.gemini_log_path.is_absolute())
+            self.assertEqual(result.prompt_path.parent, result.run_dir)
+            self.assertIn(str(result.prompt_path), result.gemini_command)
+            self.assertIn(str(result.gemini_log_path), result.gemini_command)
+            self.assertTrue(output_path.parent.is_dir())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py b/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py
new file mode 100644
index 0000000000..6b424a3e54
--- /dev/null
+++ b/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+import unittest
+from pathlib import Path
+
+from deepdiff.diff import DeepDiff
+
+from tools.agentic_import.sdmx.sdmx_enrichment_merge import merge_enrichment
+
+_TESTDATA_DIR = Path(os.path.dirname(__file__)) / 'testdata'
+_BASE_JSON = _TESTDATA_DIR / 'sample_metadata.json'
+_ENRICHED_JSON = _TESTDATA_DIR / 'sample_enriched_items.json'
+_EXPECTED_JSON = _TESTDATA_DIR / 'sample_metadata_enriched_expected.json'
+
+
+class EnrichmentMergeTest(unittest.TestCase):
+
+    def test_merge_enriched_description_across_lists(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            output_path = Path(tmpdir) / 'merged.json'
+            merge_enrichment(str(_BASE_JSON), str(_ENRICHED_JSON),
+                             str(output_path))
+
+            merged = json.loads(output_path.read_text())
+
+        expected = json.loads(_EXPECTED_JSON.read_text())
+        diff = DeepDiff(expected, merged, ignore_order=True)
+        self.assertFalse(diff, msg=str(diff))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/agentic_import/sdmx/testdata/sample_enriched_items.json b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json
new file mode 100644
index 0000000000..6fad7d28ad
--- /dev/null
+++ b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json
@@ -0,0 +1,84 @@
+{
+  "dataflows": [
+    {
+      "id": "DF1",
+      "enriched_description": "Flow One enriched",
+      "data_structure_definition": {
+        "dimensions": [
+          {
+            "id": "DIM1",
+            "enriched_description": "Dimension enriched",
+            "concept": {
+              "id": "C1",
+              "enriched_description": "Concept C1 enriched"
+            },
+            "representation": {
+              "codelist": {
+                "codes": [
+                  {
+                    "id": "CODE1",
+                    "enriched_description": "Code 1 enriched"
+                  },
+                  {
+                    "id": "CODE2",
+                    "enriched_description": "Code 2 enriched"
+                  }
+                ]
+              }
+            }
+          }
+        ],
+        "attributes": [
+          {
+            "id": "ATTR1",
+            "enriched_description": "Attribute enriched",
+            "concept": {
+              "id": "C2",
+              "enriched_description": "Concept C2 enriched"
+            },
+            "representation": {
+              "codelist": {
+                "codes": [
+                  {
+                    "id": "ACODE1",
+                    "enriched_description": "Attr code enriched"
+                  }
+                ]
+              }
+            }
+          }
+        ],
+        "measures": [
+          {
+            "id": "MEAS1",
+            "enriched_description": "Measure enriched",
+            "concept": {
+              "id": "C3",
+              "enriched_description": "Concept C3 enriched"
+            }
+          }
+        ]
+      },
+      "referenced_concept_schemes": [
+        {
+          "id": "CS1",
+          "enriched_description": "Scheme enriched",
+          "concepts": [
+            {
+              "id": "CON1",
+              "enriched_description": "Concept 1 enriched"
+            },
+            {
+              "id": "CON2",
+              "enriched_description": "Concept 2 enriched"
+            }
+          ]
+        }
+      ]
+    },
+    {
+      "id": "DF3",
+      "enriched_description": "No base match"
+    }
+  ]
+}
diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata.json b/tools/agentic_import/sdmx/testdata/sample_metadata.json
new file mode 100644
index 0000000000..e121b00df6
--- /dev/null
+++ b/tools/agentic_import/sdmx/testdata/sample_metadata.json
@@ -0,0 +1,56 @@
+{
+  "dataflows": [
+    {
+      "id": "DF1",
+      "name": "Flow One",
+      "data_structure_definition": {
+        "dimensions": [
+          {
+            "id": "DIM1",
+            "concept": {"id": "C1"},
+            "representation": {
+              "codelist": {
+                "codes": [
+                  {"id": "CODE1"},
+                  {"id": "CODE2"}
+                ]
+              }
+            }
+          }
+        ],
+        "attributes": [
+          {
+            "id": "ATTR1",
+            "concept": {"id": "C2"},
+            "representation": {
+              "codelist": {
+                "codes": [
+                  {"id": "ACODE1"}
+                ]
+              }
+            }
+          }
+        ],
+        "measures": [
+          {
+            "id": "MEAS1",
+            "concept": {"id": "C3"}
+          }
+        ]
+      },
+      "referenced_concept_schemes": [
+        {
+          "id": "CS1",
+          "concepts": [
+            {"id": "CON1"},
+            {"id": "CON2"}
+          ]
+        }
+      ]
+    },
+    {
+      "id": "DF2",
+      "name": "Flow Two"
+    }
+  ]
+}
diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
new file mode 100644
index 0000000000..1828b0d023
--- /dev/null
+++ b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
@@ -0,0 +1,85 @@
+{
+  "dataflows": [
+    {
+      "id": "DF1",
+      "name": "Flow One",
+      "enriched_description": "Flow One enriched",
+      "data_structure_definition": {
+        "dimensions": [
+          {
+            "id": "DIM1",
+            "concept": {
+              "id": "C1",
+              "enriched_description": "Concept C1 enriched"
+            },
+            "representation": {
+              "codelist": {
+                "codes": [
+                  {
+                    "id": "CODE1",
+                    "enriched_description": "Code 1 enriched"
+                  },
+                  {
+                    "id": "CODE2",
+                    "enriched_description": "Code 2 enriched"
+                  }
+                ]
+              }
+            },
+            "enriched_description": "Dimension enriched"
+          }
+        ],
+        "attributes": [
+          {
+            "id": "ATTR1",
+            "concept": {
+              "id": "C2",
+              "enriched_description": "Concept C2 enriched"
+            },
+            "representation": {
+              "codelist": {
+                "codes": [
+                  {
+                    "id": "ACODE1",
+                    "enriched_description": "Attr code enriched"
+                  }
+                ]
+              }
+            },
+            "enriched_description": "Attribute enriched"
+          }
+        ],
+        "measures": [
+          {
+            "id": "MEAS1",
+            "concept": {
+              "id": "C3",
+              "enriched_description": "Concept C3 enriched"
+            },
+            "enriched_description": "Measure enriched"
+          }
+        ]
+      },
+      "referenced_concept_schemes": [
+        {
+          "id": "CS1",
+          "concepts": [
+            {
+              "id": "CON1",
+              "enriched_description": "Concept 1 enriched"
+            },
+            {
+              "id": "CON2",
+              "enriched_description": "Concept 2 enriched"
+            }
+          ],
+          "enriched_description": "Scheme enriched"
+        }
+      ]
+    },
+    {
+      "id": "DF2",
+      "name": "Flow Two"
+    }
+  ]
+}

From 45bf22f543537677d512fbbe7f0930d6988a39f3 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 20 Jan 2026 09:02:45 +0000
Subject: [PATCH 08/15] test: assert SDMX prompt params

---
 .../sdmx/fetch_enrichment_data_test.py        | 24 ++++++++++++++++---
 .../sdmx/find_enrichment_items_test.py        | 24 ++++++++++++++++---
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
index cfb1c2a974..541a8c7e8d 100644
--- a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
+++ b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
@@ -19,6 +19,9 @@
 import tempfile
 import unittest
 from pathlib import Path
+from unittest import mock
+
+from jinja2 import Template
 
 from tools.agentic_import.sdmx.fetch_enrichment_data import (Config,
                                                             EnrichmentDataFetcher)
@@ -43,17 +46,32 @@ def test_dry_run_creates_prompt_and_run_dir(self) -> None:
             )
 
             fetcher = EnrichmentDataFetcher(config)
-            result = fetcher.fetch_enrichment_data()
+            with mock.patch("jinja2.environment.Template.render",
+                            autospec=True,
+                            side_effect=Template.render) as render_mock:
+                result = fetcher.fetch_enrichment_data()
 
             self.assertTrue(result.run_id.startswith('demo_gemini_'))
             self.assertTrue(result.run_dir.is_dir())
             self.assertTrue(result.prompt_path.is_file())
             self.assertTrue(result.gemini_log_path.is_absolute())
             self.assertEqual(result.prompt_path.parent, result.run_dir)
-            self.assertIn(str(result.prompt_path), result.gemini_command)
-            self.assertIn(str(result.gemini_log_path), result.gemini_command)
+            expected_command = (
+                f"cat '{result.prompt_path.resolve()}' | "
+                f"{config.gemini_cli or 'gemini'} "
+                f"{'--sandbox' if config.enable_sandboxing else ''} "
+                f"-y 2>&1 | tee '{result.gemini_log_path.resolve()}'")
+            self.assertEqual(result.gemini_command, expected_command)
             self.assertTrue(output_path.parent.is_dir())
 
+            self.assertEqual(render_mock.call_count, 1)
+            _, render_kwargs = render_mock.call_args
+            self.assertEqual(
+                render_kwargs, {
+                    "input_items_abs": str(input_path.resolve()),
+                    "output_path_abs": str(output_path.resolve()),
+                })
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/find_enrichment_items_test.py
index ec07983e9e..d135be3d0b 100644
--- a/tools/agentic_import/sdmx/find_enrichment_items_test.py
+++ b/tools/agentic_import/sdmx/find_enrichment_items_test.py
@@ -18,6 +18,9 @@
 import tempfile
 import unittest
 from pathlib import Path
+from unittest import mock
+
+from jinja2 import Template
 
 from tools.agentic_import.sdmx.find_enrichment_items import (Config,
                                                             EnrichmentItemsFinder)
@@ -42,17 +45,32 @@ def test_dry_run_creates_prompt_and_run_dir(self) -> None:
             )
 
             finder = EnrichmentItemsFinder(config)
-            result = finder.find_items_to_enrich()
+            with mock.patch("jinja2.environment.Template.render",
+                            autospec=True,
+                            side_effect=Template.render) as render_mock:
+                result = finder.find_items_to_enrich()
 
             self.assertTrue(result.run_id.startswith('demo_gemini_'))
             self.assertTrue(result.run_dir.is_dir())
             self.assertTrue(result.prompt_path.is_file())
             self.assertTrue(result.gemini_log_path.is_absolute())
             self.assertEqual(result.prompt_path.parent, result.run_dir)
-            self.assertIn(str(result.prompt_path), result.gemini_command)
-            self.assertIn(str(result.gemini_log_path), result.gemini_command)
+            expected_command = (
+                f"cat '{result.prompt_path.resolve()}' | "
+                f"{config.gemini_cli or 'gemini'} "
+                f"{'--sandbox' if config.enable_sandboxing else ''} "
+                f"-y 2>&1 | tee '{result.gemini_log_path.resolve()}'")
+            self.assertEqual(result.gemini_command, expected_command)
             self.assertTrue(output_path.parent.is_dir())
 
+            self.assertEqual(render_mock.call_count, 1)
+            _, render_kwargs = render_mock.call_args
+            self.assertEqual(
+                render_kwargs, {
+                    "input_metadata_abs": str(input_path.resolve()),
+                    "output_path_abs": str(output_path.resolve()),
+                })
+
 
 if __name__ == '__main__':
     unittest.main()

From bda8cbd39c5d9453c26f733716a98794ebd68fb7 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 20 Jan 2026 09:03:43 +0000
Subject: [PATCH 09/15] lint fix

---
 tools/agentic_import/sdmx/fetch_enrichment_data_test.py | 4 ++--
 tools/agentic_import/sdmx/find_enrichment_items_test.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
index 541a8c7e8d..fb1b1609c3 100644
--- a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
+++ b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
@@ -23,8 +23,8 @@
 
 from jinja2 import Template
 
-from tools.agentic_import.sdmx.fetch_enrichment_data import (Config,
-                                                            EnrichmentDataFetcher)
+from tools.agentic_import.sdmx.fetch_enrichment_data import (
+    Config, EnrichmentDataFetcher)
 
 
 class EnrichmentDataFetcherTest(unittest.TestCase):
diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/find_enrichment_items_test.py
index d135be3d0b..3c0865178f 100644
--- a/tools/agentic_import/sdmx/find_enrichment_items_test.py
+++ b/tools/agentic_import/sdmx/find_enrichment_items_test.py
@@ -22,8 +22,8 @@
 
 from jinja2 import Template
 
-from tools.agentic_import.sdmx.find_enrichment_items import (Config,
-                                                            EnrichmentItemsFinder)
+from tools.agentic_import.sdmx.find_enrichment_items import (
+    Config, EnrichmentItemsFinder)
 
 
 class EnrichmentItemsFinderTest(unittest.TestCase):

From f8692f5c507c98b8cf1a79cc6930d387e7754e21 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 20 Jan 2026 09:57:24 +0000
Subject: [PATCH 10/15] Rename SDMX metadata enricher tools

---
 tools/agentic_import/sdmx/README.md                  | 12 ++++++------
 ...enrichment_data.py => metadata_enricher_fetch.py} |  4 ++--
 ..._data_test.py => metadata_enricher_fetch_test.py} |  2 +-
 ...enrichment_items.py => metadata_enricher_find.py} |  4 ++--
 ..._items_test.py => metadata_enricher_find_test.py} |  2 +-
 ...nrichment_merge.py => metadata_enricher_merge.py} |  0
 ...merge_test.py => metadata_enricher_merge_test.py} |  2 +-
 ...a_prompt.j2 => metadata_enricher_fetch_prompt.j2} |  0
 ...ms_prompt.j2 => metadata_enricher_find_prompt.j2} |  0
 9 files changed, 13 insertions(+), 13 deletions(-)
 rename tools/agentic_import/sdmx/{fetch_enrichment_data.py => metadata_enricher_fetch.py} (98%)
 rename tools/agentic_import/sdmx/{fetch_enrichment_data_test.py => metadata_enricher_fetch_test.py} (97%)
 rename tools/agentic_import/sdmx/{find_enrichment_items.py => metadata_enricher_find.py} (98%)
 rename tools/agentic_import/sdmx/{find_enrichment_items_test.py => metadata_enricher_find_test.py} (97%)
 rename tools/agentic_import/sdmx/{sdmx_enrichment_merge.py => metadata_enricher_merge.py} (100%)
 rename tools/agentic_import/sdmx/{sdmx_enrichment_merge_test.py => metadata_enricher_merge_test.py} (95%)
 rename tools/agentic_import/sdmx/templates/{fetch_enrichment_data_prompt.j2 => metadata_enricher_fetch_prompt.j2} (100%)
 rename tools/agentic_import/sdmx/templates/{find_enrichment_items_prompt.j2 => metadata_enricher_find_prompt.j2} (100%)

diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md
index c9c4416ccb..8fdb06b21d 100644
--- a/tools/agentic_import/sdmx/README.md
+++ b/tools/agentic_import/sdmx/README.md
@@ -3,13 +3,13 @@
 This folder contains three standalone tools for SDMX metadata enrichment.
 Each tool supports CLI usage and can be called programmatically.
 
-## 1) find_enrichment_items.py
+## 1) metadata_enricher_find.py
 Selects which SDMX codes/concepts need enrichment and generates
 `enrichment_query` values using full dataset context.
 
 CLI usage:
 ```
-python tools/agentic_import/sdmx/find_enrichment_items.py \
+python tools/agentic_import/sdmx/metadata_enricher_find.py \
   --input_metadata_json="/path/to/metadata.json" \
   --dataset_prefix="oecd_prices" \
   --output_path="/path/to/items_to_enrich.json" \
@@ -21,13 +21,13 @@ Output:
 - A pruned JSON that preserves the original structure but keeps only selected
   items with `enrichment_query`. Name/description fields are omitted.
 
-## 2) fetch_enrichment_data.py
+## 2) metadata_enricher_fetch.py
 Uses Gemini CLI web search to populate `enriched_description` for each selected
 item.
 
 CLI usage:
 ```
-python tools/agentic_import/sdmx/fetch_enrichment_data.py \
+python tools/agentic_import/sdmx/metadata_enricher_fetch.py \
   --input_items_json="/path/to/items_to_enrich.json" \
   --dataset_prefix="oecd_prices" \
   --output_path="/path/to/enriched_items.json" \
@@ -39,12 +39,12 @@ Output:
 - A pruned JSON in the same structure as the input, with `enriched_description`
   added and `enrichment_query` removed.
 
-## 3) sdmx_enrichment_merge.py
+## 3) metadata_enricher_merge.py
 Merges `enriched_description` into the base metadata JSON.
 
 CLI usage:
 ```
-python tools/agentic_import/sdmx/sdmx_enrichment_merge.py \
+python tools/agentic_import/sdmx/metadata_enricher_merge.py \
   --input_metadata_json="/path/to/metadata.json" \
   --input_enriched_items_json="/path/to/enriched_items.json" \
   --output_path="/path/to/metadata_enriched.json"
diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/metadata_enricher_fetch.py
similarity index 98%
rename from tools/agentic_import/sdmx/fetch_enrichment_data.py
rename to tools/agentic_import/sdmx/metadata_enricher_fetch.py
index 22e3e89b55..1fdaee6684 100644
--- a/tools/agentic_import/sdmx/fetch_enrichment_data.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_fetch.py
@@ -172,14 +172,14 @@ def _resolve_path(self, path: str) -> Path:
     def _generate_prompt(self) -> Path:
         template_dir = os.path.join(_SCRIPT_DIR, 'templates')
         env = Environment(loader=FileSystemLoader(template_dir))
-        template = env.get_template('fetch_enrichment_data_prompt.j2')
+        template = env.get_template('metadata_enricher_fetch_prompt.j2')
 
         rendered_prompt = template.render(
             input_items_abs=str(self._input_path),
             output_path_abs=str(self._output_path),
         )
 
-        output_file = self._run_dir / 'fetch_enrichment_data_prompt.md'
+        output_file = self._run_dir / 'metadata_enricher_fetch_prompt.md'
         with open(output_file, 'w') as f:
             f.write(rendered_prompt)
 
diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/metadata_enricher_fetch_test.py
similarity index 97%
rename from tools/agentic_import/sdmx/fetch_enrichment_data_test.py
rename to tools/agentic_import/sdmx/metadata_enricher_fetch_test.py
index fb1b1609c3..494381f72e 100644
--- a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_fetch_test.py
@@ -23,7 +23,7 @@
 
 from jinja2 import Template
 
-from tools.agentic_import.sdmx.fetch_enrichment_data import (
+from tools.agentic_import.sdmx.metadata_enricher_fetch import (
     Config, EnrichmentDataFetcher)
 
 
diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/metadata_enricher_find.py
similarity index 98%
rename from tools/agentic_import/sdmx/find_enrichment_items.py
rename to tools/agentic_import/sdmx/metadata_enricher_find.py
index 8807b397ae..e9c610eca6 100644
--- a/tools/agentic_import/sdmx/find_enrichment_items.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_find.py
@@ -172,14 +172,14 @@ def _resolve_path(self, path: str) -> Path:
     def _generate_prompt(self) -> Path:
         template_dir = os.path.join(_SCRIPT_DIR, 'templates')
         env = Environment(loader=FileSystemLoader(template_dir))
-        template = env.get_template('find_enrichment_items_prompt.j2')
+        template = env.get_template('metadata_enricher_find_prompt.j2')
 
         rendered_prompt = template.render(
             input_metadata_abs=str(self._input_path),
             output_path_abs=str(self._output_path),
         )
 
-        output_file = self._run_dir / 'find_enrichment_items_prompt.md'
+        output_file = self._run_dir / 'metadata_enricher_find_prompt.md'
         with open(output_file, 'w') as f:
             f.write(rendered_prompt)
 
diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/metadata_enricher_find_test.py
similarity index 97%
rename from tools/agentic_import/sdmx/find_enrichment_items_test.py
rename to tools/agentic_import/sdmx/metadata_enricher_find_test.py
index 3c0865178f..4f45a227cf 100644
--- a/tools/agentic_import/sdmx/find_enrichment_items_test.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_find_test.py
@@ -22,7 +22,7 @@
 
 from jinja2 import Template
 
-from tools.agentic_import.sdmx.find_enrichment_items import (
+from tools.agentic_import.sdmx.metadata_enricher_find import (
     Config, EnrichmentItemsFinder)
 
 
diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py
similarity index 100%
rename from tools/agentic_import/sdmx/sdmx_enrichment_merge.py
rename to tools/agentic_import/sdmx/metadata_enricher_merge.py
diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py
similarity index 95%
rename from tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py
rename to tools/agentic_import/sdmx/metadata_enricher_merge_test.py
index 6b424a3e54..d9670efbd5 100644
--- a/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py
@@ -22,7 +22,7 @@
 
 from deepdiff.diff import DeepDiff
 
-from tools.agentic_import.sdmx.sdmx_enrichment_merge import merge_enrichment
+from tools.agentic_import.sdmx.metadata_enricher_merge import merge_enrichment
 
 _TESTDATA_DIR = Path(os.path.dirname(__file__)) / 'testdata'
 _BASE_JSON = _TESTDATA_DIR / 'sample_metadata.json'
diff --git a/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 b/tools/agentic_import/sdmx/templates/metadata_enricher_fetch_prompt.j2
similarity index 100%
rename from tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2
rename to tools/agentic_import/sdmx/templates/metadata_enricher_fetch_prompt.j2
diff --git a/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 b/tools/agentic_import/sdmx/templates/metadata_enricher_find_prompt.j2
similarity index 100%
rename from tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2
rename to tools/agentic_import/sdmx/templates/metadata_enricher_find_prompt.j2

From 7c85eff8fb33cb66ed0ba68fb3e5b4da4ad125ce Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Tue, 20 Jan 2026 16:12:50 +0000
Subject: [PATCH 11/15] Extract gemini prompt runner

Refactor SDMX metadata scripts to reuse runner
---
 .../common/gemini_prompt_runner.py            | 186 ++++++++++++++++++
 .../sdmx/metadata_enricher_fetch.py           | 146 ++++----------
 .../sdmx/metadata_enricher_find.py            | 146 ++++----------
 3 files changed, 252 insertions(+), 226 deletions(-)
 create mode 100644 tools/agentic_import/common/gemini_prompt_runner.py

diff --git a/tools/agentic_import/common/gemini_prompt_runner.py b/tools/agentic_import/common/gemini_prompt_runner.py
new file mode 100644
index 0000000000..19ae073d8f
--- /dev/null
+++ b/tools/agentic_import/common/gemini_prompt_runner.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+import subprocess
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, Mapping, Optional
+
+from absl import logging
+from jinja2 import Environment, FileSystemLoader
+
+
+@dataclass
+class GeminiRunResult:
+    run_id: str
+    run_dir: Path
+    prompt_path: Path
+    gemini_log_path: Path
+    gemini_command: str
+    sandbox_enabled: bool
+
+
+class GeminiPromptRunner:
+
+    def __init__(self,
+                 dataset_prefix: str,
+                 working_dir: Optional[str] = None,
+                 run_root: str = '.datacommons/runs',
+                 dry_run: bool = False,
+                 skip_confirmation: bool = False,
+                 enable_sandboxing: bool = False,
+                 gemini_cli: Optional[str] = None):
+        self._working_dir = Path(
+            working_dir).resolve() if working_dir else Path.cwd()
+        self._dataset_prefix = (dataset_prefix or '').strip()
+        if not self._dataset_prefix:
+            raise ValueError("dataset_prefix must be a non-empty string.")
+
+        self._run_root = run_root
+        self._dry_run = dry_run
+        self._skip_confirmation = skip_confirmation
+        self._enable_sandboxing = enable_sandboxing
+        self._gemini_cli = gemini_cli
+
+        self._run_id = self._build_run_id()
+        self._run_dir = self._create_run_dir()
+
+    @property
+    def run_id(self) -> str:
+        return self._run_id
+
+    @property
+    def run_dir(self) -> Path:
+        return self._run_dir
+
+    @property
+    def working_dir(self) -> Path:
+        return self._working_dir
+
+    def render_prompt(self, template_dir: Path, template_name: str,
+                      context: Mapping[str, str], prompt_filename: str) -> Path:
+        # If other LLM runners are added later, extract rendering into a separate utility.
+        env = Environment(loader=FileSystemLoader(str(template_dir)))
+        template = env.get_template(template_name)
+
+        rendered_prompt = template.render(**context)
+        output_file = self._run_dir / prompt_filename
+        with open(output_file, 'w') as f:
+            f.write(rendered_prompt)
+
+        logging.info("Generated prompt written to: %s", output_file)
+        return output_file
+
+    def run(self,
+            prompt_file: Path,
+            log_filename: str = 'gemini_cli.log',
+            log_path_override: Optional[Path] = None,
+            confirm_fn: Optional[Callable[[Path], bool]] = None,
+            cancel_log_message: Optional[str] = None) -> GeminiRunResult:
+        gemini_log_path = (log_path_override.resolve() if log_path_override else
+                           (self._run_dir / log_filename))
+        gemini_command = self._build_gemini_command(prompt_file,
+                                                    gemini_log_path)
+
+        result = GeminiRunResult(run_id=self._run_id,
+                                 run_dir=self._run_dir,
+                                 prompt_path=prompt_file,
+                                 gemini_log_path=gemini_log_path,
+                                 gemini_command=gemini_command,
+                                 sandbox_enabled=self._enable_sandboxing)
+
+        if self._dry_run:
+            logging.info(
+                "Dry run mode: Prompt file generated at %s. "
+                "Skipping Gemini CLI execution.", prompt_file)
+            return result
+
+        if not self._skip_confirmation and confirm_fn is not None:
+            if not confirm_fn(prompt_file):
+                if cancel_log_message:
+                    logging.info(cancel_log_message)
+                return result
+
+        if not self._check_gemini_cli_available():
+            logging.warning(
+                "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)."
+            )
+
+        logging.info("Launching gemini (cwd: %s): %s", self._working_dir,
+                     gemini_command)
+        logging.info("Gemini output will be saved to: %s", gemini_log_path)
+
+        exit_code = self._run_subprocess(gemini_command)
+        if exit_code == 0:
+            logging.info("Gemini CLI completed successfully")
+            return result
+
+        raise RuntimeError(
+            f"Gemini CLI execution failed with exit code {exit_code}")
+
+    def _build_run_id(self) -> str:
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return f"{self._dataset_prefix}_gemini_{timestamp}"
+
+    def _create_run_dir(self) -> Path:
+        run_root = Path(self._run_root).expanduser()
+        if not run_root.is_absolute():
+            run_root = self._working_dir / run_root
+        run_root.mkdir(parents=True, exist_ok=True)
+
+        run_dir = run_root / self._run_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+        return run_dir
+
+    def _check_gemini_cli_available(self) -> bool:
+        if self._gemini_cli:
+            return True
+        return shutil.which('gemini') is not None
+
+    def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
+        prompt_path = prompt_file.resolve()
+        log_path = log_file.resolve()
+        gemini_cmd = self._gemini_cli or 'gemini'
+        sandbox_flag = "--sandbox" if self._enable_sandboxing else ""
+        return (
+            f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'"
+        )
+
+    def _run_subprocess(self, command: str) -> int:
+        try:
+            process = subprocess.Popen(command,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       shell=True,
+                                       cwd=self._working_dir,
+                                       encoding='utf-8',
+                                       errors='replace',
+                                       bufsize=1,
+                                       universal_newlines=True)
+
+            while True:
+                output = process.stdout.readline()
+                if output == '' and process.poll() is not None:
+                    break
+                if output:
+                    print(output.rstrip())
+
+            return process.wait()
+        except Exception as e:
+            logging.error("Error running subprocess: %s", str(e))
+            return 1
diff --git a/tools/agentic_import/sdmx/metadata_enricher_fetch.py b/tools/agentic_import/sdmx/metadata_enricher_fetch.py
index 1fdaee6684..505184b6d8 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_fetch.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_fetch.py
@@ -16,20 +16,24 @@
 
 import os
 import platform
-import shutil
-import subprocess
+import sys
 from dataclasses import dataclass
-from datetime import datetime
 from pathlib import Path
 from typing import Optional
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_REPO_ROOT = Path(_SCRIPT_DIR).resolve().parents[3]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
 from absl import app
 from absl import flags
 from absl import logging
-from jinja2 import Environment, FileSystemLoader
+
+from tools.agentic_import.common.gemini_prompt_runner import (
+    GeminiPromptRunner, GeminiRunResult)
 
 _FLAGS = flags.FLAGS
-_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 def _define_flags():
@@ -85,16 +89,6 @@ class Config:
     working_dir: Optional[str] = None
 
 
-@dataclass
-class RunResult:
-    run_id: str
-    run_dir: Path
-    prompt_path: Path
-    gemini_log_path: Path
-    gemini_command: str
-    sandbox_enabled: bool
-
-
 class EnrichmentDataFetcher:
 
     def __init__(self, config: Config):
@@ -114,54 +108,23 @@ def __init__(self, config: Config):
 
         self._output_path.parent.mkdir(parents=True, exist_ok=True)
 
-        self._datacommons_dir = self._working_dir / '.datacommons'
-        self._datacommons_dir.mkdir(parents=True, exist_ok=True)
-
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}"
-        self._run_dir = self._datacommons_dir / 'runs' / self._run_id
-        self._run_dir.mkdir(parents=True, exist_ok=True)
+        self._runner = GeminiPromptRunner(
+            dataset_prefix=self._dataset_prefix,
+            working_dir=str(self._working_dir),
+            dry_run=config.dry_run,
+            skip_confirmation=config.skip_confirmation,
+            enable_sandboxing=config.enable_sandboxing,
+            gemini_cli=config.gemini_cli,
+        )
 
-    def fetch_enrichment_data(self) -> RunResult:
+    def fetch_enrichment_data(self) -> GeminiRunResult:
         prompt_file = self._generate_prompt()
-        gemini_log_file = self._run_dir / 'gemini_cli.log'
-        gemini_command = self._build_gemini_command(prompt_file,
-                                                    gemini_log_file)
-
-        result = RunResult(run_id=self._run_id,
-                           run_dir=self._run_dir,
-                           prompt_path=prompt_file,
-                           gemini_log_path=gemini_log_file,
-                           gemini_command=gemini_command,
-                           sandbox_enabled=self._config.enable_sandboxing)
-
-        if self._config.dry_run:
-            logging.info(
-                "Dry run mode: Prompt file generated at %s. "
-                "Skipping Gemini CLI execution.", prompt_file)
-            return result
-
-        if not self._config.skip_confirmation:
-            if not self._get_user_confirmation(prompt_file):
-                logging.info("Enrichment data fetch cancelled by user.")
-                return result
-
-        if not self._check_gemini_cli_available():
-            logging.warning(
-                "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)."
-            )
-
-        logging.info("Launching gemini (cwd: %s): %s", self._working_dir,
-                     gemini_command)
-        logging.info("Gemini output will be saved to: %s", gemini_log_file)
-
-        exit_code = self._run_subprocess(gemini_command)
-        if exit_code == 0:
-            logging.info("Gemini CLI completed successfully")
-            return result
-
-        raise RuntimeError(
-            f"Gemini CLI execution failed with exit code {exit_code}")
+        return self._runner.run(
+            prompt_file,
+            log_filename='gemini_cli.log',
+            confirm_fn=self._get_user_confirmation,
+            cancel_log_message="Enrichment data fetch cancelled by user.",
+        )
 
     def _resolve_path(self, path: str) -> Path:
         resolved = Path(path).expanduser()
@@ -170,22 +133,17 @@ def _resolve_path(self, path: str) -> Path:
         return resolved.resolve()
 
     def _generate_prompt(self) -> Path:
-        template_dir = os.path.join(_SCRIPT_DIR, 'templates')
-        env = Environment(loader=FileSystemLoader(template_dir))
-        template = env.get_template('metadata_enricher_fetch_prompt.j2')
-
-        rendered_prompt = template.render(
-            input_items_abs=str(self._input_path),
-            output_path_abs=str(self._output_path),
+        template_dir = Path(_SCRIPT_DIR) / 'templates'
+        return self._runner.render_prompt(
+            template_dir=template_dir,
+            template_name='metadata_enricher_fetch_prompt.j2',
+            context={
+                "input_items_abs": str(self._input_path),
+                "output_path_abs": str(self._output_path),
+            },
+            prompt_filename='metadata_enricher_fetch_prompt.md',
         )
 
-        output_file = self._run_dir / 'metadata_enricher_fetch_prompt.md'
-        with open(output_file, 'w') as f:
-            f.write(rendered_prompt)
-
-        logging.info("Generated prompt written to: %s", output_file)
-        return output_file
-
     def _get_user_confirmation(self, prompt_file: Path) -> bool:
         print("\n" + "=" * 60)
         print("SDMX ENRICHMENT DATA FETCH SUMMARY")
@@ -218,44 +176,6 @@ def _get_user_confirmation(self, prompt_file: Path) -> bool:
                 print("\nData fetch cancelled by user.")
                 return False
 
-    def _check_gemini_cli_available(self) -> bool:
-        if self._config.gemini_cli:
-            return True
-        return shutil.which('gemini') is not None
-
-    def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
-        prompt_path = prompt_file.resolve()
-        log_path = log_file.resolve()
-        gemini_cmd = self._config.gemini_cli or 'gemini'
-        sandbox_flag = "--sandbox" if self._config.enable_sandboxing else ""
-        return (
-            f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'"
-        )
-
-    def _run_subprocess(self, command: str) -> int:
-        try:
-            process = subprocess.Popen(command,
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.STDOUT,
-                                       shell=True,
-                                       cwd=self._working_dir,
-                                       encoding='utf-8',
-                                       errors='replace',
-                                       bufsize=1,
-                                       universal_newlines=True)
-
-            while True:
-                output = process.stdout.readline()
-                if output == '' and process.poll() is not None:
-                    break
-                if output:
-                    print(output.rstrip())
-
-            return process.wait()
-        except Exception as e:
-            logging.error("Error running subprocess: %s", str(e))
-            return 1
-
 
 def prepare_config() -> Config:
     return Config(input_items_json=_FLAGS.input_items_json,
diff --git a/tools/agentic_import/sdmx/metadata_enricher_find.py b/tools/agentic_import/sdmx/metadata_enricher_find.py
index e9c610eca6..222c7eda13 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_find.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_find.py
@@ -16,20 +16,24 @@
 
 import os
 import platform
-import shutil
-import subprocess
+import sys
 from dataclasses import dataclass
-from datetime import datetime
 from pathlib import Path
 from typing import Optional
 
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_REPO_ROOT = Path(_SCRIPT_DIR).resolve().parents[3]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
 from absl import app
 from absl import flags
 from absl import logging
-from jinja2 import Environment, FileSystemLoader
+
+from tools.agentic_import.common.gemini_prompt_runner import (
+    GeminiPromptRunner, GeminiRunResult)
 
 _FLAGS = flags.FLAGS
-_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 def _define_flags():
@@ -85,16 +89,6 @@ class Config:
     working_dir: Optional[str] = None
 
 
-@dataclass
-class RunResult:
-    run_id: str
-    run_dir: Path
-    prompt_path: Path
-    gemini_log_path: Path
-    gemini_command: str
-    sandbox_enabled: bool
-
-
 class EnrichmentItemsFinder:
 
     def __init__(self, config: Config):
@@ -114,54 +108,23 @@ def __init__(self, config: Config):
 
         self._output_path.parent.mkdir(parents=True, exist_ok=True)
 
-        self._datacommons_dir = self._working_dir / '.datacommons'
-        self._datacommons_dir.mkdir(parents=True, exist_ok=True)
-
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}"
-        self._run_dir = self._datacommons_dir / 'runs' / self._run_id
-        self._run_dir.mkdir(parents=True, exist_ok=True)
+        self._runner = GeminiPromptRunner(
+            dataset_prefix=self._dataset_prefix,
+            working_dir=str(self._working_dir),
+            dry_run=config.dry_run,
+            skip_confirmation=config.skip_confirmation,
+            enable_sandboxing=config.enable_sandboxing,
+            gemini_cli=config.gemini_cli,
+        )
 
-    def find_items_to_enrich(self) -> RunResult:
+    def find_items_to_enrich(self) -> GeminiRunResult:
         prompt_file = self._generate_prompt()
-        gemini_log_file = self._run_dir / 'gemini_cli.log'
-        gemini_command = self._build_gemini_command(prompt_file,
-                                                    gemini_log_file)
-
-        result = RunResult(run_id=self._run_id,
-                           run_dir=self._run_dir,
-                           prompt_path=prompt_file,
-                           gemini_log_path=gemini_log_file,
-                           gemini_command=gemini_command,
-                           sandbox_enabled=self._config.enable_sandboxing)
-
-        if self._config.dry_run:
-            logging.info(
-                "Dry run mode: Prompt file generated at %s. "
-                "Skipping Gemini CLI execution.", prompt_file)
-            return result
-
-        if not self._config.skip_confirmation:
-            if not self._get_user_confirmation(prompt_file):
-                logging.info("Enrichment item selection cancelled by user.")
-                return result
-
-        if not self._check_gemini_cli_available():
-            logging.warning(
-                "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)."
-            )
-
-        logging.info("Launching gemini (cwd: %s): %s", self._working_dir,
-                     gemini_command)
-        logging.info("Gemini output will be saved to: %s", gemini_log_file)
-
-        exit_code = self._run_subprocess(gemini_command)
-        if exit_code == 0:
-            logging.info("Gemini CLI completed successfully")
-            return result
-
-        raise RuntimeError(
-            f"Gemini CLI execution failed with exit code {exit_code}")
+        return self._runner.run(
+            prompt_file,
+            log_filename='gemini_cli.log',
+            confirm_fn=self._get_user_confirmation,
+            cancel_log_message="Enrichment item selection cancelled by user.",
+        )
 
     def _resolve_path(self, path: str) -> Path:
         resolved = Path(path).expanduser()
@@ -170,22 +133,17 @@ def _resolve_path(self, path: str) -> Path:
         return resolved.resolve()
 
     def _generate_prompt(self) -> Path:
-        template_dir = os.path.join(_SCRIPT_DIR, 'templates')
-        env = Environment(loader=FileSystemLoader(template_dir))
-        template = env.get_template('metadata_enricher_find_prompt.j2')
-
-        rendered_prompt = template.render(
-            input_metadata_abs=str(self._input_path),
-            output_path_abs=str(self._output_path),
+        template_dir = Path(_SCRIPT_DIR) / 'templates'
+        return self._runner.render_prompt(
+            template_dir=template_dir,
+            template_name='metadata_enricher_find_prompt.j2',
+            context={
+                "input_metadata_abs": str(self._input_path),
+                "output_path_abs": str(self._output_path),
+            },
+            prompt_filename='metadata_enricher_find_prompt.md',
         )
 
-        output_file = self._run_dir / 'metadata_enricher_find_prompt.md'
-        with open(output_file, 'w') as f:
-            f.write(rendered_prompt)
-
-        logging.info("Generated prompt written to: %s", output_file)
-        return output_file
-
     def _get_user_confirmation(self, prompt_file: Path) -> bool:
         print("\n" + "=" * 60)
         print("SDMX ENRICHMENT ITEM SELECTION SUMMARY")
@@ -218,44 +176,6 @@ def _get_user_confirmation(self, prompt_file: Path) -> bool:
                 print("\nSelection cancelled by user.")
                 return False
 
-    def _check_gemini_cli_available(self) -> bool:
-        if self._config.gemini_cli:
-            return True
-        return shutil.which('gemini') is not None
-
-    def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str:
-        prompt_path = prompt_file.resolve()
-        log_path = log_file.resolve()
-        gemini_cmd = self._config.gemini_cli or 'gemini'
-        sandbox_flag = "--sandbox" if self._config.enable_sandboxing else ""
-        return (
-            f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'"
-        )
-
-    def _run_subprocess(self, command: str) -> int:
-        try:
-            process = subprocess.Popen(command,
-                                       stdout=subprocess.PIPE,
-                                       stderr=subprocess.STDOUT,
-                                       shell=True,
-                                       cwd=self._working_dir,
-                                       encoding='utf-8',
-                                       errors='replace',
-                                       bufsize=1,
-                                       universal_newlines=True)
-
-            while True:
-                output = process.stdout.readline()
-                if output == '' and process.poll() is not None:
-                    break
-                if output:
-                    print(output.rstrip())
-
-            return process.wait()
-        except Exception as e:
-            logging.error("Error running subprocess: %s", str(e))
-            return 1
-
 
 def prepare_config() -> Config:
     return Config(input_metadata_json=_FLAGS.input_metadata_json,

From e50a67ae77149561bbe8c871c6cf8ea2700e7cc1 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Wed, 21 Jan 2026 04:04:46 +0000
Subject: [PATCH 12/15] Add json merge helper and tests

Use insert-only merge for SDMX fixtures
---
 tools/agentic_import/common/json_merge.py     | 152 ++++++++++++++++++
 .../agentic_import/common/json_merge_test.py  | 150 +++++++++++++++++
 .../sdmx/metadata_enricher_merge.py           | 108 +------------
 .../sdmx/testdata/sample_enriched_items.json  |  25 +++
 .../sdmx/testdata/sample_metadata.json        |  19 ++-
 .../sample_metadata_enriched_expected.json    |  29 ++++
 6 files changed, 380 insertions(+), 103 deletions(-)
 create mode 100644 tools/agentic_import/common/json_merge.py
 create mode 100644 tools/agentic_import/common/json_merge_test.py

diff --git a/tools/agentic_import/common/json_merge.py b/tools/agentic_import/common/json_merge.py
new file mode 100644
index 0000000000..fb9e142bf8
--- /dev/null
+++ b/tools/agentic_import/common/json_merge.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+from absl import logging
+
+
+def merge_json(base: Any,
+               incoming: Any,
+               key_field: str = 'id',
+               allow_overwrite: bool = False) -> Any:
+    """Merges incoming JSON into base, mutating base where possible."""
+    return _merge_value(base,
+                        incoming,
+                        key_field=key_field,
+                        allow_overwrite=allow_overwrite,
+                        path='')
+
+
+def _merge_value(base: Any, incoming: Any, key_field: str,
+                 allow_overwrite: bool, path: str) -> Any:
+    # Dispatch by type to preserve structure and scope merges correctly.
+    if isinstance(base, dict) and isinstance(incoming, dict):
+        return _merge_dict(base,
+                           incoming,
+                           key_field=key_field,
+                           allow_overwrite=allow_overwrite,
+                           path=path)
+    if isinstance(base, list) and isinstance(incoming, list):
+        return _merge_list(base,
+                           incoming,
+                           key_field=key_field,
+                           allow_overwrite=allow_overwrite,
+                           path=path)
+    return _merge_leaf(base, incoming, allow_overwrite, path)
+
+
+def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any], key_field: str,
+                allow_overwrite: bool, path: str) -> Dict[str, Any]:
+    for key, incoming_value in incoming.items():
+        next_path = _join_path(path, key)
+        if key not in base:
+            base[key] = incoming_value
+            continue
+
+        base_value = base[key]
+        base[key] = _merge_value(base_value,
+                                 incoming_value,
+                                 key_field=key_field,
+                                 allow_overwrite=allow_overwrite,
+                                 path=next_path)
+    return base
+
+
+def _merge_list(base: List[Any], incoming: List[Any], key_field: str,
+                allow_overwrite: bool, path: str) -> List[Any]:
+    # Keep base ordering; append unmatched items to avoid data loss.
+    # Build a keyed index for scoped merges inside this list.
+    base_by_key: Dict[Any, Dict[str, Any]] = {}
+    for index, item in enumerate(base):
+        if not isinstance(item, dict):
+            logging.warning(
+                f"Base list item at {path}[index={index}] is not a dict; skipping keyed merge."
+            )
+            continue
+        key_value = item.get(key_field)
+        if key_value is None:
+            logging.warning(
+                f"Base list item at {path}[index={index}] missing key '{key_field}'; skipping keyed merge."
+            )
+            continue
+        if key_value in base_by_key:
+            logging.warning(
+                f"Duplicate key '{key_value}' in base list at {path}; using first occurrence."
+            )
+            continue
+        base_by_key[key_value] = item
+
+    seen_incoming_keys = set()
+    # Merge incoming items by key; append when a match is not possible.
+    for index, item in enumerate(incoming):
+        if not isinstance(item, dict):
+            logging.warning(
+                f"Incoming list item at {path}[index={index}] is not a dict; appending."
+            )
+            base.append(item)
+            continue
+        key_value = item.get(key_field)
+        if key_value is None:
+            logging.warning(
+                f"Incoming list item at {path}[index={index}] missing key '{key_field}'; appending."
+            )
+            base.append(item)
+            continue
+        if key_value in seen_incoming_keys:
+            logging.warning(
+                f"Duplicate key '{key_value}' in incoming list at {path}; merging again."
+            )
+        seen_incoming_keys.add(key_value)
+
+        base_item = base_by_key.get(key_value)
+        if base_item is None:
+            base.append(item)
+            base_by_key[key_value] = item
+            continue
+
+        item_path = _list_item_path(path, key_field, key_value)
+        _merge_dict(base_item,
+                    item,
+                    key_field=key_field,
+                    allow_overwrite=allow_overwrite,
+                    path=item_path)
+    return base
+
+
+def _merge_leaf(base: Any, incoming: Any, allow_overwrite: bool,
+                path: str) -> Any:
+    # Leaf values follow the overwrite policy to avoid accidental data loss.
+    if allow_overwrite:
+        if base != incoming:
+            logging.warning(
+                f"Overwriting value at {path} from {base!r} to {incoming!r}.")
+        return incoming
+
+    if base != incoming:
+        logging.warning(
+            f"Preserving base value at {path}; incoming value ignored.")
+    return base
+
+
+def _join_path(path: str, key: str) -> str:
+    if not path:
+        return key
+    return f"{path}.{key}"
+
+
+def _list_item_path(path: str, key_field: str, key_value: Any) -> str:
+    return f"{path}[{key_field}={key_value}]"
diff --git a/tools/agentic_import/common/json_merge_test.py b/tools/agentic_import/common/json_merge_test.py
new file mode 100644
index 0000000000..e13452fa9b
--- /dev/null
+++ b/tools/agentic_import/common/json_merge_test.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from tools.agentic_import.common.json_merge import merge_json
+
+
+class JsonMergeTest(unittest.TestCase):
+
+    def test_insert_only_preserves_existing_leaf_values(self) -> None:
+        base = {"a": 1, "b": {"c": 2}}
+        incoming = {"a": 3, "b": {"d": 4}}
+
+        merged = merge_json(base, incoming, allow_overwrite=False)
+
+        self.assertEqual(merged["a"], 1)
+        self.assertEqual(merged["b"]["c"], 2)
+        self.assertEqual(merged["b"]["d"], 4)
+
+    def test_allow_overwrite_updates_leaf_values(self) -> None:
+        base = {"a": 1}
+        incoming = {"a": 2}
+
+        merged = merge_json(base, incoming, allow_overwrite=True)
+
+        self.assertEqual(merged["a"], 2)
+
+    def test_insert_only_keeps_existing_name_and_adds_new_fields(self) -> None:
+        base = {"item": {"name": "Base"}}
+        incoming = {"item": {"name": "Incoming", "enriched_description": "New"}}
+
+        merged = merge_json(base, incoming, allow_overwrite=False)
+
+        self.assertEqual(merged["item"]["name"], "Base")
+        self.assertEqual(merged["item"]["enriched_description"], "New")
+
+    def test_keyed_list_merge_respects_hierarchy(self) -> None:
+        base = {
+            "codelists": [
+                {
+                    "id": "CL1",
+                    "codes": [{
+                        "id": "A"
+                    },],
+                },
+                {
+                    "id": "CL2",
+                    "codes": [{
+                        "id": "A"
+                    },],
+                },
+            ]
+        }
+        incoming = {
+            "codelists": [
+                {
+                    "id":
+                        "CL1",
+                    "codes": [{
+                        "id": "A",
+                        "enriched_description": "Code A in CL1",
+                    },],
+                },
+                {
+                    "id":
+                        "CL2",
+                    "codes": [{
+                        "id": "A",
+                        "enriched_description": "Code A in CL2",
+                    },],
+                },
+            ]
+        }
+
+        merged = merge_json(base, incoming, allow_overwrite=True)
+
+        cl1_code = merged["codelists"][0]["codes"][0]
+        cl2_code = merged["codelists"][1]["codes"][0]
+        self.assertEqual(cl1_code["enriched_description"], "Code A in CL1")
+        self.assertEqual(cl2_code["enriched_description"], "Code A in CL2")
+
+    def test_keyed_list_merge_with_custom_key(self) -> None:
+        base = {"items": [{"code": "X", "value": 1}]}
+        incoming = {
+            "items": [
+                {
+                    "code": "X",
+                    "extra": 2
+                },
+                {
+                    "code": "Y",
+                    "value": 3
+                },
+            ]
+        }
+
+        merged = merge_json(base, incoming, key_field="code")
+
+        self.assertEqual(merged["items"][0]["extra"], 2)
+        self.assertEqual(merged["items"][1]["code"], "Y")
+
+    def test_type_mismatch_respects_overwrite_policy(self) -> None:
+        base_keep = {"a": {"b": 1}}
+        base_replace = {"a": {"b": 1}}
+        incoming = {"a": [1, 2]}
+
+        merged_keep = merge_json(base_keep, incoming, allow_overwrite=False)
+        merged_replace = merge_json(base_replace,
+                                    incoming,
+                                    allow_overwrite=True)
+
+        self.assertEqual(merged_keep["a"], {"b": 1})
+        self.assertEqual(merged_replace["a"], [1, 2])
+
+    def test_list_items_without_key_are_appended(self) -> None:
+        base = {"items": [{"id": "x"}]}
+        incoming = {"items": [{"name": "no_id"}]}
+
+        merged = merge_json(base, incoming, allow_overwrite=False)
+
+        self.assertEqual(len(merged["items"]), 2)
+        self.assertEqual(merged["items"][1]["name"], "no_id")
+
+    def test_base_items_without_key_do_not_block_append(self) -> None:
+        base = {"items": [{"name": "base-only"}]}
+        incoming = {"items": [{"id": "x", "value": 1}]}
+
+        merged = merge_json(base, incoming, allow_overwrite=False)
+
+        self.assertEqual(len(merged["items"]), 2)
+        self.assertEqual(merged["items"][0]["name"], "base-only")
+        self.assertEqual(merged["items"][1]["id"], "x")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py
index 14320dbf99..65296f71e1 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_merge.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py
@@ -15,14 +15,15 @@
 # limitations under the License.
 
 import json
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict
 
 from absl import app
 from absl import flags
 from absl import logging
 
+from tools.agentic_import.common.json_merge import merge_json
+
 _FLAGS = flags.FLAGS
 
 
@@ -43,104 +44,6 @@ def _define_flags():
         pass
 
 
-@dataclass(frozen=True)
-class MergeTarget:
-    path: str
-    match_key: str = 'id'
-
-
-_MERGE_TARGETS = [
-    MergeTarget('dataflows'),
-    MergeTarget('dataflows.data_structure_definition.dimensions'),
-    MergeTarget('dataflows.data_structure_definition.attributes'),
-    MergeTarget('dataflows.data_structure_definition.measures'),
-    MergeTarget('dataflows.data_structure_definition.dimensions.concept'),
-    MergeTarget('dataflows.data_structure_definition.attributes.concept'),
-    MergeTarget('dataflows.data_structure_definition.measures.concept'),
-    MergeTarget(
-        'dataflows.data_structure_definition.dimensions.representation.codelist.codes'
-    ),
-    MergeTarget(
-        'dataflows.data_structure_definition.attributes.representation.codelist.codes'
-    ),
-    MergeTarget(
-        'dataflows.data_structure_definition.measures.representation.codelist.codes'
-    ),
-    MergeTarget('dataflows.referenced_concept_schemes'),
-    MergeTarget('dataflows.referenced_concept_schemes.concepts'),
-]
-
-
-class EnrichmentMerger:
-
-    def __init__(self, base_data: Dict[str, Any], enriched_data: Dict[str,
-                                                                      Any]):
-        self._base = base_data
-        self._enriched = enriched_data
-        self._targets = _MERGE_TARGETS
-
-    def merge(self) -> Dict[str, Any]:
-        self._merge_targets()
-        return self._base
-
-    def _merge_targets(self) -> None:
-        for target in self._targets:
-            base_nodes = list(self._find_nodes(self._base, target.path))
-            enriched_nodes = list(self._find_nodes(self._enriched, target.path))
-            if not base_nodes and enriched_nodes:
-                logging.warning(
-                    "Enriched data has path '%s' not present in base JSON",
-                    target.path)
-                continue
-
-            base_by_key = {
-                node.get(target.match_key): node
-                for node in base_nodes
-                if isinstance(node, dict) and node.get(target.match_key)
-            }
-            for enriched_node in enriched_nodes:
-                if not isinstance(enriched_node, dict):
-                    continue
-                match_value = enriched_node.get(target.match_key)
-                if not match_value:
-                    continue
-                base_node = base_by_key.get(match_value)
-                if not base_node:
-                    logging.warning("No base match for %s='%s' at path '%s'",
-                                    target.match_key, match_value, target.path)
-                    continue
-                self._merge_node(base_node, enriched_node, target.path)
-
-    def _merge_node(self, base_node: Dict[str, Any],
-                    enriched_node: Dict[str, Any], path: str) -> None:
-        if 'enriched_description' in enriched_node:
-            if 'enriched_description' in base_node:
-                logging.warning("Overwriting enriched_description at %s id=%s",
-                                path, base_node.get('id'))
-            base_node['enriched_description'] = enriched_node[
-                'enriched_description']
-
-    def _find_nodes(self, data: Dict[str, Any],
-                    path: str) -> Iterable[Dict[str, Any]]:
-        parts = path.split('.')
-        current = [data]
-        for part in parts:
-            next_level = []
-            for node in current:
-                if not isinstance(node, dict):
-                    continue
-                value = node.get(part)
-                if isinstance(value, list):
-                    next_level.extend(
-                        [item for item in value if isinstance(item, dict)])
-                elif isinstance(value, dict):
-                    next_level.append(value)
-            current = next_level
-            if not current:
-                break
-        return current
-
-
 def _load_json(path: Path) -> Dict[str, Any]:
     with open(path, 'r') as f:
         return json.load(f)
@@ -155,7 +58,10 @@ def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str,
                      output_path: str) -> None:
     base_data = _load_json(Path(input_metadata_json))
     enriched_data = _load_json(Path(input_enriched_items_json))
-    merged = EnrichmentMerger(base_data, enriched_data).merge()
+    merged = merge_json(base_data,
+                        enriched_data,
+                        key_field='id',
+                        allow_overwrite=False)
     _write_json(Path(output_path), merged)
 
 
diff --git a/tools/agentic_import/sdmx/testdata/sample_enriched_items.json b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json
index 6fad7d28ad..d566992a50 100644
--- a/tools/agentic_import/sdmx/testdata/sample_enriched_items.json
+++ b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json
@@ -14,18 +14,43 @@
             },
             "representation": {
               "codelist": {
+                "id": "CL1",
+                "name": "Enriched Codelist One",
                 "codes": [
                   {
                     "id": "CODE1",
+                    "name": "Enriched Code1 CL1",
                     "enriched_description": "Code 1 enriched"
                   },
                   {
                     "id": "CODE2",
+                    "name": "Enriched Code2 CL1",
                     "enriched_description": "Code 2 enriched"
                   }
                 ]
               }
             }
+          },
+          {
+            "id": "DIM2",
+            "enriched_description": "Dimension two enriched",
+            "concept": {
+              "id": "C4",
+              "enriched_description": "Concept C4 enriched"
+            },
+            "representation": {
+              "codelist": {
+                "id": "CL2",
+                "name": "Enriched Codelist Two",
+                "codes": [
+                  {
+                    "id": "CODE1",
+                    "name": "Enriched Code1 CL2",
+                    "enriched_description": "Code 1 enriched CL2"
+                  }
+                ]
+              }
+            }
           }
         ],
         "attributes": [
diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata.json b/tools/agentic_import/sdmx/testdata/sample_metadata.json
index e121b00df6..5770feaf3a 100644
--- a/tools/agentic_import/sdmx/testdata/sample_metadata.json
+++ b/tools/agentic_import/sdmx/testdata/sample_metadata.json
@@ -10,9 +10,24 @@
             "concept": {"id": "C1"},
             "representation": {
               "codelist": {
+                "id": "CL1",
+                "name": "Base Codelist One",
                 "codes": [
-                  {"id": "CODE1"},
-                  {"id": "CODE2"}
+                  {"id": "CODE1", "name": "Base Code1 CL1"},
+                  {"id": "CODE2", "name": "Base Code2 CL1"}
+                ]
+              }
+            }
+          },
+          {
+            "id": "DIM2",
+            "concept": {"id": "C4"},
+            "representation": {
+              "codelist": {
+                "id": "CL2",
+                "name": "Base Codelist Two",
+                "codes": [
+                  {"id": "CODE1", "name": "Base Code1 CL2"}
                 ]
               }
             }
diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
index 1828b0d023..76b33b8c2b 100644
--- a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
+++ b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
@@ -14,19 +14,44 @@
             },
             "representation": {
               "codelist": {
+                "id": "CL1",
+                "name": "Base Codelist One",
                 "codes": [
                   {
                     "id": "CODE1",
+                    "name": "Base Code1 CL1",
                     "enriched_description": "Code 1 enriched"
                   },
                   {
                     "id": "CODE2",
+                    "name": "Base Code2 CL1",
                     "enriched_description": "Code 2 enriched"
                   }
                 ]
               }
             },
             "enriched_description": "Dimension enriched"
+          },
+          {
+            "id": "DIM2",
+            "concept": {
+              "id": "C4",
+              "enriched_description": "Concept C4 enriched"
+            },
+            "representation": {
+              "codelist": {
+                "id": "CL2",
+                "name": "Base Codelist Two",
+                "codes": [
+                  {
+                    "id": "CODE1",
+                    "name": "Base Code1 CL2",
+                    "enriched_description": "Code 1 enriched CL2"
+                  }
+                ]
+              }
+            },
+            "enriched_description": "Dimension two enriched"
           }
         ],
         "attributes": [
@@ -80,6 +105,10 @@
     {
       "id": "DF2",
       "name": "Flow Two"
+    },
+    {
+      "id": "DF3",
+      "enriched_description": "No base match"
     }
   ]
 }

From 9ffdfa697ef7b3fa21be4b31d8153b3fb4c8cb75 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Wed, 21 Jan 2026 06:03:30 +0000
Subject: [PATCH 13/15] Add field-whitelist JSON merge helper

Rename module/tests and update SDMX merge usage
---
 .../agentic_import/common/json_merge_test.py  | 150 ---------------
 .../{json_merge.py => merge_json_fields.py}   |  95 +++++----
 .../common/merge_json_fields_test.py          | 182 ++++++++++++++++++
 .../sdmx/metadata_enricher_merge.py           |  11 +-
 .../sample_metadata_enriched_expected.json    |   4 -
 5 files changed, 249 insertions(+), 193 deletions(-)
 delete mode 100644 tools/agentic_import/common/json_merge_test.py
 rename tools/agentic_import/common/{json_merge.py => merge_json_fields.py} (60%)
 create mode 100644 tools/agentic_import/common/merge_json_fields_test.py

diff --git a/tools/agentic_import/common/json_merge_test.py b/tools/agentic_import/common/json_merge_test.py
deleted file mode 100644
index e13452fa9b..0000000000
--- a/tools/agentic_import/common/json_merge_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from tools.agentic_import.common.json_merge import merge_json
-
-
-class JsonMergeTest(unittest.TestCase):
-
-    def test_insert_only_preserves_existing_leaf_values(self) -> None:
-        base = {"a": 1, "b": {"c": 2}}
-        incoming = {"a": 3, "b": {"d": 4}}
-
-        merged = merge_json(base, incoming, allow_overwrite=False)
-
-        self.assertEqual(merged["a"], 1)
-        self.assertEqual(merged["b"]["c"], 2)
-        self.assertEqual(merged["b"]["d"], 4)
-
-    def test_allow_overwrite_updates_leaf_values(self) -> None:
-        base = {"a": 1}
-        incoming = {"a": 2}
-
-        merged = merge_json(base, incoming, allow_overwrite=True)
-
-        self.assertEqual(merged["a"], 2)
-
-    def test_insert_only_keeps_existing_name_and_adds_new_fields(self) -> None:
-        base = {"item": {"name": "Base"}}
-        incoming = {"item": {"name": "Incoming", "enriched_description": "New"}}
-
-        merged = merge_json(base, incoming, allow_overwrite=False)
-
-        self.assertEqual(merged["item"]["name"], "Base")
-        self.assertEqual(merged["item"]["enriched_description"], "New")
-
-    def test_keyed_list_merge_respects_hierarchy(self) -> None:
-        base = {
-            "codelists": [
-                {
-                    "id": "CL1",
-                    "codes": [{
-                        "id": "A"
-                    },],
-                },
-                {
-                    "id": "CL2",
-                    "codes": [{
-                        "id": "A"
-                    },],
-                },
-            ]
-        }
-        incoming = {
-            "codelists": [
-                {
-                    "id":
-                        "CL1",
-                    "codes": [{
-                        "id": "A",
-                        "enriched_description": "Code A in CL1",
-                    },],
-                },
-                {
-                    "id":
-                        "CL2",
-                    "codes": [{
-                        "id": "A",
-                        "enriched_description": "Code A in CL2",
-                    },],
-                },
-            ]
-        }
-
-        merged = merge_json(base, incoming, allow_overwrite=True)
-
-        cl1_code = merged["codelists"][0]["codes"][0]
-        cl2_code = merged["codelists"][1]["codes"][0]
-        self.assertEqual(cl1_code["enriched_description"], "Code A in CL1")
-        self.assertEqual(cl2_code["enriched_description"], "Code A in CL2")
-
-    def test_keyed_list_merge_with_custom_key(self) -> None:
-        base = {"items": [{"code": "X", "value": 1}]}
-        incoming = {
-            "items": [
-                {
-                    "code": "X",
-                    "extra": 2
-                },
-                {
-                    "code": "Y",
-                    "value": 3
-                },
-            ]
-        }
-
-        merged = merge_json(base, incoming, key_field="code")
-
-        self.assertEqual(merged["items"][0]["extra"], 2)
-        self.assertEqual(merged["items"][1]["code"], "Y")
-
-    def test_type_mismatch_respects_overwrite_policy(self) -> None:
-        base_keep = {"a": {"b": 1}}
-        base_replace = {"a": {"b": 1}}
-        incoming = {"a": [1, 2]}
-
-        merged_keep = merge_json(base_keep, incoming, allow_overwrite=False)
-        merged_replace = merge_json(base_replace,
-                                    incoming,
-                                    allow_overwrite=True)
-
-        self.assertEqual(merged_keep["a"], {"b": 1})
-        self.assertEqual(merged_replace["a"], [1, 2])
-
-    def test_list_items_without_key_are_appended(self) -> None:
-        base = {"items": [{"id": "x"}]}
-        incoming = {"items": [{"name": "no_id"}]}
-
-        merged = merge_json(base, incoming, allow_overwrite=False)
-
-        self.assertEqual(len(merged["items"]), 2)
-        self.assertEqual(merged["items"][1]["name"], "no_id")
-
-    def test_base_items_without_key_do_not_block_append(self) -> None:
-        base = {"items": [{"name": "base-only"}]}
-        incoming = {"items": [{"id": "x", "value": 1}]}
-
-        merged = merge_json(base, incoming, allow_overwrite=False)
-
-        self.assertEqual(len(merged["items"]), 2)
-        self.assertEqual(merged["items"][0]["name"], "base-only")
-        self.assertEqual(merged["items"][1]["id"], "x")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/agentic_import/common/json_merge.py b/tools/agentic_import/common/merge_json_fields.py
similarity index 60%
rename from tools/agentic_import/common/json_merge.py
rename to tools/agentic_import/common/merge_json_fields.py
index fb9e142bf8..f0634c2715 100644
--- a/tools/agentic_import/common/json_merge.py
+++ b/tools/agentic_import/common/merge_json_fields.py
@@ -13,63 +13,85 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Merge helper for JSON-like data.
 
-from typing import Any, Dict, List
+Only fields listed in fields_to_update can be added or updated.
+Traversal only follows containers already present in base and never creates
+new dicts or lists along the way.
+List items are matched by key (default id); unmatched items are ignored.
+Container type mismatches are skipped.
+When allow_overwrite is False, existing values are preserved.
+"""
+
+from typing import Any, Dict, List, Set
 
 from absl import logging
 
 
-def merge_json(base: Any,
-               incoming: Any,
-               key_field: str = 'id',
-               allow_overwrite: bool = False) -> Any:
-    """Merges incoming JSON into base, mutating base where possible."""
+def merge_json_fields(base: Any,
+                      incoming: Any,
+                      fields_to_update: List[str],
+                      key_field: str = 'id',
+                      allow_overwrite: bool = False) -> Any:
+    """Merges selected fields from incoming JSON into base."""
     return _merge_value(base,
                         incoming,
+                        fields_to_update=set(fields_to_update),
                         key_field=key_field,
                         allow_overwrite=allow_overwrite,
                         path='')
 
 
-def _merge_value(base: Any, incoming: Any, key_field: str,
-                 allow_overwrite: bool, path: str) -> Any:
-    # Dispatch by type to preserve structure and scope merges correctly.
+def _merge_value(base: Any, incoming: Any, fields_to_update: Set[str],
+                 key_field: str, allow_overwrite: bool, path: str) -> Any:
+    # Only traverse matching container types; leave base untouched otherwise.
     if isinstance(base, dict) and isinstance(incoming, dict):
         return _merge_dict(base,
                            incoming,
+                           fields_to_update=fields_to_update,
                            key_field=key_field,
                            allow_overwrite=allow_overwrite,
                            path=path)
     if isinstance(base, list) and isinstance(incoming, list):
         return _merge_list(base,
                            incoming,
+                           fields_to_update=fields_to_update,
                            key_field=key_field,
                            allow_overwrite=allow_overwrite,
                            path=path)
-    return _merge_leaf(base, incoming, allow_overwrite, path)
+    if type(base) != type(incoming):
+        location = path or 'root'
+        logging.warning(f"Type mismatch at {location}; skipping.")
+    return base
 
 
-def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any], key_field: str,
+def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any],
+                fields_to_update: Set[str], key_field: str,
                 allow_overwrite: bool, path: str) -> Dict[str, Any]:
     for key, incoming_value in incoming.items():
         next_path = _join_path(path, key)
+        if key in fields_to_update:
+            _merge_field(base,
+                         key,
+                         incoming_value,
+                         allow_overwrite=allow_overwrite,
+                         path=next_path)
+            continue
         if key not in base:
-            base[key] = incoming_value
             continue
-
-        base_value = base[key]
-        base[key] = _merge_value(base_value,
+        base[key] = _merge_value(base[key],
                                  incoming_value,
+                                 fields_to_update=fields_to_update,
                                  key_field=key_field,
                                  allow_overwrite=allow_overwrite,
                                  path=next_path)
     return base
 
 
-def _merge_list(base: List[Any], incoming: List[Any], key_field: str,
+def _merge_list(base: List[Any], incoming: List[Any],
+                fields_to_update: Set[str], key_field: str,
                 allow_overwrite: bool, path: str) -> List[Any]:
-    # Keep base ordering; append unmatched items to avoid data loss.
-    # Build a keyed index for scoped merges inside this list.
+    # Keep base ordering; only merge keyed items already present in base.
     base_by_key: Dict[Any, Dict[str, Any]] = {}
     for index, item in enumerate(base):
         if not isinstance(item, dict):
@@ -91,20 +113,17 @@ def _merge_list(base: List[Any], incoming: List[Any], key_field: str,
         base_by_key[key_value] = item
 
     seen_incoming_keys = set()
-    # Merge incoming items by key; append when a match is not possible.
     for index, item in enumerate(incoming):
         if not isinstance(item, dict):
             logging.warning(
-                f"Incoming list item at {path}[index={index}] is not a dict; appending."
+                f"Incoming list item at {path}[index={index}] is not a dict; ignoring."
             )
-            base.append(item)
             continue
         key_value = item.get(key_field)
         if key_value is None:
             logging.warning(
-                f"Incoming list item at {path}[index={index}] missing key '{key_field}'; appending."
+                f"Incoming list item at {path}[index={index}] missing key '{key_field}'; ignoring."
             )
-            base.append(item)
             continue
         if key_value in seen_incoming_keys:
             logging.warning(
@@ -114,30 +133,38 @@ def _merge_list(base: List[Any], incoming: List[Any], key_field: str,
 
         base_item = base_by_key.get(key_value)
         if base_item is None:
-            base.append(item)
-            base_by_key[key_value] = item
+            item_path = _list_item_path(path, key_field, key_value)
+            logging.warning(
+                f"No base match for {item_path}; ignoring incoming list item.")
             continue
 
         item_path = _list_item_path(path, key_field, key_value)
         _merge_dict(base_item,
                     item,
+                    fields_to_update=fields_to_update,
                     key_field=key_field,
                     allow_overwrite=allow_overwrite,
                     path=item_path)
     return base
 
 
-def _merge_leaf(base: Any, incoming: Any, allow_overwrite: bool,
-                path: str) -> Any:
-    # Leaf values follow the overwrite policy to avoid accidental data loss.
+def _merge_field(base: Dict[str, Any], key: str, incoming_value: Any,
+                 allow_overwrite: bool, path: str) -> Any:
+    if key not in base:
+        base[key] = incoming_value
+        return base
+
+    base_value = base[key]
     if allow_overwrite:
-        if base != incoming:
-            logging.warning(
-                f"Overwriting value at {path} from {base!r} to {incoming!r}.")
-        return incoming
+        if base_value != incoming_value:
+            logging.info(
+                f"Overwriting value at {path} from {base_value!r} to {incoming_value!r}."
+            )
+        base[key] = incoming_value
+        return base
 
-    if base != incoming:
-        logging.warning(
+    if base_value != incoming_value:
+        logging.info(
             f"Preserving base value at {path}; incoming value ignored.")
     return base
 
diff --git a/tools/agentic_import/common/merge_json_fields_test.py b/tools/agentic_import/common/merge_json_fields_test.py
new file mode 100644
index 0000000000..4c24bde5aa
--- /dev/null
+++ b/tools/agentic_import/common/merge_json_fields_test.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from tools.agentic_import.common.merge_json_fields import merge_json_fields
+
+
+class JsonMergeTest(unittest.TestCase):
+
+    def test_updates_only_listed_fields(self) -> None:
+        base = {"item": {"name": "Base"}}
+        incoming = {"item": {"name": "Incoming", "enriched_description": "New"}}
+
+        merged = merge_json_fields(base,
+                                   incoming,
+                                   fields_to_update=["enriched_description"],
+                                   allow_overwrite=False)
+
+        self.assertEqual(merged["item"]["name"], "Base")
+        self.assertEqual(merged["item"]["enriched_description"], "New")
+
+    def test_overwrite_policy_for_listed_fields(self) -> None:
+        base_keep = {"item": {"enriched_description": "Old"}}
+        base_overwrite = {"item": {"enriched_description": "Old"}}
+        incoming = {"item": {"enriched_description": "New"}}
+
+        merged_keep = merge_json_fields(
+            base_keep,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=False)
+        merged_overwrite = merge_json_fields(
+            base_overwrite,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=True)
+
+        self.assertEqual(merged_keep["item"]["enriched_description"], "Old")
+        self.assertEqual(merged_overwrite["item"]["enriched_description"],
+                         "New")
+
+    def test_type_mismatch_on_listed_field_respects_overwrite(self) -> None:
+        base_keep = {"item": {"enriched_description": {"a": 1}}}
+        base_overwrite = {"item": {"enriched_description": {"a": 1}}}
+        incoming = {"item": {"enriched_description": "New"}}
+
+        merged_keep = merge_json_fields(
+            base_keep,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=False)
+        merged_overwrite = merge_json_fields(
+            base_overwrite,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=True)
+
+        self.assertEqual(merged_keep["item"]["enriched_description"], {"a": 1})
+        self.assertEqual(merged_overwrite["item"]["enriched_description"],
+                         "New")
+
+    def test_traversal_type_mismatch_is_skipped(self) -> None:
+        base = {"item": {"details": {"a": 1}}}
+        incoming = {"item": {"details": ["x"]}}
+
+        merged = merge_json_fields(base,
+                                   incoming,
+                                   fields_to_update=["enriched_description"])
+
+        self.assertEqual(merged["item"]["details"], {"a": 1})
+
+    def test_keyed_list_merge_respects_hierarchy(self) -> None:
+        base = {
+            "codelists": [
+                {
+                    "id": "CL1",
+                    "codes": [{
+                        "id": "A"
+                    },],
+                },
+                {
+                    "id": "CL2",
+                    "codes": [{
+                        "id": "A"
+                    },],
+                },
+            ]
+        }
+        incoming = {
+            "codelists": [
+                {
+                    "id":
+                        "CL1",
+                    "codes": [{
+                        "id": "A",
+                        "enriched_description": "Code A in CL1",
+                    },],
+                },
+                {
+                    "id":
+                        "CL2",
+                    "codes": [{
+                        "id": "A",
+                        "enriched_description": "Code A in CL2",
+                    },],
+                },
+            ]
+        }
+
+        merged = merge_json_fields(base,
+                                   incoming,
+                                   fields_to_update=["enriched_description"],
+                                   allow_overwrite=False)
+
+        cl1_code = merged["codelists"][0]["codes"][0]
+        cl2_code = merged["codelists"][1]["codes"][0]
+        self.assertEqual(cl1_code["enriched_description"], "Code A in CL1")
+        self.assertEqual(cl2_code["enriched_description"], "Code A in CL2")
+
+    def test_keyed_list_merge_with_custom_key(self) -> None:
+        base = {"items": [{"code": "X", "value": 1}]}
+        incoming = {
+            "items": [
+                {
+                    "code": "X",
+                    "enriched_description": "X desc"
+                },
+                {
+                    "code": "Y",
+                    "enriched_description": "Y desc"
+                },
+            ]
+        }
+
+        merged = merge_json_fields(base,
+                                   incoming,
+                                   fields_to_update=["enriched_description"],
+                                   key_field="code")
+
+        self.assertEqual(len(merged["items"]), 1)
+        self.assertEqual(merged["items"][0]["enriched_description"], "X desc")
+
+    def test_list_items_without_key_are_ignored(self) -> None:
+        base = {"items": [{"id": "x"}]}
+        incoming = {"items": [{"name": "no_id"}]}
+
+        merged = merge_json_fields(base,
+                                   incoming,
+                                   fields_to_update=["enriched_description"])
+
+        self.assertEqual(len(merged["items"]), 1)
+        self.assertEqual(merged["items"][0], {"id": "x"})
+
+    def test_base_items_without_key_are_ignored(self) -> None:
+        base = {"items": [{"name": "base-only"}]}
+        incoming = {"items": [{"id": "x", "enriched_description": "desc"}]}
+
+        merged = merge_json_fields(base,
+                                   incoming,
+                                   fields_to_update=["enriched_description"])
+
+        self.assertEqual(len(merged["items"]), 1)
+        self.assertEqual(merged["items"][0]["name"], "base-only")
+        self.assertNotIn("enriched_description", merged["items"][0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py
index 65296f71e1..c997f8c7b3 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_merge.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py
@@ -22,7 +22,7 @@
 from absl import flags
 from absl import logging
 
-from tools.agentic_import.common.json_merge import merge_json
+from tools.agentic_import.common.merge_json_fields import merge_json_fields
 
 _FLAGS = flags.FLAGS
 
@@ -58,10 +58,11 @@ def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str,
                      output_path: str) -> None:
     base_data = _load_json(Path(input_metadata_json))
     enriched_data = _load_json(Path(input_enriched_items_json))
-    merged = merge_json(base_data,
-                        enriched_data,
-                        key_field='id',
-                        allow_overwrite=False)
+    merged = merge_json_fields(base_data,
+                               enriched_data,
+                               fields_to_update=['enriched_description'],
+                               key_field='id',
+                               allow_overwrite=False)
     _write_json(Path(output_path), merged)
 
 
diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
index 76b33b8c2b..ce6b3978db 100644
--- a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
+++ b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json
@@ -105,10 +105,6 @@
     {
       "id": "DF2",
       "name": "Flow Two"
-    },
-    {
-      "id": "DF3",
-      "enriched_description": "No base match"
     }
   ]
 }

From 565ae084e1bc0eee98d4c74a8c8686d3900f5cc5 Mon Sep 17 00:00:00 2001
From: rohit kumar <rohitrkumar@google.com>
Date: Wed, 21 Jan 2026 07:52:57 +0000
Subject: [PATCH 14/15] refactor: move collection merge into sdmx tool

---
 .../common/gemini_prompt_runner.py            |   1 +
 .../common/merge_json_fields.py               | 179 -----------------
 .../common/merge_json_fields_test.py          | 182 -----------------
 .../sdmx/metadata_enricher_fetch.py           |   1 +
 .../sdmx/metadata_enricher_find.py            |   1 +
 .../sdmx/metadata_enricher_merge.py           | 189 +++++++++++++++++-
 .../sdmx/metadata_enricher_merge_test.py      | 163 +++++++++++++++
 7 files changed, 347 insertions(+), 369 deletions(-)
 delete mode 100644 tools/agentic_import/common/merge_json_fields.py
 delete mode 100644 tools/agentic_import/common/merge_json_fields_test.py

diff --git a/tools/agentic_import/common/gemini_prompt_runner.py b/tools/agentic_import/common/gemini_prompt_runner.py
index 19ae073d8f..29b696730a 100644
--- a/tools/agentic_import/common/gemini_prompt_runner.py
+++ b/tools/agentic_import/common/gemini_prompt_runner.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Render prompts and run the Gemini CLI with tracked run outputs."""
 
 import shutil
 import subprocess
diff --git a/tools/agentic_import/common/merge_json_fields.py b/tools/agentic_import/common/merge_json_fields.py
deleted file mode 100644
index f0634c2715..0000000000
--- a/tools/agentic_import/common/merge_json_fields.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Merge helper for JSON-like data.
-
-Only fields listed in fields_to_update can be added or updated.
-Traversal only follows containers already present in base and never creates
-new dicts or lists along the way.
-List items are matched by key (default id); unmatched items are ignored.
-Container type mismatches are skipped.
-When allow_overwrite is False, existing values are preserved.
-"""
-
-from typing import Any, Dict, List, Set
-
-from absl import logging
-
-
-def merge_json_fields(base: Any,
-                      incoming: Any,
-                      fields_to_update: List[str],
-                      key_field: str = 'id',
-                      allow_overwrite: bool = False) -> Any:
-    """Merges selected fields from incoming JSON into base."""
-    return _merge_value(base,
-                        incoming,
-                        fields_to_update=set(fields_to_update),
-                        key_field=key_field,
-                        allow_overwrite=allow_overwrite,
-                        path='')
-
-
-def _merge_value(base: Any, incoming: Any, fields_to_update: Set[str],
-                 key_field: str, allow_overwrite: bool, path: str) -> Any:
-    # Only traverse matching container types; leave base untouched otherwise.
-    if isinstance(base, dict) and isinstance(incoming, dict):
-        return _merge_dict(base,
-                           incoming,
-                           fields_to_update=fields_to_update,
-                           key_field=key_field,
-                           allow_overwrite=allow_overwrite,
-                           path=path)
-    if isinstance(base, list) and isinstance(incoming, list):
-        return _merge_list(base,
-                           incoming,
-                           fields_to_update=fields_to_update,
-                           key_field=key_field,
-                           allow_overwrite=allow_overwrite,
-                           path=path)
-    if type(base) != type(incoming):
-        location = path or 'root'
-        logging.warning(f"Type mismatch at {location}; skipping.")
-    return base
-
-
-def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any],
-                fields_to_update: Set[str], key_field: str,
-                allow_overwrite: bool, path: str) -> Dict[str, Any]:
-    for key, incoming_value in incoming.items():
-        next_path = _join_path(path, key)
-        if key in fields_to_update:
-            _merge_field(base,
-                         key,
-                         incoming_value,
-                         allow_overwrite=allow_overwrite,
-                         path=next_path)
-            continue
-        if key not in base:
-            continue
-        base[key] = _merge_value(base[key],
-                                 incoming_value,
-                                 fields_to_update=fields_to_update,
-                                 key_field=key_field,
-                                 allow_overwrite=allow_overwrite,
-                                 path=next_path)
-    return base
-
-
-def _merge_list(base: List[Any], incoming: List[Any],
-                fields_to_update: Set[str], key_field: str,
-                allow_overwrite: bool, path: str) -> List[Any]:
-    # Keep base ordering; only merge keyed items already present in base.
-    base_by_key: Dict[Any, Dict[str, Any]] = {}
-    for index, item in enumerate(base):
-        if not isinstance(item, dict):
-            logging.warning(
-                f"Base list item at {path}[index={index}] is not a dict; skipping keyed merge."
-            )
-            continue
-        key_value = item.get(key_field)
-        if key_value is None:
-            logging.warning(
-                f"Base list item at {path}[index={index}] missing key '{key_field}'; skipping keyed merge."
-            )
-            continue
-        if key_value in base_by_key:
-            logging.warning(
-                f"Duplicate key '{key_value}' in base list at {path}; using first occurrence."
-            )
-            continue
-        base_by_key[key_value] = item
-
-    seen_incoming_keys = set()
-    for index, item in enumerate(incoming):
-        if not isinstance(item, dict):
-            logging.warning(
-                f"Incoming list item at {path}[index={index}] is not a dict; ignoring."
-            )
-            continue
-        key_value = item.get(key_field)
-        if key_value is None:
-            logging.warning(
-                f"Incoming list item at {path}[index={index}] missing key '{key_field}'; ignoring."
-            )
-            continue
-        if key_value in seen_incoming_keys:
-            logging.warning(
-                f"Duplicate key '{key_value}' in incoming list at {path}; merging again."
-            )
-        seen_incoming_keys.add(key_value)
-
-        base_item = base_by_key.get(key_value)
-        if base_item is None:
-            item_path = _list_item_path(path, key_field, key_value)
-            logging.warning(
-                f"No base match for {item_path}; ignoring incoming list item.")
-            continue
-
-        item_path = _list_item_path(path, key_field, key_value)
-        _merge_dict(base_item,
-                    item,
-                    fields_to_update=fields_to_update,
-                    key_field=key_field,
-                    allow_overwrite=allow_overwrite,
-                    path=item_path)
-    return base
-
-
-def _merge_field(base: Dict[str, Any], key: str, incoming_value: Any,
-                 allow_overwrite: bool, path: str) -> Any:
-    if key not in base:
-        base[key] = incoming_value
-        return base
-
-    base_value = base[key]
-    if allow_overwrite:
-        if base_value != incoming_value:
-            logging.info(
-                f"Overwriting value at {path} from {base_value!r} to {incoming_value!r}."
-            )
-        base[key] = incoming_value
-        return base
-
-    if base_value != incoming_value:
-        logging.info(
-            f"Preserving base value at {path}; incoming value ignored.")
-    return base
-
-
-def _join_path(path: str, key: str) -> str:
-    if not path:
-        return key
-    return f"{path}.{key}"
-
-
-def _list_item_path(path: str, key_field: str, key_value: Any) -> str:
-    return f"{path}[{key_field}={key_value}]"
diff --git a/tools/agentic_import/common/merge_json_fields_test.py b/tools/agentic_import/common/merge_json_fields_test.py
deleted file mode 100644
index 4c24bde5aa..0000000000
--- a/tools/agentic_import/common/merge_json_fields_test.py
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from tools.agentic_import.common.merge_json_fields import merge_json_fields
-
-
-class JsonMergeTest(unittest.TestCase):
-
-    def test_updates_only_listed_fields(self) -> None:
-        base = {"item": {"name": "Base"}}
-        incoming = {"item": {"name": "Incoming", "enriched_description": "New"}}
-
-        merged = merge_json_fields(base,
-                                   incoming,
-                                   fields_to_update=["enriched_description"],
-                                   allow_overwrite=False)
-
-        self.assertEqual(merged["item"]["name"], "Base")
-        self.assertEqual(merged["item"]["enriched_description"], "New")
-
-    def test_overwrite_policy_for_listed_fields(self) -> None:
-        base_keep = {"item": {"enriched_description": "Old"}}
-        base_overwrite = {"item": {"enriched_description": "Old"}}
-        incoming = {"item": {"enriched_description": "New"}}
-
-        merged_keep = merge_json_fields(
-            base_keep,
-            incoming,
-            fields_to_update=["enriched_description"],
-            allow_overwrite=False)
-        merged_overwrite = merge_json_fields(
-            base_overwrite,
-            incoming,
-            fields_to_update=["enriched_description"],
-            allow_overwrite=True)
-
-        self.assertEqual(merged_keep["item"]["enriched_description"], "Old")
-        self.assertEqual(merged_overwrite["item"]["enriched_description"],
-                         "New")
-
-    def test_type_mismatch_on_listed_field_respects_overwrite(self) -> None:
-        base_keep = {"item": {"enriched_description": {"a": 1}}}
-        base_overwrite = {"item": {"enriched_description": {"a": 1}}}
-        incoming = {"item": {"enriched_description": "New"}}
-
-        merged_keep = merge_json_fields(
-            base_keep,
-            incoming,
-            fields_to_update=["enriched_description"],
-            allow_overwrite=False)
-        merged_overwrite = merge_json_fields(
-            base_overwrite,
-            incoming,
-            fields_to_update=["enriched_description"],
-            allow_overwrite=True)
-
-        self.assertEqual(merged_keep["item"]["enriched_description"], {"a": 1})
-        self.assertEqual(merged_overwrite["item"]["enriched_description"],
-                         "New")
-
-    def test_traversal_type_mismatch_is_skipped(self) -> None:
-        base = {"item": {"details": {"a": 1}}}
-        incoming = {"item": {"details": ["x"]}}
-
-        merged = merge_json_fields(base,
-                                   incoming,
-                                   fields_to_update=["enriched_description"])
-
-        self.assertEqual(merged["item"]["details"], {"a": 1})
-
-    def test_keyed_list_merge_respects_hierarchy(self) -> None:
-        base = {
-            "codelists": [
-                {
-                    "id": "CL1",
-                    "codes": [{
-                        "id": "A"
-                    },],
-                },
-                {
-                    "id": "CL2",
-                    "codes": [{
-                        "id": "A"
-                    },],
-                },
-            ]
-        }
-        incoming = {
-            "codelists": [
-                {
-                    "id":
-                        "CL1",
-                    "codes": [{
-                        "id": "A",
-                        "enriched_description": "Code A in CL1",
-                    },],
-                },
-                {
-                    "id":
-                        "CL2",
-                    "codes": [{
-                        "id": "A",
-                        "enriched_description": "Code A in CL2",
-                    },],
-                },
-            ]
-        }
-
-        merged = merge_json_fields(base,
-                                   incoming,
-                                   fields_to_update=["enriched_description"],
-                                   allow_overwrite=False)
-
-        cl1_code = merged["codelists"][0]["codes"][0]
-        cl2_code = merged["codelists"][1]["codes"][0]
-        self.assertEqual(cl1_code["enriched_description"], "Code A in CL1")
-        self.assertEqual(cl2_code["enriched_description"], "Code A in CL2")
-
-    def test_keyed_list_merge_with_custom_key(self) -> None:
-        base = {"items": [{"code": "X", "value": 1}]}
-        incoming = {
-            "items": [
-                {
-                    "code": "X",
-                    "enriched_description": "X desc"
-                },
-                {
-                    "code": "Y",
-                    "enriched_description": "Y desc"
-                },
-            ]
-        }
-
-        merged = merge_json_fields(base,
-                                   incoming,
-                                   fields_to_update=["enriched_description"],
-                                   key_field="code")
-
-        self.assertEqual(len(merged["items"]), 1)
-        self.assertEqual(merged["items"][0]["enriched_description"], "X desc")
-
-    def test_list_items_without_key_are_ignored(self) -> None:
-        base = {"items": [{"id": "x"}]}
-        incoming = {"items": [{"name": "no_id"}]}
-
-        merged = merge_json_fields(base,
-                                   incoming,
-                                   fields_to_update=["enriched_description"])
-
-        self.assertEqual(len(merged["items"]), 1)
-        self.assertEqual(merged["items"][0], {"id": "x"})
-
-    def test_base_items_without_key_are_ignored(self) -> None:
-        base = {"items": [{"name": "base-only"}]}
-        incoming = {"items": [{"id": "x", "enriched_description": "desc"}]}
-
-        merged = merge_json_fields(base,
-                                   incoming,
-                                   fields_to_update=["enriched_description"])
-
-        self.assertEqual(len(merged["items"]), 1)
-        self.assertEqual(merged["items"][0]["name"], "base-only")
-        self.assertNotIn("enriched_description", merged["items"][0])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tools/agentic_import/sdmx/metadata_enricher_fetch.py b/tools/agentic_import/sdmx/metadata_enricher_fetch.py
index 505184b6d8..eed3da1568 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_fetch.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_fetch.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Fetch enriched descriptions for selected SDMX items with Gemini CLI."""
 
 import os
 import platform
diff --git a/tools/agentic_import/sdmx/metadata_enricher_find.py b/tools/agentic_import/sdmx/metadata_enricher_find.py
index 222c7eda13..eabaff845b 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_find.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_find.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Select SDMX items to enrich and generate enrichment queries."""
 
 import os
 import platform
diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py
index c997f8c7b3..aba343aff0 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_merge.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py
@@ -13,19 +13,191 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Merge enriched SDMX descriptions into base metadata files."""
 
 import json
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Dict, List, Set, Union
 
 from absl import app
 from absl import flags
 from absl import logging
 
-from tools.agentic_import.common.merge_json_fields import merge_json_fields
-
 _FLAGS = flags.FLAGS
 
+DictOrList = Union[Dict[str, Any], List[Any]]
+
+
+class CollectionMerger:
+    """Merges selected fields from an incoming nested collection into a base one."""
+
+    def merge(self,
+              base: DictOrList,
+              incoming: DictOrList,
+              fields_to_update: List[str],
+              key_field: str = 'id',
+              allow_overwrite: bool = False) -> DictOrList:
+        """Merges values from `incoming` into `base` for a controlled set of fields.
+
+        The merge walks `base` and only descends into dicts/lists that already
+        exist in `base` (it does not create new containers). For dicts, only keys
+        listed in `fields_to_update` may be added or updated; other keys are
+        merged only when they already exist in `base`. For lists, items are
+        matched by `key_field` (default: "id"); incoming items with no match in
+        `base` are ignored. Container type mismatches are skipped.
+
+        Args:
+          base: Nested dict/list structure to update. Modified in place.
+          incoming: Nested dict/list structure providing candidate updates.
+          fields_to_update: Dict keys that are allowed to be added/updated.
+          key_field: Dict key used to match list items when merging lists.
+          allow_overwrite: If True, overwrite existing values for
+            `fields_to_update`. If False, existing `base` values are preserved
+            and incoming values are ignored.
+
+        Returns:
+          The updated `base` object.
+        """
+        return self._merge_value(base,
+                                 incoming,
+                                 fields_to_update=set(fields_to_update),
+                                 key_field=key_field,
+                                 allow_overwrite=allow_overwrite,
+                                 path='')
+
+    def _merge_value(self, base: DictOrList, incoming: DictOrList,
+                     fields_to_update: Set[str], key_field: str,
+                     allow_overwrite: bool, path: str) -> DictOrList:
+        # Only traverse matching container types; leave base untouched otherwise.
+        if isinstance(base, dict) and isinstance(incoming, dict):
+            return self._merge_dict(base,
+                                    incoming,
+                                    fields_to_update=fields_to_update,
+                                    key_field=key_field,
+                                    allow_overwrite=allow_overwrite,
+                                    path=path)
+        if isinstance(base, list) and isinstance(incoming, list):
+            return self._merge_list(base,
+                                    incoming,
+                                    fields_to_update=fields_to_update,
+                                    key_field=key_field,
+                                    allow_overwrite=allow_overwrite,
+                                    path=path)
+        if type(base) != type(incoming):
+            location = path or 'root'
+            logging.warning(f"Type mismatch at {location}; skipping.")
+        return base
+
+    def _merge_dict(self, base: Dict[str, Any], incoming: Dict[str, Any],
+                    fields_to_update: Set[str], key_field: str,
+                    allow_overwrite: bool, path: str) -> Dict[str, Any]:
+        for key, incoming_value in incoming.items():
+            next_path = self._join_path(path, key)
+            if key in fields_to_update:
+                self._merge_field(base,
+                                  key,
+                                  incoming_value,
+                                  allow_overwrite=allow_overwrite,
+                                  path=next_path)
+                continue
+            if key not in base:
+                continue
+            base[key] = self._merge_value(base[key],
+                                          incoming_value,
+                                          fields_to_update=fields_to_update,
+                                          key_field=key_field,
+                                          allow_overwrite=allow_overwrite,
+                                          path=next_path)
+        return base
+
+    def _merge_list(self, base: List[Any], incoming: List[Any],
+                    fields_to_update: Set[str], key_field: str,
+                    allow_overwrite: bool, path: str) -> List[Any]:
+        # Keep base ordering; only merge keyed items already present in base.
+        base_by_key: Dict[Any, Dict[str, Any]] = {}
+        for index, item in enumerate(base):
+            if not isinstance(item, dict):
+                logging.warning(
+                    f"Base list item at {path}[index={index}] is not a dict; skipping keyed merge."
+                )
+                continue
+            key_value = item.get(key_field)
+            if key_value is None:
+                logging.warning(
+                    f"Base list item at {path}[index={index}] missing key '{key_field}'; skipping keyed merge."
+                )
+                continue
+            if key_value in base_by_key:
+                logging.warning(
+                    f"Duplicate key '{key_value}' in base list at {path}; using first occurrence."
+                )
+                continue
+            base_by_key[key_value] = item
+
+        seen_incoming_keys = set()
+        for index, item in enumerate(incoming):
+            if not isinstance(item, dict):
+                logging.warning(
+                    f"Incoming list item at {path}[index={index}] is not a dict; ignoring."
+                )
+                continue
+            key_value = item.get(key_field)
+            if key_value is None:
+                logging.warning(
+                    f"Incoming list item at {path}[index={index}] missing key '{key_field}'; ignoring."
+                )
+                continue
+            if key_value in seen_incoming_keys:
+                logging.warning(
+                    f"Duplicate key '{key_value}' in incoming list at {path}; merging again."
+                )
+            seen_incoming_keys.add(key_value)
+
+            base_item = base_by_key.get(key_value)
+            if base_item is None:
+                item_path = self._list_item_path(path, key_field, key_value)
+                logging.warning(
+                    f"No base match for {item_path}; ignoring incoming list item."
+                )
+                continue
+
+            item_path = self._list_item_path(path, key_field, key_value)
+            self._merge_dict(base_item,
+                             item,
+                             fields_to_update=fields_to_update,
+                             key_field=key_field,
+                             allow_overwrite=allow_overwrite,
+                             path=item_path)
+        return base
+
+    def _merge_field(self, base: Dict[str, Any], key: str, incoming_value: Any,
+                     allow_overwrite: bool, path: str) -> Any:
+        if key not in base:
+            base[key] = incoming_value
+            return base
+
+        base_value = base[key]
+        if allow_overwrite:
+            if base_value != incoming_value:
+                logging.info(
+                    f"Overwriting value at {path} from {base_value!r} to {incoming_value!r}."
+                )
+            base[key] = incoming_value
+            return base
+
+        if base_value != incoming_value:
+            logging.info(
+                f"Preserving base value at {path}; incoming value ignored.")
+        return base
+
+    def _join_path(self, path: str, key: str) -> str:
+        if not path:
+            return key
+        return f"{path}.{key}"
+
+    def _list_item_path(self, path: str, key_field: str, key_value: Any) -> str:
+        return f"{path}[{key_field}={key_value}]"
+
 
 def _define_flags():
     try:
@@ -58,11 +230,12 @@ def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str,
                      output_path: str) -> None:
     base_data = _load_json(Path(input_metadata_json))
     enriched_data = _load_json(Path(input_enriched_items_json))
-    merged = merge_json_fields(base_data,
-                               enriched_data,
-                               fields_to_update=['enriched_description'],
-                               key_field='id',
-                               allow_overwrite=False)
+    merger = CollectionMerger()
+    merged = merger.merge(base_data,
+                          enriched_data,
+                          fields_to_update=['enriched_description'],
+                          key_field='id',
+                          allow_overwrite=False)
     _write_json(Path(output_path), merged)
 
 
diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge_test.py b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py
index d9670efbd5..1de45a7fff 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_merge_test.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py
@@ -22,6 +22,7 @@
 
 from deepdiff.diff import DeepDiff
 
+from tools.agentic_import.sdmx.metadata_enricher_merge import CollectionMerger
 from tools.agentic_import.sdmx.metadata_enricher_merge import merge_enrichment
 
 _TESTDATA_DIR = Path(os.path.dirname(__file__)) / 'testdata'
@@ -30,6 +31,168 @@
 _EXPECTED_JSON = _TESTDATA_DIR / 'sample_metadata_enriched_expected.json'
 
 
+class CollectionMergerTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self._merger = CollectionMerger()
+
+    def test_updates_only_listed_fields(self) -> None:
+        base = {"item": {"name": "Base"}}
+        incoming = {"item": {"name": "Incoming", "enriched_description": "New"}}
+
+        merged = self._merger.merge(base,
+                                    incoming,
+                                    fields_to_update=["enriched_description"],
+                                    allow_overwrite=False)
+
+        self.assertEqual(merged["item"]["name"], "Base")
+        self.assertEqual(merged["item"]["enriched_description"], "New")
+
+    def test_overwrite_policy_for_listed_fields(self) -> None:
+        base_keep = {"item": {"enriched_description": "Old"}}
+        base_overwrite = {"item": {"enriched_description": "Old"}}
+        incoming = {"item": {"enriched_description": "New"}}
+
+        merged_keep = self._merger.merge(
+            base_keep,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=False)
+        merged_overwrite = self._merger.merge(
+            base_overwrite,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=True)
+
+        self.assertEqual(merged_keep["item"]["enriched_description"], "Old")
+        self.assertEqual(merged_overwrite["item"]["enriched_description"],
+                         "New")
+
+    def test_type_mismatch_on_listed_field_respects_overwrite(self) -> None:
+        base_keep = {"item": {"enriched_description": {"a": 1}}}
+        base_overwrite = {"item": {"enriched_description": {"a": 1}}}
+        incoming = {"item": {"enriched_description": "New"}}
+
+        merged_keep = self._merger.merge(
+            base_keep,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=False)
+        merged_overwrite = self._merger.merge(
+            base_overwrite,
+            incoming,
+            fields_to_update=["enriched_description"],
+            allow_overwrite=True)
+
+        self.assertEqual(merged_keep["item"]["enriched_description"], {"a": 1})
+        self.assertEqual(merged_overwrite["item"]["enriched_description"],
+                         "New")
+
+    def test_traversal_type_mismatch_is_skipped(self) -> None:
+        base = {"item": {"details": {"a": 1}}}
+        incoming = {"item": {"details": ["x"]}}
+
+        merged = self._merger.merge(base,
+                                    incoming,
+                                    fields_to_update=["enriched_description"])
+
+        self.assertEqual(merged["item"]["details"], {"a": 1})
+
+    def test_keyed_list_merge_respects_hierarchy(self) -> None:
+        base = {
+            "codelists": [
+                {
+                    "id": "CL1",
+                    "codes": [{
+                        "id": "A"
+                    },],
+                },
+                {
+                    "id": "CL2",
+                    "codes": [{
+                        "id": "A"
+                    },],
+                },
+            ]
+        }
+        incoming = {
+            "codelists": [
+                {
+                    "id":
+                        "CL1",
+                    "codes": [{
+                        "id": "A",
+                        "enriched_description": "Code A in CL1",
+                    },],
+                },
+                {
+                    "id":
+                        "CL2",
+                    "codes": [{
+                        "id": "A",
+                        "enriched_description": "Code A in CL2",
+                    },],
+                },
+            ]
+        }
+
+        merged = self._merger.merge(base,
+                                    incoming,
+                                    fields_to_update=["enriched_description"],
+                                    allow_overwrite=False)
+
+        cl1_code = merged["codelists"][0]["codes"][0]
+        cl2_code = merged["codelists"][1]["codes"][0]
+        self.assertEqual(cl1_code["enriched_description"], "Code A in CL1")
+        self.assertEqual(cl2_code["enriched_description"], "Code A in CL2")
+
+    def test_keyed_list_merge_with_custom_key(self) -> None:
+        base = {"items": [{"code": "X", "value": 1}]}
+        incoming = {
+            "items": [
+                {
+                    "code": "X",
+                    "enriched_description": "X desc"
+                },
+                {
+                    "code": "Y",
+                    "enriched_description": "Y desc"
+                },
+            ]
+        }
+
+        merged = self._merger.merge(base,
+                                    incoming,
+                                    fields_to_update=["enriched_description"],
+                                    key_field="code")
+
+        self.assertEqual(len(merged["items"]), 1)
+        self.assertEqual(merged["items"][0]["enriched_description"], "X desc")
+
+    def test_list_items_without_key_are_ignored(self) -> None:
+        base = {"items": [{"id": "x"}]}
+        incoming = {"items": [{"name": "no_id"}]}
+
+        merged = self._merger.merge(base,
+                                    incoming,
+                                    fields_to_update=["enriched_description"])
+
+        self.assertEqual(len(merged["items"]), 1)
+        self.assertEqual(merged["items"][0], {"id": "x"})
+
+    def test_base_items_without_key_are_ignored(self) -> None:
+        base = {"items": [{"name": "base-only"}]}
+        incoming = {"items": [{"id": "x", "enriched_description": "desc"}]}
+
+        merged = self._merger.merge(base,
+                                    incoming,
+                                    fields_to_update=["enriched_description"])
+
+        self.assertEqual(len(merged["items"]), 1)
+        self.assertEqual(merged["items"][0]["name"], "base-only")
+        self.assertNotIn("enriched_description", merged["items"][0])
+
+
 class EnrichmentMergeTest(unittest.TestCase):
 
     def test_merge_enriched_description_across_lists(self) -> None:

From d9149dadded6d663498b5e6be53b5c70e6bfe612 Mon Sep 17 00:00:00 2001
From: Rohit Kumar <rohitrkumar@google.com>
Date: Wed, 21 Jan 2026 08:28:01 +0000
Subject: [PATCH 15/15] docs: Refactor the SDMX enrichment README to detail the
 metadata enrichment pipeline steps and tool usage.

---
 tools/agentic_import/sdmx/README.md           | 91 +++++++++++++------
 .../sdmx/metadata_enricher_merge.py           | 24 ++---
 2 files changed, 75 insertions(+), 40 deletions(-)

diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md
index 8fdb06b21d..a66c828331 100644
--- a/tools/agentic_import/sdmx/README.md
+++ b/tools/agentic_import/sdmx/README.md
@@ -1,14 +1,15 @@
-# SDMX Enrichment Tools
+# SDMX Metadata Enrichment Pipeline
 
-This folder contains three standalone tools for SDMX metadata enrichment.
-Each tool supports CLI usage and can be called programmatically.
+The enrichment process is organized into three distinct steps: Discovery, Fetching, and Integration.
 
-## 1) metadata_enricher_find.py
-Selects which SDMX codes/concepts need enrichment and generates
-`enrichment_query` values using full dataset context.
+---
 
-CLI usage:
-```
+## Step 1: Discovery (`metadata_enricher_find.py`)
+
+**Role**: Analyzes the base SDMX metadata using Gemini CLI to identify codes and concepts that require enrichment. It generates context-aware search queries (`enrichment_query`) for these items while preserving the original dataset structure.
+
+**Command**:
+```bash
 python tools/agentic_import/sdmx/metadata_enricher_find.py \
   --input_metadata_json="/path/to/metadata.json" \
   --dataset_prefix="oecd_prices" \
@@ -17,16 +18,20 @@ python tools/agentic_import/sdmx/metadata_enricher_find.py \
   --enable_sandboxing
 ```
 
-Output:
-- A pruned JSON that preserves the original structure but keeps only selected
-  items with `enrichment_query`. Name/description fields are omitted.
+**Input**:
+- Base SDMX `metadata.json` file.
 
-## 2) metadata_enricher_fetch.py
-Uses Gemini CLI web search to populate `enriched_description` for each selected
-item.
+**Output**:
+- `items_to_enrich.json`: A pruned JSON structure containing only selected items with their generated `enrichment_query`.
 
-CLI usage:
-```
+---
+
+## Step 2: Fetching (`metadata_enricher_fetch.py`)
+
+**Role**: Orchestrates external web searches (via Gemini CLI) to populate detailed descriptions (`enriched_description`) for the items identified in the previous step.
+
+**Command**:
+```bash
 python tools/agentic_import/sdmx/metadata_enricher_fetch.py \
   --input_items_json="/path/to/items_to_enrich.json" \
   --dataset_prefix="oecd_prices" \
@@ -35,21 +40,55 @@ python tools/agentic_import/sdmx/metadata_enricher_fetch.py \
   --enable_sandboxing
 ```
 
-Output:
-- A pruned JSON in the same structure as the input, with `enriched_description`
-  added and `enrichment_query` removed.
+**Input**:
+- `items_to_enrich.json` (from Step 1).
 
-## 3) metadata_enricher_merge.py
-Merges `enriched_description` into the base metadata JSON.
+**Output**:
+- `enriched_items.json`: A pruned JSON structure with `enriched_description` added for each item.
 
-CLI usage:
-```
+---
+
+## Step 3: Integration (`metadata_enricher_merge.py`)
+
+**Role**: Merges the fetched descriptions back into the original SDMX metadata JSON, resulting in a complete, enriched metadata file.
+
+**Command**:
+```bash
 python tools/agentic_import/sdmx/metadata_enricher_merge.py \
   --input_metadata_json="/path/to/metadata.json" \
   --input_enriched_items_json="/path/to/enriched_items.json" \
   --output_path="/path/to/metadata_enriched.json"
 ```
 
-Output:
-- A full metadata JSON with `enriched_description` merged into the matching
-  codes and concepts.
+**Input**:
+- Base SDMX `metadata.json`.
+- `enriched_items.json` (from Step 2).
+
+**Output**:
+- `metadata_enriched.json`: The final, full metadata JSON with `enriched_description` fields merged into the matching codes and concepts.
+
+---
+
+## Full Pipeline Example
+
+To run the entire enrichment pipeline for a dataset:
+
+```bash
+# 1. Discover items to enrich
+python tools/agentic_import/sdmx/metadata_enricher_find.py \
+  --input_metadata_json="metadata.json" \
+  --dataset_prefix="my_dataset" \
+  --output_path="items_to_enrich.json"
+
+# 2. Fetch enriched descriptions
+python tools/agentic_import/sdmx/metadata_enricher_fetch.py \
+  --input_items_json="items_to_enrich.json" \
+  --dataset_prefix="my_dataset" \
+  --output_path="enriched_items.json"
+
+# 3. Merge results into the original metadata
+python tools/agentic_import/sdmx/metadata_enricher_merge.py \
+  --input_metadata_json="metadata.json" \
+  --input_enriched_items_json="enriched_items.json" \
+  --output_path="metadata_enriched.json"
+```
diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py
index aba343aff0..4ae702ee16 100644
--- a/tools/agentic_import/sdmx/metadata_enricher_merge.py
+++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py
@@ -37,23 +37,19 @@ def merge(self,
               fields_to_update: List[str],
               key_field: str = 'id',
               allow_overwrite: bool = False) -> DictOrList:
-        """Merges values from `incoming` into `base` for a controlled set of fields.
+        """Merges specific fields from `incoming` into `base` in-place.
 
-        The merge walks `base` and only descends into dicts/lists that already
-        exist in `base` (it does not create new containers). For dicts, only keys
-        listed in `fields_to_update` may be added or updated; other keys are
-        merged only when they already exist in `base`. For lists, items are
-        matched by `key_field` (default: "id"); incoming items with no match in
-        `base` are ignored. Container type mismatches are skipped.
+        This performs a structure-aware merge:
+        1. Only keys in `fields_to_update` are added or updated.
+        2. Lists are merged by matching items on `key_field` (e.g., "id").
+        3. Structure in `base` is preserved; new unmatched containers are ignored.
 
         Args:
-          base: Nested dict/list structure to update. Modified in place.
-          incoming: Nested dict/list structure providing candidate updates.
-          fields_to_update: Dict keys that are allowed to be added/updated.
-          key_field: Dict key used to match list items when merging lists.
-          allow_overwrite: If True, overwrite existing values for
-            `fields_to_update`. If False, existing `base` values are preserved
-            and incoming values are ignored.
+          base: The target dictionary or list to update.
+          incoming: The source dictionary or list with updates.
+          fields_to_update: Keys allowed to be changed or added.
+          key_field: Key used to match list items.
+          allow_overwrite: Whether to overwrite existing values.
 
         Returns:
           The updated `base` object.