From 52344391ed28feae8404879a7df2d3e0e20dffdc Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 19 Jan 2026 11:22:46 +0000 Subject: [PATCH 01/15] Add SDMX enrichment item selector --- .../sdmx/find_enrichment_items.py | 275 ++++++++++++++++++ .../templates/find_enrichment_items_prompt.j2 | 49 ++++ 2 files changed, 324 insertions(+) create mode 100644 tools/agentic_import/sdmx/find_enrichment_items.py create mode 100644 tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/find_enrichment_items.py new file mode 100644 index 0000000000..2f47ac5063 --- /dev/null +++ b/tools/agentic_import/sdmx/find_enrichment_items.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import platform +import shutil +import subprocess +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Optional + +from absl import app +from absl import flags +from absl import logging +from jinja2 import Environment, FileSystemLoader + +_FLAGS = flags.FLAGS +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def _define_flags(): + try: + flags.DEFINE_string( + 'input_metadata_json', None, + 'Path to input SDMX metadata JSON (required)') + flags.mark_flag_as_required('input_metadata_json') + + flags.DEFINE_string('output_path', None, + 'Path to output items JSON (required)') + flags.mark_flag_as_required('output_path') + + flags.DEFINE_boolean('dry_run', False, + 'Generate prompt only without calling Gemini CLI') + + flags.DEFINE_boolean( + 'skip_confirmation', False, + 'Skip user confirmation before running Gemini CLI') + + flags.DEFINE_boolean( + 'enable_sandboxing', + platform.system() == 'Darwin', + 'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)' + ) + + flags.DEFINE_string( + 'gemini_cli', 'gemini', + 'Custom path or command to invoke Gemini CLI. ' + 'Example: "/usr/local/bin/gemini". ' + 'WARNING: This value is executed in a shell - use only with trusted input.' + ) + + flags.DEFINE_string( + 'working_dir', None, + 'Working directory for the run (default: current directory)') + except flags.DuplicateFlagError: + pass + + +@dataclass +class Config: + input_metadata_json: str + output_path: str + dry_run: bool = False + skip_confirmation: bool = False + enable_sandboxing: bool = False + gemini_cli: Optional[str] = None + working_dir: Optional[str] = None + + +@dataclass +class RunResult: + run_id: str + run_dir: Path + prompt_path: Path + gemini_log_path: Path + gemini_command: str + sandbox_enabled: bool + + +class EnrichmentItemsFinder: + def __init__(self, config: Config): + self._config = config + self._working_dir = Path( + config.working_dir).resolve() if config.working_dir else Path.cwd() + self._input_path = self._resolve_path(config.input_metadata_json) + self._output_path = self._resolve_path(config.output_path) + + if not self._input_path.exists(): + raise FileNotFoundError( + f"input_metadata_json does not exist: {self._input_path}") + + self._output_path.parent.mkdir(parents=True, exist_ok=True) + + self._datacommons_dir = self._working_dir / '.datacommons' + self._datacommons_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + self._run_id = f"gemini_{timestamp}" + self._run_dir = self._datacommons_dir / 'runs' / self._run_id + self._run_dir.mkdir(parents=True, exist_ok=True) + + def find_items_to_enrich(self) -> RunResult: + prompt_file = self._generate_prompt() + gemini_log_file = self._run_dir / 'gemini_cli.log' + gemini_command = self._build_gemini_command(prompt_file, + gemini_log_file) + + result = RunResult(run_id=self._run_id, + run_dir=self._run_dir, + prompt_path=prompt_file, + gemini_log_path=gemini_log_file, + gemini_command=gemini_command, + sandbox_enabled=self._config.enable_sandboxing) + + if self._config.dry_run: + logging.info( + "Dry run mode: Prompt file generated at %s. " + "Skipping Gemini CLI execution.", prompt_file) + return result + + if not self._config.skip_confirmation: + if not self._get_user_confirmation(prompt_file): + logging.info("Enrichment item selection cancelled by user.") + return result + + if not self._check_gemini_cli_available(): + logging.warning( + "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)." + ) + + logging.info( + "Launching gemini (cwd: %s): %s", self._working_dir, gemini_command) + logging.info("Gemini output will be saved to: %s", gemini_log_file) + + exit_code = self._run_subprocess(gemini_command) + if exit_code == 0: + logging.info("Gemini CLI completed successfully") + return result + + raise RuntimeError( + f"Gemini CLI execution failed with exit code {exit_code}") + + def _resolve_path(self, path: str) -> Path: + resolved = Path(path).expanduser() + if not resolved.is_absolute(): + resolved = self._working_dir / resolved + return resolved.resolve() + + def _generate_prompt(self) -> Path: + template_dir = os.path.join(_SCRIPT_DIR, 'templates') + env = Environment(loader=FileSystemLoader(template_dir)) + template = env.get_template('find_enrichment_items_prompt.j2') + + rendered_prompt = template.render( + input_metadata_abs=str(self._input_path), + output_path_abs=str(self._output_path), + ) + + output_file = self._run_dir / 'find_enrichment_items_prompt.md' + with open(output_file, 'w') as f: + f.write(rendered_prompt) + + logging.info("Generated prompt written to: %s", output_file) + return output_file + + def _get_user_confirmation(self, prompt_file: Path) -> bool: + print("\n" + "=" * 60) + print("SDMX ENRICHMENT ITEM SELECTION SUMMARY") + print("=" * 60) + print(f"Input metadata file: {self._input_path}") + print(f"Output items file: {self._output_path}") + print(f"Prompt file: {prompt_file}") + print(f"Working directory: {self._working_dir}") + print( + f"Sandboxing: {'Enabled' if self._config.enable_sandboxing else 'Disabled'}" + ) + if not self._config.enable_sandboxing: + print( + "WARNING: Sandboxing is disabled. Gemini will run without safety restrictions." + ) + print("=" * 60) + + while True: + try: + response = input( + "Ready to run Gemini for enrichment item selection? (y/n): " + ).strip().lower() + if response in ['y', 'yes']: + return True + if response in ['n', 'no']: + print("Selection cancelled by user.") + return False + print("Please enter 'y' or 'n'.") + except KeyboardInterrupt: + print("\nSelection cancelled by user.") + return False + + def _check_gemini_cli_available(self) -> bool: + if self._config.gemini_cli: + return True + return shutil.which('gemini') is not None + + def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: + prompt_path = prompt_file.resolve() + log_path = log_file.resolve() + gemini_cmd = self._config.gemini_cli or 'gemini' + sandbox_flag = "--sandbox" if self._config.enable_sandboxing else "" + return ( + f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'" + ) + + def _run_subprocess(self, command: str) -> int: + try: + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + cwd=self._working_dir, + encoding='utf-8', + errors='replace', + bufsize=1, + universal_newlines=True) + + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: + break + if output: + print(output.rstrip()) + + return process.wait() + except Exception as e: + logging.error("Error running subprocess: %s", str(e)) + return 1 + + +def prepare_config() -> Config: + return Config(input_metadata_json=_FLAGS.input_metadata_json, + output_path=_FLAGS.output_path, + dry_run=_FLAGS.dry_run, + skip_confirmation=_FLAGS.skip_confirmation, + enable_sandboxing=_FLAGS.enable_sandboxing, + gemini_cli=_FLAGS.gemini_cli, + working_dir=_FLAGS.working_dir) + + +def main(_): + config = prepare_config() + logging.info("Loaded config for enrichment item selection") + + finder = EnrichmentItemsFinder(config) + finder.find_items_to_enrich() + + logging.info("Enrichment item selection completed.") + return 0 + + +if __name__ == '__main__': + _define_flags() + app.run(main) diff --git a/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 b/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 new file mode 100644 index 0000000000..11e965df39 --- /dev/null +++ b/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 @@ -0,0 +1,49 @@ +You are an expert SDMX metadata analyst. Your task is to select only the SDMX +codes and concepts that need enrichment and to craft precise web search queries +for them. + +# INPUT +- Full extractor JSON: {{ input_metadata_abs }} + +# OUTPUT +- Write JSON to: {{ output_path_abs }} +- Output MUST be valid JSON only. No extra text. + +# CRITICAL RULES +- Process the ENTIRE input file. Do not read only the first lines. +- Do not add `enriched_name` anywhere. +- Do not include `name` or `description` fields in the output. +- Skip place names (countries, regions, cities, etc.). +- Skip popular/self-explanatory terms when clear (e.g., GDP, Population). +- Use full context (dataflow name/description, codelist name, dimension name, + concept name, code name/description) to decide and to build queries. +- Example: Interpret HICP in the context of the dataset and codelist, not alone. + +# TASK +1) Read the full JSON from the input path. +2) Select only items that truly need enrichment. +3) For each selected item, add an `enrichment_query` string that reflects the + full context needed for web search. +4) Produce a PRUNED JSON that preserves the original structure but ONLY keeps + the selected items and their necessary parent structure. + +# OUTPUT SHAPE (pruned) +- Keep `dataflows` array. +- For each kept dataflow: include `id` and only the substructures that contain + selected items. +- For code items, keep them under their original `representation.codelist.codes`. +- For concept items, keep them under their original `concept` (components) and/or + `referenced_concept_schemes[*].concepts`. +- Remove all unselected items and any parent objects left empty. + +# FIELD MINIMUMS (do not add name/description) +- dataflow: `id` +- data_structure_definition: `id` +- component (dimension/attribute/measure): `id` +- concept: `id`, `concept_scheme_id`, `enrichment_query` +- representation: `type` +- codelist: `id` +- code: `id`, `enrichment_query` +- referenced_concept_schemes: `id` + +Write ONLY the JSON file to the output path. From 328e130bfcd7e29f8bbef1153aaa55e737b6d2b2 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 19 Jan 2026 11:40:56 +0000 Subject: [PATCH 02/15] Add SDMX enrichment data fetcher --- .../sdmx/fetch_enrichment_data.py | 274 ++++++++++++++++++ .../templates/fetch_enrichment_data_prompt.j2 | 37 +++ 2 files changed, 311 insertions(+) create mode 100644 tools/agentic_import/sdmx/fetch_enrichment_data.py create mode 100644 tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/fetch_enrichment_data.py new file mode 100644 index 0000000000..1ecf61c1be --- /dev/null +++ b/tools/agentic_import/sdmx/fetch_enrichment_data.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import platform +import shutil +import subprocess +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Optional + +from absl import app +from absl import flags +from absl import logging +from jinja2 import Environment, FileSystemLoader + +_FLAGS = flags.FLAGS +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def _define_flags(): + try: + flags.DEFINE_string('input_items_json', None, + 'Path to input items JSON (required)') + flags.mark_flag_as_required('input_items_json') + + flags.DEFINE_string('output_path', None, + 'Path to output items JSON (required)') + flags.mark_flag_as_required('output_path') + + flags.DEFINE_boolean('dry_run', False, + 'Generate prompt only without calling Gemini CLI') + + flags.DEFINE_boolean( + 'skip_confirmation', False, + 'Skip user confirmation before running Gemini CLI') + + flags.DEFINE_boolean( + 'enable_sandboxing', + platform.system() == 'Darwin', + 'Enable sandboxing for Gemini CLI (default: True on macOS, False elsewhere)' + ) + + flags.DEFINE_string( + 'gemini_cli', 'gemini', + 'Custom path or command to invoke Gemini CLI. ' + 'Example: "/usr/local/bin/gemini". ' + 'WARNING: This value is executed in a shell - use only with trusted input.' + ) + + flags.DEFINE_string( + 'working_dir', None, + 'Working directory for the run (default: current directory)') + except flags.DuplicateFlagError: + pass + + +@dataclass +class Config: + input_items_json: str + output_path: str + dry_run: bool = False + skip_confirmation: bool = False + enable_sandboxing: bool = False + gemini_cli: Optional[str] = None + working_dir: Optional[str] = None + + +@dataclass +class RunResult: + run_id: str + run_dir: Path + prompt_path: Path + gemini_log_path: Path + gemini_command: str + sandbox_enabled: bool + + +class EnrichmentDataFetcher: + def __init__(self, config: Config): + self._config = config + self._working_dir = Path( + config.working_dir).resolve() if config.working_dir else Path.cwd() + self._input_path = self._resolve_path(config.input_items_json) + self._output_path = self._resolve_path(config.output_path) + + if not self._input_path.exists(): + raise FileNotFoundError( + f"input_items_json does not exist: {self._input_path}") + + self._output_path.parent.mkdir(parents=True, exist_ok=True) + + self._datacommons_dir = self._working_dir / '.datacommons' + self._datacommons_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + self._run_id = f"gemini_{timestamp}" + self._run_dir = self._datacommons_dir / 'runs' / self._run_id + self._run_dir.mkdir(parents=True, exist_ok=True) + + def fetch_enrichment_data(self) -> RunResult: + prompt_file = self._generate_prompt() + gemini_log_file = self._run_dir / 'gemini_cli.log' + gemini_command = self._build_gemini_command(prompt_file, + gemini_log_file) + + result = RunResult(run_id=self._run_id, + run_dir=self._run_dir, + prompt_path=prompt_file, + gemini_log_path=gemini_log_file, + gemini_command=gemini_command, + sandbox_enabled=self._config.enable_sandboxing) + + if self._config.dry_run: + logging.info( + "Dry run mode: Prompt file generated at %s. " + "Skipping Gemini CLI execution.", prompt_file) + return result + + if not self._config.skip_confirmation: + if not self._get_user_confirmation(prompt_file): + logging.info("Enrichment data fetch cancelled by user.") + return result + + if not self._check_gemini_cli_available(): + logging.warning( + "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)." + ) + + logging.info("Launching gemini (cwd: %s): %s", self._working_dir, + gemini_command) + logging.info("Gemini output will be saved to: %s", gemini_log_file) + + exit_code = self._run_subprocess(gemini_command) + if exit_code == 0: + logging.info("Gemini CLI completed successfully") + return result + + raise RuntimeError( + f"Gemini CLI execution failed with exit code {exit_code}") + + def _resolve_path(self, path: str) -> Path: + resolved = Path(path).expanduser() + if not resolved.is_absolute(): + resolved = self._working_dir / resolved + return resolved.resolve() + + def _generate_prompt(self) -> Path: + template_dir = os.path.join(_SCRIPT_DIR, 'templates') + env = Environment(loader=FileSystemLoader(template_dir)) + template = env.get_template('fetch_enrichment_data_prompt.j2') + + rendered_prompt = template.render( + input_items_abs=str(self._input_path), + output_path_abs=str(self._output_path), + ) + + output_file = self._run_dir / 'fetch_enrichment_data_prompt.md' + with open(output_file, 'w') as f: + f.write(rendered_prompt) + + logging.info("Generated prompt written to: %s", output_file) + return output_file + + def _get_user_confirmation(self, prompt_file: Path) -> bool: + print("\n" + "=" * 60) + print("SDMX ENRICHMENT DATA FETCH SUMMARY") + print("=" * 60) + print(f"Input items file: {self._input_path}") + print(f"Output items file: {self._output_path}") + print(f"Prompt file: {prompt_file}") + print(f"Working directory: {self._working_dir}") + print( + f"Sandboxing: {'Enabled' if self._config.enable_sandboxing else 'Disabled'}" + ) + if not self._config.enable_sandboxing: + print( + "WARNING: Sandboxing is disabled. Gemini will run without safety restrictions." + ) + print("=" * 60) + + while True: + try: + response = input( + "Ready to run Gemini for enrichment data fetch? (y/n): " + ).strip().lower() + if response in ['y', 'yes']: + return True + if response in ['n', 'no']: + print("Data fetch cancelled by user.") + return False + print("Please enter 'y' or 'n'.") + except KeyboardInterrupt: + print("\nData fetch cancelled by user.") + return False + + def _check_gemini_cli_available(self) -> bool: + if self._config.gemini_cli: + return True + return shutil.which('gemini') is not None + + def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: + prompt_path = prompt_file.resolve() + log_path = log_file.resolve() + gemini_cmd = self._config.gemini_cli or 'gemini' + sandbox_flag = "--sandbox" if self._config.enable_sandboxing else "" + return ( + f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'" + ) + + def _run_subprocess(self, command: str) -> int: + try: + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + cwd=self._working_dir, + encoding='utf-8', + errors='replace', + bufsize=1, + universal_newlines=True) + + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: + break + if output: + print(output.rstrip()) + + return process.wait() + except Exception as e: + logging.error("Error running subprocess: %s", str(e)) + return 1 + + +def prepare_config() -> Config: + return Config(input_items_json=_FLAGS.input_items_json, + output_path=_FLAGS.output_path, + dry_run=_FLAGS.dry_run, + skip_confirmation=_FLAGS.skip_confirmation, + enable_sandboxing=_FLAGS.enable_sandboxing, + gemini_cli=_FLAGS.gemini_cli, + working_dir=_FLAGS.working_dir) + + +def main(_): + config = prepare_config() + logging.info("Loaded config for enrichment data fetch") + + fetcher = EnrichmentDataFetcher(config) + fetcher.fetch_enrichment_data() + + logging.info("Enrichment data fetch completed.") + return 0 + + +if __name__ == '__main__': + _define_flags() + app.run(main) diff --git a/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 b/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 new file mode 100644 index 0000000000..8f3ada99d8 --- /dev/null +++ b/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 @@ -0,0 +1,37 @@ +You are an expert SDMX metadata analyst. Your task is to enrich selected SDMX +codes and concepts using web search and provide concise descriptions. + +# INPUT +- Items-to-enrich JSON: {{ input_items_abs }} + +# OUTPUT +- Write JSON to: {{ output_path_abs }} +- Output MUST be valid JSON only. No extra text. + +# CRITICAL RULES +- Process the ENTIRE input file. Do not read only the first lines. +- Use web search for each item, batching multiple items per web call when possible. +- Do not add `enriched_name` anywhere. +- Do not include `name` or `description` fields in the output. +- Do not include `enrichment_query` in the output. +- Ground descriptions in search results and dataset context. +- Keep `enriched_description` concise (<= 240 chars). + +# TASK +1) Read the full JSON from the input path. +2) For each selected item, use its `enrichment_query` to search the web. +3) Produce an `enriched_description` for each item. +4) Output the SAME pruned JSON structure as input, but remove + `enrichment_query` and add `enriched_description`. + +# FIELD MINIMUMS (do not add name/description) +- dataflow: `id` +- data_structure_definition: `id` +- component (dimension/attribute/measure): `id` +- concept: `id`, `concept_scheme_id`, `enriched_description` +- representation: `type` +- codelist: `id` +- code: `id`, `enriched_description` +- referenced_concept_schemes: `id` + +Write ONLY the JSON file to the output path. From b4089df45a9f180a5db5d7c395abd8700af3e3f6 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 19 Jan 2026 13:01:36 +0000 Subject: [PATCH 03/15] Add SDMX enrichment merge tool --- .../sdmx/sdmx_enrichment_merge.py | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tools/agentic_import/sdmx/sdmx_enrichment_merge.py diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py new file mode 100644 index 0000000000..41cb38183c --- /dev/null +++ b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional + +from absl import app +from absl import flags +from absl import logging + +_FLAGS = flags.FLAGS + + +def _define_flags(): + try: + flags.DEFINE_string('input_metadata_json', None, + 'Path to base SDMX metadata JSON (required)') + flags.mark_flag_as_required('input_metadata_json') + + flags.DEFINE_string('input_enriched_items_json', None, + 'Path to enriched items JSON (required)') + flags.mark_flag_as_required('input_enriched_items_json') + + flags.DEFINE_string('output_path', None, + 'Path to output enriched metadata JSON (required)') + flags.mark_flag_as_required('output_path') + except flags.DuplicateFlagError: + pass + + +@dataclass(frozen=True) +class MergeTarget: + path: str + match_key: str = 'id' + + +_MERGE_TARGETS = [ + MergeTarget('dataflows'), + MergeTarget('dataflows.data_structure_definition.dimensions'), + MergeTarget('dataflows.data_structure_definition.attributes'), + MergeTarget('dataflows.data_structure_definition.measures'), + MergeTarget('dataflows.data_structure_definition.dimensions.concept'), + MergeTarget('dataflows.data_structure_definition.attributes.concept'), + MergeTarget('dataflows.data_structure_definition.measures.concept'), + MergeTarget( + 'dataflows.data_structure_definition.dimensions.representation.codelist.codes' + ), + MergeTarget( + 'dataflows.data_structure_definition.attributes.representation.codelist.codes' + ), + MergeTarget( + 'dataflows.data_structure_definition.measures.representation.codelist.codes' + ), + MergeTarget('dataflows.referenced_concept_schemes'), + MergeTarget('dataflows.referenced_concept_schemes.concepts'), +] + + +class EnrichmentMerger: + def __init__(self, base_data: Dict[str, Any], + enriched_data: Dict[str, Any]): + self._base = base_data + self._enriched = enriched_data + self._targets = _MERGE_TARGETS + + def merge(self) -> Dict[str, Any]: + self._merge_targets() + return self._base + + def _merge_targets(self) -> None: + for target in self._targets: + base_nodes = list(self._find_nodes(self._base, target.path)) + enriched_nodes = list(self._find_nodes(self._enriched, target.path)) + if not base_nodes and enriched_nodes: + logging.warning( + "Enriched data has path '%s' not present in base JSON", + target.path) + continue + + base_by_key = { + node.get(target.match_key): node + for node in base_nodes + if isinstance(node, dict) and node.get(target.match_key) + } + for enriched_node in enriched_nodes: + if not isinstance(enriched_node, dict): + continue + match_value = enriched_node.get(target.match_key) + if not match_value: + continue + base_node = base_by_key.get(match_value) + if not base_node: + logging.warning( + "No base match for %s='%s' at path '%s'", target.match_key, + match_value, target.path) + continue + self._merge_node(base_node, enriched_node, target.path) + + def _merge_node(self, base_node: Dict[str, Any], + enriched_node: Dict[str, Any], path: str) -> None: + if 'enriched_description' in enriched_node: + if 'enriched_description' in base_node: + logging.warning( + "Overwriting enriched_description at %s id=%s", path, + base_node.get('id')) + base_node['enriched_description'] = enriched_node[ + 'enriched_description'] + + def _find_nodes(self, data: Dict[str, Any], + path: str) -> Iterable[Dict[str, Any]]: + parts = path.split('.') + current = [data] + for part in parts: + next_level = [] + for node in current: + if not isinstance(node, dict): + continue + value = node.get(part) + if isinstance(value, list): + next_level.extend([item for item in value if isinstance( + item, dict)]) + elif isinstance(value, dict): + next_level.append(value) + current = next_level + if not current: + break + return current + + +def _load_json(path: Path) -> Dict[str, Any]: + with open(path, 'r') as f: + return json.load(f) + + +def _write_json(path: Path, data: Dict[str, Any]) -> None: + with open(path, 'w') as f: + json.dump(data, f, indent=2) + + +def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str, + output_path: str) -> None: + base_data = _load_json(Path(input_metadata_json)) + enriched_data = _load_json(Path(input_enriched_items_json)) + merged = EnrichmentMerger(base_data, enriched_data).merge() + _write_json(Path(output_path), merged) + + +def main(_): + merge_enrichment(_FLAGS.input_metadata_json, + _FLAGS.input_enriched_items_json, _FLAGS.output_path) + logging.info("Merged enriched descriptions into base metadata JSON") + return 0 + + +if __name__ == '__main__': + _define_flags() + app.run(main) From 108fef6ad8ade9248d3ee11e168dc762ddc378b7 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 19 Jan 2026 13:10:07 +0000 Subject: [PATCH 04/15] lint changes --- .../sdmx/fetch_enrichment_data.py | 20 ++++++------- .../sdmx/find_enrichment_items.py | 29 +++++++++---------- .../sdmx/sdmx_enrichment_merge.py | 19 ++++++------ 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/fetch_enrichment_data.py index 1ecf61c1be..ab2878e664 100644 --- a/tools/agentic_import/sdmx/fetch_enrichment_data.py +++ b/tools/agentic_import/sdmx/fetch_enrichment_data.py @@ -91,6 +91,7 @@ class RunResult: class EnrichmentDataFetcher: + def __init__(self, config: Config): self._config = config self._working_dir = Path( @@ -224,16 +225,15 @@ def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: def _run_subprocess(self, command: str) -> int: try: - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True, - cwd=self._working_dir, - encoding='utf-8', - errors='replace', - bufsize=1, - universal_newlines=True) + process = subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + cwd=self._working_dir, + encoding='utf-8', + errors='replace', + bufsize=1, + universal_newlines=True) while True: output = process.stdout.readline() diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/find_enrichment_items.py index 2f47ac5063..9da2221f03 100644 --- a/tools/agentic_import/sdmx/find_enrichment_items.py +++ b/tools/agentic_import/sdmx/find_enrichment_items.py @@ -34,9 +34,8 @@ def _define_flags(): try: - flags.DEFINE_string( - 'input_metadata_json', None, - 'Path to input SDMX metadata JSON (required)') + flags.DEFINE_string('input_metadata_json', None, + 'Path to input SDMX metadata JSON (required)') flags.mark_flag_as_required('input_metadata_json') flags.DEFINE_string('output_path', None, @@ -92,6 +91,7 @@ class RunResult: class EnrichmentItemsFinder: + def __init__(self, config: Config): self._config = config self._working_dir = Path( @@ -142,8 +142,8 @@ def find_items_to_enrich(self) -> RunResult: "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)." ) - logging.info( - "Launching gemini (cwd: %s): %s", self._working_dir, gemini_command) + logging.info("Launching gemini (cwd: %s): %s", self._working_dir, + gemini_command) logging.info("Gemini output will be saved to: %s", gemini_log_file) exit_code = self._run_subprocess(gemini_command) @@ -225,16 +225,15 @@ def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: def _run_subprocess(self, command: str) -> int: try: - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True, - cwd=self._working_dir, - encoding='utf-8', - errors='replace', - bufsize=1, - universal_newlines=True) + process = subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + cwd=self._working_dir, + encoding='utf-8', + errors='replace', + bufsize=1, + universal_newlines=True) while True: output = process.stdout.readline() diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py index 41cb38183c..14320dbf99 100644 --- a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py +++ b/tools/agentic_import/sdmx/sdmx_enrichment_merge.py @@ -72,8 +72,9 @@ class MergeTarget: class EnrichmentMerger: - def __init__(self, base_data: Dict[str, Any], - enriched_data: Dict[str, Any]): + + def __init__(self, base_data: Dict[str, Any], enriched_data: Dict[str, + Any]): self._base = base_data self._enriched = enriched_data self._targets = _MERGE_TARGETS @@ -105,9 +106,8 @@ def _merge_targets(self) -> None: continue base_node = base_by_key.get(match_value) if not base_node: - logging.warning( - "No base match for %s='%s' at path '%s'", target.match_key, - match_value, target.path) + logging.warning("No base match for %s='%s' at path '%s'", + target.match_key, match_value, target.path) continue self._merge_node(base_node, enriched_node, target.path) @@ -115,9 +115,8 @@ def _merge_node(self, base_node: Dict[str, Any], enriched_node: Dict[str, Any], path: str) -> None: if 'enriched_description' in enriched_node: if 'enriched_description' in base_node: - logging.warning( - "Overwriting enriched_description at %s id=%s", path, - base_node.get('id')) + logging.warning("Overwriting enriched_description at %s id=%s", + path, base_node.get('id')) base_node['enriched_description'] = enriched_node[ 'enriched_description'] @@ -132,8 +131,8 @@ def _find_nodes(self, data: Dict[str, Any], continue value = node.get(part) if isinstance(value, list): - next_level.extend([item for item in value if isinstance( - item, dict)]) + next_level.extend( + [item for item in value if isinstance(item, dict)]) elif isinstance(value, dict): next_level.append(value) current = next_level From b1850c3e07638d94a9da746b11ed37e47015e1b0 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 19 Jan 2026 13:11:44 +0000 Subject: [PATCH 05/15] Document SDMX enrichment tools --- tools/agentic_import/sdmx/README.md | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tools/agentic_import/sdmx/README.md diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md new file mode 100644 index 0000000000..ac7dfd0434 --- /dev/null +++ b/tools/agentic_import/sdmx/README.md @@ -0,0 +1,53 @@ +# SDMX Enrichment Tools + +This folder contains three standalone tools for SDMX metadata enrichment. +Each tool supports CLI usage and can be called programmatically. + +## 1) find_enrichment_items.py +Selects which SDMX codes/concepts need enrichment and generates +`enrichment_query` values using full dataset context. + +CLI usage: +``` +python tools/agentic_import/sdmx/find_enrichment_items.py \ + --input_metadata_json="/path/to/metadata.json" \ + --output_path="/path/to/items_to_enrich.json" \ + --gemini_cli="gemini" \ + --enable_sandboxing +``` + +Output: +- A pruned JSON that preserves the original structure but keeps only selected + items with `enrichment_query`. Name/description fields are omitted. + +## 2) fetch_enrichment_data.py +Uses Gemini CLI web search to populate `enriched_description` for each selected +item. + +CLI usage: +``` +python tools/agentic_import/sdmx/fetch_enrichment_data.py \ + --input_items_json="/path/to/items_to_enrich.json" \ + --output_path="/path/to/enriched_items.json" \ + --gemini_cli="gemini" \ + --enable_sandboxing +``` + +Output: +- A pruned JSON in the same structure as the input, with `enriched_description` + added and `enrichment_query` removed. + +## 3) sdmx_enrichment_merge.py +Merges `enriched_description` into the base metadata JSON. + +CLI usage: +``` +python tools/agentic_import/sdmx/sdmx_enrichment_merge.py \ + --input_metadata_json="/path/to/metadata.json" \ + --input_enriched_items_json="/path/to/enriched_items.json" \ + --output_path="/path/to/metadata_enriched.json" +``` + +Output: +- A full metadata JSON with `enriched_description` merged into the matching + codes and concepts. From dfc3b86fb047d614c41f9de3f2dcecb901096601 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Mon, 19 Jan 2026 16:20:35 +0000 Subject: [PATCH 06/15] Require dataset prefix for Gemini runs Add dataset_prefix to SDMX tools and docs. --- tools/agentic_import/sdmx/README.md | 2 ++ tools/agentic_import/sdmx/fetch_enrichment_data.py | 12 +++++++++++- tools/agentic_import/sdmx/find_enrichment_items.py | 12 +++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md index ac7dfd0434..c9c4416ccb 100644 --- a/tools/agentic_import/sdmx/README.md +++ b/tools/agentic_import/sdmx/README.md @@ -11,6 +11,7 @@ CLI usage: ``` python tools/agentic_import/sdmx/find_enrichment_items.py \ --input_metadata_json="/path/to/metadata.json" \ + --dataset_prefix="oecd_prices" \ --output_path="/path/to/items_to_enrich.json" \ --gemini_cli="gemini" \ --enable_sandboxing @@ -28,6 +29,7 @@ CLI usage: ``` python tools/agentic_import/sdmx/fetch_enrichment_data.py \ --input_items_json="/path/to/items_to_enrich.json" \ + --dataset_prefix="oecd_prices" \ --output_path="/path/to/enriched_items.json" \ --gemini_cli="gemini" \ --enable_sandboxing diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/fetch_enrichment_data.py index ab2878e664..22e3e89b55 100644 --- a/tools/agentic_import/sdmx/fetch_enrichment_data.py +++ b/tools/agentic_import/sdmx/fetch_enrichment_data.py @@ -38,6 +38,10 @@ def _define_flags(): 'Path to input items JSON (required)') flags.mark_flag_as_required('input_items_json') + flags.DEFINE_string('dataset_prefix', None, + 'Dataset prefix for run id (required, non-empty)') + flags.mark_flag_as_required('dataset_prefix') + flags.DEFINE_string('output_path', None, 'Path to output items JSON (required)') flags.mark_flag_as_required('output_path') @@ -72,6 +76,7 @@ def _define_flags(): @dataclass class Config: input_items_json: str + dataset_prefix: str output_path: str dry_run: bool = False skip_confirmation: bool = False @@ -98,6 +103,10 @@ def __init__(self, config: Config): config.working_dir).resolve() if config.working_dir else Path.cwd() self._input_path = self._resolve_path(config.input_items_json) self._output_path = self._resolve_path(config.output_path) + self._dataset_prefix = (config.dataset_prefix or '').strip() + + if not self._dataset_prefix: + raise ValueError("dataset_prefix must be a non-empty string.") if not self._input_path.exists(): raise FileNotFoundError( @@ -109,7 +118,7 @@ def __init__(self, config: Config): self._datacommons_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - self._run_id = f"gemini_{timestamp}" + self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}" self._run_dir = self._datacommons_dir / 'runs' / self._run_id self._run_dir.mkdir(parents=True, exist_ok=True) @@ -250,6 +259,7 @@ def _run_subprocess(self, command: str) -> int: def prepare_config() -> Config: return Config(input_items_json=_FLAGS.input_items_json, + dataset_prefix=_FLAGS.dataset_prefix, output_path=_FLAGS.output_path, dry_run=_FLAGS.dry_run, skip_confirmation=_FLAGS.skip_confirmation, diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/find_enrichment_items.py index 9da2221f03..8807b397ae 100644 --- a/tools/agentic_import/sdmx/find_enrichment_items.py +++ b/tools/agentic_import/sdmx/find_enrichment_items.py @@ -38,6 +38,10 @@ def _define_flags(): 'Path to input SDMX metadata JSON (required)') flags.mark_flag_as_required('input_metadata_json') + flags.DEFINE_string('dataset_prefix', None, + 'Dataset prefix for run id (required, non-empty)') + flags.mark_flag_as_required('dataset_prefix') + flags.DEFINE_string('output_path', None, 'Path to output items JSON (required)') flags.mark_flag_as_required('output_path') @@ -72,6 +76,7 @@ def _define_flags(): @dataclass class Config: input_metadata_json: str + dataset_prefix: str output_path: str dry_run: bool = False skip_confirmation: bool = False @@ -98,6 +103,10 @@ def __init__(self, config: Config): config.working_dir).resolve() if config.working_dir else Path.cwd() self._input_path = self._resolve_path(config.input_metadata_json) self._output_path = self._resolve_path(config.output_path) + self._dataset_prefix = (config.dataset_prefix or '').strip() + + if not self._dataset_prefix: + raise ValueError("dataset_prefix must be a non-empty string.") if not self._input_path.exists(): raise FileNotFoundError( @@ -109,7 +118,7 @@ def __init__(self, config: Config): self._datacommons_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - self._run_id = f"gemini_{timestamp}" + self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}" self._run_dir = self._datacommons_dir / 'runs' / self._run_id self._run_dir.mkdir(parents=True, exist_ok=True) @@ -250,6 +259,7 @@ def _run_subprocess(self, command: str) -> int: def prepare_config() -> Config: return Config(input_metadata_json=_FLAGS.input_metadata_json, + dataset_prefix=_FLAGS.dataset_prefix, output_path=_FLAGS.output_path, dry_run=_FLAGS.dry_run, skip_confirmation=_FLAGS.skip_confirmation, From 1a7b29df90da7f16946b8aa9d047ff041c770688 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 20 Jan 2026 06:26:31 +0000 Subject: [PATCH 07/15] test: add sdmx enrichment fixtures Cover SDMX merge and dry-run flows --- .../sdmx/fetch_enrichment_data_test.py | 59 +++++++++++++ .../sdmx/find_enrichment_items_test.py | 58 +++++++++++++ .../sdmx/sdmx_enrichment_merge_test.py | 49 +++++++++++ .../sdmx/testdata/sample_enriched_items.json | 84 ++++++++++++++++++ .../sdmx/testdata/sample_metadata.json | 56 ++++++++++++ .../sample_metadata_enriched_expected.json | 85 +++++++++++++++++++ 6 files changed, 391 insertions(+) create mode 100644 tools/agentic_import/sdmx/fetch_enrichment_data_test.py create mode 100644 tools/agentic_import/sdmx/find_enrichment_items_test.py create mode 100644 tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py create mode 100644 tools/agentic_import/sdmx/testdata/sample_enriched_items.json create mode 100644 tools/agentic_import/sdmx/testdata/sample_metadata.json create mode 100644 tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py new file mode 100644 index 0000000000..cfb1c2a974 --- /dev/null +++ b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import tempfile +import unittest +from pathlib import Path + +from tools.agentic_import.sdmx.fetch_enrichment_data import (Config, + EnrichmentDataFetcher) + + +class EnrichmentDataFetcherTest(unittest.TestCase): + + def test_dry_run_creates_prompt_and_run_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + input_path = Path(tmpdir) / 'items.json' + input_path.write_text(json.dumps({"items": []})) + output_path = Path(tmpdir) / 'out' / 'items_enriched.json' + + config = Config( + input_items_json=str(input_path), + dataset_prefix='demo', + output_path=str(output_path), + dry_run=True, + skip_confirmation=True, + enable_sandboxing=False, + working_dir=tmpdir, + ) + + fetcher = EnrichmentDataFetcher(config) + result = fetcher.fetch_enrichment_data() + + self.assertTrue(result.run_id.startswith('demo_gemini_')) + self.assertTrue(result.run_dir.is_dir()) + self.assertTrue(result.prompt_path.is_file()) + self.assertTrue(result.gemini_log_path.is_absolute()) + self.assertEqual(result.prompt_path.parent, result.run_dir) + self.assertIn(str(result.prompt_path), result.gemini_command) + self.assertIn(str(result.gemini_log_path), result.gemini_command) + self.assertTrue(output_path.parent.is_dir()) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/find_enrichment_items_test.py new file mode 100644 index 0000000000..ec07983e9e --- /dev/null +++ b/tools/agentic_import/sdmx/find_enrichment_items_test.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import tempfile +import unittest +from pathlib import Path + +from tools.agentic_import.sdmx.find_enrichment_items import (Config, + EnrichmentItemsFinder) + + +class EnrichmentItemsFinderTest(unittest.TestCase): + + def test_dry_run_creates_prompt_and_run_dir(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + input_path = Path(tmpdir) / 'metadata.json' + input_path.write_text(json.dumps({"dataflows": []})) + output_path = Path(tmpdir) / 'out' / 'items.json' + + config = Config( + input_metadata_json=str(input_path), + dataset_prefix='demo', + output_path=str(output_path), + dry_run=True, + skip_confirmation=True, + enable_sandboxing=False, + working_dir=tmpdir, + ) + + finder = EnrichmentItemsFinder(config) + result = finder.find_items_to_enrich() + + self.assertTrue(result.run_id.startswith('demo_gemini_')) + self.assertTrue(result.run_dir.is_dir()) + self.assertTrue(result.prompt_path.is_file()) + self.assertTrue(result.gemini_log_path.is_absolute()) + self.assertEqual(result.prompt_path.parent, result.run_dir) + self.assertIn(str(result.prompt_path), result.gemini_command) + self.assertIn(str(result.gemini_log_path), result.gemini_command) + self.assertTrue(output_path.parent.is_dir()) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py b/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py new file mode 100644 index 0000000000..6b424a3e54 --- /dev/null +++ b/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import tempfile +import unittest +from pathlib import Path + +from deepdiff.diff import DeepDiff + +from tools.agentic_import.sdmx.sdmx_enrichment_merge import merge_enrichment + +_TESTDATA_DIR = Path(os.path.dirname(__file__)) / 'testdata' +_BASE_JSON = _TESTDATA_DIR / 'sample_metadata.json' +_ENRICHED_JSON = _TESTDATA_DIR / 'sample_enriched_items.json' +_EXPECTED_JSON = _TESTDATA_DIR / 'sample_metadata_enriched_expected.json' + + +class EnrichmentMergeTest(unittest.TestCase): + + def test_merge_enriched_description_across_lists(self) -> None: + with tempfile.TemporaryDirectory() as tmpdir: + output_path = Path(tmpdir) / 'merged.json' + merge_enrichment(str(_BASE_JSON), str(_ENRICHED_JSON), + str(output_path)) + + merged = json.loads(output_path.read_text()) + + expected = json.loads(_EXPECTED_JSON.read_text()) + diff = DeepDiff(expected, merged, ignore_order=True) + self.assertFalse(diff, msg=str(diff)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/agentic_import/sdmx/testdata/sample_enriched_items.json b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json new file mode 100644 index 0000000000..6fad7d28ad --- /dev/null +++ b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json @@ -0,0 +1,84 @@ +{ + "dataflows": [ + { + "id": "DF1", + "enriched_description": "Flow One enriched", + "data_structure_definition": { + "dimensions": [ + { + "id": "DIM1", + "enriched_description": "Dimension enriched", + "concept": { + "id": "C1", + "enriched_description": "Concept C1 enriched" + }, + "representation": { + "codelist": { + "codes": [ + { + "id": "CODE1", + "enriched_description": "Code 1 enriched" + }, + { + "id": "CODE2", + "enriched_description": "Code 2 enriched" + } + ] + } + } + } + ], + "attributes": [ + { + "id": "ATTR1", + "enriched_description": "Attribute enriched", + "concept": { + "id": "C2", + "enriched_description": "Concept C2 enriched" + }, + "representation": { + "codelist": { + "codes": [ + { + "id": "ACODE1", + "enriched_description": "Attr code enriched" + } + ] + } + } + } + ], + "measures": [ + { + "id": "MEAS1", + "enriched_description": "Measure enriched", + "concept": { + "id": "C3", + "enriched_description": "Concept C3 enriched" + } + } + ] + }, + "referenced_concept_schemes": [ + { + "id": "CS1", + "enriched_description": "Scheme enriched", + "concepts": [ + { + "id": "CON1", + "enriched_description": "Concept 1 enriched" + }, + { + "id": "CON2", + "enriched_description": "Concept 2 enriched" + } + ] + } + ] + }, + { + "id": "DF3", + "enriched_description": "No base match" + } + ] +} diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata.json b/tools/agentic_import/sdmx/testdata/sample_metadata.json new file mode 100644 index 0000000000..e121b00df6 --- /dev/null +++ b/tools/agentic_import/sdmx/testdata/sample_metadata.json @@ -0,0 +1,56 @@ +{ + "dataflows": [ + { + "id": "DF1", + "name": "Flow One", + "data_structure_definition": { + "dimensions": [ + { + "id": "DIM1", + "concept": {"id": "C1"}, + "representation": { + "codelist": { + "codes": [ + {"id": "CODE1"}, + {"id": "CODE2"} + ] + } + } + } + ], + "attributes": [ + { + "id": "ATTR1", + "concept": {"id": "C2"}, + "representation": { + "codelist": { + "codes": [ + {"id": "ACODE1"} + ] + } + } + } + ], + "measures": [ + { + "id": "MEAS1", + "concept": {"id": "C3"} + } + ] + }, + "referenced_concept_schemes": [ + { + "id": "CS1", + "concepts": [ + {"id": "CON1"}, + {"id": "CON2"} + ] + } + ] + }, + { + "id": "DF2", + "name": "Flow Two" + } + ] +} diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json new file mode 100644 index 0000000000..1828b0d023 --- /dev/null +++ b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json @@ -0,0 +1,85 @@ +{ + "dataflows": [ + { + "id": "DF1", + "name": "Flow One", + "enriched_description": "Flow One enriched", + "data_structure_definition": { + "dimensions": [ + { + "id": "DIM1", + "concept": { + "id": "C1", + "enriched_description": "Concept C1 enriched" + }, + "representation": { + "codelist": { + "codes": [ + { + "id": "CODE1", + "enriched_description": "Code 1 enriched" + }, + { + "id": "CODE2", + "enriched_description": "Code 2 enriched" + } + ] + } + }, + "enriched_description": "Dimension enriched" + } + ], + "attributes": [ + { + "id": "ATTR1", + "concept": { + "id": "C2", + "enriched_description": "Concept C2 enriched" + }, + "representation": { + "codelist": { + "codes": [ + { + "id": "ACODE1", + "enriched_description": "Attr code enriched" + } + ] + } + }, + "enriched_description": "Attribute enriched" + } + ], + "measures": [ + { + "id": "MEAS1", + "concept": { + "id": "C3", + "enriched_description": "Concept C3 enriched" + }, + "enriched_description": "Measure enriched" + } + ] + }, + "referenced_concept_schemes": [ + { + "id": "CS1", + "concepts": [ + { + "id": "CON1", + "enriched_description": "Concept 1 enriched" + }, + { + "id": "CON2", + "enriched_description": "Concept 2 enriched" + } + ], + "enriched_description": "Scheme enriched" + } + ] + }, + { + "id": "DF2", + "name": "Flow Two" + } + ] +} From 45bf22f543537677d512fbbe7f0930d6988a39f3 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 20 Jan 2026 09:02:45 +0000 Subject: [PATCH 08/15] test: assert SDMX prompt params --- .../sdmx/fetch_enrichment_data_test.py | 24 ++++++++++++++++--- .../sdmx/find_enrichment_items_test.py | 24 ++++++++++++++++--- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py index cfb1c2a974..541a8c7e8d 100644 --- a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py +++ b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py @@ -19,6 +19,9 @@ import tempfile import unittest from pathlib import Path +from unittest import mock + +from jinja2 import Template from tools.agentic_import.sdmx.fetch_enrichment_data import (Config, EnrichmentDataFetcher) @@ -43,17 +46,32 @@ def test_dry_run_creates_prompt_and_run_dir(self) -> None: ) fetcher = EnrichmentDataFetcher(config) - result = fetcher.fetch_enrichment_data() + with mock.patch("jinja2.environment.Template.render", + autospec=True, + side_effect=Template.render) as render_mock: + result = fetcher.fetch_enrichment_data() self.assertTrue(result.run_id.startswith('demo_gemini_')) self.assertTrue(result.run_dir.is_dir()) self.assertTrue(result.prompt_path.is_file()) self.assertTrue(result.gemini_log_path.is_absolute()) self.assertEqual(result.prompt_path.parent, result.run_dir) - self.assertIn(str(result.prompt_path), result.gemini_command) - self.assertIn(str(result.gemini_log_path), result.gemini_command) + expected_command = ( + f"cat '{result.prompt_path.resolve()}' | " + f"{config.gemini_cli or 'gemini'} " + f"{'--sandbox' if config.enable_sandboxing else ''} " + f"-y 2>&1 | tee '{result.gemini_log_path.resolve()}'") + self.assertEqual(result.gemini_command, expected_command) self.assertTrue(output_path.parent.is_dir()) + self.assertEqual(render_mock.call_count, 1) + _, render_kwargs = render_mock.call_args + self.assertEqual( + render_kwargs, { + "input_items_abs": str(input_path.resolve()), + "output_path_abs": str(output_path.resolve()), + }) + if __name__ == '__main__': unittest.main() diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/find_enrichment_items_test.py index ec07983e9e..d135be3d0b 100644 --- a/tools/agentic_import/sdmx/find_enrichment_items_test.py +++ b/tools/agentic_import/sdmx/find_enrichment_items_test.py @@ -18,6 +18,9 @@ import tempfile import unittest from pathlib import Path +from unittest import mock + +from jinja2 import Template from tools.agentic_import.sdmx.find_enrichment_items import (Config, EnrichmentItemsFinder) @@ -42,17 +45,32 @@ def test_dry_run_creates_prompt_and_run_dir(self) -> None: ) finder = EnrichmentItemsFinder(config) - result = finder.find_items_to_enrich() + with mock.patch("jinja2.environment.Template.render", + autospec=True, + side_effect=Template.render) as render_mock: + result = finder.find_items_to_enrich() self.assertTrue(result.run_id.startswith('demo_gemini_')) self.assertTrue(result.run_dir.is_dir()) self.assertTrue(result.prompt_path.is_file()) self.assertTrue(result.gemini_log_path.is_absolute()) self.assertEqual(result.prompt_path.parent, result.run_dir) - self.assertIn(str(result.prompt_path), result.gemini_command) - self.assertIn(str(result.gemini_log_path), result.gemini_command) + expected_command = ( + f"cat '{result.prompt_path.resolve()}' | " + f"{config.gemini_cli or 'gemini'} " + f"{'--sandbox' if config.enable_sandboxing else ''} " + f"-y 2>&1 | tee '{result.gemini_log_path.resolve()}'") + self.assertEqual(result.gemini_command, expected_command) self.assertTrue(output_path.parent.is_dir()) + self.assertEqual(render_mock.call_count, 1) + _, render_kwargs = render_mock.call_args + self.assertEqual( + render_kwargs, { + "input_metadata_abs": str(input_path.resolve()), + "output_path_abs": str(output_path.resolve()), + }) + if __name__ == '__main__': unittest.main() From bda8cbd39c5d9453c26f733716a98794ebd68fb7 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 20 Jan 2026 09:03:43 +0000 Subject: [PATCH 09/15] lint fix --- tools/agentic_import/sdmx/fetch_enrichment_data_test.py | 4 ++-- tools/agentic_import/sdmx/find_enrichment_items_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py index 541a8c7e8d..fb1b1609c3 100644 --- a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py +++ b/tools/agentic_import/sdmx/fetch_enrichment_data_test.py @@ -23,8 +23,8 @@ from jinja2 import Template -from tools.agentic_import.sdmx.fetch_enrichment_data import (Config, - EnrichmentDataFetcher) +from tools.agentic_import.sdmx.fetch_enrichment_data import ( + Config, EnrichmentDataFetcher) class EnrichmentDataFetcherTest(unittest.TestCase): diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/find_enrichment_items_test.py index d135be3d0b..3c0865178f 100644 --- a/tools/agentic_import/sdmx/find_enrichment_items_test.py +++ b/tools/agentic_import/sdmx/find_enrichment_items_test.py @@ -22,8 +22,8 @@ from jinja2 import Template -from tools.agentic_import.sdmx.find_enrichment_items import (Config, - EnrichmentItemsFinder) +from tools.agentic_import.sdmx.find_enrichment_items import ( + Config, EnrichmentItemsFinder) class EnrichmentItemsFinderTest(unittest.TestCase): From f8692f5c507c98b8cf1a79cc6930d387e7754e21 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 20 Jan 2026 09:57:24 +0000 Subject: [PATCH 10/15] Rename SDMX metadata enricher tools --- tools/agentic_import/sdmx/README.md | 12 ++++++------ ...enrichment_data.py => metadata_enricher_fetch.py} | 4 ++-- ..._data_test.py => metadata_enricher_fetch_test.py} | 2 +- ...enrichment_items.py => metadata_enricher_find.py} | 4 ++-- ..._items_test.py => metadata_enricher_find_test.py} | 2 +- ...nrichment_merge.py => metadata_enricher_merge.py} | 0 ...merge_test.py => metadata_enricher_merge_test.py} | 2 +- ...a_prompt.j2 => metadata_enricher_fetch_prompt.j2} | 0 ...ms_prompt.j2 => metadata_enricher_find_prompt.j2} | 0 9 files changed, 13 insertions(+), 13 deletions(-) rename tools/agentic_import/sdmx/{fetch_enrichment_data.py => metadata_enricher_fetch.py} (98%) rename tools/agentic_import/sdmx/{fetch_enrichment_data_test.py => metadata_enricher_fetch_test.py} (97%) rename tools/agentic_import/sdmx/{find_enrichment_items.py => metadata_enricher_find.py} (98%) rename tools/agentic_import/sdmx/{find_enrichment_items_test.py => metadata_enricher_find_test.py} (97%) rename tools/agentic_import/sdmx/{sdmx_enrichment_merge.py => metadata_enricher_merge.py} (100%) rename tools/agentic_import/sdmx/{sdmx_enrichment_merge_test.py => metadata_enricher_merge_test.py} (95%) rename tools/agentic_import/sdmx/templates/{fetch_enrichment_data_prompt.j2 => metadata_enricher_fetch_prompt.j2} (100%) rename tools/agentic_import/sdmx/templates/{find_enrichment_items_prompt.j2 => metadata_enricher_find_prompt.j2} (100%) diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md index c9c4416ccb..8fdb06b21d 100644 --- a/tools/agentic_import/sdmx/README.md +++ b/tools/agentic_import/sdmx/README.md @@ -3,13 +3,13 @@ This folder contains three standalone tools for SDMX metadata enrichment. Each tool supports CLI usage and can be called programmatically. -## 1) find_enrichment_items.py +## 1) metadata_enricher_find.py Selects which SDMX codes/concepts need enrichment and generates `enrichment_query` values using full dataset context. CLI usage: ``` -python tools/agentic_import/sdmx/find_enrichment_items.py \ +python tools/agentic_import/sdmx/metadata_enricher_find.py \ --input_metadata_json="/path/to/metadata.json" \ --dataset_prefix="oecd_prices" \ --output_path="/path/to/items_to_enrich.json" \ @@ -21,13 +21,13 @@ Output: - A pruned JSON that preserves the original structure but keeps only selected items with `enrichment_query`. Name/description fields are omitted. -## 2) fetch_enrichment_data.py +## 2) metadata_enricher_fetch.py Uses Gemini CLI web search to populate `enriched_description` for each selected item. CLI usage: ``` -python tools/agentic_import/sdmx/fetch_enrichment_data.py \ +python tools/agentic_import/sdmx/metadata_enricher_fetch.py \ --input_items_json="/path/to/items_to_enrich.json" \ --dataset_prefix="oecd_prices" \ --output_path="/path/to/enriched_items.json" \ @@ -39,12 +39,12 @@ Output: - A pruned JSON in the same structure as the input, with `enriched_description` added and `enrichment_query` removed. -## 3) sdmx_enrichment_merge.py +## 3) metadata_enricher_merge.py Merges `enriched_description` into the base metadata JSON. CLI usage: ``` -python tools/agentic_import/sdmx/sdmx_enrichment_merge.py \ +python tools/agentic_import/sdmx/metadata_enricher_merge.py \ --input_metadata_json="/path/to/metadata.json" \ --input_enriched_items_json="/path/to/enriched_items.json" \ --output_path="/path/to/metadata_enriched.json" diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data.py b/tools/agentic_import/sdmx/metadata_enricher_fetch.py similarity index 98% rename from tools/agentic_import/sdmx/fetch_enrichment_data.py rename to tools/agentic_import/sdmx/metadata_enricher_fetch.py index 22e3e89b55..1fdaee6684 100644 --- a/tools/agentic_import/sdmx/fetch_enrichment_data.py +++ b/tools/agentic_import/sdmx/metadata_enricher_fetch.py @@ -172,14 +172,14 @@ def _resolve_path(self, path: str) -> Path: def _generate_prompt(self) -> Path: template_dir = os.path.join(_SCRIPT_DIR, 'templates') env = Environment(loader=FileSystemLoader(template_dir)) - template = env.get_template('fetch_enrichment_data_prompt.j2') + template = env.get_template('metadata_enricher_fetch_prompt.j2') rendered_prompt = template.render( input_items_abs=str(self._input_path), output_path_abs=str(self._output_path), ) - output_file = self._run_dir / 'fetch_enrichment_data_prompt.md' + output_file = self._run_dir / 'metadata_enricher_fetch_prompt.md' with open(output_file, 'w') as f: f.write(rendered_prompt) diff --git a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py b/tools/agentic_import/sdmx/metadata_enricher_fetch_test.py similarity index 97% rename from tools/agentic_import/sdmx/fetch_enrichment_data_test.py rename to tools/agentic_import/sdmx/metadata_enricher_fetch_test.py index fb1b1609c3..494381f72e 100644 --- a/tools/agentic_import/sdmx/fetch_enrichment_data_test.py +++ b/tools/agentic_import/sdmx/metadata_enricher_fetch_test.py @@ -23,7 +23,7 @@ from jinja2 import Template -from tools.agentic_import.sdmx.fetch_enrichment_data import ( +from tools.agentic_import.sdmx.metadata_enricher_fetch import ( Config, EnrichmentDataFetcher) diff --git a/tools/agentic_import/sdmx/find_enrichment_items.py b/tools/agentic_import/sdmx/metadata_enricher_find.py similarity index 98% rename from tools/agentic_import/sdmx/find_enrichment_items.py rename to tools/agentic_import/sdmx/metadata_enricher_find.py index 8807b397ae..e9c610eca6 100644 --- a/tools/agentic_import/sdmx/find_enrichment_items.py +++ b/tools/agentic_import/sdmx/metadata_enricher_find.py @@ -172,14 +172,14 @@ def _resolve_path(self, path: str) -> Path: def _generate_prompt(self) -> Path: template_dir = os.path.join(_SCRIPT_DIR, 'templates') env = Environment(loader=FileSystemLoader(template_dir)) - template = env.get_template('find_enrichment_items_prompt.j2') + template = env.get_template('metadata_enricher_find_prompt.j2') rendered_prompt = template.render( input_metadata_abs=str(self._input_path), output_path_abs=str(self._output_path), ) - output_file = self._run_dir / 'find_enrichment_items_prompt.md' + output_file = self._run_dir / 'metadata_enricher_find_prompt.md' with open(output_file, 'w') as f: f.write(rendered_prompt) diff --git a/tools/agentic_import/sdmx/find_enrichment_items_test.py b/tools/agentic_import/sdmx/metadata_enricher_find_test.py similarity index 97% rename from tools/agentic_import/sdmx/find_enrichment_items_test.py rename to tools/agentic_import/sdmx/metadata_enricher_find_test.py index 3c0865178f..4f45a227cf 100644 --- a/tools/agentic_import/sdmx/find_enrichment_items_test.py +++ b/tools/agentic_import/sdmx/metadata_enricher_find_test.py @@ -22,7 +22,7 @@ from jinja2 import Template -from tools.agentic_import.sdmx.find_enrichment_items import ( +from tools.agentic_import.sdmx.metadata_enricher_find import ( Config, EnrichmentItemsFinder) diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py similarity index 100% rename from tools/agentic_import/sdmx/sdmx_enrichment_merge.py rename to tools/agentic_import/sdmx/metadata_enricher_merge.py diff --git a/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py similarity index 95% rename from tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py rename to tools/agentic_import/sdmx/metadata_enricher_merge_test.py index 6b424a3e54..d9670efbd5 100644 --- a/tools/agentic_import/sdmx/sdmx_enrichment_merge_test.py +++ b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py @@ -22,7 +22,7 @@ from deepdiff.diff import DeepDiff -from tools.agentic_import.sdmx.sdmx_enrichment_merge import merge_enrichment +from tools.agentic_import.sdmx.metadata_enricher_merge import merge_enrichment _TESTDATA_DIR = Path(os.path.dirname(__file__)) / 'testdata' _BASE_JSON = _TESTDATA_DIR / 'sample_metadata.json' diff --git a/tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 b/tools/agentic_import/sdmx/templates/metadata_enricher_fetch_prompt.j2 similarity index 100% rename from tools/agentic_import/sdmx/templates/fetch_enrichment_data_prompt.j2 rename to tools/agentic_import/sdmx/templates/metadata_enricher_fetch_prompt.j2 diff --git a/tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 b/tools/agentic_import/sdmx/templates/metadata_enricher_find_prompt.j2 similarity index 100% rename from tools/agentic_import/sdmx/templates/find_enrichment_items_prompt.j2 rename to tools/agentic_import/sdmx/templates/metadata_enricher_find_prompt.j2 From 7c85eff8fb33cb66ed0ba68fb3e5b4da4ad125ce Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Tue, 20 Jan 2026 16:12:50 +0000 Subject: [PATCH 11/15] Extract gemini prompt runner Refactor SDMX metadata scripts to reuse runner --- .../common/gemini_prompt_runner.py | 186 ++++++++++++++++++ .../sdmx/metadata_enricher_fetch.py | 146 ++++---------- .../sdmx/metadata_enricher_find.py | 146 ++++---------- 3 files changed, 252 insertions(+), 226 deletions(-) create mode 100644 tools/agentic_import/common/gemini_prompt_runner.py diff --git a/tools/agentic_import/common/gemini_prompt_runner.py b/tools/agentic_import/common/gemini_prompt_runner.py new file mode 100644 index 0000000000..19ae073d8f --- /dev/null +++ b/tools/agentic_import/common/gemini_prompt_runner.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import subprocess +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Callable, Mapping, Optional + +from absl import logging +from jinja2 import Environment, FileSystemLoader + + +@dataclass +class GeminiRunResult: + run_id: str + run_dir: Path + prompt_path: Path + gemini_log_path: Path + gemini_command: str + sandbox_enabled: bool + + +class GeminiPromptRunner: + + def __init__(self, + dataset_prefix: str, + working_dir: Optional[str] = None, + run_root: str = '.datacommons/runs', + dry_run: bool = False, + skip_confirmation: bool = False, + enable_sandboxing: bool = False, + gemini_cli: Optional[str] = None): + self._working_dir = Path( + working_dir).resolve() if working_dir else Path.cwd() + self._dataset_prefix = (dataset_prefix or '').strip() + if not self._dataset_prefix: + raise ValueError("dataset_prefix must be a non-empty string.") + + self._run_root = run_root + self._dry_run = dry_run + self._skip_confirmation = skip_confirmation + self._enable_sandboxing = enable_sandboxing + self._gemini_cli = gemini_cli + + self._run_id = self._build_run_id() + self._run_dir = self._create_run_dir() + + @property + def run_id(self) -> str: + return self._run_id + + @property + def run_dir(self) -> Path: + return self._run_dir + + @property + def working_dir(self) -> Path: + return self._working_dir + + def render_prompt(self, template_dir: Path, template_name: str, + context: Mapping[str, str], prompt_filename: str) -> Path: + # If other LLM runners are added later, extract rendering into a separate utility. + env = Environment(loader=FileSystemLoader(str(template_dir))) + template = env.get_template(template_name) + + rendered_prompt = template.render(**context) + output_file = self._run_dir / prompt_filename + with open(output_file, 'w') as f: + f.write(rendered_prompt) + + logging.info("Generated prompt written to: %s", output_file) + return output_file + + def run(self, + prompt_file: Path, + log_filename: str = 'gemini_cli.log', + log_path_override: Optional[Path] = None, + confirm_fn: Optional[Callable[[Path], bool]] = None, + cancel_log_message: Optional[str] = None) -> GeminiRunResult: + gemini_log_path = (log_path_override.resolve() if log_path_override else + (self._run_dir / log_filename)) + gemini_command = self._build_gemini_command(prompt_file, + gemini_log_path) + + result = GeminiRunResult(run_id=self._run_id, + run_dir=self._run_dir, + prompt_path=prompt_file, + gemini_log_path=gemini_log_path, + gemini_command=gemini_command, + sandbox_enabled=self._enable_sandboxing) + + if self._dry_run: + logging.info( + "Dry run mode: Prompt file generated at %s. " + "Skipping Gemini CLI execution.", prompt_file) + return result + + if not self._skip_confirmation and confirm_fn is not None: + if not confirm_fn(prompt_file): + if cancel_log_message: + logging.info(cancel_log_message) + return result + + if not self._check_gemini_cli_available(): + logging.warning( + "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)." + ) + + logging.info("Launching gemini (cwd: %s): %s", self._working_dir, + gemini_command) + logging.info("Gemini output will be saved to: %s", gemini_log_path) + + exit_code = self._run_subprocess(gemini_command) + if exit_code == 0: + logging.info("Gemini CLI completed successfully") + return result + + raise RuntimeError( + f"Gemini CLI execution failed with exit code {exit_code}") + + def _build_run_id(self) -> str: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"{self._dataset_prefix}_gemini_{timestamp}" + + def _create_run_dir(self) -> Path: + run_root = Path(self._run_root).expanduser() + if not run_root.is_absolute(): + run_root = self._working_dir / run_root + run_root.mkdir(parents=True, exist_ok=True) + + run_dir = run_root / self._run_id + run_dir.mkdir(parents=True, exist_ok=True) + return run_dir + + def _check_gemini_cli_available(self) -> bool: + if self._gemini_cli: + return True + return shutil.which('gemini') is not None + + def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: + prompt_path = prompt_file.resolve() + log_path = log_file.resolve() + gemini_cmd = self._gemini_cli or 'gemini' + sandbox_flag = "--sandbox" if self._enable_sandboxing else "" + return ( + f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'" + ) + + def _run_subprocess(self, command: str) -> int: + try: + process = subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + cwd=self._working_dir, + encoding='utf-8', + errors='replace', + bufsize=1, + universal_newlines=True) + + while True: + output = process.stdout.readline() + if output == '' and process.poll() is not None: + break + if output: + print(output.rstrip()) + + return process.wait() + except Exception as e: + logging.error("Error running subprocess: %s", str(e)) + return 1 diff --git a/tools/agentic_import/sdmx/metadata_enricher_fetch.py b/tools/agentic_import/sdmx/metadata_enricher_fetch.py index 1fdaee6684..505184b6d8 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_fetch.py +++ b/tools/agentic_import/sdmx/metadata_enricher_fetch.py @@ -16,20 +16,24 @@ import os import platform -import shutil -import subprocess +import sys from dataclasses import dataclass -from datetime import datetime from pathlib import Path from typing import Optional +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_REPO_ROOT = Path(_SCRIPT_DIR).resolve().parents[3] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + from absl import app from absl import flags from absl import logging -from jinja2 import Environment, FileSystemLoader + +from tools.agentic_import.common.gemini_prompt_runner import ( + GeminiPromptRunner, GeminiRunResult) _FLAGS = flags.FLAGS -_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) def _define_flags(): @@ -85,16 +89,6 @@ class Config: working_dir: Optional[str] = None -@dataclass -class RunResult: - run_id: str - run_dir: Path - prompt_path: Path - gemini_log_path: Path - gemini_command: str - sandbox_enabled: bool - - class EnrichmentDataFetcher: def __init__(self, config: Config): @@ -114,54 +108,23 @@ def __init__(self, config: Config): self._output_path.parent.mkdir(parents=True, exist_ok=True) - self._datacommons_dir = self._working_dir / '.datacommons' - self._datacommons_dir.mkdir(parents=True, exist_ok=True) - - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}" - self._run_dir = self._datacommons_dir / 'runs' / self._run_id - self._run_dir.mkdir(parents=True, exist_ok=True) + self._runner = GeminiPromptRunner( + dataset_prefix=self._dataset_prefix, + working_dir=str(self._working_dir), + dry_run=config.dry_run, + skip_confirmation=config.skip_confirmation, + enable_sandboxing=config.enable_sandboxing, + gemini_cli=config.gemini_cli, + ) - def fetch_enrichment_data(self) -> RunResult: + def fetch_enrichment_data(self) -> GeminiRunResult: prompt_file = self._generate_prompt() - gemini_log_file = self._run_dir / 'gemini_cli.log' - gemini_command = self._build_gemini_command(prompt_file, - gemini_log_file) - - result = RunResult(run_id=self._run_id, - run_dir=self._run_dir, - prompt_path=prompt_file, - gemini_log_path=gemini_log_file, - gemini_command=gemini_command, - sandbox_enabled=self._config.enable_sandboxing) - - if self._config.dry_run: - logging.info( - "Dry run mode: Prompt file generated at %s. " - "Skipping Gemini CLI execution.", prompt_file) - return result - - if not self._config.skip_confirmation: - if not self._get_user_confirmation(prompt_file): - logging.info("Enrichment data fetch cancelled by user.") - return result - - if not self._check_gemini_cli_available(): - logging.warning( - "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)." - ) - - logging.info("Launching gemini (cwd: %s): %s", self._working_dir, - gemini_command) - logging.info("Gemini output will be saved to: %s", gemini_log_file) - - exit_code = self._run_subprocess(gemini_command) - if exit_code == 0: - logging.info("Gemini CLI completed successfully") - return result - - raise RuntimeError( - f"Gemini CLI execution failed with exit code {exit_code}") + return self._runner.run( + prompt_file, + log_filename='gemini_cli.log', + confirm_fn=self._get_user_confirmation, + cancel_log_message="Enrichment data fetch cancelled by user.", + ) def _resolve_path(self, path: str) -> Path: resolved = Path(path).expanduser() @@ -170,22 +133,17 @@ def _resolve_path(self, path: str) -> Path: return resolved.resolve() def _generate_prompt(self) -> Path: - template_dir = os.path.join(_SCRIPT_DIR, 'templates') - env = Environment(loader=FileSystemLoader(template_dir)) - template = env.get_template('metadata_enricher_fetch_prompt.j2') - - rendered_prompt = template.render( - input_items_abs=str(self._input_path), - output_path_abs=str(self._output_path), + template_dir = Path(_SCRIPT_DIR) / 'templates' + return self._runner.render_prompt( + template_dir=template_dir, + template_name='metadata_enricher_fetch_prompt.j2', + context={ + "input_items_abs": str(self._input_path), + "output_path_abs": str(self._output_path), + }, + prompt_filename='metadata_enricher_fetch_prompt.md', ) - output_file = self._run_dir / 'metadata_enricher_fetch_prompt.md' - with open(output_file, 'w') as f: - f.write(rendered_prompt) - - logging.info("Generated prompt written to: %s", output_file) - return output_file - def _get_user_confirmation(self, prompt_file: Path) -> bool: print("\n" + "=" * 60) print("SDMX ENRICHMENT DATA FETCH SUMMARY") @@ -218,44 +176,6 @@ def _get_user_confirmation(self, prompt_file: Path) -> bool: print("\nData fetch cancelled by user.") return False - def _check_gemini_cli_available(self) -> bool: - if self._config.gemini_cli: - return True - return shutil.which('gemini') is not None - - def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: - prompt_path = prompt_file.resolve() - log_path = log_file.resolve() - gemini_cmd = self._config.gemini_cli or 'gemini' - sandbox_flag = "--sandbox" if self._config.enable_sandboxing else "" - return ( - f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'" - ) - - def _run_subprocess(self, command: str) -> int: - try: - process = subprocess.Popen(command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True, - cwd=self._working_dir, - encoding='utf-8', - errors='replace', - bufsize=1, - universal_newlines=True) - - while True: - output = process.stdout.readline() - if output == '' and process.poll() is not None: - break - if output: - print(output.rstrip()) - - return process.wait() - except Exception as e: - logging.error("Error running subprocess: %s", str(e)) - return 1 - def prepare_config() -> Config: return Config(input_items_json=_FLAGS.input_items_json, diff --git a/tools/agentic_import/sdmx/metadata_enricher_find.py b/tools/agentic_import/sdmx/metadata_enricher_find.py index e9c610eca6..222c7eda13 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_find.py +++ b/tools/agentic_import/sdmx/metadata_enricher_find.py @@ -16,20 +16,24 @@ import os import platform -import shutil -import subprocess +import sys from dataclasses import dataclass -from datetime import datetime from pathlib import Path from typing import Optional +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_REPO_ROOT = Path(_SCRIPT_DIR).resolve().parents[3] +if str(_REPO_ROOT) not in sys.path: + sys.path.insert(0, str(_REPO_ROOT)) + from absl import app from absl import flags from absl import logging -from jinja2 import Environment, FileSystemLoader + +from tools.agentic_import.common.gemini_prompt_runner import ( + GeminiPromptRunner, GeminiRunResult) _FLAGS = flags.FLAGS -_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) def _define_flags(): @@ -85,16 +89,6 @@ class Config: working_dir: Optional[str] = None -@dataclass -class RunResult: - run_id: str - run_dir: Path - prompt_path: Path - gemini_log_path: Path - gemini_command: str - sandbox_enabled: bool - - class EnrichmentItemsFinder: def __init__(self, config: Config): @@ -114,54 +108,23 @@ def __init__(self, config: Config): self._output_path.parent.mkdir(parents=True, exist_ok=True) - self._datacommons_dir = self._working_dir / '.datacommons' - self._datacommons_dir.mkdir(parents=True, exist_ok=True) - - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - self._run_id = f"{self._dataset_prefix}_gemini_{timestamp}" - self._run_dir = self._datacommons_dir / 'runs' / self._run_id - self._run_dir.mkdir(parents=True, exist_ok=True) + self._runner = GeminiPromptRunner( + dataset_prefix=self._dataset_prefix, + working_dir=str(self._working_dir), + dry_run=config.dry_run, + skip_confirmation=config.skip_confirmation, + enable_sandboxing=config.enable_sandboxing, + gemini_cli=config.gemini_cli, + ) - def find_items_to_enrich(self) -> RunResult: + def find_items_to_enrich(self) -> GeminiRunResult: prompt_file = self._generate_prompt() - gemini_log_file = self._run_dir / 'gemini_cli.log' - gemini_command = self._build_gemini_command(prompt_file, - gemini_log_file) - - result = RunResult(run_id=self._run_id, - run_dir=self._run_dir, - prompt_path=prompt_file, - gemini_log_path=gemini_log_file, - gemini_command=gemini_command, - sandbox_enabled=self._config.enable_sandboxing) - - if self._config.dry_run: - logging.info( - "Dry run mode: Prompt file generated at %s. " - "Skipping Gemini CLI execution.", prompt_file) - return result - - if not self._config.skip_confirmation: - if not self._get_user_confirmation(prompt_file): - logging.info("Enrichment item selection cancelled by user.") - return result - - if not self._check_gemini_cli_available(): - logging.warning( - "Gemini CLI not found in PATH. Will attempt to run anyway (may work if aliased)." - ) - - logging.info("Launching gemini (cwd: %s): %s", self._working_dir, - gemini_command) - logging.info("Gemini output will be saved to: %s", gemini_log_file) - - exit_code = self._run_subprocess(gemini_command) - if exit_code == 0: - logging.info("Gemini CLI completed successfully") - return result - - raise RuntimeError( - f"Gemini CLI execution failed with exit code {exit_code}") + return self._runner.run( + prompt_file, + log_filename='gemini_cli.log', + confirm_fn=self._get_user_confirmation, + cancel_log_message="Enrichment item selection cancelled by user.", + ) def _resolve_path(self, path: str) -> Path: resolved = Path(path).expanduser() @@ -170,22 +133,17 @@ def _resolve_path(self, path: str) -> Path: return resolved.resolve() def _generate_prompt(self) -> Path: - template_dir = os.path.join(_SCRIPT_DIR, 'templates') - env = Environment(loader=FileSystemLoader(template_dir)) - template = env.get_template('metadata_enricher_find_prompt.j2') - - rendered_prompt = template.render( - input_metadata_abs=str(self._input_path), - output_path_abs=str(self._output_path), + template_dir = Path(_SCRIPT_DIR) / 'templates' + return self._runner.render_prompt( + template_dir=template_dir, + template_name='metadata_enricher_find_prompt.j2', + context={ + "input_metadata_abs": str(self._input_path), + "output_path_abs": str(self._output_path), + }, + prompt_filename='metadata_enricher_find_prompt.md', ) - output_file = self._run_dir / 'metadata_enricher_find_prompt.md' - with open(output_file, 'w') as f: - f.write(rendered_prompt) - - logging.info("Generated prompt written to: %s", output_file) - return output_file - def _get_user_confirmation(self, prompt_file: Path) -> bool: print("\n" + "=" * 60) print("SDMX ENRICHMENT ITEM SELECTION SUMMARY") @@ -218,44 +176,6 @@ def _get_user_confirmation(self, prompt_file: Path) -> bool: print("\nSelection cancelled by user.") return False - def _check_gemini_cli_available(self) -> bool: - if self._config.gemini_cli: - return True - return shutil.which('gemini') is not None - - def _build_gemini_command(self, prompt_file: Path, log_file: Path) -> str: - prompt_path = prompt_file.resolve() - log_path = log_file.resolve() - gemini_cmd = self._config.gemini_cli or 'gemini' - sandbox_flag = "--sandbox" if self._config.enable_sandboxing else "" - return ( - f"cat '{prompt_path}' | {gemini_cmd} {sandbox_flag} -y 2>&1 | tee '{log_path}'" - ) - - def _run_subprocess(self, command: str) -> int: - try: - process = subprocess.Popen(command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - shell=True, - cwd=self._working_dir, - encoding='utf-8', - errors='replace', - bufsize=1, - universal_newlines=True) - - while True: - output = process.stdout.readline() - if output == '' and process.poll() is not None: - break - if output: - print(output.rstrip()) - - return process.wait() - except Exception as e: - logging.error("Error running subprocess: %s", str(e)) - return 1 - def prepare_config() -> Config: return Config(input_metadata_json=_FLAGS.input_metadata_json, From e50a67ae77149561bbe8c871c6cf8ea2700e7cc1 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Wed, 21 Jan 2026 04:04:46 +0000 Subject: [PATCH 12/15] Add json merge helper and tests Use insert-only merge for SDMX fixtures --- tools/agentic_import/common/json_merge.py | 152 ++++++++++++++++++ .../agentic_import/common/json_merge_test.py | 150 +++++++++++++++++ .../sdmx/metadata_enricher_merge.py | 108 +------------ .../sdmx/testdata/sample_enriched_items.json | 25 +++ .../sdmx/testdata/sample_metadata.json | 19 ++- .../sample_metadata_enriched_expected.json | 29 ++++ 6 files changed, 380 insertions(+), 103 deletions(-) create mode 100644 tools/agentic_import/common/json_merge.py create mode 100644 tools/agentic_import/common/json_merge_test.py diff --git a/tools/agentic_import/common/json_merge.py b/tools/agentic_import/common/json_merge.py new file mode 100644 index 0000000000..fb9e142bf8 --- /dev/null +++ b/tools/agentic_import/common/json_merge.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, List + +from absl import logging + + +def merge_json(base: Any, + incoming: Any, + key_field: str = 'id', + allow_overwrite: bool = False) -> Any: + """Merges incoming JSON into base, mutating base where possible.""" + return _merge_value(base, + incoming, + key_field=key_field, + allow_overwrite=allow_overwrite, + path='') + + +def _merge_value(base: Any, incoming: Any, key_field: str, + allow_overwrite: bool, path: str) -> Any: + # Dispatch by type to preserve structure and scope merges correctly. + if isinstance(base, dict) and isinstance(incoming, dict): + return _merge_dict(base, + incoming, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=path) + if isinstance(base, list) and isinstance(incoming, list): + return _merge_list(base, + incoming, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=path) + return _merge_leaf(base, incoming, allow_overwrite, path) + + +def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any], key_field: str, + allow_overwrite: bool, path: str) -> Dict[str, Any]: + for key, incoming_value in incoming.items(): + next_path = _join_path(path, key) + if key not in base: + base[key] = incoming_value + continue + + base_value = base[key] + base[key] = _merge_value(base_value, + incoming_value, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=next_path) + return base + + +def _merge_list(base: List[Any], incoming: List[Any], key_field: str, + allow_overwrite: bool, path: str) -> List[Any]: + # Keep base ordering; append unmatched items to avoid data loss. + # Build a keyed index for scoped merges inside this list. + base_by_key: Dict[Any, Dict[str, Any]] = {} + for index, item in enumerate(base): + if not isinstance(item, dict): + logging.warning( + f"Base list item at {path}[index={index}] is not a dict; skipping keyed merge." + ) + continue + key_value = item.get(key_field) + if key_value is None: + logging.warning( + f"Base list item at {path}[index={index}] missing key '{key_field}'; skipping keyed merge." + ) + continue + if key_value in base_by_key: + logging.warning( + f"Duplicate key '{key_value}' in base list at {path}; using first occurrence." + ) + continue + base_by_key[key_value] = item + + seen_incoming_keys = set() + # Merge incoming items by key; append when a match is not possible. + for index, item in enumerate(incoming): + if not isinstance(item, dict): + logging.warning( + f"Incoming list item at {path}[index={index}] is not a dict; appending." + ) + base.append(item) + continue + key_value = item.get(key_field) + if key_value is None: + logging.warning( + f"Incoming list item at {path}[index={index}] missing key '{key_field}'; appending." + ) + base.append(item) + continue + if key_value in seen_incoming_keys: + logging.warning( + f"Duplicate key '{key_value}' in incoming list at {path}; merging again." + ) + seen_incoming_keys.add(key_value) + + base_item = base_by_key.get(key_value) + if base_item is None: + base.append(item) + base_by_key[key_value] = item + continue + + item_path = _list_item_path(path, key_field, key_value) + _merge_dict(base_item, + item, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=item_path) + return base + + +def _merge_leaf(base: Any, incoming: Any, allow_overwrite: bool, + path: str) -> Any: + # Leaf values follow the overwrite policy to avoid accidental data loss. + if allow_overwrite: + if base != incoming: + logging.warning( + f"Overwriting value at {path} from {base!r} to {incoming!r}.") + return incoming + + if base != incoming: + logging.warning( + f"Preserving base value at {path}; incoming value ignored.") + return base + + +def _join_path(path: str, key: str) -> str: + if not path: + return key + return f"{path}.{key}" + + +def _list_item_path(path: str, key_field: str, key_value: Any) -> str: + return f"{path}[{key_field}={key_value}]" diff --git a/tools/agentic_import/common/json_merge_test.py b/tools/agentic_import/common/json_merge_test.py new file mode 100644 index 0000000000..e13452fa9b --- /dev/null +++ b/tools/agentic_import/common/json_merge_test.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from tools.agentic_import.common.json_merge import merge_json + + +class JsonMergeTest(unittest.TestCase): + + def test_insert_only_preserves_existing_leaf_values(self) -> None: + base = {"a": 1, "b": {"c": 2}} + incoming = {"a": 3, "b": {"d": 4}} + + merged = merge_json(base, incoming, allow_overwrite=False) + + self.assertEqual(merged["a"], 1) + self.assertEqual(merged["b"]["c"], 2) + self.assertEqual(merged["b"]["d"], 4) + + def test_allow_overwrite_updates_leaf_values(self) -> None: + base = {"a": 1} + incoming = {"a": 2} + + merged = merge_json(base, incoming, allow_overwrite=True) + + self.assertEqual(merged["a"], 2) + + def test_insert_only_keeps_existing_name_and_adds_new_fields(self) -> None: + base = {"item": {"name": "Base"}} + incoming = {"item": {"name": "Incoming", "enriched_description": "New"}} + + merged = merge_json(base, incoming, allow_overwrite=False) + + self.assertEqual(merged["item"]["name"], "Base") + self.assertEqual(merged["item"]["enriched_description"], "New") + + def test_keyed_list_merge_respects_hierarchy(self) -> None: + base = { + "codelists": [ + { + "id": "CL1", + "codes": [{ + "id": "A" + },], + }, + { + "id": "CL2", + "codes": [{ + "id": "A" + },], + }, + ] + } + incoming = { + "codelists": [ + { + "id": + "CL1", + "codes": [{ + "id": "A", + "enriched_description": "Code A in CL1", + },], + }, + { + "id": + "CL2", + "codes": [{ + "id": "A", + "enriched_description": "Code A in CL2", + },], + }, + ] + } + + merged = merge_json(base, incoming, allow_overwrite=True) + + cl1_code = merged["codelists"][0]["codes"][0] + cl2_code = merged["codelists"][1]["codes"][0] + self.assertEqual(cl1_code["enriched_description"], "Code A in CL1") + self.assertEqual(cl2_code["enriched_description"], "Code A in CL2") + + def test_keyed_list_merge_with_custom_key(self) -> None: + base = {"items": [{"code": "X", "value": 1}]} + incoming = { + "items": [ + { + "code": "X", + "extra": 2 + }, + { + "code": "Y", + "value": 3 + }, + ] + } + + merged = merge_json(base, incoming, key_field="code") + + self.assertEqual(merged["items"][0]["extra"], 2) + self.assertEqual(merged["items"][1]["code"], "Y") + + def test_type_mismatch_respects_overwrite_policy(self) -> None: + base_keep = {"a": {"b": 1}} + base_replace = {"a": {"b": 1}} + incoming = {"a": [1, 2]} + + merged_keep = merge_json(base_keep, incoming, allow_overwrite=False) + merged_replace = merge_json(base_replace, + incoming, + allow_overwrite=True) + + self.assertEqual(merged_keep["a"], {"b": 1}) + self.assertEqual(merged_replace["a"], [1, 2]) + + def test_list_items_without_key_are_appended(self) -> None: + base = {"items": [{"id": "x"}]} + incoming = {"items": [{"name": "no_id"}]} + + merged = merge_json(base, incoming, allow_overwrite=False) + + self.assertEqual(len(merged["items"]), 2) + self.assertEqual(merged["items"][1]["name"], "no_id") + + def test_base_items_without_key_do_not_block_append(self) -> None: + base = {"items": [{"name": "base-only"}]} + incoming = {"items": [{"id": "x", "value": 1}]} + + merged = merge_json(base, incoming, allow_overwrite=False) + + self.assertEqual(len(merged["items"]), 2) + self.assertEqual(merged["items"][0]["name"], "base-only") + self.assertEqual(merged["items"][1]["id"], "x") + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py index 14320dbf99..65296f71e1 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_merge.py +++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py @@ -15,14 +15,15 @@ # limitations under the License. import json -from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict from absl import app from absl import flags from absl import logging +from tools.agentic_import.common.json_merge import merge_json + _FLAGS = flags.FLAGS @@ -43,104 +44,6 @@ def _define_flags(): pass -@dataclass(frozen=True) -class MergeTarget: - path: str - match_key: str = 'id' - - -_MERGE_TARGETS = [ - MergeTarget('dataflows'), - MergeTarget('dataflows.data_structure_definition.dimensions'), - MergeTarget('dataflows.data_structure_definition.attributes'), - MergeTarget('dataflows.data_structure_definition.measures'), - MergeTarget('dataflows.data_structure_definition.dimensions.concept'), - MergeTarget('dataflows.data_structure_definition.attributes.concept'), - MergeTarget('dataflows.data_structure_definition.measures.concept'), - MergeTarget( - 'dataflows.data_structure_definition.dimensions.representation.codelist.codes' - ), - MergeTarget( - 'dataflows.data_structure_definition.attributes.representation.codelist.codes' - ), - MergeTarget( - 'dataflows.data_structure_definition.measures.representation.codelist.codes' - ), - MergeTarget('dataflows.referenced_concept_schemes'), - MergeTarget('dataflows.referenced_concept_schemes.concepts'), -] - - -class EnrichmentMerger: - - def __init__(self, base_data: Dict[str, Any], enriched_data: Dict[str, - Any]): - self._base = base_data - self._enriched = enriched_data - self._targets = _MERGE_TARGETS - - def merge(self) -> Dict[str, Any]: - self._merge_targets() - return self._base - - def _merge_targets(self) -> None: - for target in self._targets: - base_nodes = list(self._find_nodes(self._base, target.path)) - enriched_nodes = list(self._find_nodes(self._enriched, target.path)) - if not base_nodes and enriched_nodes: - logging.warning( - "Enriched data has path '%s' not present in base JSON", - target.path) - continue - - base_by_key = { - node.get(target.match_key): node - for node in base_nodes - if isinstance(node, dict) and node.get(target.match_key) - } - for enriched_node in enriched_nodes: - if not isinstance(enriched_node, dict): - continue - match_value = enriched_node.get(target.match_key) - if not match_value: - continue - base_node = base_by_key.get(match_value) - if not base_node: - logging.warning("No base match for %s='%s' at path '%s'", - target.match_key, match_value, target.path) - continue - self._merge_node(base_node, enriched_node, target.path) - - def _merge_node(self, base_node: Dict[str, Any], - enriched_node: Dict[str, Any], path: str) -> None: - if 'enriched_description' in enriched_node: - if 'enriched_description' in base_node: - logging.warning("Overwriting enriched_description at %s id=%s", - path, base_node.get('id')) - base_node['enriched_description'] = enriched_node[ - 'enriched_description'] - - def _find_nodes(self, data: Dict[str, Any], - path: str) -> Iterable[Dict[str, Any]]: - parts = path.split('.') - current = [data] - for part in parts: - next_level = [] - for node in current: - if not isinstance(node, dict): - continue - value = node.get(part) - if isinstance(value, list): - next_level.extend( - [item for item in value if isinstance(item, dict)]) - elif isinstance(value, dict): - next_level.append(value) - current = next_level - if not current: - break - return current - - def _load_json(path: Path) -> Dict[str, Any]: with open(path, 'r') as f: return json.load(f) @@ -155,7 +58,10 @@ def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str, output_path: str) -> None: base_data = _load_json(Path(input_metadata_json)) enriched_data = _load_json(Path(input_enriched_items_json)) - merged = EnrichmentMerger(base_data, enriched_data).merge() + merged = merge_json(base_data, + enriched_data, + key_field='id', + allow_overwrite=False) _write_json(Path(output_path), merged) diff --git a/tools/agentic_import/sdmx/testdata/sample_enriched_items.json b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json index 6fad7d28ad..d566992a50 100644 --- a/tools/agentic_import/sdmx/testdata/sample_enriched_items.json +++ b/tools/agentic_import/sdmx/testdata/sample_enriched_items.json @@ -14,18 +14,43 @@ }, "representation": { "codelist": { + "id": "CL1", + "name": "Enriched Codelist One", "codes": [ { "id": "CODE1", + "name": "Enriched Code1 CL1", "enriched_description": "Code 1 enriched" }, { "id": "CODE2", + "name": "Enriched Code2 CL1", "enriched_description": "Code 2 enriched" } ] } } + }, + { + "id": "DIM2", + "enriched_description": "Dimension two enriched", + "concept": { + "id": "C4", + "enriched_description": "Concept C4 enriched" + }, + "representation": { + "codelist": { + "id": "CL2", + "name": "Enriched Codelist Two", + "codes": [ + { + "id": "CODE1", + "name": "Enriched Code1 CL2", + "enriched_description": "Code 1 enriched CL2" + } + ] + } + } } ], "attributes": [ diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata.json b/tools/agentic_import/sdmx/testdata/sample_metadata.json index e121b00df6..5770feaf3a 100644 --- a/tools/agentic_import/sdmx/testdata/sample_metadata.json +++ b/tools/agentic_import/sdmx/testdata/sample_metadata.json @@ -10,9 +10,24 @@ "concept": {"id": "C1"}, "representation": { "codelist": { + "id": "CL1", + "name": "Base Codelist One", "codes": [ - {"id": "CODE1"}, - {"id": "CODE2"} + {"id": "CODE1", "name": "Base Code1 CL1"}, + {"id": "CODE2", "name": "Base Code2 CL1"} + ] + } + } + }, + { + "id": "DIM2", + "concept": {"id": "C4"}, + "representation": { + "codelist": { + "id": "CL2", + "name": "Base Codelist Two", + "codes": [ + {"id": "CODE1", "name": "Base Code1 CL2"} ] } } diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json index 1828b0d023..76b33b8c2b 100644 --- a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json +++ b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json @@ -14,19 +14,44 @@ }, "representation": { "codelist": { + "id": "CL1", + "name": "Base Codelist One", "codes": [ { "id": "CODE1", + "name": "Base Code1 CL1", "enriched_description": "Code 1 enriched" }, { "id": "CODE2", + "name": "Base Code2 CL1", "enriched_description": "Code 2 enriched" } ] } }, "enriched_description": "Dimension enriched" + }, + { + "id": "DIM2", + "concept": { + "id": "C4", + "enriched_description": "Concept C4 enriched" + }, + "representation": { + "codelist": { + "id": "CL2", + "name": "Base Codelist Two", + "codes": [ + { + "id": "CODE1", + "name": "Base Code1 CL2", + "enriched_description": "Code 1 enriched CL2" + } + ] + } + }, + "enriched_description": "Dimension two enriched" } ], "attributes": [ @@ -80,6 +105,10 @@ { "id": "DF2", "name": "Flow Two" + }, + { + "id": "DF3", + "enriched_description": "No base match" } ] } From 9ffdfa697ef7b3fa21be4b31d8153b3fb4c8cb75 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Wed, 21 Jan 2026 06:03:30 +0000 Subject: [PATCH 13/15] Add field-whitelist JSON merge helper Rename module/tests and update SDMX merge usage --- .../agentic_import/common/json_merge_test.py | 150 --------------- .../{json_merge.py => merge_json_fields.py} | 95 +++++---- .../common/merge_json_fields_test.py | 182 ++++++++++++++++++ .../sdmx/metadata_enricher_merge.py | 11 +- .../sample_metadata_enriched_expected.json | 4 - 5 files changed, 249 insertions(+), 193 deletions(-) delete mode 100644 tools/agentic_import/common/json_merge_test.py rename tools/agentic_import/common/{json_merge.py => merge_json_fields.py} (60%) create mode 100644 tools/agentic_import/common/merge_json_fields_test.py diff --git a/tools/agentic_import/common/json_merge_test.py b/tools/agentic_import/common/json_merge_test.py deleted file mode 100644 index e13452fa9b..0000000000 --- a/tools/agentic_import/common/json_merge_test.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -from tools.agentic_import.common.json_merge import merge_json - - -class JsonMergeTest(unittest.TestCase): - - def test_insert_only_preserves_existing_leaf_values(self) -> None: - base = {"a": 1, "b": {"c": 2}} - incoming = {"a": 3, "b": {"d": 4}} - - merged = merge_json(base, incoming, allow_overwrite=False) - - self.assertEqual(merged["a"], 1) - self.assertEqual(merged["b"]["c"], 2) - self.assertEqual(merged["b"]["d"], 4) - - def test_allow_overwrite_updates_leaf_values(self) -> None: - base = {"a": 1} - incoming = {"a": 2} - - merged = merge_json(base, incoming, allow_overwrite=True) - - self.assertEqual(merged["a"], 2) - - def test_insert_only_keeps_existing_name_and_adds_new_fields(self) -> None: - base = {"item": {"name": "Base"}} - incoming = {"item": {"name": "Incoming", "enriched_description": "New"}} - - merged = merge_json(base, incoming, allow_overwrite=False) - - self.assertEqual(merged["item"]["name"], "Base") - self.assertEqual(merged["item"]["enriched_description"], "New") - - def test_keyed_list_merge_respects_hierarchy(self) -> None: - base = { - "codelists": [ - { - "id": "CL1", - "codes": [{ - "id": "A" - },], - }, - { - "id": "CL2", - "codes": [{ - "id": "A" - },], - }, - ] - } - incoming = { - "codelists": [ - { - "id": - "CL1", - "codes": [{ - "id": "A", - "enriched_description": "Code A in CL1", - },], - }, - { - "id": - "CL2", - "codes": [{ - "id": "A", - "enriched_description": "Code A in CL2", - },], - }, - ] - } - - merged = merge_json(base, incoming, allow_overwrite=True) - - cl1_code = merged["codelists"][0]["codes"][0] - cl2_code = merged["codelists"][1]["codes"][0] - self.assertEqual(cl1_code["enriched_description"], "Code A in CL1") - self.assertEqual(cl2_code["enriched_description"], "Code A in CL2") - - def test_keyed_list_merge_with_custom_key(self) -> None: - base = {"items": [{"code": "X", "value": 1}]} - incoming = { - "items": [ - { - "code": "X", - "extra": 2 - }, - { - "code": "Y", - "value": 3 - }, - ] - } - - merged = merge_json(base, incoming, key_field="code") - - self.assertEqual(merged["items"][0]["extra"], 2) - self.assertEqual(merged["items"][1]["code"], "Y") - - def test_type_mismatch_respects_overwrite_policy(self) -> None: - base_keep = {"a": {"b": 1}} - base_replace = {"a": {"b": 1}} - incoming = {"a": [1, 2]} - - merged_keep = merge_json(base_keep, incoming, allow_overwrite=False) - merged_replace = merge_json(base_replace, - incoming, - allow_overwrite=True) - - self.assertEqual(merged_keep["a"], {"b": 1}) - self.assertEqual(merged_replace["a"], [1, 2]) - - def test_list_items_without_key_are_appended(self) -> None: - base = {"items": [{"id": "x"}]} - incoming = {"items": [{"name": "no_id"}]} - - merged = merge_json(base, incoming, allow_overwrite=False) - - self.assertEqual(len(merged["items"]), 2) - self.assertEqual(merged["items"][1]["name"], "no_id") - - def test_base_items_without_key_do_not_block_append(self) -> None: - base = {"items": [{"name": "base-only"}]} - incoming = {"items": [{"id": "x", "value": 1}]} - - merged = merge_json(base, incoming, allow_overwrite=False) - - self.assertEqual(len(merged["items"]), 2) - self.assertEqual(merged["items"][0]["name"], "base-only") - self.assertEqual(merged["items"][1]["id"], "x") - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/agentic_import/common/json_merge.py b/tools/agentic_import/common/merge_json_fields.py similarity index 60% rename from tools/agentic_import/common/json_merge.py rename to tools/agentic_import/common/merge_json_fields.py index fb9e142bf8..f0634c2715 100644 --- a/tools/agentic_import/common/json_merge.py +++ b/tools/agentic_import/common/merge_json_fields.py @@ -13,63 +13,85 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Merge helper for JSON-like data. -from typing import Any, Dict, List +Only fields listed in fields_to_update can be added or updated. +Traversal only follows containers already present in base and never creates +new dicts or lists along the way. +List items are matched by key (default id); unmatched items are ignored. +Container type mismatches are skipped. +When allow_overwrite is False, existing values are preserved. +""" + +from typing import Any, Dict, List, Set from absl import logging -def merge_json(base: Any, - incoming: Any, - key_field: str = 'id', - allow_overwrite: bool = False) -> Any: - """Merges incoming JSON into base, mutating base where possible.""" +def merge_json_fields(base: Any, + incoming: Any, + fields_to_update: List[str], + key_field: str = 'id', + allow_overwrite: bool = False) -> Any: + """Merges selected fields from incoming JSON into base.""" return _merge_value(base, incoming, + fields_to_update=set(fields_to_update), key_field=key_field, allow_overwrite=allow_overwrite, path='') -def _merge_value(base: Any, incoming: Any, key_field: str, - allow_overwrite: bool, path: str) -> Any: - # Dispatch by type to preserve structure and scope merges correctly. +def _merge_value(base: Any, incoming: Any, fields_to_update: Set[str], + key_field: str, allow_overwrite: bool, path: str) -> Any: + # Only traverse matching container types; leave base untouched otherwise. if isinstance(base, dict) and isinstance(incoming, dict): return _merge_dict(base, incoming, + fields_to_update=fields_to_update, key_field=key_field, allow_overwrite=allow_overwrite, path=path) if isinstance(base, list) and isinstance(incoming, list): return _merge_list(base, incoming, + fields_to_update=fields_to_update, key_field=key_field, allow_overwrite=allow_overwrite, path=path) - return _merge_leaf(base, incoming, allow_overwrite, path) + if type(base) != type(incoming): + location = path or 'root' + logging.warning(f"Type mismatch at {location}; skipping.") + return base -def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any], key_field: str, +def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any], + fields_to_update: Set[str], key_field: str, allow_overwrite: bool, path: str) -> Dict[str, Any]: for key, incoming_value in incoming.items(): next_path = _join_path(path, key) + if key in fields_to_update: + _merge_field(base, + key, + incoming_value, + allow_overwrite=allow_overwrite, + path=next_path) + continue if key not in base: - base[key] = incoming_value continue - - base_value = base[key] - base[key] = _merge_value(base_value, + base[key] = _merge_value(base[key], incoming_value, + fields_to_update=fields_to_update, key_field=key_field, allow_overwrite=allow_overwrite, path=next_path) return base -def _merge_list(base: List[Any], incoming: List[Any], key_field: str, +def _merge_list(base: List[Any], incoming: List[Any], + fields_to_update: Set[str], key_field: str, allow_overwrite: bool, path: str) -> List[Any]: - # Keep base ordering; append unmatched items to avoid data loss. - # Build a keyed index for scoped merges inside this list. + # Keep base ordering; only merge keyed items already present in base. base_by_key: Dict[Any, Dict[str, Any]] = {} for index, item in enumerate(base): if not isinstance(item, dict): @@ -91,20 +113,17 @@ def _merge_list(base: List[Any], incoming: List[Any], key_field: str, base_by_key[key_value] = item seen_incoming_keys = set() - # Merge incoming items by key; append when a match is not possible. for index, item in enumerate(incoming): if not isinstance(item, dict): logging.warning( - f"Incoming list item at {path}[index={index}] is not a dict; appending." + f"Incoming list item at {path}[index={index}] is not a dict; ignoring." ) - base.append(item) continue key_value = item.get(key_field) if key_value is None: logging.warning( - f"Incoming list item at {path}[index={index}] missing key '{key_field}'; appending." + f"Incoming list item at {path}[index={index}] missing key '{key_field}'; ignoring." ) - base.append(item) continue if key_value in seen_incoming_keys: logging.warning( @@ -114,30 +133,38 @@ def _merge_list(base: List[Any], incoming: List[Any], key_field: str, base_item = base_by_key.get(key_value) if base_item is None: - base.append(item) - base_by_key[key_value] = item + item_path = _list_item_path(path, key_field, key_value) + logging.warning( + f"No base match for {item_path}; ignoring incoming list item.") continue item_path = _list_item_path(path, key_field, key_value) _merge_dict(base_item, item, + fields_to_update=fields_to_update, key_field=key_field, allow_overwrite=allow_overwrite, path=item_path) return base -def _merge_leaf(base: Any, incoming: Any, allow_overwrite: bool, - path: str) -> Any: - # Leaf values follow the overwrite policy to avoid accidental data loss. +def _merge_field(base: Dict[str, Any], key: str, incoming_value: Any, + allow_overwrite: bool, path: str) -> Any: + if key not in base: + base[key] = incoming_value + return base + + base_value = base[key] if allow_overwrite: - if base != incoming: - logging.warning( - f"Overwriting value at {path} from {base!r} to {incoming!r}.") - return incoming + if base_value != incoming_value: + logging.info( + f"Overwriting value at {path} from {base_value!r} to {incoming_value!r}." + ) + base[key] = incoming_value + return base - if base != incoming: - logging.warning( + if base_value != incoming_value: + logging.info( f"Preserving base value at {path}; incoming value ignored.") return base diff --git a/tools/agentic_import/common/merge_json_fields_test.py b/tools/agentic_import/common/merge_json_fields_test.py new file mode 100644 index 0000000000..4c24bde5aa --- /dev/null +++ b/tools/agentic_import/common/merge_json_fields_test.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from tools.agentic_import.common.merge_json_fields import merge_json_fields + + +class JsonMergeTest(unittest.TestCase): + + def test_updates_only_listed_fields(self) -> None: + base = {"item": {"name": "Base"}} + incoming = {"item": {"name": "Incoming", "enriched_description": "New"}} + + merged = merge_json_fields(base, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + + self.assertEqual(merged["item"]["name"], "Base") + self.assertEqual(merged["item"]["enriched_description"], "New") + + def test_overwrite_policy_for_listed_fields(self) -> None: + base_keep = {"item": {"enriched_description": "Old"}} + base_overwrite = {"item": {"enriched_description": "Old"}} + incoming = {"item": {"enriched_description": "New"}} + + merged_keep = merge_json_fields( + base_keep, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + merged_overwrite = merge_json_fields( + base_overwrite, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=True) + + self.assertEqual(merged_keep["item"]["enriched_description"], "Old") + self.assertEqual(merged_overwrite["item"]["enriched_description"], + "New") + + def test_type_mismatch_on_listed_field_respects_overwrite(self) -> None: + base_keep = {"item": {"enriched_description": {"a": 1}}} + base_overwrite = {"item": {"enriched_description": {"a": 1}}} + incoming = {"item": {"enriched_description": "New"}} + + merged_keep = merge_json_fields( + base_keep, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + merged_overwrite = merge_json_fields( + base_overwrite, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=True) + + self.assertEqual(merged_keep["item"]["enriched_description"], {"a": 1}) + self.assertEqual(merged_overwrite["item"]["enriched_description"], + "New") + + def test_traversal_type_mismatch_is_skipped(self) -> None: + base = {"item": {"details": {"a": 1}}} + incoming = {"item": {"details": ["x"]}} + + merged = merge_json_fields(base, + incoming, + fields_to_update=["enriched_description"]) + + self.assertEqual(merged["item"]["details"], {"a": 1}) + + def test_keyed_list_merge_respects_hierarchy(self) -> None: + base = { + "codelists": [ + { + "id": "CL1", + "codes": [{ + "id": "A" + },], + }, + { + "id": "CL2", + "codes": [{ + "id": "A" + },], + }, + ] + } + incoming = { + "codelists": [ + { + "id": + "CL1", + "codes": [{ + "id": "A", + "enriched_description": "Code A in CL1", + },], + }, + { + "id": + "CL2", + "codes": [{ + "id": "A", + "enriched_description": "Code A in CL2", + },], + }, + ] + } + + merged = merge_json_fields(base, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + + cl1_code = merged["codelists"][0]["codes"][0] + cl2_code = merged["codelists"][1]["codes"][0] + self.assertEqual(cl1_code["enriched_description"], "Code A in CL1") + self.assertEqual(cl2_code["enriched_description"], "Code A in CL2") + + def test_keyed_list_merge_with_custom_key(self) -> None: + base = {"items": [{"code": "X", "value": 1}]} + incoming = { + "items": [ + { + "code": "X", + "enriched_description": "X desc" + }, + { + "code": "Y", + "enriched_description": "Y desc" + }, + ] + } + + merged = merge_json_fields(base, + incoming, + fields_to_update=["enriched_description"], + key_field="code") + + self.assertEqual(len(merged["items"]), 1) + self.assertEqual(merged["items"][0]["enriched_description"], "X desc") + + def test_list_items_without_key_are_ignored(self) -> None: + base = {"items": [{"id": "x"}]} + incoming = {"items": [{"name": "no_id"}]} + + merged = merge_json_fields(base, + incoming, + fields_to_update=["enriched_description"]) + + self.assertEqual(len(merged["items"]), 1) + self.assertEqual(merged["items"][0], {"id": "x"}) + + def test_base_items_without_key_are_ignored(self) -> None: + base = {"items": [{"name": "base-only"}]} + incoming = {"items": [{"id": "x", "enriched_description": "desc"}]} + + merged = merge_json_fields(base, + incoming, + fields_to_update=["enriched_description"]) + + self.assertEqual(len(merged["items"]), 1) + self.assertEqual(merged["items"][0]["name"], "base-only") + self.assertNotIn("enriched_description", merged["items"][0]) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py index 65296f71e1..c997f8c7b3 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_merge.py +++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py @@ -22,7 +22,7 @@ from absl import flags from absl import logging -from tools.agentic_import.common.json_merge import merge_json +from tools.agentic_import.common.merge_json_fields import merge_json_fields _FLAGS = flags.FLAGS @@ -58,10 +58,11 @@ def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str, output_path: str) -> None: base_data = _load_json(Path(input_metadata_json)) enriched_data = _load_json(Path(input_enriched_items_json)) - merged = merge_json(base_data, - enriched_data, - key_field='id', - allow_overwrite=False) + merged = merge_json_fields(base_data, + enriched_data, + fields_to_update=['enriched_description'], + key_field='id', + allow_overwrite=False) _write_json(Path(output_path), merged) diff --git a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json index 76b33b8c2b..ce6b3978db 100644 --- a/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json +++ b/tools/agentic_import/sdmx/testdata/sample_metadata_enriched_expected.json @@ -105,10 +105,6 @@ { "id": "DF2", "name": "Flow Two" - }, - { - "id": "DF3", - "enriched_description": "No base match" } ] } From 565ae084e1bc0eee98d4c74a8c8686d3900f5cc5 Mon Sep 17 00:00:00 2001 From: rohit kumar Date: Wed, 21 Jan 2026 07:52:57 +0000 Subject: [PATCH 14/15] refactor: move collection merge into sdmx tool --- .../common/gemini_prompt_runner.py | 1 + .../common/merge_json_fields.py | 179 ----------------- .../common/merge_json_fields_test.py | 182 ----------------- .../sdmx/metadata_enricher_fetch.py | 1 + .../sdmx/metadata_enricher_find.py | 1 + .../sdmx/metadata_enricher_merge.py | 189 +++++++++++++++++- .../sdmx/metadata_enricher_merge_test.py | 163 +++++++++++++++ 7 files changed, 347 insertions(+), 369 deletions(-) delete mode 100644 tools/agentic_import/common/merge_json_fields.py delete mode 100644 tools/agentic_import/common/merge_json_fields_test.py diff --git a/tools/agentic_import/common/gemini_prompt_runner.py b/tools/agentic_import/common/gemini_prompt_runner.py index 19ae073d8f..29b696730a 100644 --- a/tools/agentic_import/common/gemini_prompt_runner.py +++ b/tools/agentic_import/common/gemini_prompt_runner.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Render prompts and run the Gemini CLI with tracked run outputs.""" import shutil import subprocess diff --git a/tools/agentic_import/common/merge_json_fields.py b/tools/agentic_import/common/merge_json_fields.py deleted file mode 100644 index f0634c2715..0000000000 --- a/tools/agentic_import/common/merge_json_fields.py +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Merge helper for JSON-like data. - -Only fields listed in fields_to_update can be added or updated. -Traversal only follows containers already present in base and never creates -new dicts or lists along the way. -List items are matched by key (default id); unmatched items are ignored. -Container type mismatches are skipped. -When allow_overwrite is False, existing values are preserved. -""" - -from typing import Any, Dict, List, Set - -from absl import logging - - -def merge_json_fields(base: Any, - incoming: Any, - fields_to_update: List[str], - key_field: str = 'id', - allow_overwrite: bool = False) -> Any: - """Merges selected fields from incoming JSON into base.""" - return _merge_value(base, - incoming, - fields_to_update=set(fields_to_update), - key_field=key_field, - allow_overwrite=allow_overwrite, - path='') - - -def _merge_value(base: Any, incoming: Any, fields_to_update: Set[str], - key_field: str, allow_overwrite: bool, path: str) -> Any: - # Only traverse matching container types; leave base untouched otherwise. - if isinstance(base, dict) and isinstance(incoming, dict): - return _merge_dict(base, - incoming, - fields_to_update=fields_to_update, - key_field=key_field, - allow_overwrite=allow_overwrite, - path=path) - if isinstance(base, list) and isinstance(incoming, list): - return _merge_list(base, - incoming, - fields_to_update=fields_to_update, - key_field=key_field, - allow_overwrite=allow_overwrite, - path=path) - if type(base) != type(incoming): - location = path or 'root' - logging.warning(f"Type mismatch at {location}; skipping.") - return base - - -def _merge_dict(base: Dict[str, Any], incoming: Dict[str, Any], - fields_to_update: Set[str], key_field: str, - allow_overwrite: bool, path: str) -> Dict[str, Any]: - for key, incoming_value in incoming.items(): - next_path = _join_path(path, key) - if key in fields_to_update: - _merge_field(base, - key, - incoming_value, - allow_overwrite=allow_overwrite, - path=next_path) - continue - if key not in base: - continue - base[key] = _merge_value(base[key], - incoming_value, - fields_to_update=fields_to_update, - key_field=key_field, - allow_overwrite=allow_overwrite, - path=next_path) - return base - - -def _merge_list(base: List[Any], incoming: List[Any], - fields_to_update: Set[str], key_field: str, - allow_overwrite: bool, path: str) -> List[Any]: - # Keep base ordering; only merge keyed items already present in base. - base_by_key: Dict[Any, Dict[str, Any]] = {} - for index, item in enumerate(base): - if not isinstance(item, dict): - logging.warning( - f"Base list item at {path}[index={index}] is not a dict; skipping keyed merge." - ) - continue - key_value = item.get(key_field) - if key_value is None: - logging.warning( - f"Base list item at {path}[index={index}] missing key '{key_field}'; skipping keyed merge." - ) - continue - if key_value in base_by_key: - logging.warning( - f"Duplicate key '{key_value}' in base list at {path}; using first occurrence." - ) - continue - base_by_key[key_value] = item - - seen_incoming_keys = set() - for index, item in enumerate(incoming): - if not isinstance(item, dict): - logging.warning( - f"Incoming list item at {path}[index={index}] is not a dict; ignoring." - ) - continue - key_value = item.get(key_field) - if key_value is None: - logging.warning( - f"Incoming list item at {path}[index={index}] missing key '{key_field}'; ignoring." - ) - continue - if key_value in seen_incoming_keys: - logging.warning( - f"Duplicate key '{key_value}' in incoming list at {path}; merging again." - ) - seen_incoming_keys.add(key_value) - - base_item = base_by_key.get(key_value) - if base_item is None: - item_path = _list_item_path(path, key_field, key_value) - logging.warning( - f"No base match for {item_path}; ignoring incoming list item.") - continue - - item_path = _list_item_path(path, key_field, key_value) - _merge_dict(base_item, - item, - fields_to_update=fields_to_update, - key_field=key_field, - allow_overwrite=allow_overwrite, - path=item_path) - return base - - -def _merge_field(base: Dict[str, Any], key: str, incoming_value: Any, - allow_overwrite: bool, path: str) -> Any: - if key not in base: - base[key] = incoming_value - return base - - base_value = base[key] - if allow_overwrite: - if base_value != incoming_value: - logging.info( - f"Overwriting value at {path} from {base_value!r} to {incoming_value!r}." - ) - base[key] = incoming_value - return base - - if base_value != incoming_value: - logging.info( - f"Preserving base value at {path}; incoming value ignored.") - return base - - -def _join_path(path: str, key: str) -> str: - if not path: - return key - return f"{path}.{key}" - - -def _list_item_path(path: str, key_field: str, key_value: Any) -> str: - return f"{path}[{key_field}={key_value}]" diff --git a/tools/agentic_import/common/merge_json_fields_test.py b/tools/agentic_import/common/merge_json_fields_test.py deleted file mode 100644 index 4c24bde5aa..0000000000 --- a/tools/agentic_import/common/merge_json_fields_test.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -from tools.agentic_import.common.merge_json_fields import merge_json_fields - - -class JsonMergeTest(unittest.TestCase): - - def test_updates_only_listed_fields(self) -> None: - base = {"item": {"name": "Base"}} - incoming = {"item": {"name": "Incoming", "enriched_description": "New"}} - - merged = merge_json_fields(base, - incoming, - fields_to_update=["enriched_description"], - allow_overwrite=False) - - self.assertEqual(merged["item"]["name"], "Base") - self.assertEqual(merged["item"]["enriched_description"], "New") - - def test_overwrite_policy_for_listed_fields(self) -> None: - base_keep = {"item": {"enriched_description": "Old"}} - base_overwrite = {"item": {"enriched_description": "Old"}} - incoming = {"item": {"enriched_description": "New"}} - - merged_keep = merge_json_fields( - base_keep, - incoming, - fields_to_update=["enriched_description"], - allow_overwrite=False) - merged_overwrite = merge_json_fields( - base_overwrite, - incoming, - fields_to_update=["enriched_description"], - allow_overwrite=True) - - self.assertEqual(merged_keep["item"]["enriched_description"], "Old") - self.assertEqual(merged_overwrite["item"]["enriched_description"], - "New") - - def test_type_mismatch_on_listed_field_respects_overwrite(self) -> None: - base_keep = {"item": {"enriched_description": {"a": 1}}} - base_overwrite = {"item": {"enriched_description": {"a": 1}}} - incoming = {"item": {"enriched_description": "New"}} - - merged_keep = merge_json_fields( - base_keep, - incoming, - fields_to_update=["enriched_description"], - allow_overwrite=False) - merged_overwrite = merge_json_fields( - base_overwrite, - incoming, - fields_to_update=["enriched_description"], - allow_overwrite=True) - - self.assertEqual(merged_keep["item"]["enriched_description"], {"a": 1}) - self.assertEqual(merged_overwrite["item"]["enriched_description"], - "New") - - def test_traversal_type_mismatch_is_skipped(self) -> None: - base = {"item": {"details": {"a": 1}}} - incoming = {"item": {"details": ["x"]}} - - merged = merge_json_fields(base, - incoming, - fields_to_update=["enriched_description"]) - - self.assertEqual(merged["item"]["details"], {"a": 1}) - - def test_keyed_list_merge_respects_hierarchy(self) -> None: - base = { - "codelists": [ - { - "id": "CL1", - "codes": [{ - "id": "A" - },], - }, - { - "id": "CL2", - "codes": [{ - "id": "A" - },], - }, - ] - } - incoming = { - "codelists": [ - { - "id": - "CL1", - "codes": [{ - "id": "A", - "enriched_description": "Code A in CL1", - },], - }, - { - "id": - "CL2", - "codes": [{ - "id": "A", - "enriched_description": "Code A in CL2", - },], - }, - ] - } - - merged = merge_json_fields(base, - incoming, - fields_to_update=["enriched_description"], - allow_overwrite=False) - - cl1_code = merged["codelists"][0]["codes"][0] - cl2_code = merged["codelists"][1]["codes"][0] - self.assertEqual(cl1_code["enriched_description"], "Code A in CL1") - self.assertEqual(cl2_code["enriched_description"], "Code A in CL2") - - def test_keyed_list_merge_with_custom_key(self) -> None: - base = {"items": [{"code": "X", "value": 1}]} - incoming = { - "items": [ - { - "code": "X", - "enriched_description": "X desc" - }, - { - "code": "Y", - "enriched_description": "Y desc" - }, - ] - } - - merged = merge_json_fields(base, - incoming, - fields_to_update=["enriched_description"], - key_field="code") - - self.assertEqual(len(merged["items"]), 1) - self.assertEqual(merged["items"][0]["enriched_description"], "X desc") - - def test_list_items_without_key_are_ignored(self) -> None: - base = {"items": [{"id": "x"}]} - incoming = {"items": [{"name": "no_id"}]} - - merged = merge_json_fields(base, - incoming, - fields_to_update=["enriched_description"]) - - self.assertEqual(len(merged["items"]), 1) - self.assertEqual(merged["items"][0], {"id": "x"}) - - def test_base_items_without_key_are_ignored(self) -> None: - base = {"items": [{"name": "base-only"}]} - incoming = {"items": [{"id": "x", "enriched_description": "desc"}]} - - merged = merge_json_fields(base, - incoming, - fields_to_update=["enriched_description"]) - - self.assertEqual(len(merged["items"]), 1) - self.assertEqual(merged["items"][0]["name"], "base-only") - self.assertNotIn("enriched_description", merged["items"][0]) - - -if __name__ == '__main__': - unittest.main() diff --git a/tools/agentic_import/sdmx/metadata_enricher_fetch.py b/tools/agentic_import/sdmx/metadata_enricher_fetch.py index 505184b6d8..eed3da1568 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_fetch.py +++ b/tools/agentic_import/sdmx/metadata_enricher_fetch.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Fetch enriched descriptions for selected SDMX items with Gemini CLI.""" import os import platform diff --git a/tools/agentic_import/sdmx/metadata_enricher_find.py b/tools/agentic_import/sdmx/metadata_enricher_find.py index 222c7eda13..eabaff845b 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_find.py +++ b/tools/agentic_import/sdmx/metadata_enricher_find.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Select SDMX items to enrich and generate enrichment queries.""" import os import platform diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py index c997f8c7b3..aba343aff0 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_merge.py +++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py @@ -13,19 +13,191 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Merge enriched SDMX descriptions into base metadata files.""" import json from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List, Set, Union from absl import app from absl import flags from absl import logging -from tools.agentic_import.common.merge_json_fields import merge_json_fields - _FLAGS = flags.FLAGS +DictOrList = Union[Dict[str, Any], List[Any]] + + +class CollectionMerger: + """Merges selected fields from an incoming nested collection into a base one.""" + + def merge(self, + base: DictOrList, + incoming: DictOrList, + fields_to_update: List[str], + key_field: str = 'id', + allow_overwrite: bool = False) -> DictOrList: + """Merges values from `incoming` into `base` for a controlled set of fields. + + The merge walks `base` and only descends into dicts/lists that already + exist in `base` (it does not create new containers). For dicts, only keys + listed in `fields_to_update` may be added or updated; other keys are + merged only when they already exist in `base`. For lists, items are + matched by `key_field` (default: "id"); incoming items with no match in + `base` are ignored. Container type mismatches are skipped. + + Args: + base: Nested dict/list structure to update. Modified in place. + incoming: Nested dict/list structure providing candidate updates. + fields_to_update: Dict keys that are allowed to be added/updated. + key_field: Dict key used to match list items when merging lists. + allow_overwrite: If True, overwrite existing values for + `fields_to_update`. If False, existing `base` values are preserved + and incoming values are ignored. + + Returns: + The updated `base` object. + """ + return self._merge_value(base, + incoming, + fields_to_update=set(fields_to_update), + key_field=key_field, + allow_overwrite=allow_overwrite, + path='') + + def _merge_value(self, base: DictOrList, incoming: DictOrList, + fields_to_update: Set[str], key_field: str, + allow_overwrite: bool, path: str) -> DictOrList: + # Only traverse matching container types; leave base untouched otherwise. + if isinstance(base, dict) and isinstance(incoming, dict): + return self._merge_dict(base, + incoming, + fields_to_update=fields_to_update, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=path) + if isinstance(base, list) and isinstance(incoming, list): + return self._merge_list(base, + incoming, + fields_to_update=fields_to_update, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=path) + if type(base) != type(incoming): + location = path or 'root' + logging.warning(f"Type mismatch at {location}; skipping.") + return base + + def _merge_dict(self, base: Dict[str, Any], incoming: Dict[str, Any], + fields_to_update: Set[str], key_field: str, + allow_overwrite: bool, path: str) -> Dict[str, Any]: + for key, incoming_value in incoming.items(): + next_path = self._join_path(path, key) + if key in fields_to_update: + self._merge_field(base, + key, + incoming_value, + allow_overwrite=allow_overwrite, + path=next_path) + continue + if key not in base: + continue + base[key] = self._merge_value(base[key], + incoming_value, + fields_to_update=fields_to_update, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=next_path) + return base + + def _merge_list(self, base: List[Any], incoming: List[Any], + fields_to_update: Set[str], key_field: str, + allow_overwrite: bool, path: str) -> List[Any]: + # Keep base ordering; only merge keyed items already present in base. + base_by_key: Dict[Any, Dict[str, Any]] = {} + for index, item in enumerate(base): + if not isinstance(item, dict): + logging.warning( + f"Base list item at {path}[index={index}] is not a dict; skipping keyed merge." + ) + continue + key_value = item.get(key_field) + if key_value is None: + logging.warning( + f"Base list item at {path}[index={index}] missing key '{key_field}'; skipping keyed merge." + ) + continue + if key_value in base_by_key: + logging.warning( + f"Duplicate key '{key_value}' in base list at {path}; using first occurrence." + ) + continue + base_by_key[key_value] = item + + seen_incoming_keys = set() + for index, item in enumerate(incoming): + if not isinstance(item, dict): + logging.warning( + f"Incoming list item at {path}[index={index}] is not a dict; ignoring." + ) + continue + key_value = item.get(key_field) + if key_value is None: + logging.warning( + f"Incoming list item at {path}[index={index}] missing key '{key_field}'; ignoring." + ) + continue + if key_value in seen_incoming_keys: + logging.warning( + f"Duplicate key '{key_value}' in incoming list at {path}; merging again." + ) + seen_incoming_keys.add(key_value) + + base_item = base_by_key.get(key_value) + if base_item is None: + item_path = self._list_item_path(path, key_field, key_value) + logging.warning( + f"No base match for {item_path}; ignoring incoming list item." + ) + continue + + item_path = self._list_item_path(path, key_field, key_value) + self._merge_dict(base_item, + item, + fields_to_update=fields_to_update, + key_field=key_field, + allow_overwrite=allow_overwrite, + path=item_path) + return base + + def _merge_field(self, base: Dict[str, Any], key: str, incoming_value: Any, + allow_overwrite: bool, path: str) -> Any: + if key not in base: + base[key] = incoming_value + return base + + base_value = base[key] + if allow_overwrite: + if base_value != incoming_value: + logging.info( + f"Overwriting value at {path} from {base_value!r} to {incoming_value!r}." + ) + base[key] = incoming_value + return base + + if base_value != incoming_value: + logging.info( + f"Preserving base value at {path}; incoming value ignored.") + return base + + def _join_path(self, path: str, key: str) -> str: + if not path: + return key + return f"{path}.{key}" + + def _list_item_path(self, path: str, key_field: str, key_value: Any) -> str: + return f"{path}[{key_field}={key_value}]" + def _define_flags(): try: @@ -58,11 +230,12 @@ def merge_enrichment(input_metadata_json: str, input_enriched_items_json: str, output_path: str) -> None: base_data = _load_json(Path(input_metadata_json)) enriched_data = _load_json(Path(input_enriched_items_json)) - merged = merge_json_fields(base_data, - enriched_data, - fields_to_update=['enriched_description'], - key_field='id', - allow_overwrite=False) + merger = CollectionMerger() + merged = merger.merge(base_data, + enriched_data, + fields_to_update=['enriched_description'], + key_field='id', + allow_overwrite=False) _write_json(Path(output_path), merged) diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge_test.py b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py index d9670efbd5..1de45a7fff 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_merge_test.py +++ b/tools/agentic_import/sdmx/metadata_enricher_merge_test.py @@ -22,6 +22,7 @@ from deepdiff.diff import DeepDiff +from tools.agentic_import.sdmx.metadata_enricher_merge import CollectionMerger from tools.agentic_import.sdmx.metadata_enricher_merge import merge_enrichment _TESTDATA_DIR = Path(os.path.dirname(__file__)) / 'testdata' @@ -30,6 +31,168 @@ _EXPECTED_JSON = _TESTDATA_DIR / 'sample_metadata_enriched_expected.json' +class CollectionMergerTest(unittest.TestCase): + + def setUp(self) -> None: + self._merger = CollectionMerger() + + def test_updates_only_listed_fields(self) -> None: + base = {"item": {"name": "Base"}} + incoming = {"item": {"name": "Incoming", "enriched_description": "New"}} + + merged = self._merger.merge(base, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + + self.assertEqual(merged["item"]["name"], "Base") + self.assertEqual(merged["item"]["enriched_description"], "New") + + def test_overwrite_policy_for_listed_fields(self) -> None: + base_keep = {"item": {"enriched_description": "Old"}} + base_overwrite = {"item": {"enriched_description": "Old"}} + incoming = {"item": {"enriched_description": "New"}} + + merged_keep = self._merger.merge( + base_keep, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + merged_overwrite = self._merger.merge( + base_overwrite, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=True) + + self.assertEqual(merged_keep["item"]["enriched_description"], "Old") + self.assertEqual(merged_overwrite["item"]["enriched_description"], + "New") + + def test_type_mismatch_on_listed_field_respects_overwrite(self) -> None: + base_keep = {"item": {"enriched_description": {"a": 1}}} + base_overwrite = {"item": {"enriched_description": {"a": 1}}} + incoming = {"item": {"enriched_description": "New"}} + + merged_keep = self._merger.merge( + base_keep, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + merged_overwrite = self._merger.merge( + base_overwrite, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=True) + + self.assertEqual(merged_keep["item"]["enriched_description"], {"a": 1}) + self.assertEqual(merged_overwrite["item"]["enriched_description"], + "New") + + def test_traversal_type_mismatch_is_skipped(self) -> None: + base = {"item": {"details": {"a": 1}}} + incoming = {"item": {"details": ["x"]}} + + merged = self._merger.merge(base, + incoming, + fields_to_update=["enriched_description"]) + + self.assertEqual(merged["item"]["details"], {"a": 1}) + + def test_keyed_list_merge_respects_hierarchy(self) -> None: + base = { + "codelists": [ + { + "id": "CL1", + "codes": [{ + "id": "A" + },], + }, + { + "id": "CL2", + "codes": [{ + "id": "A" + },], + }, + ] + } + incoming = { + "codelists": [ + { + "id": + "CL1", + "codes": [{ + "id": "A", + "enriched_description": "Code A in CL1", + },], + }, + { + "id": + "CL2", + "codes": [{ + "id": "A", + "enriched_description": "Code A in CL2", + },], + }, + ] + } + + merged = self._merger.merge(base, + incoming, + fields_to_update=["enriched_description"], + allow_overwrite=False) + + cl1_code = merged["codelists"][0]["codes"][0] + cl2_code = merged["codelists"][1]["codes"][0] + self.assertEqual(cl1_code["enriched_description"], "Code A in CL1") + self.assertEqual(cl2_code["enriched_description"], "Code A in CL2") + + def test_keyed_list_merge_with_custom_key(self) -> None: + base = {"items": [{"code": "X", "value": 1}]} + incoming = { + "items": [ + { + "code": "X", + "enriched_description": "X desc" + }, + { + "code": "Y", + "enriched_description": "Y desc" + }, + ] + } + + merged = self._merger.merge(base, + incoming, + fields_to_update=["enriched_description"], + key_field="code") + + self.assertEqual(len(merged["items"]), 1) + self.assertEqual(merged["items"][0]["enriched_description"], "X desc") + + def test_list_items_without_key_are_ignored(self) -> None: + base = {"items": [{"id": "x"}]} + incoming = {"items": [{"name": "no_id"}]} + + merged = self._merger.merge(base, + incoming, + fields_to_update=["enriched_description"]) + + self.assertEqual(len(merged["items"]), 1) + self.assertEqual(merged["items"][0], {"id": "x"}) + + def test_base_items_without_key_are_ignored(self) -> None: + base = {"items": [{"name": "base-only"}]} + incoming = {"items": [{"id": "x", "enriched_description": "desc"}]} + + merged = self._merger.merge(base, + incoming, + fields_to_update=["enriched_description"]) + + self.assertEqual(len(merged["items"]), 1) + self.assertEqual(merged["items"][0]["name"], "base-only") + self.assertNotIn("enriched_description", merged["items"][0]) + + class EnrichmentMergeTest(unittest.TestCase): def test_merge_enriched_description_across_lists(self) -> None: From d9149dadded6d663498b5e6be53b5c70e6bfe612 Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Wed, 21 Jan 2026 08:28:01 +0000 Subject: [PATCH 15/15] docs: Refactor the SDMX enrichment README to detail the metadata enrichment pipeline steps and tool usage. --- tools/agentic_import/sdmx/README.md | 91 +++++++++++++------ .../sdmx/metadata_enricher_merge.py | 24 ++--- 2 files changed, 75 insertions(+), 40 deletions(-) diff --git a/tools/agentic_import/sdmx/README.md b/tools/agentic_import/sdmx/README.md index 8fdb06b21d..a66c828331 100644 --- a/tools/agentic_import/sdmx/README.md +++ b/tools/agentic_import/sdmx/README.md @@ -1,14 +1,15 @@ -# SDMX Enrichment Tools +# SDMX Metadata Enrichment Pipeline -This folder contains three standalone tools for SDMX metadata enrichment. -Each tool supports CLI usage and can be called programmatically. +The enrichment process is organized into three distinct steps: Discovery, Fetching, and Integration. -## 1) metadata_enricher_find.py -Selects which SDMX codes/concepts need enrichment and generates -`enrichment_query` values using full dataset context. +--- -CLI usage: -``` +## Step 1: Discovery (`metadata_enricher_find.py`) + +**Role**: Analyzes the base SDMX metadata using Gemini CLI to identify codes and concepts that require enrichment. It generates context-aware search queries (`enrichment_query`) for these items while preserving the original dataset structure. + +**Command**: +```bash python tools/agentic_import/sdmx/metadata_enricher_find.py \ --input_metadata_json="/path/to/metadata.json" \ --dataset_prefix="oecd_prices" \ @@ -17,16 +18,20 @@ python tools/agentic_import/sdmx/metadata_enricher_find.py \ --enable_sandboxing ``` -Output: -- A pruned JSON that preserves the original structure but keeps only selected - items with `enrichment_query`. Name/description fields are omitted. +**Input**: +- Base SDMX `metadata.json` file. -## 2) metadata_enricher_fetch.py -Uses Gemini CLI web search to populate `enriched_description` for each selected -item. +**Output**: +- `items_to_enrich.json`: A pruned JSON structure containing only selected items with their generated `enrichment_query`. -CLI usage: -``` +--- + +## Step 2: Fetching (`metadata_enricher_fetch.py`) + +**Role**: Orchestrates external web searches (via Gemini CLI) to populate detailed descriptions (`enriched_description`) for the items identified in the previous step. + +**Command**: +```bash python tools/agentic_import/sdmx/metadata_enricher_fetch.py \ --input_items_json="/path/to/items_to_enrich.json" \ --dataset_prefix="oecd_prices" \ @@ -35,21 +40,55 @@ python tools/agentic_import/sdmx/metadata_enricher_fetch.py \ --enable_sandboxing ``` -Output: -- A pruned JSON in the same structure as the input, with `enriched_description` - added and `enrichment_query` removed. +**Input**: +- `items_to_enrich.json` (from Step 1). -## 3) metadata_enricher_merge.py -Merges `enriched_description` into the base metadata JSON. +**Output**: +- `enriched_items.json`: A pruned JSON structure with `enriched_description` added for each item. -CLI usage: -``` +--- + +## Step 3: Integration (`metadata_enricher_merge.py`) + +**Role**: Merges the fetched descriptions back into the original SDMX metadata JSON, resulting in a complete, enriched metadata file. + +**Command**: +```bash python tools/agentic_import/sdmx/metadata_enricher_merge.py \ --input_metadata_json="/path/to/metadata.json" \ --input_enriched_items_json="/path/to/enriched_items.json" \ --output_path="/path/to/metadata_enriched.json" ``` -Output: -- A full metadata JSON with `enriched_description` merged into the matching - codes and concepts. +**Input**: +- Base SDMX `metadata.json`. +- `enriched_items.json` (from Step 2). + +**Output**: +- `metadata_enriched.json`: The final, full metadata JSON with `enriched_description` fields merged into the matching codes and concepts. + +--- + +## Full Pipeline Example + +To run the entire enrichment pipeline for a dataset: + +```bash +# 1. Discover items to enrich +python tools/agentic_import/sdmx/metadata_enricher_find.py \ + --input_metadata_json="metadata.json" \ + --dataset_prefix="my_dataset" \ + --output_path="items_to_enrich.json" + +# 2. Fetch enriched descriptions +python tools/agentic_import/sdmx/metadata_enricher_fetch.py \ + --input_items_json="items_to_enrich.json" \ + --dataset_prefix="my_dataset" \ + --output_path="enriched_items.json" + +# 3. Merge results into the original metadata +python tools/agentic_import/sdmx/metadata_enricher_merge.py \ + --input_metadata_json="metadata.json" \ + --input_enriched_items_json="enriched_items.json" \ + --output_path="metadata_enriched.json" +``` diff --git a/tools/agentic_import/sdmx/metadata_enricher_merge.py b/tools/agentic_import/sdmx/metadata_enricher_merge.py index aba343aff0..4ae702ee16 100644 --- a/tools/agentic_import/sdmx/metadata_enricher_merge.py +++ b/tools/agentic_import/sdmx/metadata_enricher_merge.py @@ -37,23 +37,19 @@ def merge(self, fields_to_update: List[str], key_field: str = 'id', allow_overwrite: bool = False) -> DictOrList: - """Merges values from `incoming` into `base` for a controlled set of fields. + """Merges specific fields from `incoming` into `base` in-place. - The merge walks `base` and only descends into dicts/lists that already - exist in `base` (it does not create new containers). For dicts, only keys - listed in `fields_to_update` may be added or updated; other keys are - merged only when they already exist in `base`. For lists, items are - matched by `key_field` (default: "id"); incoming items with no match in - `base` are ignored. Container type mismatches are skipped. + This performs a structure-aware merge: + 1. Only keys in `fields_to_update` are added or updated. + 2. Lists are merged by matching items on `key_field` (e.g., "id"). + 3. Structure in `base` is preserved; new unmatched containers are ignored. Args: - base: Nested dict/list structure to update. Modified in place. - incoming: Nested dict/list structure providing candidate updates. - fields_to_update: Dict keys that are allowed to be added/updated. - key_field: Dict key used to match list items when merging lists. - allow_overwrite: If True, overwrite existing values for - `fields_to_update`. If False, existing `base` values are preserved - and incoming values are ignored. + base: The target dictionary or list to update. + incoming: The source dictionary or list with updates. + fields_to_update: Keys allowed to be changed or added. + key_field: Key used to match list items. + allow_overwrite: Whether to overwrite existing values. Returns: The updated `base` object.