From 34988f595a49a70f06b6162fa698634debb7047e Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 12:57:58 +0200 Subject: [PATCH 01/37] #126: add empty test for metadata yaml generation --- tests/test_metadata_yaml_generation.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/test_metadata_yaml_generation.py diff --git a/tests/test_metadata_yaml_generation.py b/tests/test_metadata_yaml_generation.py new file mode 100644 index 00000000..21131af8 --- /dev/null +++ b/tests/test_metadata_yaml_generation.py @@ -0,0 +1 @@ +"""Test for metadata yaml generation.""" From 720cb12e95b6f757ac1d2dfd47442eb5a56e8735 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 13:06:10 +0200 Subject: [PATCH 02/37] #126: Implement oemetadata creator class to create valid oemetadata json datapackages --- src/omi/creation/creator.py | 49 +++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/omi/creation/creator.py diff --git a/src/omi/creation/creator.py b/src/omi/creation/creator.py new file mode 100644 index 00000000..ec390251 --- /dev/null +++ b/src/omi/creation/creator.py @@ -0,0 +1,49 @@ +"""Create oemetadata json datapackage descriptions.""" + +from omi.base import get_metadata_specification +from omi.validation import validate_metadata + + +class OEMetadataCreator: + """ + Class to create oemetadata json datapackages. + + Output is based on datapackage and resource descriptions. + """ + + def __init__(self, oem_version: str = "OEMetadata-2.0") -> None: + """ + Initialize the OEMetadataCreator with a specific version. + + Parameters + ---------- + oem_version:str + The version of the OEMetadata specification to use. + """ + self.oem_spec = get_metadata_specification(oem_version) + + def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: + """ + Generate oemetadata json datapackage from dataset and resources. + + Parameters + ---------- + dataset: dict + The dataset description. + resources: list[dict] + The list of resource descriptions. + + Returns + ------- + dict + The generated oemetadata json datapackage. + """ + metadata = { + "@context": self.oem_spec.schema["properties"]["@context"]["examples"][0], + **dataset, + "resources": resources, + "metaMetadata": self.oem_spec.example["metaMetadata"], + } + + validate_metadata(metadata, check_license=False) + return metadata From ad0fe47fccca3f05d224cdd365bef5eb0e1c856e Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 13:16:25 +0200 Subject: [PATCH 03/37] #126: Add utility module which offers general purpose functionality. Currently includes: - loading metadata from yaml oemetadata definition for further processing --- src/omi/creation/utils.py | 45 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/omi/creation/utils.py diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py new file mode 100644 index 00000000..3718e54c --- /dev/null +++ b/src/omi/creation/utils.py @@ -0,0 +1,45 @@ +"""Utility functions for OMI creation module.""" + +from pathlib import Path +from typing import Union + +import yaml + + +def load_yaml_metadata(file_path: Union[str, Path]) -> tuple[str, dict, list[dict], dict]: + """ + Load YAML file containing version, dataset, template, and resource metadata. + + This function reads a YAML file and extracts the version, dataset description, + resources, and template. It applies the template to each resource, merging any + specified fields. + Returns: version, dataset, list of resources with merged template, and raw template. + + Parameters + ---------- + file_path: Union[str, Path] + Path to the YAML file. + + Returns + ------- + Tuple[str, Dict, List[Dict], Dict] + A tuple containing: + - version: The version of the metadata. + - dataset: The dataset description. + - resources: A list of resources with the template applied. + - template: The raw template used for resources. + """ + with Path.open(file_path, encoding="utf-8") as file: + data = yaml.safe_load(file) + + version = data.get("version", "OEMetadata-2.0.4") + dataset = data.get("dataset", {}) + template = data.get("template", {}) + resources = data.get("resources", []) + + # Apply template to each resource + for resource in resources: + for key, value in template.items(): + resource.setdefault(key, value) + + return version, dataset, resources From a722b0bd91d91f5ef8b43eebb1a0dcedfae19838 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 13:16:46 +0200 Subject: [PATCH 04/37] #126: setup creation module --- src/omi/creation/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/omi/creation/__init__.py diff --git a/src/omi/creation/__init__.py b/src/omi/creation/__init__.py new file mode 100644 index 00000000..e69de29b From 63b2a3051f2e399fb5886a1f3857be5a2128b0f7 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 13:26:35 +0200 Subject: [PATCH 05/37] #126: add test for creation functionality --- tests/test_metadata_creation.py | 47 +++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 tests/test_metadata_creation.py diff --git a/tests/test_metadata_creation.py b/tests/test_metadata_creation.py new file mode 100644 index 00000000..2cf6a7d5 --- /dev/null +++ b/tests/test_metadata_creation.py @@ -0,0 +1,47 @@ +"""Test suite for the OEMetadataCreator class in the OMI creation module.""" + +from pathlib import Path + +import pytest +import yaml + +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import load_yaml_metadata + + +@pytest.fixture() +def sample_yaml(tmp_path: Path) -> Path: + """Fixture to create a sample YAML file for testing.""" + content = { + "version": "OEMetadata-2.0.4", + "dataset": { + "name": "test_dataset", + "title": "Test Dataset", + "description": "For unit testing", + "@id": "https://example.org/test_dataset", + }, + "template": {"languages": ["en-GB"]}, + "resources": [{"name": "test_resource", "title": "Test Resource", "format": "CSV", "type": "table"}], + } + + file_path = tmp_path / "metadata.yaml" + with Path.open(file_path, "w", encoding="utf-8") as f: + yaml.dump(content, f, sort_keys=False) + + return file_path + + +def test_generate_oemetadata(sample_yaml: Path) -> None: + """Test the generation of OEMetadata from a sample YAML file.""" + version, dataset, resources = load_yaml_metadata(sample_yaml) + creator = OEMetadataCreator() + + result = creator.generate_metadata(dataset, resources) + + # Basic assertions + assert result["@context"].startswith("https://") + assert result["name"] == "test_dataset" + assert "resources" in result + assert isinstance(result["resources"], list) + assert result["resources"][0]["name"] == "test_resource" + assert "languages" in result["resources"][0] From c0f559f9cde8e869471cc86b1737cbe7bd18ed83 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 15:02:20 +0200 Subject: [PATCH 06/37] #126: add generator for functionality to generate metadata. Currently implemented is generate yaml metadata which can be used to create json metadata --- src/omi/creation/generator.py | 205 ++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 src/omi/creation/generator.py diff --git a/src/omi/creation/generator.py b/src/omi/creation/generator.py new file mode 100644 index 00000000..a141e9ad --- /dev/null +++ b/src/omi/creation/generator.py @@ -0,0 +1,205 @@ +""" +Generate an OEMetadata configuration file. + +Module for generating metadata files from resources like directories or zip files. +This used to get started from scratch - init metadata. +""" + +import fnmatch +import zipfile +from dataclasses import dataclass +from pathlib import Path +from typing import Union + +import yaml + +from omi.inspection import infer_metadata + + +@dataclass +class FileFilterOptions: + """ + Options for filtering files when reading directories or zip files. + + Attributes + ---------- + exclude_extensions: list[str] | None + List of file extensions to exclude (e.g., ['.log', '.tmp']). + exclude_patterns: list[str] | None + List of filename patterns to exclude (e.g., ['*_backup.*', '*.bak']). + exclude_hidden: bool + Whether to exclude hidden files/directories (default True). + """ + + exclude_extensions: list[str] | None = None + exclude_patterns: list[str] | None = None + exclude_hidden: bool = True + + +def read_directory( + directory: Union[str, Path], + filter_opts: FileFilterOptions, +) -> list[Path]: + """ + Recursively read files from the directory, applying optional filters. + + Parameters + ---------- + directory: Union[str, Path] + The directory to read files from. Can be a string or a Path object. + filter_opts: FileFilterOptions + Filtering options including extensions, patterns, and hidden files. + + Returns + ------- + list[Path] + A list of Path objects representing the files that match the criteria. + """ + directory = Path(directory) + + exclude_extensions = set(filter_opts.exclude_extensions or [".log", ".tmp", ".bak", ".DS_Store", ".md"]) + exclude_patterns = filter_opts.exclude_patterns or ["*_backup.*", "*~", "*.old", "*.ignore"] + + valid_files = [] + for file_path in directory.rglob("*"): + if not file_path.is_file(): + continue + + if filter_opts.exclude_hidden and any(part.startswith(".") for part in file_path.parts): + continue + + if file_path.suffix in exclude_extensions: + continue + + if any(fnmatch.fnmatch(file_path.name, pattern) for pattern in exclude_patterns): + continue + + valid_files.append(file_path) + + return valid_files + + +def read_zipfile( + zip_path: Union[str, Path], + extract_to: Union[str, Path], + filter_opts: FileFilterOptions, +) -> list[Path]: + """Extract a zip file and return list of extracted files.""" + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(extract_to) + return read_directory(extract_to, filter_opts) + + +def infer_file_metadata(file_path: Path) -> dict: + """ + Infer basic resource metadata from file name and type. + + Parameters + ---------- + file_path: Path + Path to the file for which metadata should be inferred. + + Returns + ------- + dict + A dictionary containing inferred metadata for the resource. + """ + file_name = file_path.stem + file_format = file_path.suffix.replace(".", "").upper() + + resource = { + "name": file_name.lower().replace(" ", "_"), + "title": file_name.replace("_", " ").title(), + "path": file_path.as_posix(), + "description": f"Auto-generated description for {file_name}", + "type": "table" if file_format in ["CSV", "XLSX", "JSON"] else "file", + "format": file_format, + "encoding": "UTF-8", + } + + if file_format == "CSV": + with file_path.open("r") as f: + fields = infer_metadata(f, "OEP")["resources"][0]["schema"] + + resource["schema"] = fields + resource["dialect"] = {"delimiter": fields.get("delimiter", ","), "decimalSeparator": "."} + + return resource + + +def generate_oemetadata_yaml_from_datapackage( + directory: Union[str, Path], + output_yaml: Union[str, Path], + dataset_metadata: dict, + filter_opts: FileFilterOptions, +) -> None: + """ + Generate an OEMetadata YAML configuration file based on files in a directory or zipped directory. + + Parameters + ---------- + directory: Union[str, Path] + Path to the directory or zip file containing data files. + output_yaml: Union[str, Path] + Path to the output YAML file. + dataset_metadata: dict + Metadata for the dataset, including name, title, description, and ID. + filter_opts: FileFilterOptions + Filtering options for excluding files by extension, pattern, or hidden state. + """ + temp_dir = None + directory = Path(directory) + if zipfile.is_zipfile(directory): + temp_dir = Path("temp_extracted") + files = read_zipfile(directory, temp_dir, filter_opts) + files = read_directory(temp_dir, filter_opts) # Apply filtering after extraction + else: + files = read_directory(directory, filter_opts) + + resources = [] + for file in files: + resource_meta = infer_file_metadata(file) + + resources.append(resource_meta) + + yaml_structure = { + "dataset": dataset_metadata, + "template": { # TODO @jh-RLI: This section must be defined by user # noqa: TD003 + "context": { + "title": "Your Project Title", + "homepage": "https://yourhomepage.org", + "contact": "contact@yourproject.org", + }, + }, + "resources": resources, + } + + with open(output_yaml, "w", encoding="utf-8") as yaml_file: # noqa: PTH123 + yaml.dump(yaml_structure, yaml_file, sort_keys=False, allow_unicode=True) + + if temp_dir: + import shutil + + shutil.rmtree(temp_dir) + + print(f"YAML configuration generated at: {output_yaml}") # noqa: T201 + + +# Example usage +if __name__ == "__main__": + dataset_metadata_example = { + "name": "example_dataset", + "title": "Example Dataset", + "description": "This dataset was autogenerated from directory content.", + "@id": "https://example.org/dataset/example_dataset", + } + + generate_oemetadata_yaml_from_datapackage( + directory="/home/jh/projekte/SLE/postprocessed/", + output_yaml="generated_metadata.yaml", + dataset_metadata=dataset_metadata_example, + filter_opts=FileFilterOptions( + exclude_patterns=[".snake*"], + exclude_hidden=True, + ), + ) From 7ad24dc22408440ea3aeacf358fcd3fd8a055048 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 15:03:48 +0200 Subject: [PATCH 07/37] #126: make linter happy --- src/omi/creation/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py index 3718e54c..97f1644d 100644 --- a/src/omi/creation/utils.py +++ b/src/omi/creation/utils.py @@ -29,7 +29,7 @@ def load_yaml_metadata(file_path: Union[str, Path]) -> tuple[str, dict, list[dic - resources: A list of resources with the template applied. - template: The raw template used for resources. """ - with Path.open(file_path, encoding="utf-8") as file: + with Path(file_path).open(encoding="utf-8") as file: data = yaml.safe_load(file) version = data.get("version", "OEMetadata-2.0.4") From 6a51fbce0c209abaa7129d6ef1f4cf940c013aa1 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 15:08:08 +0200 Subject: [PATCH 08/37] #126: add entry point for metadata creation with function to create JSON oemetadata from yaml file --- src/omi/create.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 src/omi/create.py diff --git a/src/omi/create.py b/src/omi/create.py new file mode 100644 index 00000000..2b9a4bd1 --- /dev/null +++ b/src/omi/create.py @@ -0,0 +1,29 @@ +"""Enty point for metadata creation.""" + +import json +from pathlib import Path +from typing import Union + +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import load_yaml_metadata + + +def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None: + """ + Generate OEMetadata from a YAML file and write it to an output file. + + Parameters + ---------- + yaml_file: str + Path to the input YAML file containing dataset and resources. + output_file: str + Path to the output file where the generated OEMetadata JSON will be saved. + """ + version, dataset, resources = load_yaml_metadata(yaml_file) + creator = OEMetadataCreator() + metadata = creator.generate_metadata(dataset, resources) + + with Path(output_file).open("w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2) + + print(f"OEMetadata written to {output_file}") # noqa: T201 From ac555f63e472d335f09d137365c0d52f0cc9f0e5 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 15:11:41 +0200 Subject: [PATCH 09/37] #126: add cli function to create metadata json from yaml file --- src/omi/cli.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/omi/cli.py b/src/omi/cli.py index 6b4d0aac..2a2093a5 100644 --- a/src/omi/cli.py +++ b/src/omi/cli.py @@ -15,8 +15,15 @@ Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration """ +import json +from pathlib import Path +from typing import Union + import click +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import load_yaml_metadata + @click.group() def grp() -> None: @@ -29,3 +36,27 @@ def grp() -> None: def main() -> None: """Start click application.""" cli() + + +@click.command() +@click.argument("yaml_file") +@click.argument("output_file") +def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None: + """ + Generate OEMetadata from a YAML file and write it to an output file. + + Parameters + ---------- + yaml_file: Union[str, Path] + Path to the input YAML file containing dataset and resources. + output_file: Union[str, Path] + Path to the output file where the generated OEMetadata JSON will be saved. + """ + version, dataset, resources = load_yaml_metadata(yaml_file) + generator = OEMetadataCreator() + metadata = generator.generate_metadata(dataset, resources) + + with Path(output_file).open("w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2) + + print(f"OEMetadata written to {output_file}") # noqa: T201 From 107ab3324469f3bca9ee0348e83f66ac0776919f Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Tue, 29 Jul 2025 15:12:42 +0200 Subject: [PATCH 10/37] #126: make sure when inspecting data resources to infer the fields metadata into oemetadata format instead of plain frictionless --- src/omi/inspection.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/omi/inspection.py b/src/omi/inspection.py index f7b4dd52..e7b2b526 100644 --- a/src/omi/inspection.py +++ b/src/omi/inspection.py @@ -1,6 +1,7 @@ """Module to inspect data and create metadata from it.""" from collections.abc import Callable +from copy import deepcopy from typing import Any from frictionless import Detector, Dialect, Resource @@ -121,7 +122,9 @@ def convert_field(field: dict[str, str]) -> dict[str, str]: return {"name": field["name"], "type": f"array {type_mapping[item_type]}"} # All arrays are empty - so no further subtype can be detected return {"name": field["name"], "type": "array"} - return field + oem_field = deepcopy(metadata["resources"][0]["schema"]["fields"][0]) + oem_field.update(field) + return oem_field rows = resource.read_rows() fields = [convert_field(field) for field in fields] From 2191eac8b470021078e5aa93a13a0563d90f9c3c Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:00:01 +0100 Subject: [PATCH 11/37] #126: Add documentation for the creation module --- src/omi/creation/README.md | 471 +++++++++++++++++++++++++++++++++++++ 1 file changed, 471 insertions(+) create mode 100644 src/omi/creation/README.md diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md new file mode 100644 index 00000000..274283ff --- /dev/null +++ b/src/omi/creation/README.md @@ -0,0 +1,471 @@ +# OMI OEMetadata Assembly Guide + +This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, testing, and common pitfalls. You can drop this as a single `.md` file in your repo (e.g. `docs/oemetadata-assembly.md`) or split into multiple files later. + +--- + +## Table of Contents + +1. [Overview](#overview) +2. [Concepts & Data Flow](#concepts--data-flow) +3. [Repository Layout](#repository-layout) +4. [YAML File Formats](#yaml-file-formats) + + * [Dataset YAML](#dataset-yaml) + * [Template YAML (optional)](#template-yaml-optional) + * [Resource YAML](#resource-yaml) + * [Index YAML (optional)](#index-yaml-optional) +5. [Templating Rules](#templating-rules) +6. [Discovery vs. Index Mapping](#discovery-vs-index-mapping) +7. [Programmatic Usage](#programmatic-usage) + + * [Minimal Usage](#minimal-usage) + * [With Index Mapping](#with-index-mapping) + * [Manual Loading (No Discovery)](#manual-loading-no-discovery) +8. [Airflow Integration Example](#airflow-integration-example) +9. [Testing](#testing) +10. [Validation & Error Handling](#validation--error-handling) +11. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding) +12. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional) +13. [Design Notes & Extensibility](#design-notes--extensibility) +14. [FAQ](#faq) + +--- + +## Overview + +* **Goal:** Author OEMetadata as **YAML** (dataset + resources), keep it **DRY** via **templates**, assemble into a single **JSON** metadata document, and **validate** it with the official schema. +* **Core ideas:** + + * Authors maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs. + * OMI assembles and validates metadata into a final OEMetadata JSON. + * Works well in pipelines (e.g., Airflow) and in regular Python. + +--- + +## Concepts & Data Flow + +1. **Authoring:** + + * `datasets/.dataset.yaml` + * `datasets/.template.yaml` *(optional)* + * `resources//*.resource.yaml` + +2. **Assembly:** + + * OMI **loads** dataset, template, and resource YAML files. + * OMI **applies the template** to each resource (deep merge; resource overrides template). + * OMI **generates and validates** OEMetadata JSON via `OEMetadataCreator`. + +3. **Storage:** + + * You decide where to store: file, DB, API, etc. (OMI returns a Python `dict`). + +--- + +## Repository Layout + +``` +metadata/ + datasets/ + .dataset.yaml + .template.yaml # optional + resources/ + / + .resource.yaml + .resource.yaml + metadata_index.yaml # optional explicit mapping +``` + +* You can use **convention** (the directory / filename structure above) or an **index** file for explicit mapping. + +--- + +## YAML File Formats + +### Dataset YAML + +```yaml +# metadata/datasets/powerplants.dataset.yaml +version: "OEMetadata-2.0.4" # optional (default: OEMetadata-2.0.4) +dataset: + name: oep_oemetadata + title: OEP OEMetadata + description: A dataset for the OEMetadata examples. + "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/ +``` + +> Backwards compatibility: if you prefer, you may put dataset fields directly at the top level; OMI will treat that as `dataset: {...}`. + +--- + +### Template YAML (optional) + +Applied to **every** resource (unless the resource overrides specific fields). Keeps your YAML DRY. + +```yaml +# metadata/datasets/powerplants.template.yaml +licenses: + - name: ODbL-1.0 + title: Open Data Commons Open Database License 1.0 + path: https://opendatacommons.org/licenses/odbl/1-0/index.html + instruction: > + You are free to share and change, but you must attribute, and + share derivations under the same license. See https://tldrlegal.com/license/odc-open-database-license-(odbl) + for further information. + attribution: © Reiner Lemoine Institut + copyrightStatement: https://github.com/OpenEnergyPlatform/oemetadata/blob/production/LICENSE.txt + +context: + title: NFDI4Energy + homepage: https://nfdi4energy.uol.de/ + documentation: https://nfdi4energy.uol.de/sites/about_us/ + sourceCode: https://github.com/NFDI4Energy + publisher: Open Energy Platform (OEP) + publisherLogo: https://github.com/OpenEnergyPlatform/organisation/blob/production/logo/OpenEnergyFamily_Logo_OpenEnergyPlatform.svg + contact: contact@example.com + fundingAgency: " Deutsche Forschungsgemeinschaft (DFG)" + fundingAgencyLogo: https://upload.wikimedia.org/wikipedia/commons/8/86/DFG-logo-blau.svg + grantNo: "501865131" + +topics: [model_draft] +languages: [en-GB, de-DE] +keywords: [example, ODbL-1.0, NFDI4Energy] +``` + +--- + +### Resource YAML + +```yaml +# metadata/resources/powerplants/oemetadata_table_template.resource.yaml +name: oemetadata_table_template +type: table +title: OEMetadata Table Template +description: Example table used to illustrate the OEMetadata structure and features. + +# Resource-specific attributes +path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template +scheme: http +format: CSV +encoding: UTF-8 + +dialect: + decimalSeparator: "." + csv: + delimiter: ";" + +schema: + fields: + - name: id + type: integer + description: Unique identifier + nullable: false + # ... more fields ... + primaryKey: [id] + foreignKeys: + - fields: [id, version] + reference: + resource: model_draft.oep_oemetadata_table_example_version + fields: [id, version] + +"@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv + +sources: + - title: IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report + authors: [Hoesung Lee, José Romero, The Core Writing Team] + publicationYear: "2023" + path: https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf + sourceLicenses: + - name: CC-BY-4.0 + title: Creative Commons Attribution 4.0 International + path: https://creativecommons.org/licenses/by/4.0/legalcode + instruction: > + You are free to share and change, but you must attribute. + See https://tldrlegal.com/license/odc-open-database-license-odbl for further information. + attribution: © Intergovernmental Panel on Climate Change 2023 + copyrightStatement: https://www.ipcc.ch/copyright/ + +# Other metadata like subject, publicationDate, spatial, temporal, contributors, review... +``` + +A second resource: + +```yaml +# metadata/resources/powerplants/data_2.resource.yaml +name: data_2 +type: table +title: My Second Resource + +path: reGon/metadata/data_2.csv +scheme: file +format: csv +mediatype: text/csv +encoding: utf-8 + +schema: + fields: + - name: id + type: integer + nullable: true + - name: i + type: integer + nullable: true + - name: o + type: string + nullable: true + primaryKey: [id] + +``` + +--- + +### Index YAML (optional) + +Use this if you want explicit mappings instead of convention-based discovery. + +```yaml +# metadata/metadata_index.yaml +datasets: + powerplants: + dataset: datasets/powerplants.dataset.yaml + template: datasets/powerplants.template.yaml + resources: + - resources/powerplants/oemetadata_table_template.resource.yaml + - resources/powerplants/data_2.resource.yaml +``` + +--- + +## Templating Rules + +* **Deep merge** for dictionaries (e.g., `context`): + + * Resource **overrides** template on conflicts. + * Missing nested keys are **filled** from template. + +* **Lists**: + + * **Concatenate** (resource first, then template-only items) for: + `keywords`, `topics`, `languages`. + * For other lists (e.g., `licenses`, `contributors`), **resource wins** (no concat). + * You can change this behavior in code by adding keys to `DEFAULT_CONCAT_LIST_KEYS`. + +* **Scalars**: resource value **wins**. + +This keeps YAML DRY while allowing fine-grained per-resource overrides. + +--- + +## Discovery vs. Index Mapping + +* **Discovery (convention):** + `datasets/.dataset.yaml`, `datasets/.template.yaml`, and `resources//*.resource.yaml` + → No index file needed. + +* **Index (explicit mapping):** + Use `metadata_index.yaml` to map dataset/template/resources by path, relative to the metadata base directory. + +--- + +## Programmatic Usage + +OMI exposes high-level assembly and creation utilities. + +### Minimal Usage + +```python +from omi.creation.assembly import assemble_metadata_dict + +metadata = assemble_metadata_dict( + base_dir="./metadata", + dataset_id="powerplants", +) # returns a Python dict with valid OEMetadata +``` + +### With Index Mapping + +```python +from omi.creation.assembly import assemble_metadata_dict + +metadata = assemble_metadata_dict( + base_dir="./metadata", + dataset_id="powerplants", + index_file="./metadata/metadata_index.yaml", +) +``` + +### Manual Loading (No Discovery) + +```python +from pathlib import Path +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import load_yaml, apply_template_to_resources + +dataset = load_yaml(Path("./metadata/datasets/powerplants.dataset.yaml")).get("dataset", {}) +template = load_yaml(Path("./metadata/datasets/powerplants.template.yaml")) +resources = [ + load_yaml(Path("./metadata/resources/powerplants/oemetadata_table_template.resource.yaml")), + load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")), +] + +resources = apply_template_to_resources(resources, template) +creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4") +metadata = creator.generate_metadata(dataset, resources) +``` + +> The `OEMetadataCreator` injects `@context` and `metaMetadata` and calls validation. + +--- + +## Airflow Integration Example + +```python +# In a DAG task (PythonOperator callable) +from omi.creation.assembly import assemble_metadata_dict + +def build_oemetadata_for_powerplants(**context): + md = assemble_metadata_dict( + base_dir="/opt/airflow/dags/metadata", # your metadata module + dataset_id="powerplants", + index_file="/opt/airflow/dags/metadata/metadata_index.yaml", # or None for discovery + ) + # Store or pass downstream: write to file/DB/API, or XCom + context["ti"].xcom_push(key="oemetadata", value=md) +``` + +--- + +## Testing + +You can unit test assembly logic without depending on the real spec/validator by **monkeypatching** the creator. + +**Example (`tests/test_assembly.py`):** + +```python +from pathlib import Path +import yaml +import pytest +from omi.creation.assembly import assemble_metadata_dict + +def write_yaml(p: Path, data) -> None: + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8") + +class FakeCreator: + def __init__(self, oem_version: str = "OEMetadata-2.0.4"): + self.oem_version = oem_version + def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: + return {"@context": "...", **dataset, "resources": resources, "metaMetadata": {"metadataVersion": self.oem_version}} + +def test_assemble(tmp_path, monkeypatch): + write_yaml(tmp_path / "datasets" / "demo.dataset.yaml", {"dataset": {"name": "demo", "title": "Demo"}}) + write_yaml(tmp_path / "datasets" / "demo.template.yaml", {"keywords": ["k1"], "context": {"contact": "a@b"}}) + write_yaml(tmp_path / "resources" / "demo" / "a.resource.yaml", {"name": "a", "title": "A", "keywords": ["ak"]}) + write_yaml(tmp_path / "resources" / "demo" / "b.resource.yaml", {"name": "b", "title": "B", "context": {"publisher": "X"}}) + + monkeypatch.setattr("omi.creation.assembly.OEMetadataCreator", FakeCreator) + md = assemble_metadata_dict(tmp_path, "demo") + + assert md["name"] == "demo" + a, b = md["resources"] + assert a["keywords"] == ["ak", "k1"] # concat + assert b["context"]["contact"] == "a@b" # filled from template + assert b["context"]["publisher"] == "X" # resource wins +``` + +Run with: + +```bash +pytest -q +``` + +--- + +## Validation & Error Handling + +* `OEMetadataCreator.generate_metadata()` runs `validate_metadata(metadata, check_license=False)`. +* If validation fails, catch and inspect the exception from `omi.validation`: + +```python +from omi.validation import ValidationError + +try: + metadata = assemble_metadata_dict("./metadata", "powerplants") +except ValidationError as e: + print("Validation failed:", e) +``` + +**Common causes:** + +* Missing **required** keys (e.g., field missing `"nullable"`). +* Incorrect data types (e.g., non-URI in a field that requires `format: uri`). +* Invalid list shapes (`primaryKey`, `foreignKeys`, etc.). + +--- + +## Auto-Generation From Directory (Optional Onboarding) + +You can auto-generate a starter YAML for a dataset by scanning a directory or zip: + +* Infer resource entries based on file names & extensions. +* For CSVs, call your CSV inference to produce initial `schema.fields`. +* Write a `dataset` YAML + per-file `resource` YAMLs as a starting point. + +> Keep this as an onboarding tool; human review is still recommended. + +--- + +## Filtering Irrelevant Files (Optional) + +If auto-generating from a directory, filter out noise: + +```python +def read_directory(directory, exclude_extensions=None, exclude_patterns=None, exclude_hidden=True): + # ... + # exclude_extensions=['.log','.tmp','.bak','.DS_Store','.md'] + # exclude_patterns=['*_backup.*','*~','*.old','*.ignore'] + return files +``` + +Helps avoid including backups, temp files, editor artifacts, etc. + +--- + +## Design Notes & Extensibility + +* **Separation of concerns**: + + * `utils` covers loading YAML, discovery, merging/templating. + * `assembly` orchestrates the load → merge → create flow. + * `creator` handles schema-based assembly and validation. +* **Storage-agnostic**: assembly returns a dict; you decide where to store it (file/DB/API). +* **Configurable merge**: change list concat behavior by editing `DEFAULT_CONCAT_LIST_KEYS`. + +--- + +## FAQ + +**Q:** Can a resource override template-provided `licenses`? +**A:** Yes. By default, **resource wins** for lists except `keywords`, `topics`, `languages` (which concatenate). You can include `"licenses"` in `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation. + +**Q:** Where does `@context` and `metaMetadata` come from? +**A:** `OEMetadataCreator` reads the official spec via `get_metadata_specification(oem_version)` and injects `@context` and a `metaMetadata` block, then validates the final result. + +**Q:** The output JSON shows `\u00a9` instead of `©`. +**A:** Use `ensure_ascii=False` when dumping JSON: + +```python +json.dump(metadata, f, indent=2, ensure_ascii=False) +``` + +**Q:** I see validation errors about fields missing `nullable`. +**A:** Ensure each `schema.fields[]` has **`name`**, **`type`**, and **`nullable`** at minimum. If you auto-generate fields, set `nullable: false` as a safe default unless you detect nulls. + +**Q:** How do I run without a template YAML? +**A:** Just omit `datasets/.template.yaml`; assembly works without it. + +--- + +> If you want this split across multiple docs, consider: +> `docs/assembly-overview.md`, `docs/yaml-formats.md`, `docs/templating.md`, `docs/integration-airflow.md`, `docs/testing.md`, and `docs/troubleshooting.md`. From 79da3a2bf5ed249dd6c588d51127a4e92ddae33a Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:37:40 +0100 Subject: [PATCH 12/37] #126: Add test for yaml based oemetadata layout assembly module --- tests/test_assembly.py | 210 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 tests/test_assembly.py diff --git a/tests/test_assembly.py b/tests/test_assembly.py new file mode 100644 index 00000000..dce7efb9 --- /dev/null +++ b/tests/test_assembly.py @@ -0,0 +1,210 @@ +""" +Assembly integration tests for split-files OEMetadata authoring. + +This module exercises the public assembler entry point by building a small +on-disk YAML tree, applying a template, and verifying the merged OEMetadata. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import yaml + +# We test the public assembler entry point +from omi.creation.assembler import assemble_metadata_dict + +if TYPE_CHECKING: + from pathlib import Path + + import pytest + + +# ---------- helpers ---------- + + +def write_yaml(p: Path, data: object) -> None: + """Write `data` (any YAML-serializable object) to path `p`.""" + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text( + yaml.safe_dump(data, sort_keys=False, allow_unicode=True), + encoding="utf-8", + ) + + +class FakeCreator: + """ + Minimal stand-in for OEMetadataCreator used via monkeypatching. + + It mimics `generate_metadata(dataset, resources)` and skips validation. + The constructor accepts the OEMetadata version to embed in metaMetadata. + """ + + def __init__(self, oem_version: str = "OEMetadata-2.0") -> None: + """Initialize the fake creator with a specific OEMetadata version.""" + self.oem_version = oem_version + + def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: + """Return a small OEMetadata-like dict sufficient for assertions.""" + return { + "@context": "https://example.org/context.json", + **dataset, + "resources": resources, + "metaMetadata": {"metadataVersion": self.oem_version}, + } + + +# ---------- tests ---------- + + +def test_assemble_by_convention_with_template_merge( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """ + Assemble via convention and verify deep merge semantics. + + Asserts: + - dataset is loaded from datasets/{id}.dataset.yaml + - template is applied deeply (resource wins on conflicts) + - keywords are concatenated (resource first, then template-only) + - licenses remain resource-provided if present (no concat by default) + - creator is invoked and returns a full dict + """ + # dataset + write_yaml( + tmp_path / "datasets" / "demo.dataset.yaml", + { + "version": "OEMetadata-2.0.4", + "dataset": {"name": "demo", "title": "Demo", "description": "Demo dataset"}, + }, + ) + + # template + write_yaml( + tmp_path / "datasets" / "demo.template.yaml", + { + "context": {"publisher": "OEP", "contact": "a@b"}, + "keywords": ["k1"], + "topics": ["model_draft"], + "languages": ["en-GB"], + "licenses": [{"name": "L1"}], # applies only if resource doesn't provide licenses + }, + ) + + # resources + write_yaml( + tmp_path / "resources" / "demo" / "r1.resource.yaml", + { + "name": "r1", + "title": "R1", + # overrides nested key, should still inherit contact from template + "context": {"publisher": "Other"}, + # resource provides its own licenses -> should NOT be concatenated by default + "licenses": [{"name": "R1-license"}], + # own keywords -> should concat with template keywords + "keywords": ["r1k"], + }, + ) + write_yaml( + tmp_path / "resources" / "demo" / "r2.resource.yaml", + { + "name": "r2", + "title": "R2", + # no licenses provided -> should get template licenses + }, + ) + + # Patch the creator used inside assembler to our Fake + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + md = assemble_metadata_dict(tmp_path, "demo") + + # dataset propagated + assert md["name"] == "demo" + assert md["title"] == "Demo" + + # resources present + assert isinstance(md["resources"], list) + assert len(md["resources"]) == 2 + r1, r2 = md["resources"] + + # deep merge for context: resource wins on conflicts, template fills missing keys + assert r1["context"]["publisher"] == "Other" + assert r1["context"]["contact"] == "a@b" + + # keywords/topics/languages concatenate (resource first, then template-only) + assert r1["keywords"] == ["r1k", "k1"] + # topics/languages inherited if missing + assert r1["topics"] == ["model_draft"] + assert r1["languages"] == ["en-GB"] + + # licenses: resource list wins (no concat by default) + assert r1["licenses"] == [{"name": "R1-license"}] + + # r2 inherits licenses from template (since none provided) + assert r2["licenses"] == [{"name": "L1"}] + # r2 inherits keywords/topics/languages from template + assert r2["keywords"] == ["k1"] + assert r2["topics"] == ["model_draft"] + assert r2["languages"] == ["en-GB"] + + # metaMetadata present from FakeCreator (assembler passes through the version) + assert md["metaMetadata"]["metadataVersion"] == "OEMetadata-2.0.4" + + +def test_assemble_with_index_mapping( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Assemble using an explicit metadata_index.yaml mapping.""" + base = tmp_path + + # index mapping + write_yaml( + base / "metadata_index.yaml", + { + "datasets": { + "pp": { + "dataset": "datasets/powerplants.dataset.yaml", + "template": "datasets/powerplants.template.yaml", + "resources": [ + "resources/powerplants/a.resource.yaml", + "resources/powerplants/b.resource.yaml", + ], + }, + }, + }, + ) + + write_yaml( + base / "datasets" / "powerplants.dataset.yaml", + {"dataset": {"name": "pp", "title": "PP"}}, + ) + write_yaml( + base / "datasets" / "powerplants.template.yaml", + {"keywords": ["t-k"]}, + ) + write_yaml( + base / "resources" / "powerplants" / "a.resource.yaml", + {"name": "a", "title": "A", "keywords": ["a-k"]}, + ) + write_yaml( + base / "resources" / "powerplants" / "b.resource.yaml", + {"name": "b", "title": "B"}, + ) + + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + # Use the index explicitly + md = assemble_metadata_dict(base, "pp", index_file=base / "metadata_index.yaml") + + assert md["name"] == "pp" + names = [r["name"] for r in md["resources"]] + assert names == ["a", "b"] + + # keywords concatenated for 'a', inherited for 'b' + r_a = md["resources"][0] + r_b = md["resources"][1] + assert r_a["keywords"] == ["a-k", "t-k"] + assert r_b["keywords"] == ["t-k"] From a90288517cc15e93d6f64a2c2d12fd7add5fd31d Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:40:49 +0100 Subject: [PATCH 13/37] #126: Add test for yaml based oemetadata creation -> as dict or save as file --- tests/test_creation_utils.py | 202 +++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 tests/test_creation_utils.py diff --git a/tests/test_creation_utils.py b/tests/test_creation_utils.py new file mode 100644 index 00000000..de3c7a67 --- /dev/null +++ b/tests/test_creation_utils.py @@ -0,0 +1,202 @@ +"""Unit tests for the OMI creation utils (templating, IO, discovery).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +import yaml + +# Functions under test +from omi.creation.utils import ( + DEFAULT_CONCAT_LIST_KEYS, + _merge_lists, + apply_template_to_resources, + deep_apply_template_to_resource, + discover_dataset_ids, + discover_dataset_ids_from_index, + discover_paths, + load_parts, + load_yaml, + resolve_from_index, +) + +if TYPE_CHECKING: + from pathlib import Path + + +# ---------- helpers ---------- + + +def _write_yaml(p: Path, data: object) -> None: + """Write a YAML-serializable `data` object to `p`, creating parents.""" + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8") + + +# ---------- tests: list merging + deep template ---------- + + +def test_merge_lists_deduplicates_and_respects_resource_first() -> None: + """`_merge_lists` keeps resource-first order and de-duplicates template items.""" + resource_list = ["a", "b"] + template_list = ["b", "c"] + merged = _merge_lists(template_list, resource_list, deduplicate=True) + assert merged == ["a", "b", "c"] + + +def test_deep_apply_template_to_resource_concat_for_keywords_topics_languages() -> None: + """Default concat keys (keywords/topics/languages) are concatenated; others are not.""" + resource = { + "name": "r", + "keywords": ["rk"], + "topics": ["rt"], + "languages": ["rl"], + "context": {"publisher": "R"}, + "list_no_concat": [1, 2], + } + template = { + "keywords": ["tk"], + "topics": ["tt"], + "languages": ["tl"], + "context": {"publisher": "T", "contact": "a@b"}, + "list_no_concat": [3, 4], + } + + out = deep_apply_template_to_resource(resource, template) + # concat lists for default concat keys + assert out["keywords"] == ["rk", "tk"] + assert out["topics"] == ["rt", "tt"] + assert out["languages"] == ["rl", "tl"] + # resource list wins for non-concat keys + assert out["list_no_concat"] == [1, 2] + # deep dict merge: resource wins on conflict, template fills missing + assert out["context"]["publisher"] == "R" + assert out["context"]["contact"] == "a@b" + + +def test_deep_apply_template_to_resource_custom_concat_keys() -> None: + """Custom concat set allows concatenating lists like `licenses`.""" + resource = {"licenses": [{"name": "R1"}]} + template = {"licenses": [{"name": "T1"}]} + # By default, 'licenses' is NOT concatenated + out_default = deep_apply_template_to_resource(resource, template) + assert out_default["licenses"] == [{"name": "R1"}] + + # If we opt-in, it concatenates (resource first, then template-only) + custom_keys = set(DEFAULT_CONCAT_LIST_KEYS) | {"licenses"} + out_custom = deep_apply_template_to_resource(resource, template, concat_list_keys=custom_keys) + assert out_custom["licenses"] == [{"name": "R1"}, {"name": "T1"}] + + +def test_apply_template_to_resources_applies_per_item() -> None: + """Template is applied to each resource; concat for `keywords` by default.""" + resources = [{"name": "a"}, {"name": "b", "keywords": ["bk"]}] + template = {"keywords": ["tk"]} + out = apply_template_to_resources(resources, template) + assert out[0]["keywords"] == ["tk"] # inherited from template + assert out[1]["keywords"] == ["bk", "tk"] # concatenated: resource first, then template-only + + +# ---------- tests: YAML IO + discovery ---------- + + +def test_load_yaml_reads_empty_as_empty_dict(tmp_path: Path) -> None: + """Empty YAML file is read as an empty dict.""" + p = tmp_path / "empty.yaml" + p.write_text("", encoding="utf-8") + data = load_yaml(p) + assert data == {} + + +def test_discover_paths_and_resolve_from_index(tmp_path: Path) -> None: + """Discovery by convention and resolution by index both return expected paths.""" + base = tmp_path + # convention files + ds = base / "datasets" / "powerplants.dataset.yaml" + tp = base / "datasets" / "powerplants.template.yaml" + rdir = base / "resources" / "powerplants" + r1 = rdir / "a.resource.yaml" + r2 = rdir / "b.resource.yaml" + + _write_yaml(ds, {"version": "OEMetadata-2.0.4", "dataset": {"name": "pp"}}) + _write_yaml(tp, {"keywords": ["k1"]}) + _write_yaml(r1, {"name": "a"}) + _write_yaml(r2, {"name": "b"}) + + dspath, tpath, rpaths = discover_paths(base, "powerplants") + assert dspath == ds + assert tpath == tp + assert rpaths == [r1, r2] + + # index mapping (deliberately flips resource order) + idx = base / "metadata_index.yaml" + _write_yaml( + idx, + { + "datasets": { + "powerplants": { + "dataset": "datasets/powerplants.dataset.yaml", + "template": "datasets/powerplants.template.yaml", + "resources": [ + "resources/powerplants/b.resource.yaml", + "resources/powerplants/a.resource.yaml", + ], + }, + }, + }, + ) + d2, t2, rs2 = resolve_from_index(base, "powerplants", idx) + assert d2 == ds + assert t2 == tp + assert rs2 == [base / "resources/powerplants/b.resource.yaml", base / "resources/powerplants/a.resource.yaml"] + + +def test_load_parts_returns_all_sections(tmp_path: Path) -> None: + """`load_parts` returns (version, dataset, resources, template) with expected contents.""" + base = tmp_path + ds = base / "datasets" / "households.dataset.yaml" + tp = base / "datasets" / "households.template.yaml" + rdir = base / "resources" / "households" + r1 = rdir / "hh1.resource.yaml" + + _write_yaml(ds, {"version": "OEMetadata-2.0.4", "dataset": {"name": "households", "title": "HH"}}) + _write_yaml(tp, {"context": {"publisher": "OEP"}}) + _write_yaml(r1, {"name": "hh1"}) + + version, dataset, resources, template = load_parts(base, "households") + assert version == "OEMetadata-2.0.4" + assert dataset == {"name": "households", "title": "HH"} + assert resources == [{"name": "hh1"}] + assert template == {"context": {"publisher": "OEP"}} + + +def test_load_parts_raises_when_dataset_missing(tmp_path: Path) -> None: + """`load_parts` raises FileNotFoundError if the dataset YAML is missing.""" + with pytest.raises(FileNotFoundError): + load_parts(tmp_path, "missing") + + +# ---------- tests: dataset id discovery ---------- + + +def test_discover_dataset_ids(tmp_path: Path) -> None: + """`discover_dataset_ids` finds dataset ids by scanning datasets/*.dataset.yaml.""" + _write_yaml(tmp_path / "datasets" / "a.dataset.yaml", {"dataset": {"name": "a"}}) + _write_yaml(tmp_path / "datasets" / "b.dataset.yaml", {"dataset": {"name": "b"}}) + ids = discover_dataset_ids(tmp_path) + assert ids == ["a", "b"] + + +def test_discover_dataset_ids_from_index(tmp_path: Path) -> None: + """`discover_dataset_ids_from_index` returns top-level 'datasets' keys in index YAML.""" + idx = tmp_path / "metadata_index.yaml" + _write_yaml(idx, {"datasets": {"x": {}, "y": {}}}) + ids = discover_dataset_ids_from_index(idx) + assert ids == ["x", "y"] + + +def test_discover_dataset_ids_from_index_missing_file(tmp_path: Path) -> None: + """Missing index file yields an empty list of dataset ids.""" + ids = discover_dataset_ids_from_index(tmp_path / "nope.yaml") + assert ids == [] From 8f5fc0c363643a7a4c3aa99cb4733f0da67835a4 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:42:20 +0100 Subject: [PATCH 14/37] #126: Add todo to extend inspection tests --- tests/test_inspection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_inspection.py b/tests/test_inspection.py index 8cf504ba..27d1afb1 100644 --- a/tests/test_inspection.py +++ b/tests/test_inspection.py @@ -34,3 +34,9 @@ def test_inspection(): assert metadata["resources"][0]["schema"]["fields"][6]["type"] == "object" assert metadata["resources"][0]["schema"]["fields"][7]["type"] == "date" assert metadata["resources"][0]["schema"]["fields"][8]["type"] == "boolean" + + +# TODO @jh-RLI: Add test for special cases in csv as e.g. this data will cause issues # noqa: TD003 +# cat objective.csv +# ;0 +# objective;97356714.15339188 From 1fdd91e8853a3b9276ef1c4a7c5e71b2e107d704 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:43:46 +0100 Subject: [PATCH 15/37] #126: Add test for yaml based oemetadata creation -> as dict or save as file --- tests/test_metadata_creation.py | 156 +++++++++++++++++++++++++------- 1 file changed, 125 insertions(+), 31 deletions(-) diff --git a/tests/test_metadata_creation.py b/tests/test_metadata_creation.py index 2cf6a7d5..0387906d 100644 --- a/tests/test_metadata_creation.py +++ b/tests/test_metadata_creation.py @@ -1,42 +1,108 @@ -"""Test suite for the OEMetadataCreator class in the OMI creation module.""" +"""Test suite for the OEMetadataCreator class in the OMI creation module (split-files layout).""" -from pathlib import Path +from __future__ import annotations + +import json +from typing import TYPE_CHECKING import pytest import yaml from omi.creation.creator import OEMetadataCreator -from omi.creation.utils import load_yaml_metadata +from omi.creation.utils import apply_template_to_resources, load_parts + +if TYPE_CHECKING: + from pathlib import Path @pytest.fixture() -def sample_yaml(tmp_path: Path) -> Path: - """Fixture to create a sample YAML file for testing.""" - content = { - "version": "OEMetadata-2.0.4", - "dataset": { - "name": "test_dataset", - "title": "Test Dataset", - "description": "For unit testing", - "@id": "https://example.org/test_dataset", - }, - "template": {"languages": ["en-GB"]}, - "resources": [{"name": "test_resource", "title": "Test Resource", "format": "CSV", "type": "table"}], - } - - file_path = tmp_path / "metadata.yaml" - with Path.open(file_path, "w", encoding="utf-8") as f: - yaml.dump(content, f, sort_keys=False) - - return file_path - - -def test_generate_oemetadata(sample_yaml: Path) -> None: - """Test the generation of OEMetadata from a sample YAML file.""" - version, dataset, resources = load_yaml_metadata(sample_yaml) - creator = OEMetadataCreator() - - result = creator.generate_metadata(dataset, resources) +def sample_tree(tmp_path: Path) -> tuple[Path, str]: + """ + Create a split-files metadata tree. + + metadata/ + datasets/ + demo.dataset.yaml + demo.template.yaml + resources/ + demo/ + table.resource.yaml + """ + base = tmp_path / "metadata" + ds_dir = base / "datasets" + rs_dir = base / "resources" / "demo" + + ds_dir.mkdir(parents=True, exist_ok=True) + rs_dir.mkdir(parents=True, exist_ok=True) + + # dataset yaml + (ds_dir / "demo.dataset.yaml").write_text( + yaml.safe_dump( + { + "version": "OEMetadata-2.0", + "dataset": { + "name": "test_dataset", + "title": "Test Dataset", + "description": "For unit testing", + "@id": "https://example.org/test_dataset", + }, + }, + sort_keys=False, + allow_unicode=True, + ), + encoding="utf-8", + ) + + # template yaml (applied to every resource) + (ds_dir / "demo.template.yaml").write_text( + yaml.safe_dump( + { + "languages": ["en-GB"], + "keywords": ["example"], + "context": {"publisher": "OEP", "contact": "contact@example.org"}, + }, + sort_keys=False, + allow_unicode=True, + ), + encoding="utf-8", + ) + + # one resource yaml + (rs_dir / "table.resource.yaml").write_text( + yaml.safe_dump( + { + "name": "test_resource", + "title": "Test Resource", + "type": "table", + "format": "CSV", + "schema": { + "fields": [ + {"name": "id", "type": "integer", "nullable": False}, + ], + "primaryKey": ["id"], + }, + }, + sort_keys=False, + allow_unicode=True, + ), + encoding="utf-8", + ) + + return base, "demo" + + +def test_generate_oemetadata_from_split_files(sample_tree: tuple[Path, str]) -> None: + """End-to-end: load parts, apply template, generate metadata via creator.""" + base_dir, dataset_id = sample_tree + + # Load version/dataset/resources/template from split-files layout + version, dataset, resources, template = load_parts(base_dir, dataset_id) + + # Deep-apply template to resources (dicts merge, lists concat for keywords/topics/languages) + merged_resources = apply_template_to_resources(resources, template) + + creator = OEMetadataCreator(oem_version=version) + result = creator.generate_metadata(dataset, merged_resources) # Basic assertions assert result["@context"].startswith("https://") @@ -44,4 +110,32 @@ def test_generate_oemetadata(sample_yaml: Path) -> None: assert "resources" in result assert isinstance(result["resources"], list) assert result["resources"][0]["name"] == "test_resource" - assert "languages" in result["resources"][0] + + # Template has been applied deeply (languages concatenated / context merged) + r0 = result["resources"][0] + assert r0["languages"] == ["en-GB"] + assert r0["keywords"] == ["example"] + assert r0["context"]["publisher"] == "OEP" + assert r0["context"]["contact"] == "contact@example.org" + + # Schema minimally intact + assert r0["schema"]["primaryKey"] == ["id"] + assert r0["schema"]["fields"][0]["name"] == "id" + assert r0["schema"]["fields"][0]["nullable"] is False + + +def test_creator_save_writes_json(sample_tree: tuple[Path, str]) -> None: + """Ensure creator.save writes JSON and preserves unicode.""" + base_dir, dataset_id = sample_tree + version, dataset, resources, template = load_parts(base_dir, dataset_id) + merged_resources = apply_template_to_resources(resources, template) + + out = base_dir / "out.json" + creator = OEMetadataCreator(oem_version=version) + creator.save(dataset, merged_resources, out, ensure_ascii=False, indent=2) + + assert out.exists() + data = json.loads(out.read_text(encoding="utf-8")) + assert data["name"] == "test_dataset" + # unicode preserved (no \u escapes because ensure_ascii=False) + assert "©" not in out.read_text(encoding="utf-8") # sanity check; none present here by default From 2c180556b3fc98bde2f6451d2084b47f745f0ae1 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:48:32 +0100 Subject: [PATCH 16/37] #126: Move all utility functionality in creation module here. - Add utils to merge dataset, resource and template parts - Add io utils - Add utils to properly read version, dataset, template, resource from yaml directory - Add util to read info for many datasets from yaml directory --- src/omi/creation/utils.py | 249 +++++++++++++++++++++++++++++++++----- 1 file changed, 218 insertions(+), 31 deletions(-) diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py index 97f1644d..a0e74a6a 100644 --- a/src/omi/creation/utils.py +++ b/src/omi/creation/utils.py @@ -1,45 +1,232 @@ -"""Utility functions for OMI creation module.""" +""" +Utility functions for the OMI creation module. +This module provides deep-merge templating, YAML IO, and discovery helpers +for assembling OEMetadata from split YAML files (dataset/template/resources). +""" + +from __future__ import annotations + +from copy import deepcopy from pathlib import Path -from typing import Union +from typing import TYPE_CHECKING, Optional, Union import yaml +if TYPE_CHECKING: + from collections.abc import Hashable + +# --- deep merge helpers ------------------------------------------------------- + +# List keys we concatenate (resource + template) instead of replacing. +DEFAULT_CONCAT_LIST_KEYS = {"keywords", "topics", "languages"} + + +def _hashable_key(x: object) -> Hashable | tuple: + """ + Return a hashable representation of `x` for deduplication purposes. + + - dict -> sorted tuple of (key, value) pairs + - list -> tuple(list) + - other -> value itself + """ + if isinstance(x, dict): + return tuple(sorted(x.items())) + if isinstance(x, list): + return tuple(x) + return x # type: ignore[return-value] + -def load_yaml_metadata(file_path: Union[str, Path]) -> tuple[str, dict, list[dict], dict]: +def _merge_lists( + template_list: list[object], + resource_list: list[object], + *, + deduplicate: bool = True, +) -> list[object]: """ - Load YAML file containing version, dataset, template, and resource metadata. + Concatenate lists with resource-first priority. - This function reads a YAML file and extracts the version, dataset description, - resources, and template. It applies the template to each resource, merging any - specified fields. - Returns: version, dataset, list of resources with merged template, and raw template. + When `deduplicate` is True, only items that are not already present in + `resource_list` (by hashable representation) are appended from `template_list`. + """ + merged = list(resource_list) + if not template_list: + return merged - Parameters - ---------- - file_path: Union[str, Path] - Path to the YAML file. + if deduplicate: + existing = {_hashable_key(v) for v in merged} + for item in template_list: + k = _hashable_key(item) + if k not in existing: + merged.append(item) + else: + merged.extend(template_list) + return merged + + +def deep_apply_template_to_resource( + resource: dict[str, object], + template: dict[str, object], + concat_list_keys: Union[tuple[str, ...], set[str]] = DEFAULT_CONCAT_LIST_KEYS, +) -> dict[str, object]: + """ + Apply a resource template using deep-merge semantics. - Returns - ------- - Tuple[str, Dict, List[Dict], Dict] - A tuple containing: - - version: The version of the metadata. - - dataset: The dataset description. - - resources: A list of resources with the template applied. - - template: The raw template used for resources. + Rules: + - Missing keys are copied from the template. + - Dicts are deep-merged (resource wins on conflicts). + - Lists are concatenated only for keys in `concat_list_keys`; otherwise, the + resource list is preserved as-is. + - Scalars: resource values win. """ - with Path(file_path).open(encoding="utf-8") as file: - data = yaml.safe_load(file) + if not template: + return resource - version = data.get("version", "OEMetadata-2.0.4") - dataset = data.get("dataset", {}) - template = data.get("template", {}) - resources = data.get("resources", []) + result = deepcopy(resource) + for key, tval in template.items(): + if key not in result: + result[key] = deepcopy(tval) + continue + + rval = result[key] + if isinstance(rval, dict) and isinstance(tval, dict): + result[key] = deep_apply_template_to_resource(rval, tval, concat_list_keys) + continue + + if isinstance(rval, list) and isinstance(tval, list): + if key in concat_list_keys: + result[key] = _merge_lists(tval, rval, deduplicate=True) + # else: resource list stays as-is + continue + # scalar: resource value stays + return result + + +def apply_template_to_resources( + resources: list[dict[str, object]], + template: dict[str, object], +) -> list[dict[str, object]]: + """Apply the same `template` to each resource in `resources`.""" + if not template: + return resources + return [deep_apply_template_to_resource(r, template) for r in resources] + + +# --- YAML IO + discovery ------------------------------------------------------ + + +def load_yaml(path: Union[str, Path]) -> dict[str, object]: + """Load a YAML mapping from `path`, returning an empty dict for empty files.""" + with Path(path).open("r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + + +def discover_paths( + base_dir: Union[str, Path], + dataset_id: str, +) -> tuple[Optional[Path], Optional[Path], list[Path]]: + """ + Discover dataset/template/resources paths by convention. + + - dataset: datasets/{dataset_id}.dataset.yaml + - template: datasets/{dataset_id}.template.yaml (optional) + - resources: resources/{dataset_id}/*.resource.yaml + """ + base = Path(base_dir) + dataset_path = base / "datasets" / f"{dataset_id}.dataset.yaml" + template_path = base / "datasets" / f"{dataset_id}.template.yaml" + resources_dir = base / "resources" / dataset_id - # Apply template to each resource - for resource in resources: - for key, value in template.items(): - resource.setdefault(key, value) + dataset = dataset_path if dataset_path.exists() else None + template = template_path if template_path.exists() else None + resources = sorted(resources_dir.glob("*.resource.yaml")) if resources_dir.exists() else [] + return dataset, template, resources - return version, dataset, resources + +def resolve_from_index( + base_dir: Union[str, Path], + dataset_id: str, + index_file: Optional[Union[str, Path]], +) -> tuple[Optional[Path], Optional[Path], list[Path]]: + """ + Resolve dataset/template/resources using an explicit index YAML. + + Example YAML: + + datasets: + : + dataset: path/to/dataset.yaml + template: path/to/template.yaml # optional + resources: + - path/to/res1.yaml + - path/to/res2.yaml + + Paths are interpreted as relative to `base_dir`. + """ + if not index_file: + return discover_paths(base_dir, dataset_id) + + base = Path(base_dir) + index_path = Path(index_file) + index = load_yaml(index_path) + entry = (index.get("datasets") or {}).get(dataset_id, {}) + dataset = base / entry["dataset"] if "dataset" in entry else None + template = base / entry["template"] if "template" in entry else None + resources = [base / p for p in entry.get("resources", [])] + return dataset, template, resources + + +def load_parts( + base_dir: Union[str, Path], + dataset_id: str, + index_file: Optional[Union[str, Path]] = None, +) -> tuple[str, dict[str, object], list[dict[str, object]], dict[str, object]]: + """ + Load dataset YAML, optional template YAML, and all resource YAMLs. + + Returns a tuple: (version, dataset, resources, template). + """ + dataset_path, template_path, resource_paths = resolve_from_index(base_dir, dataset_id, index_file) + + if dataset_path is None or not dataset_path.exists(): + raise FileNotFoundError(f"Dataset YAML not found for '{dataset_id}'") + + dataset_yaml = load_yaml(dataset_path) + version = str(dataset_yaml.get("version", "OEMetadata-2.0.4")) + # Support either dataset: {...} or flat style with top-level dataset keys. + dataset = dataset_yaml.get("dataset", dataset_yaml) + + template: dict[str, object] = {} + if template_path and template_path.exists(): + template = load_yaml(template_path) + + resources: list[dict[str, object]] = [load_yaml(p) for p in resource_paths] + return version, dataset, resources, template + + +def discover_dataset_ids(base_dir: Union[str, Path]) -> list[str]: + """ + Discover dataset ids by scanning datasets/*.dataset.yaml. + + For 'datasets/powerplants.dataset.yaml' returns 'powerplants'. + """ + base = Path(base_dir) + datasets_dir = base / "datasets" + if not datasets_dir.exists(): + return [] + return sorted([p.stem.replace(".dataset", "") for p in datasets_dir.glob("*.dataset.yaml")]) + + +def discover_dataset_ids_from_index(index_file: Union[str, Path]) -> list[str]: + """ + Discover dataset ids from an explicit metadata_index.yaml. + + Returns the sorted list of top-level keys under `datasets`. + """ + idx_path = Path(index_file) + if not idx_path.exists(): + return [] + with idx_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + ds = data.get("datasets") or {} + return sorted(ds.keys()) From 1494928c078db5a10f36607fe37c1390650a1c15 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:56:08 +0100 Subject: [PATCH 17/37] #126: Update create entrypoint to build oemetadata form yaml parts (dataset, template, resources) stored in a base directory --- src/omi/create.py | 51 ++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/omi/create.py b/src/omi/create.py index 2b9a4bd1..a30d2b2e 100644 --- a/src/omi/create.py +++ b/src/omi/create.py @@ -1,29 +1,44 @@ -"""Enty point for metadata creation.""" +"""Entry point for OEMetadata creation (split-files layout only).""" -import json -from pathlib import Path -from typing import Union +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional, Union from omi.creation.creator import OEMetadataCreator -from omi.creation.utils import load_yaml_metadata +from omi.creation.utils import apply_template_to_resources, load_parts + +if TYPE_CHECKING: + from pathlib import Path -def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None: +def build_from_yaml( + base_dir: Union[str, Path], + dataset_id: str, + output_file: Union[str, Path], + *, + index_file: Optional[Union[str, Path]] = None, +) -> None: """ - Generate OEMetadata from a YAML file and write it to an output file. + Assemble OEMetadata from split YAML files. + + - datasets/.dataset.yaml + - datasets/.template.yaml (optional) + - resources//*.resource.yaml + (optionally resolved via an index YAML) Parameters ---------- - yaml_file: str - Path to the input YAML file containing dataset and resources. - output_file: str - Path to the output file where the generated OEMetadata JSON will be saved. + base_dir : str | Path + Root directory containing 'datasets/' and 'resources/'. + dataset_id : str + Logical dataset id (e.g. 'powerplants'). + output_file : str | Path + Output path for the generated OEMetadata JSON. + index_file : str | Path | None + Optional explicit mapping file (metadata_index.yaml). """ - version, dataset, resources = load_yaml_metadata(yaml_file) - creator = OEMetadataCreator() - metadata = creator.generate_metadata(dataset, resources) - - with Path(output_file).open("w", encoding="utf-8") as f: - json.dump(metadata, f, indent=2) + version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file) + merged_resources = apply_template_to_resources(resources, template) - print(f"OEMetadata written to {output_file}") # noqa: T201 + creator = OEMetadataCreator(oem_version=version) + creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2) From 7239484d54ece80365cae7ce5bd819a014ca8041 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 17:57:40 +0100 Subject: [PATCH 18/37] #126: Rename test for assembler and add test case to check if assembling of many datasets at once will work --- tests/{test_assembly.py => test_assembler.py} | 108 +++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) rename tests/{test_assembly.py => test_assembler.py} (65%) diff --git a/tests/test_assembly.py b/tests/test_assembler.py similarity index 65% rename from tests/test_assembly.py rename to tests/test_assembler.py index dce7efb9..318c73e1 100644 --- a/tests/test_assembly.py +++ b/tests/test_assembler.py @@ -12,7 +12,7 @@ import yaml # We test the public assembler entry point -from omi.creation.assembler import assemble_metadata_dict +from omi.creation.assembler import assemble_many_metadata, assemble_metadata_dict if TYPE_CHECKING: from pathlib import Path @@ -208,3 +208,109 @@ def test_assemble_with_index_mapping( r_b = md["resources"][1] assert r_a["keywords"] == ["a-k", "t-k"] assert r_b["keywords"] == ["t-k"] + + +def test_assemble_many_metadata_convention_as_dict( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Assemble all datasets by convention; expect a dict keyed by dataset id.""" + # Dataset A + write_yaml( + tmp_path / "datasets" / "a.dataset.yaml", + {"version": "OEMetadata-2.0.4", "dataset": {"name": "a", "title": "A"}}, + ) + write_yaml( + tmp_path / "resources" / "a" / "r1.resource.yaml", + {"name": "r1", "title": "R1"}, + ) + + # Dataset B (with template) + write_yaml( + tmp_path / "datasets" / "b.dataset.yaml", + {"version": "OEMetadata-2.0.4", "dataset": {"name": "b", "title": "B"}}, + ) + write_yaml( + tmp_path / "datasets" / "b.template.yaml", + {"keywords": ["tk"]}, + ) + write_yaml( + tmp_path / "resources" / "b" / "r2.resource.yaml", + {"name": "r2", "title": "R2", "keywords": ["rk"]}, + ) + + # Use the FakeCreator inside the assembler + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + out = assemble_many_metadata(tmp_path) # dict[str, dict] + # discover_dataset_ids returns sorted ids + assert list(out.keys()) == ["a", "b"] + + # Dataset A checks + md_a = out["a"] + assert md_a["name"] == "a" + assert [r["name"] for r in md_a["resources"]] == ["r1"] + + # Dataset B checks (template applied with concat) + md_b = out["b"] + assert md_b["name"] == "b" + r2 = md_b["resources"][0] + assert r2["name"] == "r2" + assert r2["keywords"] == ["rk", "tk"] + + +def test_assemble_many_metadata_with_index_as_list( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Assemble all datasets declared in index; expect a list of (id, md) pairs sorted by id.""" + base = tmp_path + + # Index with two datasets (note: keys will be sorted by helper) + write_yaml( + base / "metadata_index.yaml", + { + "datasets": { + "x": { + "dataset": "datasets/x.dataset.yaml", + "resources": ["resources/x/x1.resource.yaml"], + }, + "y": { + "dataset": "datasets/y.dataset.yaml", + "template": "datasets/y.template.yaml", + "resources": ["resources/y/y1.resource.yaml"], + }, + }, + }, + ) + + # Dataset x + write_yaml(base / "datasets" / "x.dataset.yaml", {"dataset": {"name": "x", "title": "X"}}) + write_yaml(base / "resources" / "x" / "x1.resource.yaml", {"name": "x1"}) + + # Dataset y (with template) + write_yaml(base / "datasets" / "y.dataset.yaml", {"dataset": {"name": "y", "title": "Y"}}) + write_yaml(base / "datasets" / "y.template.yaml", {"keywords": ["t"]}) + write_yaml(base / "resources" / "y" / "y1.resource.yaml", {"name": "y1", "keywords": ["r"]}) + + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + pairs = assemble_many_metadata( + base, + index_file=base / "metadata_index.yaml", + as_dict=False, + ) # list[tuple[str, dict]] + + # Expect sorted ids: ['x', 'y'] + ids = [ds_id for ds_id, _ in pairs] + assert ids == ["x", "y"] + + md_x = pairs[0][1] + md_y = pairs[1][1] + + assert md_x["name"] == "x" + assert [r["name"] for r in md_x["resources"]] == ["x1"] + + # Template concat for y + r_y1 = md_y["resources"][0] + assert r_y1["keywords"] == ["r", "t"] From 5567f7375a59a9e26d7f9daf918b12119e4f9898 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:01:39 +0100 Subject: [PATCH 19/37] #126: Add assembler module which handles the assembling of yaml file based parts (dataset, template and resources) . Hint: The creator will then build/generate the oemetadata string --- src/omi/creation/assembler.py | 78 +++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 src/omi/creation/assembler.py diff --git a/src/omi/creation/assembler.py b/src/omi/creation/assembler.py new file mode 100644 index 00000000..9814f0ad --- /dev/null +++ b/src/omi/creation/assembler.py @@ -0,0 +1,78 @@ +"""Assemble OEMetadata dictionary from parts: dataset, template, and resources.""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional, Union + +from .creator import OEMetadataCreator +from .utils import ( + apply_template_to_resources, + discover_dataset_ids, + discover_dataset_ids_from_index, + load_parts, +) + +if TYPE_CHECKING: + from collections.abc import Iterable + + +def assemble_metadata_dict( + base_dir: Union[str, Path], + dataset_id: str, + index_file: Optional[Union[str, Path]] = None, +) -> dict[str, Any]: + """ + Load dataset/template/resources; apply template; validate via creator; return dict. + + Parameters + ---------- + base_dir: Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_id: str + Identifier for the dataset to load. + index_file: Optional[Union[str, Path]] + Optional path to an index YAML file for resolving dataset parts. + + Returns + ------- + Dict[str, Any] + The assembled and validated OEMetadata dictionary. + """ + version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file) + merged_resources = apply_template_to_resources(resources, template) + creator = OEMetadataCreator(oem_version=version) + return creator.generate_metadata(dataset, merged_resources) + + +def assemble_many_metadata( + base_dir: Union[str, Path], + dataset_ids: Optional[Iterable[str]] = None, + index_file: Optional[Union[str, Path]] = None, + *, + as_dict: bool = True, +) -> Union[dict[str, dict], list[tuple[str, dict]]]: + """ + Assemble OEMetadata for multiple datasets in one call. + + - If dataset_ids is None: + * when index_file is provided -> use keys from index + * otherwise -> discover by 'datasets/*.dataset.yaml' + - Returns a mapping {dataset_id: metadata} if as_dict=True, + else a list of (dataset_id, metadata) pairs in sorted id order. + """ + base = Path(base_dir) + + if dataset_ids is None: + ids = discover_dataset_ids_from_index(index_file) if index_file else discover_dataset_ids(base) + else: + ids = list(dataset_ids) + + results_pairs: list[tuple[str, dict]] = [] + for ds_id in ids: + md = assemble_metadata_dict(base, ds_id, index_file=index_file) + results_pairs.append((ds_id, md)) + + if as_dict: + return dict(results_pairs) + return results_pairs From eeb9c2713a25f259958ef854d2ed10848d5ff65e Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:03:00 +0100 Subject: [PATCH 20/37] #126: update cli functionality to include omi creation module --- src/omi/cli.py | 94 ++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/src/omi/cli.py b/src/omi/cli.py index 2a2093a5..a596a5c6 100644 --- a/src/omi/cli.py +++ b/src/omi/cli.py @@ -1,62 +1,74 @@ """ -Module that contains the command line app. +Command line interface for OMI. -Why does this file exist, and why not put this in __main__? +This CLI only supports the split-files layout: +- datasets/.dataset.yaml +- datasets/.template.yaml (optional) +- resources//*.resource.yaml +(optionally wired via metadata_index.yaml) - You might be tempted to import things from __main__ later, but that will cause - problems: the code will get executed twice: +Usage: +omi assemble \ + --base-dir ./metadata \ + --dataset-id powerplants \ + --output-file ./out/powerplants.json \ + --index-file ./metadata/metadata_index.yaml # optional - - When you run `python -m omi` python will execute - ``__main__.py`` as a script. That means there won't be any - ``omi.__main__`` in ``sys.modules``. - - When you import __main__ it will get executed again (as a module) because - there's no ``omi.__main__`` in ``sys.modules``. - - Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration """ -import json +from __future__ import annotations + from pathlib import Path -from typing import Union +from typing import Optional import click from omi.creation.creator import OEMetadataCreator -from omi.creation.utils import load_yaml_metadata +from omi.creation.utils import apply_template_to_resources, load_parts @click.group() def grp() -> None: - """Init click group.""" - - + """OMI CLI.""" + + +@grp.command("assemble") +@click.option( + "--base-dir", + required=True, + type=click.Path(file_okay=False, path_type=Path), + help="Root directory containing 'datasets/' and 'resources/'.", +) +@click.option("--dataset-id", required=True, help="Logical dataset id (e.g. 'powerplants').") +@click.option( + "--output-file", + required=True, + type=click.Path(dir_okay=False, path_type=Path), + help="Path to write the generated OEMetadata JSON.", +) +@click.option( + "--index-file", + default=None, + type=click.Path(dir_okay=False, path_type=Path), + help="Optional metadata index YAML for explicit mapping.", +) +def assemble_cmd(base_dir: Path, dataset_id: str, output_file: Path, index_file: Optional[Path]) -> None: + """Assemble OEMetadata from split YAML files and write JSON to OUTPUT_FILE.""" + # Load pieces + version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file) + merged_resources = apply_template_to_resources(resources, template) + + # Build & save with the correct spec version + creator = OEMetadataCreator(oem_version=version) + creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2) + + click.echo(f"OEMetadata written to {output_file}") + + +# Keep CommandCollection for backwards compatibility with your entry point cli = click.CommandCollection(sources=[grp]) def main() -> None: """Start click application.""" cli() - - -@click.command() -@click.argument("yaml_file") -@click.argument("output_file") -def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None: - """ - Generate OEMetadata from a YAML file and write it to an output file. - - Parameters - ---------- - yaml_file: Union[str, Path] - Path to the input YAML file containing dataset and resources. - output_file: Union[str, Path] - Path to the output file where the generated OEMetadata JSON will be saved. - """ - version, dataset, resources = load_yaml_metadata(yaml_file) - generator = OEMetadataCreator() - metadata = generator.generate_metadata(dataset, resources) - - with Path(output_file).open("w", encoding="utf-8") as f: - json.dump(metadata, f, indent=2) - - print(f"OEMetadata written to {output_file}") # noqa: T201 From 5f4d3fc82b444b025b360a78142a053714d57e4a Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:08:05 +0100 Subject: [PATCH 21/37] #126: add method to save generated metadata to file --- src/omi/creation/creator.py | 71 +++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/src/omi/creation/creator.py b/src/omi/creation/creator.py index ec390251..9d93b2b3 100644 --- a/src/omi/creation/creator.py +++ b/src/omi/creation/creator.py @@ -1,4 +1,9 @@ -"""Create oemetadata json datapackage descriptions.""" +"""Create OEMetadata JSON datapackage structure and return or store it.""" + +from __future__ import annotations + +import json +from pathlib import Path from omi.base import get_metadata_specification from omi.validation import validate_metadata @@ -6,38 +11,18 @@ class OEMetadataCreator: """ - Class to create oemetadata json datapackages. + Create OEMetadata JSON datapackages. - Output is based on datapackage and resource descriptions. + Output is based on dataset and resource descriptions and validated against + the official schema. """ def __init__(self, oem_version: str = "OEMetadata-2.0") -> None: - """ - Initialize the OEMetadataCreator with a specific version. - - Parameters - ---------- - oem_version:str - The version of the OEMetadata specification to use. - """ + """Initialize the creator with a specific OEMetadata version.""" self.oem_spec = get_metadata_specification(oem_version) def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: - """ - Generate oemetadata json datapackage from dataset and resources. - - Parameters - ---------- - dataset: dict - The dataset description. - resources: list[dict] - The list of resource descriptions. - - Returns - ------- - dict - The generated oemetadata json datapackage. - """ + """Generate OEMetadata JSON datapackage from dataset and resources.""" metadata = { "@context": self.oem_spec.schema["properties"]["@context"]["examples"][0], **dataset, @@ -47,3 +32,37 @@ def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: validate_metadata(metadata, check_license=False) return metadata + + def save( + self, + dataset: dict, + resources: list[dict], + output_file: Path | str, + **dump_kwargs, + ) -> None: + """ + Generate OEMetadata and save it to a JSON file. + + Parameters + ---------- + dataset : dict + Dataset metadata. + resources : list[dict] + List of resource metadata entries. + output_file : Path | str + Path to the output JSON file. + **dump_kwargs : + Extra kwargs forwarded to `json.dump`. Defaults applied here: + - indent: 2 + - ensure_ascii: False + """ + metadata = self.generate_metadata(dataset, resources) + + # Defaults, can be overridden by caller via **dump_kwargs + indent = dump_kwargs.pop("indent", 2) + ensure_ascii = dump_kwargs.pop("ensure_ascii", False) + + with Path(output_file).open("w", encoding="utf-8") as f: + json.dump(metadata, f, indent=indent, ensure_ascii=ensure_ascii, **dump_kwargs) + + print(f"OEMetadata written to {output_file}") # noqa: T201 From d4e285f48e8ad521b18306fb9816445c4c5f0e33 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:08:43 +0100 Subject: [PATCH 22/37] #126: Update docs --- src/omi/creation/README.md | 297 ++++++++++++++++++++----------------- 1 file changed, 161 insertions(+), 136 deletions(-) diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md index 274283ff..cdcd0135 100644 --- a/src/omi/creation/README.md +++ b/src/omi/creation/README.md @@ -1,6 +1,6 @@ # OMI OEMetadata Assembly Guide -This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, testing, and common pitfalls. You can drop this as a single `.md` file in your repo (e.g. `docs/oemetadata-assembly.md`) or split into multiple files later. +This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, multi-dataset usage, initialization scaffolding, testing, and common pitfalls. --- @@ -22,13 +22,16 @@ This guide explains how to author, assemble, and validate **OEMetadata** using * * [Minimal Usage](#minimal-usage) * [With Index Mapping](#with-index-mapping) * [Manual Loading (No Discovery)](#manual-loading-no-discovery) -8. [Airflow Integration Example](#airflow-integration-example) -9. [Testing](#testing) -10. [Validation & Error Handling](#validation--error-handling) -11. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding) -12. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional) -13. [Design Notes & Extensibility](#design-notes--extensibility) -14. [FAQ](#faq) +8. [Multi-dataset Assembly](#multi-dataset-assembly) +9. [Spec-Driven Output Ordering](#spec-driven-output-ordering) +10. [Project Initialization (Scaffolding)](#project-initialization-scaffolding) +11. [Airflow Integration Example](#airflow-integration-example) +12. [Testing](#testing) +13. [Validation & Error Handling](#validation--error-handling) +14. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding) +15. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional) +16. [Design Notes & Extensibility](#design-notes--extensibility) +17. [FAQ](#faq) --- @@ -37,9 +40,9 @@ This guide explains how to author, assemble, and validate **OEMetadata** using * * **Goal:** Author OEMetadata as **YAML** (dataset + resources), keep it **DRY** via **templates**, assemble into a single **JSON** metadata document, and **validate** it with the official schema. * **Core ideas:** - * Authors maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs. - * OMI assembles and validates metadata into a final OEMetadata JSON. - * Works well in pipelines (e.g., Airflow) and in regular Python. + * Maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs. + * OMI assembles + validates metadata into final OEMetadata JSON. + * Works in pipelines (e.g., Airflow) and plain Python. --- @@ -50,22 +53,20 @@ This guide explains how to author, assemble, and validate **OEMetadata** using * * `datasets/.dataset.yaml` * `datasets/.template.yaml` *(optional)* * `resources//*.resource.yaml` - 2. **Assembly:** - * OMI **loads** dataset, template, and resource YAML files. - * OMI **applies the template** to each resource (deep merge; resource overrides template). - * OMI **generates and validates** OEMetadata JSON via `OEMetadataCreator`. - + * Load dataset, template, and resource YAML files. + * Apply template → deep merge; resource overrides. + * Create OEMetadata JSON via `OEMetadataCreator` and validate. 3. **Storage:** - * You decide where to store: file, DB, API, etc. (OMI returns a Python `dict`). + * Assembly returns a Python `dict`. Store wherever you like (file/DB/API). --- ## Repository Layout -``` +```bash metadata/ datasets/ .dataset.yaml @@ -77,7 +78,7 @@ metadata/ metadata_index.yaml # optional explicit mapping ``` -* You can use **convention** (the directory / filename structure above) or an **index** file for explicit mapping. +Use the **convention** above or an **index** file for explicit mapping. --- @@ -95,13 +96,13 @@ dataset: "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/ ``` -> Backwards compatibility: if you prefer, you may put dataset fields directly at the top level; OMI will treat that as `dataset: {...}`. +> Backwards compatibility: dataset fields can also be at top-level; OMI treats that as `dataset: {...}`. --- ### Template YAML (optional) -Applied to **every** resource (unless the resource overrides specific fields). Keeps your YAML DRY. +Applied to **every** resource (unless overridden). Keeps YAML DRY. ```yaml # metadata/datasets/powerplants.template.yaml @@ -185,24 +186,20 @@ sources: See https://tldrlegal.com/license/odc-open-database-license-odbl for further information. attribution: © Intergovernmental Panel on Climate Change 2023 copyrightStatement: https://www.ipcc.ch/copyright/ - -# Other metadata like subject, publicationDate, spatial, temporal, contributors, review... ``` -A second resource: +Second resource: ```yaml # metadata/resources/powerplants/data_2.resource.yaml name: data_2 type: table title: My Second Resource - path: reGon/metadata/data_2.csv scheme: file format: csv mediatype: text/csv encoding: utf-8 - schema: fields: - name: id @@ -215,14 +212,13 @@ schema: type: string nullable: true primaryKey: [id] - ``` --- ### Index YAML (optional) -Use this if you want explicit mappings instead of convention-based discovery. +Explicit mappings instead of convention: ```yaml # metadata/metadata_index.yaml @@ -240,54 +236,38 @@ datasets: ## Templating Rules * **Deep merge** for dictionaries (e.g., `context`): - - * Resource **overrides** template on conflicts. - * Missing nested keys are **filled** from template. - + Resource **overrides**; missing nested keys are **filled** from template. * **Lists**: - - * **Concatenate** (resource first, then template-only items) for: - `keywords`, `topics`, `languages`. - * For other lists (e.g., `licenses`, `contributors`), **resource wins** (no concat). - * You can change this behavior in code by adding keys to `DEFAULT_CONCAT_LIST_KEYS`. - + **Concatenate** for `keywords`, `topics`, `languages` (resource first, then template-only items). + For other lists (e.g., `licenses`, `contributors`): **resource wins** (no concat). + *(Modify via `DEFAULT_CONCAT_LIST_KEYS` if you want different behavior.)* * **Scalars**: resource value **wins**. -This keeps YAML DRY while allowing fine-grained per-resource overrides. - --- ## Discovery vs. Index Mapping * **Discovery (convention):** - `datasets/.dataset.yaml`, `datasets/.template.yaml`, and `resources//*.resource.yaml` - → No index file needed. - -* **Index (explicit mapping):** - Use `metadata_index.yaml` to map dataset/template/resources by path, relative to the metadata base directory. + `datasets/.dataset.yaml`, `datasets/.template.yaml`, `resources//*.resource.yaml` + → No index needed. +* **Index (explicit):** + Provide `metadata_index.yaml` with explicit paths relative to your base directory. --- ## Programmatic Usage -OMI exposes high-level assembly and creation utilities. - ### Minimal Usage ```python from omi.creation.assembly import assemble_metadata_dict -metadata = assemble_metadata_dict( - base_dir="./metadata", - dataset_id="powerplants", -) # returns a Python dict with valid OEMetadata +metadata = assemble_metadata_dict(base_dir="./metadata", dataset_id="powerplants") ``` ### With Index Mapping ```python -from omi.creation.assembly import assemble_metadata_dict - metadata = assemble_metadata_dict( base_dir="./metadata", dataset_id="powerplants", @@ -308,29 +288,110 @@ resources = [ load_yaml(Path("./metadata/resources/powerplants/oemetadata_table_template.resource.yaml")), load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")), ] - resources = apply_template_to_resources(resources, template) + creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4") metadata = creator.generate_metadata(dataset, resources) ``` -> The `OEMetadataCreator` injects `@context` and `metaMetadata` and calls validation. +> `OEMetadataCreator` injects `@context` and `metaMetadata` from the spec and validates the result. + +--- + +## Multi-dataset Assembly + +Assemble **N datasets** in one call: + +```python +from omi.creation.assembly import assemble_many_metadata + +# Discover by convention (datasets/*.dataset.yaml) +all_metadata = assemble_many_metadata(base_dir="./metadata") + +# From explicit index +all_metadata = assemble_many_metadata( + base_dir="./metadata", index_file="./metadata/metadata_index.yaml" +) + +# Subset +some = assemble_many_metadata(base_dir="./metadata", dataset_ids=["powerplants", "households"]) +``` + +Result is a dict `{dataset_id: metadata}` by default. + +--- + +## Spec-Driven Output Ordering + +For human-friendly JSON key order without hard-coded lists, order by the **official example** (fallback: schema `properties`): + +```python +from omi.creation.assembly import assemble_metadata_dict +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import order_with_spec + +creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4") +metadata = assemble_metadata_dict("./metadata", "powerplants") + +ordered = order_with_spec(metadata, creator.oem_spec) # uses spec.example and schema +``` + +Write with preserved unicode: + +```python +import json, pathlib +out = pathlib.Path("./out/powerplants.json") +out.parent.mkdir(parents=True, exist_ok=True) +out.write_text(json.dumps(ordered, indent=2, ensure_ascii=False), encoding="utf-8") +``` + +--- + +## Project Initialization (Scaffolding) + +Create a metadata skeleton **from the spec** (no inline templates): + +```python +from omi.creation.scaffold import init_skeleton_from_spec + +paths = init_skeleton_from_spec( + base_dir="./metadata", + dataset_id="powerplants", + oem_version="OEMetadata-2.0.4", + resource_name="oemetadata_table_template", + with_index=True, # creates metadata_index.yaml + force=False, # do not overwrite +) +``` + +This imports the spec via: + +```python +from omi.base import get_metadata_specification +``` + +…and derives: + +* `datasets/.dataset.yaml` (with version from spec) +* `datasets/.template.yaml` (from `oem_spec.template` or pruned example resource) +* `resources//sample.resource.yaml` (sanitized from example) +* optional `metadata_index.yaml` + +You can expose a CLI command `omi init` that wraps `init_skeleton_from_spec`. --- ## Airflow Integration Example ```python -# In a DAG task (PythonOperator callable) from omi.creation.assembly import assemble_metadata_dict def build_oemetadata_for_powerplants(**context): md = assemble_metadata_dict( - base_dir="/opt/airflow/dags/metadata", # your metadata module + base_dir="/opt/airflow/dags/metadata", dataset_id="powerplants", - index_file="/opt/airflow/dags/metadata/metadata_index.yaml", # or None for discovery + index_file="/opt/airflow/dags/metadata/metadata_index.yaml", ) - # Store or pass downstream: write to file/DB/API, or XCom context["ti"].xcom_push(key="oemetadata", value=md) ``` @@ -338,43 +399,17 @@ def build_oemetadata_for_powerplants(**context): ## Testing -You can unit test assembly logic without depending on the real spec/validator by **monkeypatching** the creator. - -**Example (`tests/test_assembly.py`):** - -```python -from pathlib import Path -import yaml -import pytest -from omi.creation.assembly import assemble_metadata_dict +* **Assembly test** (uses a fake creator): see `tests/test_assembly.py` example in this doc. +* **Utils tests** (I/O, discovery, merging): see `tests/test_creation_utils.py`. + It covers: -def write_yaml(p: Path, data) -> None: - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8") - -class FakeCreator: - def __init__(self, oem_version: str = "OEMetadata-2.0.4"): - self.oem_version = oem_version - def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: - return {"@context": "...", **dataset, "resources": resources, "metaMetadata": {"metadataVersion": self.oem_version}} - -def test_assemble(tmp_path, monkeypatch): - write_yaml(tmp_path / "datasets" / "demo.dataset.yaml", {"dataset": {"name": "demo", "title": "Demo"}}) - write_yaml(tmp_path / "datasets" / "demo.template.yaml", {"keywords": ["k1"], "context": {"contact": "a@b"}}) - write_yaml(tmp_path / "resources" / "demo" / "a.resource.yaml", {"name": "a", "title": "A", "keywords": ["ak"]}) - write_yaml(tmp_path / "resources" / "demo" / "b.resource.yaml", {"name": "b", "title": "B", "context": {"publisher": "X"}}) - - monkeypatch.setattr("omi.creation.assembly.OEMetadataCreator", FakeCreator) - md = assemble_metadata_dict(tmp_path, "demo") - - assert md["name"] == "demo" - a, b = md["resources"] - assert a["keywords"] == ["ak", "k1"] # concat - assert b["context"]["contact"] == "a@b" # filled from template - assert b["context"]["publisher"] == "X" # resource wins -``` + * `load_parts` (template application) + * `_merge_lists`, `deep_apply_template_to_resource`, `apply_template_to_resources` + * `load_yaml` + * `discover_paths`, `resolve_from_index`, `load_parts` + * `discover_dataset_ids`, `discover_dataset_ids_from_index` -Run with: +Run: ```bash pytest -q @@ -384,8 +419,7 @@ pytest -q ## Validation & Error Handling -* `OEMetadataCreator.generate_metadata()` runs `validate_metadata(metadata, check_license=False)`. -* If validation fails, catch and inspect the exception from `omi.validation`: +`OEMetadataCreator.generate_metadata()` validates with the official schema: ```python from omi.validation import ValidationError @@ -396,76 +430,67 @@ except ValidationError as e: print("Validation failed:", e) ``` -**Common causes:** +**Common causes**: -* Missing **required** keys (e.g., field missing `"nullable"`). -* Incorrect data types (e.g., non-URI in a field that requires `format: uri`). -* Invalid list shapes (`primaryKey`, `foreignKeys`, etc.). +* Missing required field keys (e.g., a schema field without `"nullable"`). +* Wrong types (e.g., non-URI where `format: uri` is required). +* Invalid list shapes (e.g., `primaryKey`, `foreignKeys`). --- ## Auto-Generation From Directory (Optional Onboarding) -You can auto-generate a starter YAML for a dataset by scanning a directory or zip: +You can bootstrap YAMLs from a directory or zip: -* Infer resource entries based on file names & extensions. -* For CSVs, call your CSV inference to produce initial `schema.fields`. -* Write a `dataset` YAML + per-file `resource` YAMLs as a starting point. +* infer resources from file names/extensions +* for CSV, infer a table schema +* emit dataset YAML + one resource YAML per file -> Keep this as an onboarding tool; human review is still recommended. +Use filters to skip temp/log/backup files (see next section). --- ## Filtering Irrelevant Files (Optional) -If auto-generating from a directory, filter out noise: +When scanning directories, exclude noise such as backup and editor artifacts: ```python -def read_directory(directory, exclude_extensions=None, exclude_patterns=None, exclude_hidden=True): - # ... - # exclude_extensions=['.log','.tmp','.bak','.DS_Store','.md'] - # exclude_patterns=['*_backup.*','*~','*.old','*.ignore'] - return files +exclude_extensions = {".log", ".tmp", ".bak", ".DS_Store", ".md"} +exclude_patterns = {"*_backup.*", "*~", "*.old", "*.ignore"} +exclude_hidden = True ``` -Helps avoid including backups, temp files, editor artifacts, etc. - --- ## Design Notes & Extensibility * **Separation of concerns**: - * `utils` covers loading YAML, discovery, merging/templating. - * `assembly` orchestrates the load → merge → create flow. - * `creator` handles schema-based assembly and validation. -* **Storage-agnostic**: assembly returns a dict; you decide where to store it (file/DB/API). -* **Configurable merge**: change list concat behavior by editing `DEFAULT_CONCAT_LIST_KEYS`. + * `utils`: YAML loading, discovery, deep merge, ordering by spec. + * `assembly`: Orchestrates load → merge → create → (optionally) order. + * `creator`: Pulls spec via `get_metadata_specification`, injects `@context` and `metaMetadata`, validates. + * `scaffold`: Initializes a project from the **spec/example** (no inline strings). +* **Storage-agnostic**: assembly returns a dict; saving is up to you. +* **Configurable merging**: tweak `DEFAULT_CONCAT_LIST_KEYS` to change list concat behavior. --- ## FAQ -**Q:** Can a resource override template-provided `licenses`? -**A:** Yes. By default, **resource wins** for lists except `keywords`, `topics`, `languages` (which concatenate). You can include `"licenses"` in `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation. +**Q: Can resource YAML override template-provided `licenses`?** +A: Yes. By default, resource lists override template lists except for `keywords`, `topics`, `languages` (which concatenate). Add `"licenses"` to `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation. -**Q:** Where does `@context` and `metaMetadata` come from? -**A:** `OEMetadataCreator` reads the official spec via `get_metadata_specification(oem_version)` and injects `@context` and a `metaMetadata` block, then validates the final result. +**Q: Where do `@context` and `metaMetadata` come from?** +A: `OEMetadataCreator` loads the spec (`get_metadata_specification(oem_version)`) and injects both before validation. -**Q:** The output JSON shows `\u00a9` instead of `©`. -**A:** Use `ensure_ascii=False` when dumping JSON: +**Q: Why does JSON show `\u00a9` instead of `©`?** +A: Use `ensure_ascii=False` in `json.dump` to preserve unicode characters. -```python -json.dump(metadata, f, indent=2, ensure_ascii=False) -``` - -**Q:** I see validation errors about fields missing `nullable`. -**A:** Ensure each `schema.fields[]` has **`name`**, **`type`**, and **`nullable`** at minimum. If you auto-generate fields, set `nullable: false` as a safe default unless you detect nulls. +**Q: I got a validation error: `'nullable' is a required property`.** +A: Ensure each `schema.fields[]` has **`name`**, **`type`**, **`nullable`**. If you auto-generate, set `nullable: false` unless you detect nulls. -**Q:** How do I run without a template YAML? -**A:** Just omit `datasets/.template.yaml`; assembly works without it. - ---- +**Q: Can I reorder output keys to match the official example?** +A: Yes. Use `order_with_spec(metadata, creator.oem_spec)` for spec-driven ordering (no hard-coded key lists). -> If you want this split across multiple docs, consider: -> `docs/assembly-overview.md`, `docs/yaml-formats.md`, `docs/templating.md`, `docs/integration-airflow.md`, `docs/testing.md`, and `docs/troubleshooting.md`. +**Q: Can I manage multiple datasets in one metadata module?** +A: Yes. Use `assemble_many_metadata(...)` to discover/assemble **N datasets** at once (by convention or index). From 2477b1b29b8455c7f15ae50bce6299d22434902e Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:17:27 +0100 Subject: [PATCH 23/37] #126: Update the create module as entry point for the oemetadata creation. It now uses the new assembler. --- src/omi/create.py | 77 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/src/omi/create.py b/src/omi/create.py index a30d2b2e..2f8faaaf 100644 --- a/src/omi/create.py +++ b/src/omi/create.py @@ -2,13 +2,11 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union +import json +from pathlib import Path +from typing import Optional, Union -from omi.creation.creator import OEMetadataCreator -from omi.creation.utils import apply_template_to_resources, load_parts - -if TYPE_CHECKING: - from pathlib import Path +from omi.creation.assembler import assemble_many_metadata, assemble_metadata_dict def build_from_yaml( @@ -19,26 +17,59 @@ def build_from_yaml( index_file: Optional[Union[str, Path]] = None, ) -> None: """ - Assemble OEMetadata from split YAML files. - - - datasets/.dataset.yaml - - datasets/.template.yaml (optional) - - resources//*.resource.yaml - (optionally resolved via an index YAML) + Assemble one dataset and write the resulting OEMetadata JSON to a file. Parameters ---------- - base_dir : str | Path - Root directory containing 'datasets/' and 'resources/'. + base_dir : Union[str, Path] + Base directory containing the split-files dataset structure. dataset_id : str - Logical dataset id (e.g. 'powerplants'). - output_file : str | Path - Output path for the generated OEMetadata JSON. - index_file : str | Path | None - Optional explicit mapping file (metadata_index.yaml). + The dataset ID to assemble. + output_file : Union[str, Path] + Path to write the resulting OEMetadata JSON file. + index_file : Optional[Union[str, Path]], optional + Optional path to an index file for resolving cross-dataset references, + by default None. + """ + md = assemble_metadata_dict(base_dir, dataset_id, index_file=index_file) + Path(output_file).parent.mkdir(parents=True, exist_ok=True) + Path(output_file).write_text(json.dumps(md, indent=2, ensure_ascii=False), encoding="utf-8") + + +def build_many_from_yaml( + base_dir: Union[str, Path], + output_dir: Union[str, Path], + *, + dataset_ids: Optional[list[str]] = None, + index_file: Optional[Union[str, Path]] = None, +) -> None: + """ + Assemble multiple datasets and write each as .json to output_dir. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing the split-files dataset structure. + output_dir : Union[str, Path] + Directory to write the resulting OEMetadata JSON files. + dataset_ids : Optional[list[str]], optional + Optional list of dataset IDs to assemble. If None, all datasets found + in base_dir will be assembled, by default None. + index_file : Optional[Union[str, Path]], optional + Optional path to an index file for resolving cross-dataset references, + by default None. """ - version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file) - merged_resources = apply_template_to_resources(resources, template) + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) - creator = OEMetadataCreator(oem_version=version) - creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2) + results = assemble_many_metadata( + base_dir, + dataset_ids=dataset_ids, + index_file=index_file, + as_dict=True, # keep it as a mapping id -> metadata + ) + for ds_id, md in results.items(): + (out_dir / f"{ds_id}.json").write_text( + json.dumps(md, indent=2, ensure_ascii=False), + encoding="utf-8", + ) From 7070d3b620ea3ae0d899b054c9185fafdd5defb8 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:22:35 +0100 Subject: [PATCH 24/37] #126: Add test data for "create" integration test --- .../datasets/powerplants.dataset.yaml | 6 + .../datasets/powerplants.template.yaml | 26 +++ .../powerplants/data_2.resource.yaml | 22 ++ .../oemetadata_table_template.resource.yaml | 191 ++++++++++++++++++ 4 files changed, 245 insertions(+) create mode 100644 tests/test_data/create/metadata/datasets/powerplants.dataset.yaml create mode 100644 tests/test_data/create/metadata/datasets/powerplants.template.yaml create mode 100644 tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml create mode 100644 tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml diff --git a/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml b/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml new file mode 100644 index 00000000..38bb43a2 --- /dev/null +++ b/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml @@ -0,0 +1,6 @@ +version: "OEMetadata-2.0" +dataset: + name: oep_oemetadata + title: OEP OEMetadata + description: A dataset for the OEMetadata examples. + "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/ diff --git a/tests/test_data/create/metadata/datasets/powerplants.template.yaml b/tests/test_data/create/metadata/datasets/powerplants.template.yaml new file mode 100644 index 00000000..1b60853a --- /dev/null +++ b/tests/test_data/create/metadata/datasets/powerplants.template.yaml @@ -0,0 +1,26 @@ +licenses: + - name: ODbL-1.0 + title: Open Data Commons Open Database License 1.0 + path: https://opendatacommons.org/licenses/odbl/1-0/index.html + instruction: > + You are free to share and change, but you must attribute, and + share derivations under the same license. See https://tldrlegal.com/license/odc-open-database-license-(odbl) + for further information. + attribution: © Reiner Lemoine Institut + copyrightStatement: https://github.com/OpenEnergyPlatform/oemetadata/blob/production/LICENSE.txt + +context: + title: NFDI4Energy + homepage: https://nfdi4energy.uol.de/ + documentation: https://nfdi4energy.uol.de/sites/about_us/ + sourceCode: https://github.com/NFDI4Energy + publisher: Open Energy Platform (OEP) + publisherLogo: https://github.com/OpenEnergyPlatform/organisation/blob/production/logo/OpenEnergyFamily_Logo_OpenEnergyPlatform.svg + contact: contact@example.com + fundingAgency: " Deutsche Forschungsgemeinschaft (DFG)" + fundingAgencyLogo: https://upload.wikimedia.org/wikipedia/commons/8/86/DFG-logo-blau.svg + grantNo: "501865131" + +topics: [model_draft] +languages: [en-GB, de-DE] +keywords: [example, ODbL-1.0, NFDI4Energy] diff --git a/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml new file mode 100644 index 00000000..a03ee242 --- /dev/null +++ b/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml @@ -0,0 +1,22 @@ +name: data_2 +type: table +title: My Second Resource + +path: reGon/metadata/data_2.csv +scheme: file +format: csv +mediatype: text/csv +encoding: utf-8 + +schema: + fields: + - name: h + type: integer + nullable: true + - name: i + type: integer + nullable: true + - name: o + type: string + nullable: true + primaryKey: [id] diff --git a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml new file mode 100644 index 00000000..1a030e54 --- /dev/null +++ b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml @@ -0,0 +1,191 @@ +name: oemetadata_table_template +type: table +title: OEMetadata Table Template +description: Example table used to illustrate the OEMetadata structure and features. +"@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv + +# Resource-specific attributes (template will add licenses/context/topics/languages/keywords) +path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template +scheme: http +format: CSV +encoding: UTF-8 + +dialect: + decimalSeparator: "." + delimiter: ";" + +schema: + fields: + - name: id + type: integer + description: Unique identifier + nullable: false + unit: null + isAbout: + - name: identifier + "@id": http://purl.obolibrary.org/obo/IAO_0020000 + valueReference: + - value: null + name: null + "@id": null + - name: name + type: string + description: Technology Name + nullable: true + unit: null + isAbout: + - name: power generation technology + "@id": http://openenergy-platform.org/ontology/oeo/OEO_00010423 + valueReference: + - value: wind + name: wind power technology + "@id": http://openenergyplatform.org/ontology/oeo/OEO_00010424 + - name: type + type: string + description: Type of wind farm + nullable: true + unit: null + isAbout: + - name: wind farm + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000447/ + valueReference: + - value: onshore + name: onshore wind farm + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000311/ + - value: offshore + name: offshore wind farm + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000308/ + - name: year + type: integer + description: Reference year + nullable: true + unit: null + isAbout: + - name: year + "@id": https://openenergyplatform.org/ontology/oeo/UO_0000036/ + valueReference: + - value: null + name: null + "@id": null + - name: value + type: number + description: Bruttoleistung + nullable: true + unit: MW + isAbout: + - name: nameplate capacity + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00230003/ + valueReference: + - value: null + name: null + "@id": null + - name: is_active + type: boolean + description: Betriebsstatus + nullable: false + unit: null + isAbout: + - name: Operating Mode Status + "@id": https://ontology.brickschema.org/brick/Operating_Mode_Status + valueReference: + - value: null + name: null + "@id": null + - name: version + type: integer + description: Version + nullable: true + unit: null + isAbout: + - name: version number + "@id": http://purl.obolibrary.org/obo/IAO_0000129 + valueReference: + - value: null + name: null + "@id": null + - name: comment + type: string + description: "" + nullable: true + unit: null + isAbout: + - name: comment + "@id": http://semanticscience.org/resource/SIO_001167 + valueReference: + - value: null + name: null + "@id": null + primaryKey: [id] + foreignKeys: + - fields: [id, version] + reference: + resource: model_draft.oep_oemetadata_table_example_version + fields: [id, version] + + +sources: + - title: IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report + authors: [Hoesung Lee, José Romero, The Core Writing Team] + description: A Report of the Intergovernmental Panel on Climate Change. + publicationYear: "2023" + path: https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf + sourceLicenses: + - name: CC-BY-4.0 + title: Creative Commons Attribution 4.0 International + path: https://creativecommons.org/licenses/by/4.0/legalcode + instruction: > + You are free to share and change, but you must attribute. + See https://tldrlegal.com/license/odc-open-database-license-odbl for further information. + attribution: © Intergovernmental Panel on Climate Change 2023 + copyrightStatement: https://www.ipcc.ch/copyright/ + +subject: + - name: energy + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000150 + +publicationDate: "2024-10-15" + +# embargoPeriod: +# start: "2024-10-11" +# end: "2025-01-01" +# isActive: true + +spatial: + location: + address: Rudower Chaussee 12, 12489 Berlin + "@id": https://www.wikidata.org/wiki/Q77077223 + latitude: "52.432822" + longitude: "13.5351004" + extent: + name: Berlin + "@id": https://www.wikidata.org/wiki/Q64 + resolutionValue: "100" + resolutionUnit: m + boundingBox: [13.08825, 52.33859, 13.76104, 52.6754] + crs: EPSG:4326 + +temporal: + referenceDate: "2020-01-01" + timeseries: + - start: "2020-01-01T00:00:00+01:00" + end: "2020-01-01T23:59:30+01:00" + resolutionValue: "15" + resolutionUnit: min + alignment: left + aggregationType: current + +contributors: + - title: Ludwig Hülk + path: https://github.com/Ludee + organization: Reiner Lemoine Institut + roles: [DataCollector] + date: "2024-11-19" + object: data + comment: Date of data creation + - title: Ludwig Hülk + path: https://github.com/Ludee + organization: Reiner Lemoine Institut + roles: [DataCurator] + date: "2024-11-30" + object: metadata + comment: Date of metadata creation From 666242b0d29da795cebe27e1b49d9fbec8a4d8a1 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:23:46 +0100 Subject: [PATCH 25/37] #126: Add test for creation module entry point "create" as integration test using test data --- tests/test_create.py | 76 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tests/test_create.py diff --git a/tests/test_create.py b/tests/test_create.py new file mode 100644 index 00000000..708f89c2 --- /dev/null +++ b/tests/test_create.py @@ -0,0 +1,76 @@ +""" +Integration tests for OEMetadata assembly and entry point using real YAML. + +This test suite consumes the example YAML tree located at: +tests/test_data/create/metadata/ +and verifies that OMI assembles and writes a valid OEMetadata document. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +from omi.create import build_from_yaml +from omi.creation.assembler import assemble_metadata_dict + + +def _fixture_metadata_root() -> Path: + """Return the absolute path to tests/test_data/create/metadata.""" + here = Path(__file__).resolve().parent + return here / "test_data" / "create" / "metadata" + + +def test_assemble_metadata_dict_with_fixture() -> None: + """Assemble OEMetadata dict from the real fixture and assert key content.""" + base = _fixture_metadata_root() + dataset_id = "powerplants" + + md = assemble_metadata_dict(base, dataset_id) + + # dataset-level checks (from powerplants.dataset.yaml) + assert md["name"] == "oep_oemetadata" + assert md["title"] == "OEP OEMetadata" + assert md["@id"].startswith("https://databus.openenergyplatform.org/") + + # context injected from template if not overridden in resource + assert "resources" in md + assert isinstance(md["resources"], list) + assert md["resources"] + r_names = {r["name"] for r in md["resources"]} + # Both resources from your example exist + assert {"oemetadata_table_template", "data_2"}.issubset(r_names) + + # Check one resource that should have inherited from template + r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table_template") + assert r1["context"]["title"] == "NFDI4Energy" # from template + assert "licenses" in r1 + assert isinstance(r1["licenses"], list) + assert r1["licenses"] + assert r1["licenses"][0]["name"] in {"ODbL-1.0", "ODbL-1.0".upper(), "ODBL-1.0"} + + # Meta metadata is present + assert "metaMetadata" in md + assert md["metaMetadata"]["metadataVersion"].startswith("OEMetadata-2.0") + + +def test_entrypoint_build_from_yaml_writes_file(tmp_path: Path) -> None: + """Use the real entry point to write JSON and compare basic structure.""" + base = _fixture_metadata_root() + out = tmp_path / "out" / "powerplants.json" + + build_from_yaml(base, "powerplants", out) + + assert out.exists(), "Entry point did not write the output file." + written = json.loads(out.read_text(encoding="utf-8")) + + # Sanity checks on written JSON + assert written["name"] == "oep_oemetadata" + assert isinstance(written["resources"], list) + assert written["resources"] + # Ensure unicode is preserved (© should not be escaped) + licenses = written["resources"][0].get("licenses", []) + if licenses: + # stringify to inspect the character; ensure_ascii=False in writer preserves it + text = json.dumps(licenses[0], ensure_ascii=False) + assert "©" in text From 41aafda1a7511e78388d1c9b5168253b1be5c799 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:30:29 +0100 Subject: [PATCH 26/37] #126: Add docs on how to use the create module (entry point for creation module) --- docs/create.md | 159 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 docs/create.md diff --git a/docs/create.md b/docs/create.md new file mode 100644 index 00000000..09d6481a --- /dev/null +++ b/docs/create.md @@ -0,0 +1,159 @@ +# OMI “Create” Entry Point + +This mini-guide explains how to use the **programmatic entry points** that turn your split YAML metadata (dataset + template + resources) into a single OEMetadata JSON document. + +> If you’re looking for how to author the YAML files and how templating works, see the main **Assembly Guide** in the `creation` module directory. This page just shows how to *call* the entry points. + +--- + +## What it does + +The functions in `omi.create` wrap the full assembly pipeline: + +1. **Discover / load** your YAML parts (dataset, optional template, resources). +2. **Apply the template** to each resource (deep merge; resource wins; keywords/topics/languages concatenate). +3. **Generate & validate** the final OEMetadata JSON using the official schema (via `OEMetadataCreator`). +4. **Write** the result to disk (`build_from_yaml`) or many results to a directory (`build_many_from_yaml`). + +--- + +## API + +```python +from omi.create import build_from_yaml, build_many_from_yaml +``` + +### `build_from_yaml(base_dir, dataset_id, output_file, *, index_file=None) -> None` + +Assemble **one** dataset and write `` (JSON). + +* `base_dir` (`str | Path`): Root that contains: + + * `datasets/.dataset.yaml` + * `datasets/.template.yaml` *(optional)* + * `resources//*.resource.yaml` +* `dataset_id` (`str`): Logical dataset name (e.g. `"powerplants"`). +* `output_file` (`str | Path`): Path to write the generated OEMetadata JSON. +* `index_file` (`str | Path | None`): Optional explicit mapping file (`metadata_index.yaml`). If provided, paths are taken from the index instead of convention. + +### `build_many_from_yaml(base_dir, output_dir, *, dataset_ids=None, index_file=None) -> None` + +Assemble **multiple** datasets and write each as `/.json`. + +* `base_dir` (`str | Path`): Same as above. +* `output_dir` (`str | Path`): Destination directory for one JSON file per dataset. +* `dataset_ids` (`list[str] | None`): Limit to specific datasets. If `None`, we: + + * Use keys from `index_file` when provided, **else** + * Discover all `datasets/*.dataset.yaml` in `base_dir`. +* `index_file` (`str | Path | None`): Optional `metadata_index.yaml`. + +--- + +## Quick examples + +### One dataset (convention-based discovery) + +```python +from omi.create import build_from_yaml + +build_from_yaml( + base_dir="./metadata", + dataset_id="powerplants", + output_file="./out/powerplants.json", +) +``` + +Directory layout: + +```bash +metadata/ + datasets/ + powerplants.dataset.yaml + powerplants.template.yaml # optional + resources/ + powerplants/ + *.resource.yaml +``` + +### One dataset (explicit index) + +```python +from omi.create import build_from_yaml + +build_from_yaml( + base_dir="./metadata", + dataset_id="powerplants", + output_file="./out/powerplants.json", + index_file="./metadata/metadata_index.yaml", +) +``` + +### Many datasets (discover all) + +```python +from omi.create import build_many_from_yaml + +build_many_from_yaml( + base_dir="./metadata", + output_dir="./out", +) +# writes ./out/.json for each dataset found +``` + +### Many datasets (index + subset) + +```python +from omi.create import build_many_from_yaml + +build_many_from_yaml( + base_dir="./metadata", + output_dir="./out", + dataset_ids=["powerplants", "households"], + index_file="./metadata/metadata_index.yaml", +) +``` + +--- + +## Notes & behavior + +* Output JSON is written with `indent=2` and **`ensure_ascii=False`** to preserve characters like `©`. +* Validation happens via `OEMetadataCreator` using the official schema provided by `oemetadata` (imported through `omi.base.get_metadata_specification`). +* If a dataset YAML is missing, `FileNotFoundError` is raised. +* If schema validation fails, you’ll get an exception from `omi.validation`. Catch it where you call the entry point if you want to handle/report errors. + +--- + +## Using in 3rd Party code like data pipelines + +```python +from pathlib import Path +from omi.create import build_from_yaml + +def build_oemetadata_callable(**context): + base = Path("/opt/airflow/dags/metadata") + out = Path("/opt/airflow/out/powerplants.json") + build_from_yaml(base, "powerplants", out) + # optionally push to XCom, publish, upload, etc. +``` + +--- + +## Testing tips + +* For **unit tests** of `omi.create`, patch `omi.create.assemble_metadata_dict` / `assemble_many_metadata` and verify files are written. +* For **integration tests**, put real example YAMLs under `tests/test_data/create/metadata/` and call `build_from_yaml` end-to-end. + +--- + +## Troubleshooting + +* **“Dataset YAML not found”** + Check `base_dir/datasets/.dataset.yaml` exists, or supply the correct `index_file`. + +* **Unicode characters appear escaped (`\u00a9`)** + Ensure you’re not re-writing the JSON elsewhere with `ensure_ascii=True`. + +* **Template not applied** + Confirm your template file name matches `.template.yaml` (or is correctly referenced from the index), and the keys you expect to inherit aren’t already set in the resource (resource values win). From 70435e98e870dfdcf0f298491453f4988d443391 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 18:36:15 +0100 Subject: [PATCH 27/37] deactivate test --- tests/test_metadata_validation.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_metadata_validation.py b/tests/test_metadata_validation.py index 2a0de492..278f251d 100644 --- a/tests/test_metadata_validation.py +++ b/tests/test_metadata_validation.py @@ -110,11 +110,10 @@ def deactivate__test_metadata_against_oep_table(): validation.validate_oep_table_against_metadata(oep_table=table, oep_schema="model_draft", metadata=metadata) -def test_metadata_against_oep_table_using_metadata_from_oep(): - """Test OEP table definition against OEP metadata, where metadata is taken from OEP.""" - table = "x2x_p2gas_soec_1" - with pytest.raises(validation.ValidationError, match="None is not of type 'object'"): - validation.validate_oep_table_against_metadata(oep_table=table, oep_schema="model_draft") +# Test fails always as tables does not exist in OEP anymore +# def test_metadata_against_oep_table_using_metadata_from_oep(): +# """Test OEP table definition against OEP metadata, where metadata is taken from OEP.""" +# with pytest.raises(validation.ValidationError, match="None is not of type 'object'"): def test_metadata_against_oep_table_invalid_name(): From 59f4263f9ef9c8d8e703e6406e3522bb5bf6c21d Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Thu, 6 Nov 2025 22:31:14 +0100 Subject: [PATCH 28/37] remove irritating info from example resource name --- src/omi/creation/README.md | 12 ++++++------ tests/test_create.py | 4 ++-- ....resource.yaml => oemetadata_table.resource.yaml} | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) rename tests/test_data/create/metadata/resources/powerplants/{oemetadata_table_template.resource.yaml => oemetadata_table.resource.yaml} (99%) diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md index cdcd0135..7fedbfef 100644 --- a/src/omi/creation/README.md +++ b/src/omi/creation/README.md @@ -139,14 +139,14 @@ keywords: [example, ODbL-1.0, NFDI4Energy] ### Resource YAML ```yaml -# metadata/resources/powerplants/oemetadata_table_template.resource.yaml -name: oemetadata_table_template +# metadata/resources/powerplants/oemetadata_table.resource.yaml +name: oemetadata_table type: table title: OEMetadata Table Template description: Example table used to illustrate the OEMetadata structure and features. # Resource-specific attributes -path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template +path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table scheme: http format: CSV encoding: UTF-8 @@ -227,7 +227,7 @@ datasets: dataset: datasets/powerplants.dataset.yaml template: datasets/powerplants.template.yaml resources: - - resources/powerplants/oemetadata_table_template.resource.yaml + - resources/powerplants/oemetadata_table.resource.yaml - resources/powerplants/data_2.resource.yaml ``` @@ -285,7 +285,7 @@ from omi.creation.utils import load_yaml, apply_template_to_resources dataset = load_yaml(Path("./metadata/datasets/powerplants.dataset.yaml")).get("dataset", {}) template = load_yaml(Path("./metadata/datasets/powerplants.template.yaml")) resources = [ - load_yaml(Path("./metadata/resources/powerplants/oemetadata_table_template.resource.yaml")), + load_yaml(Path("./metadata/resources/powerplants/oemetadata_table.resource.yaml")), load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")), ] resources = apply_template_to_resources(resources, template) @@ -358,7 +358,7 @@ paths = init_skeleton_from_spec( base_dir="./metadata", dataset_id="powerplants", oem_version="OEMetadata-2.0.4", - resource_name="oemetadata_table_template", + resource_name="oemetadata_table", with_index=True, # creates metadata_index.yaml force=False, # do not overwrite ) diff --git a/tests/test_create.py b/tests/test_create.py index 708f89c2..7cadf2d8 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -39,10 +39,10 @@ def test_assemble_metadata_dict_with_fixture() -> None: assert md["resources"] r_names = {r["name"] for r in md["resources"]} # Both resources from your example exist - assert {"oemetadata_table_template", "data_2"}.issubset(r_names) + assert {"oemetadata_table", "data_2"}.issubset(r_names) # Check one resource that should have inherited from template - r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table_template") + r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table") assert r1["context"]["title"] == "NFDI4Energy" # from template assert "licenses" in r1 assert isinstance(r1["licenses"], list) diff --git a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml similarity index 99% rename from tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml rename to tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml index 1a030e54..f28b8392 100644 --- a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml +++ b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml @@ -1,11 +1,11 @@ -name: oemetadata_table_template +name: oemetadata_table type: table title: OEMetadata Table Template description: Example table used to illustrate the OEMetadata structure and features. "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv # Resource-specific attributes (template will add licenses/context/topics/languages/keywords) -path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template +path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table scheme: http format: CSV encoding: UTF-8 From b37ecf07d4a6b2bdf2d8c2b4bc832bb49240a52c Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 10:40:39 +0100 Subject: [PATCH 29/37] #126: Update create docs --- docs/create.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/create.md b/docs/create.md index 09d6481a..20b816e7 100644 --- a/docs/create.md +++ b/docs/create.md @@ -132,10 +132,10 @@ from pathlib import Path from omi.create import build_from_yaml def build_oemetadata_callable(**context): - base = Path("/opt/airflow/dags/metadata") - out = Path("/opt/airflow/out/powerplants.json") + base = Path("/project/metadata") + out = Path("/project/metadata/out/powerplants.json") build_from_yaml(base, "powerplants", out) - # optionally push to XCom, publish, upload, etc. + # optionally push to airflow XCom, publish, upload, etc. ``` --- From c269469c22354d08bb1aae7b289c671dfcb00a23 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 10:45:46 +0100 Subject: [PATCH 30/37] #126: Add CLI command to initialize a new metadata workspace with template contents --- src/omi/cli.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/src/omi/cli.py b/src/omi/cli.py index a596a5c6..fbfb722e 100644 --- a/src/omi/cli.py +++ b/src/omi/cli.py @@ -24,6 +24,7 @@ import click from omi.creation.creator import OEMetadataCreator +from omi.creation.init import init_dataset, init_resources_from_files from omi.creation.utils import apply_template_to_resources, load_parts @@ -62,11 +63,56 @@ def assemble_cmd(base_dir: Path, dataset_id: str, output_file: Path, index_file: creator = OEMetadataCreator(oem_version=version) creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2) - click.echo(f"OEMetadata written to {output_file}") + +@click.group() +def init() -> None: + """Scaffold OEMetadata split-files layout.""" + + +@init.command("dataset") +@click.argument("base_dir", type=click.Path(file_okay=False, path_type=Path)) +@click.argument("dataset_id") +@click.option("--oem-version", default="OEMetadata-2.0", show_default=True) +@click.option("--resource", "resources", multiple=True, help="Initial resource names (repeatable).") +@click.option("--overwrite", is_flag=True, help="Overwrite existing files.") +def init_dataset_cmd( + base_dir: Path, + dataset_id: str, + oem_version: str, + resources: tuple[str, ...], + *, + overwrite: bool, +) -> None: + """Initialize a split-files OEMetadata dataset layout under BASE_DIR.""" + res = init_dataset(base_dir, dataset_id, oem_version=oem_version, resources=resources, overwrite=overwrite) + click.echo(f"dataset: {res.dataset_yaml}") + click.echo(f"template: {res.template_yaml}") + for p in res.resource_yamls: + click.echo(f"resource: {p}") + + +@init.command("resources") +@click.argument("base_dir", type=click.Path(file_okay=False, path_type=Path)) +@click.argument("dataset_id") +@click.argument("files", nargs=-1, type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option("--oem-version", default="OEMetadata-2.0", show_default=True) +@click.option("--overwrite", is_flag=True, help="Overwrite existing files.") +def init_resources_cmd( + base_dir: Path, + dataset_id: str, + files: tuple[Path, ...], + oem_version: str, + *, + overwrite: bool, +) -> None: + """Create resource YAML files for DATASET_ID from the given FILES.""" + outs = init_resources_from_files(base_dir, dataset_id, files, oem_version=oem_version, overwrite=overwrite) + for p in outs: + click.echo(p) # Keep CommandCollection for backwards compatibility with your entry point -cli = click.CommandCollection(sources=[grp]) +cli = click.CommandCollection(sources=[grp, init]) def main() -> None: From a4bedf2d18a6c5255baf09bf87ba8fdc6612a669 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 11:57:33 +0100 Subject: [PATCH 31/37] #126: add omi scripts to project --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index cd3fadfd..585b49fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,3 +78,6 @@ unfixable = ["UP007", "I001"] "*/__init__.py" = [ "D104", # Missing docstring in public package ] + +[omi.scripts] +omi = "omi.cli:main" From b1dbbf8e1143f048375c39a4afd57d06685a9687 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 11:57:53 +0100 Subject: [PATCH 32/37] #126: enhance docstring --- src/omi/creation/assembler.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/omi/creation/assembler.py b/src/omi/creation/assembler.py index 9814f0ad..edaa4318 100644 --- a/src/omi/creation/assembler.py +++ b/src/omi/creation/assembler.py @@ -60,6 +60,24 @@ def assemble_many_metadata( * otherwise -> discover by 'datasets/*.dataset.yaml' - Returns a mapping {dataset_id: metadata} if as_dict=True, else a list of (dataset_id, metadata) pairs in sorted id order. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_ids : Optional[Iterable[str]], optional + Optional iterable of dataset IDs to assemble. If None, all datasets found + in base_dir will be assembled, by default None. + index_file : Optional[Union[str, Path]], optional + Optional path to an index YAML file for resolving dataset parts. + as_dict : bool, optional + Whether to return results as a dict mapping dataset_id to metadata. If False, + returns a list of (dataset_id, metadata) tuples, by default True. + + Returns + ------- + Union[dict[str, dict], list[tuple[str, dict]]] + Assembled OEMetadata for each dataset. """ base = Path(base_dir) From 47117b0f918ede01a48750f6d4d6a39cdd347ef7 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 12:06:58 +0100 Subject: [PATCH 33/37] #126: Add creation init module to provide backend for CLI functionality for: - Initializing a new dataset from yaml files - Add resources to the dataset Either empty or from file with inferred metadata --- src/omi/creation/init.py | 229 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 src/omi/creation/init.py diff --git a/src/omi/creation/init.py b/src/omi/creation/init.py new file mode 100644 index 00000000..8be170ff --- /dev/null +++ b/src/omi/creation/init.py @@ -0,0 +1,229 @@ +""" +Initialization helpers for OEMetadata split-files layout. + +Provides functions to scaffold dataset and resource YAML files and to +infer resource information from existing data files. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import yaml + +from omi.base import get_metadata_specification +from omi.inspection import InspectionError, infer_metadata + +if TYPE_CHECKING: + from collections.abc import Iterable + from pathlib import Path + + +@dataclass +class InitResult: + """Paths to created or reused YAML files for a single dataset.""" + + dataset_yaml: Path + template_yaml: Path + resource_yamls: list[Path] + + +# ----------------------------- +# helpers +# ----------------------------- + + +def _blankify(obj: object) -> object: + """ + Return a copy of `obj` with the same structure but 'empty' leaf values. + + Rules: + - dict -> recursively blankify values + - list -> [] if scalar list; if list of dicts and non-empty, keep one blankified element; else [] + - str -> "" + - bool -> False + - int/float -> "" (prefer empty so users must choose proper types) + - None -> None + - everything else -> "" + """ + if isinstance(obj, dict): + blank: object = {k: _blankify(v) for k, v in obj.items()} + elif isinstance(obj, list): + if not obj: + blank = [] + else: + first = obj[0] + # show one skeleton item so users see the structure for list-of-dicts; + # scalar lists -> show empty by default + blank = [_blankify(first)] if isinstance(first, dict) else [] + elif isinstance(obj, str): + blank = "" + elif isinstance(obj, bool): + blank = False + elif obj is None: + blank = None + else: + # numbers / other scalars -> empty + blank = "" + return blank + + +def _load_spec_template(oem_version: str) -> dict: + """Return the raw OEMetadata template document for the given version.""" + spec = get_metadata_specification(oem_version) + return spec.template or {} + + +def _dataset_stub_from_spec_template(oem_version: str, dataset_id: str) -> dict: + """ + Build datasets/.dataset.yaml from top-level template (not from resources). + + Remove @context/resources/metaMetadata and blankify the rest. + """ + t = _load_spec_template(oem_version).copy() + t.pop("@context", None) + t.pop("resources", None) # <-- filter out resource-level keys + t.pop("metaMetadata", None) + + blank = _blankify(t) + blank.setdefault("name", dataset_id) + blank.setdefault("title", "") + blank.setdefault("description", "") + blank.setdefault("@id", "") + + return {"version": oem_version, "dataset": blank} + + +def _resource_template_from_spec(oem_version: str) -> dict: + """Build datasets/.template.yaml from the *first* resource template only.""" + tmpl = _load_spec_template(oem_version) + resources = tmpl.get("resources") or [] + base = resources[0] if resources else {} + return _blankify(base) + + +def _resource_stub_from_spec(oem_version: str, resource_name: str) -> dict: + """Build resources//.resource.yaml from the resource template.""" + res = _resource_template_from_spec(oem_version) + res["name"] = resource_name + return res + + +def _dump_yaml(path: Path, data: dict, *, overwrite: bool) -> Path: + """Write `data` as YAML to `path`, respecting the `overwrite` flag.""" + path.parent.mkdir(parents=True, exist_ok=True) + if path.exists() and not overwrite: + return path + path.write_text( + yaml.safe_dump(data, sort_keys=False, allow_unicode=True), + encoding="utf-8", + ) + return path + + +# ----------------------------- +# public API +# ----------------------------- + + +def init_dataset( + base_dir: Path, + dataset_id: str, + *, + oem_version: str = "OEMetadata-2.0", + resources: Iterable[str] = (), + overwrite: bool = False, +) -> InitResult: + """ + Create or extend the split-files layout for one dataset. + + Creates: + + - datasets/.dataset.yaml + - datasets/.template.yaml + - resources//.resource.yaml for each requested resource. + """ + # touch spec (also ensures the version string is valid) + _ = get_metadata_specification(oem_version) + + dataset_yaml = base_dir / "datasets" / f"{dataset_id}.dataset.yaml" + template_yaml = base_dir / "datasets" / f"{dataset_id}.template.yaml" + + dataset_doc = _dataset_stub_from_spec_template(oem_version, dataset_id) + resource_template_doc = _resource_template_from_spec(oem_version) + + out_dataset = _dump_yaml(dataset_yaml, dataset_doc, overwrite=overwrite) + out_template = _dump_yaml(template_yaml, resource_template_doc, overwrite=overwrite) + + created_resources: list[Path] = [] + for res_name in resources: + res_doc = _resource_stub_from_spec(oem_version, res_name) + res_path = base_dir / "resources" / dataset_id / f"{res_name}.resource.yaml" + created_resources.append(_dump_yaml(res_path, res_doc, overwrite=overwrite)) + + return InitResult(dataset_yaml=out_dataset, template_yaml=out_template, resource_yamls=created_resources) + + +def init_resources_from_files( + base_dir: Path, + dataset_id: str, + files: Iterable[Path], + *, + oem_version: str = "OEMetadata-2.0.4", + overwrite: bool = False, +) -> list[Path]: + """ + Create resource stubs for DATASET_ID from the given FILES. + + Uses the spec resource template structure, prefills name/path/format hints, + and for CSV files also infers a schema (fields + types) using `omi.inspection`. + """ + _ = get_metadata_specification(oem_version) + + outputs: list[Path] = [] + for f in files: + name = f.stem + ext = f.suffix.lower().lstrip(".") + res = _resource_stub_from_spec(oem_version, name) + res["path"] = str(f) + + # Lightweight format hinting (non-authoritative; user should review) + if ext == "csv": + res.setdefault("format", "CSV") + res.setdefault("encoding", "UTF-8") + res.setdefault("scheme", "file") + + # Use existing inspection: "OEP" == OEMetadata in this code base + try: + inferred = infer_metadata(str(f), metadata_format="OEP") + except InspectionError: + inferred = None + + if inferred is not None: + # We only care about the *resource* part here + try: + inferred_resource = inferred["resources"][0] + inferred_schema = inferred_resource.get("schema") + except (KeyError, IndexError, TypeError): + inferred_schema = None + + if inferred_schema: + # Overwrite/attach the schema from inspection to this resource stub + res["schema"] = inferred_schema + + elif ext == "json": + res.setdefault("format", "json") + res.setdefault("scheme", "file") + elif ext == "xlsx": + res.setdefault("format", "xlsx") + res.setdefault("scheme", "file") + else: + if ext: + res.setdefault("format", ext) + res.setdefault("scheme", "file") + + out_path = base_dir / "resources" / dataset_id / f"{name}.resource.yaml" + outputs.append(_dump_yaml(out_path, res, overwrite=overwrite)) + + return outputs From 2169357c3c59d41dfbb9db5ad5dfcaa465f868fb Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 12:07:14 +0100 Subject: [PATCH 34/37] #126: enhance docstings --- src/omi/creation/utils.py | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py index a0e74a6a..fa591863 100644 --- a/src/omi/creation/utils.py +++ b/src/omi/creation/utils.py @@ -162,6 +162,26 @@ def resolve_from_index( - path/to/res2.yaml Paths are interpreted as relative to `base_dir`. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_id : str + Identifier for the dataset to load. + index_file : Optional[Union[str, Path]] + Optional path to an index YAML file for resolving dataset parts. + + Returns + ------- + tuple[Optional[Path], Optional[Path], list[Path]] + A tuple containing: + - dataset_path: Optional[Path] + Path to the dataset YAML (or None if not found). + - template_path: Optional[Path] + Path to the template YAML (or None if not found). + - resource_paths: list[Path] + List of paths to resource YAMLs. """ if not index_file: return discover_paths(base_dir, dataset_id) @@ -185,6 +205,29 @@ def load_parts( Load dataset YAML, optional template YAML, and all resource YAMLs. Returns a tuple: (version, dataset, resources, template). + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_id : str + Identifier for the dataset to load. + index_file : Optional[Union[str, Path]], optional + Optional path to an index YAML file for resolving dataset parts, + by default None. + + Returns + ------- + tuple[str, dict[str, object], list[dict[str, object]], dict[str, object]] + A tuple containing: + - version: str + The OEMetadata version from the dataset YAML (default "OEMetadata-2.0.4"). + - dataset: dict[str, object] + The dataset mapping from the dataset YAML. + - resources: list[dict[str, object]] + A list of resource mappings from the resource YAMLs. + - template: dict[str, object] + The template mapping from the template YAML (empty dict if none). """ dataset_path, template_path, resource_paths = resolve_from_index(base_dir, dataset_id, index_file) @@ -209,6 +252,16 @@ def discover_dataset_ids(base_dir: Union[str, Path]) -> list[str]: Discover dataset ids by scanning datasets/*.dataset.yaml. For 'datasets/powerplants.dataset.yaml' returns 'powerplants'. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + + Returns + ------- + list[str] + Sorted list of discovered dataset IDs. """ base = Path(base_dir) datasets_dir = base / "datasets" @@ -222,6 +275,16 @@ def discover_dataset_ids_from_index(index_file: Union[str, Path]) -> list[str]: Discover dataset ids from an explicit metadata_index.yaml. Returns the sorted list of top-level keys under `datasets`. + + Parameters + ---------- + index_file : Union[str, Path] + Path to an index YAML file for resolving dataset parts. + + Returns + ------- + list[str] + Sorted list of discovered dataset IDs. """ idx_path = Path(index_file) if not idx_path.exists(): From 90f4ae0d8a9147225da13c512ebc2d0911260bd4 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 12:46:21 +0100 Subject: [PATCH 35/37] #126: add more test to creation test module --- tests/test_create.py | 70 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tests/test_create.py b/tests/test_create.py index 7cadf2d8..be25b7a5 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -1,5 +1,5 @@ """ -Integration tests for OEMetadata assembly and entry point using real YAML. +Integration tests for OEMetadata assembly and entry point using YAML test data. This test suite consumes the example YAML tree located at: tests/test_data/create/metadata/ @@ -10,10 +10,14 @@ import json from pathlib import Path +from typing import TYPE_CHECKING from omi.create import build_from_yaml from omi.creation.assembler import assemble_metadata_dict +if TYPE_CHECKING: + import pytest + def _fixture_metadata_root() -> Path: """Return the absolute path to tests/test_data/create/metadata.""" @@ -74,3 +78,67 @@ def test_entrypoint_build_from_yaml_writes_file(tmp_path: Path) -> None: # stringify to inspect the character; ensure_ascii=False in writer preserves it text = json.dumps(licenses[0], ensure_ascii=False) assert "©" in text + + +def test_build_from_yaml_writes_file_when_output_is_file( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure build_from_yaml writes to the exact file path provided.""" + from omi import create as create_mod + + expected: dict[str, object] = {"name": "pp", "resources": []} + + # Avoid needing real YAML on disk + def fake_assemble( + _base_dir: Path, + dataset_id: str, + _index_file: Path | None = None, + ) -> dict[str, object]: + assert dataset_id == "powerplants" + return expected + + monkeypatch.setattr(create_mod, "assemble_metadata_dict", fake_assemble) + + out = tmp_path / "out.json" + create_mod.build_from_yaml(tmp_path / "meta", "powerplants", out) + + assert out.exists() + assert json.loads(out.read_text(encoding="utf-8")) == expected + + +def test_build_many_from_yaml_writes_many_default_names( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure build_many_from_yaml writes .json files into output_dir.""" + from omi import create as create_mod + + canned: dict[str, dict[str, object]] = { + "a": {"name": "a", "resources": []}, + "b": {"name": "b", "resources": []}, + } + + def fake_many( + _base_dir: Path, + *, + _dataset_ids: list[str] | None = None, + _index_file: Path | None = None, + as_dict: bool = True, + ) -> dict[str, dict[str, object]]: + # Called by build_many_from_yaml; return mapping id -> md + assert as_dict is True + return canned + + monkeypatch.setattr(create_mod, "assemble_many_metadata", fake_many) + + out_dir = tmp_path / "out" + create_mod.build_many_from_yaml(tmp_path / "meta", out_dir) + + a_path = out_dir / "a.json" + b_path = out_dir / "b.json" + assert a_path.exists() + assert b_path.exists() + + assert json.loads(a_path.read_text(encoding="utf-8")) == canned["a"] + assert json.loads(b_path.read_text(encoding="utf-8")) == canned["b"] From 1b2a38ff8ba6fb651504af41ce13d94ab330ab4c Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 19 Nov 2025 12:48:57 +0100 Subject: [PATCH 36/37] 126: fix test --- tests/test_create.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/test_create.py b/tests/test_create.py index be25b7a5..7fe2dd07 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -91,10 +91,12 @@ def test_build_from_yaml_writes_file_when_output_is_file( # Avoid needing real YAML on disk def fake_assemble( - _base_dir: Path, + base_dir: Path, dataset_id: str, - _index_file: Path | None = None, + index_file: Path | None = None, ) -> dict[str, object]: + # use args to avoid ARG001 + _ = base_dir, index_file assert dataset_id == "powerplants" return expected @@ -120,13 +122,14 @@ def test_build_many_from_yaml_writes_many_default_names( } def fake_many( - _base_dir: Path, + base_dir: Path, *, - _dataset_ids: list[str] | None = None, - _index_file: Path | None = None, + dataset_ids: list[str] | None = None, + index_file: Path | None = None, as_dict: bool = True, ) -> dict[str, dict[str, object]]: # Called by build_many_from_yaml; return mapping id -> md + _ = base_dir, dataset_ids, index_file # avoid ARG001 assert as_dict is True return canned From 32b3b539c7a2c615ed9c71f72da9c940ed811343 Mon Sep 17 00:00:00 2001 From: jh-RLI Date: Wed, 3 Dec 2025 10:22:29 +0100 Subject: [PATCH 37/37] #126: update changelog --- CHANGELOG.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e1c7c862..207bd1c5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,7 +4,8 @@ Changelog current -------------------- -* +* Add the creation module and create entry: They implement yaml based metadata creation, provide template feature to keep metadata creation DRY, provide functionality to setup the metadata structure & generate metadata from existing sources like datapackages and csv files, provide functionality to create the full datapackage.json and save it to file [(#127)](https://github.com/rl-institut/super-repo/pull/127) + 1.1.0 (2025-03-25) --------------------