diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e1c7c862..207bd1c5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,7 +4,8 @@ Changelog current -------------------- -* +* Add the creation module and create entry: They implement yaml based metadata creation, provide template feature to keep metadata creation DRY, provide functionality to setup the metadata structure & generate metadata from existing sources like datapackages and csv files, provide functionality to create the full datapackage.json and save it to file [(#127)](https://github.com/rl-institut/super-repo/pull/127) + 1.1.0 (2025-03-25) -------------------- diff --git a/docs/create.md b/docs/create.md new file mode 100644 index 00000000..20b816e7 --- /dev/null +++ b/docs/create.md @@ -0,0 +1,159 @@ +# OMI “Create” Entry Point + +This mini-guide explains how to use the **programmatic entry points** that turn your split YAML metadata (dataset + template + resources) into a single OEMetadata JSON document. + +> If you’re looking for how to author the YAML files and how templating works, see the main **Assembly Guide** in the `creation` module directory. This page just shows how to *call* the entry points. + +--- + +## What it does + +The functions in `omi.create` wrap the full assembly pipeline: + +1. **Discover / load** your YAML parts (dataset, optional template, resources). +2. **Apply the template** to each resource (deep merge; resource wins; keywords/topics/languages concatenate). +3. **Generate & validate** the final OEMetadata JSON using the official schema (via `OEMetadataCreator`). +4. **Write** the result to disk (`build_from_yaml`) or many results to a directory (`build_many_from_yaml`). + +--- + +## API + +```python +from omi.create import build_from_yaml, build_many_from_yaml +``` + +### `build_from_yaml(base_dir, dataset_id, output_file, *, index_file=None) -> None` + +Assemble **one** dataset and write `` (JSON). + +* `base_dir` (`str | Path`): Root that contains: + + * `datasets/.dataset.yaml` + * `datasets/.template.yaml` *(optional)* + * `resources//*.resource.yaml` +* `dataset_id` (`str`): Logical dataset name (e.g. `"powerplants"`). +* `output_file` (`str | Path`): Path to write the generated OEMetadata JSON. +* `index_file` (`str | Path | None`): Optional explicit mapping file (`metadata_index.yaml`). If provided, paths are taken from the index instead of convention. + +### `build_many_from_yaml(base_dir, output_dir, *, dataset_ids=None, index_file=None) -> None` + +Assemble **multiple** datasets and write each as `/.json`. + +* `base_dir` (`str | Path`): Same as above. +* `output_dir` (`str | Path`): Destination directory for one JSON file per dataset. +* `dataset_ids` (`list[str] | None`): Limit to specific datasets. If `None`, we: + + * Use keys from `index_file` when provided, **else** + * Discover all `datasets/*.dataset.yaml` in `base_dir`. +* `index_file` (`str | Path | None`): Optional `metadata_index.yaml`. + +--- + +## Quick examples + +### One dataset (convention-based discovery) + +```python +from omi.create import build_from_yaml + +build_from_yaml( + base_dir="./metadata", + dataset_id="powerplants", + output_file="./out/powerplants.json", +) +``` + +Directory layout: + +```bash +metadata/ + datasets/ + powerplants.dataset.yaml + powerplants.template.yaml # optional + resources/ + powerplants/ + *.resource.yaml +``` + +### One dataset (explicit index) + +```python +from omi.create import build_from_yaml + +build_from_yaml( + base_dir="./metadata", + dataset_id="powerplants", + output_file="./out/powerplants.json", + index_file="./metadata/metadata_index.yaml", +) +``` + +### Many datasets (discover all) + +```python +from omi.create import build_many_from_yaml + +build_many_from_yaml( + base_dir="./metadata", + output_dir="./out", +) +# writes ./out/.json for each dataset found +``` + +### Many datasets (index + subset) + +```python +from omi.create import build_many_from_yaml + +build_many_from_yaml( + base_dir="./metadata", + output_dir="./out", + dataset_ids=["powerplants", "households"], + index_file="./metadata/metadata_index.yaml", +) +``` + +--- + +## Notes & behavior + +* Output JSON is written with `indent=2` and **`ensure_ascii=False`** to preserve characters like `©`. +* Validation happens via `OEMetadataCreator` using the official schema provided by `oemetadata` (imported through `omi.base.get_metadata_specification`). +* If a dataset YAML is missing, `FileNotFoundError` is raised. +* If schema validation fails, you’ll get an exception from `omi.validation`. Catch it where you call the entry point if you want to handle/report errors. + +--- + +## Using in 3rd Party code like data pipelines + +```python +from pathlib import Path +from omi.create import build_from_yaml + +def build_oemetadata_callable(**context): + base = Path("/project/metadata") + out = Path("/project/metadata/out/powerplants.json") + build_from_yaml(base, "powerplants", out) + # optionally push to airflow XCom, publish, upload, etc. +``` + +--- + +## Testing tips + +* For **unit tests** of `omi.create`, patch `omi.create.assemble_metadata_dict` / `assemble_many_metadata` and verify files are written. +* For **integration tests**, put real example YAMLs under `tests/test_data/create/metadata/` and call `build_from_yaml` end-to-end. + +--- + +## Troubleshooting + +* **“Dataset YAML not found”** + Check `base_dir/datasets/.dataset.yaml` exists, or supply the correct `index_file`. + +* **Unicode characters appear escaped (`\u00a9`)** + Ensure you’re not re-writing the JSON elsewhere with `ensure_ascii=True`. + +* **Template not applied** + Confirm your template file name matches `.template.yaml` (or is correctly referenced from the index), and the keys you expect to inherit aren’t already set in the resource (resource values win). diff --git a/pyproject.toml b/pyproject.toml index cd3fadfd..585b49fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,3 +78,6 @@ unfixable = ["UP007", "I001"] "*/__init__.py" = [ "D104", # Missing docstring in public package ] + +[omi.scripts] +omi = "omi.cli:main" diff --git a/src/omi/cli.py b/src/omi/cli.py index 6b4d0aac..fbfb722e 100644 --- a/src/omi/cli.py +++ b/src/omi/cli.py @@ -1,29 +1,118 @@ """ -Module that contains the command line app. +Command line interface for OMI. -Why does this file exist, and why not put this in __main__? +This CLI only supports the split-files layout: +- datasets/.dataset.yaml +- datasets/.template.yaml (optional) +- resources//*.resource.yaml +(optionally wired via metadata_index.yaml) - You might be tempted to import things from __main__ later, but that will cause - problems: the code will get executed twice: +Usage: +omi assemble \ + --base-dir ./metadata \ + --dataset-id powerplants \ + --output-file ./out/powerplants.json \ + --index-file ./metadata/metadata_index.yaml # optional - - When you run `python -m omi` python will execute - ``__main__.py`` as a script. That means there won't be any - ``omi.__main__`` in ``sys.modules``. - - When you import __main__ it will get executed again (as a module) because - there's no ``omi.__main__`` in ``sys.modules``. - - Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration """ +from __future__ import annotations + +from pathlib import Path +from typing import Optional + import click +from omi.creation.creator import OEMetadataCreator +from omi.creation.init import init_dataset, init_resources_from_files +from omi.creation.utils import apply_template_to_resources, load_parts + @click.group() def grp() -> None: - """Init click group.""" + """OMI CLI.""" + + +@grp.command("assemble") +@click.option( + "--base-dir", + required=True, + type=click.Path(file_okay=False, path_type=Path), + help="Root directory containing 'datasets/' and 'resources/'.", +) +@click.option("--dataset-id", required=True, help="Logical dataset id (e.g. 'powerplants').") +@click.option( + "--output-file", + required=True, + type=click.Path(dir_okay=False, path_type=Path), + help="Path to write the generated OEMetadata JSON.", +) +@click.option( + "--index-file", + default=None, + type=click.Path(dir_okay=False, path_type=Path), + help="Optional metadata index YAML for explicit mapping.", +) +def assemble_cmd(base_dir: Path, dataset_id: str, output_file: Path, index_file: Optional[Path]) -> None: + """Assemble OEMetadata from split YAML files and write JSON to OUTPUT_FILE.""" + # Load pieces + version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file) + merged_resources = apply_template_to_resources(resources, template) + + # Build & save with the correct spec version + creator = OEMetadataCreator(oem_version=version) + creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2) + + +@click.group() +def init() -> None: + """Scaffold OEMetadata split-files layout.""" + + +@init.command("dataset") +@click.argument("base_dir", type=click.Path(file_okay=False, path_type=Path)) +@click.argument("dataset_id") +@click.option("--oem-version", default="OEMetadata-2.0", show_default=True) +@click.option("--resource", "resources", multiple=True, help="Initial resource names (repeatable).") +@click.option("--overwrite", is_flag=True, help="Overwrite existing files.") +def init_dataset_cmd( + base_dir: Path, + dataset_id: str, + oem_version: str, + resources: tuple[str, ...], + *, + overwrite: bool, +) -> None: + """Initialize a split-files OEMetadata dataset layout under BASE_DIR.""" + res = init_dataset(base_dir, dataset_id, oem_version=oem_version, resources=resources, overwrite=overwrite) + click.echo(f"dataset: {res.dataset_yaml}") + click.echo(f"template: {res.template_yaml}") + for p in res.resource_yamls: + click.echo(f"resource: {p}") + + +@init.command("resources") +@click.argument("base_dir", type=click.Path(file_okay=False, path_type=Path)) +@click.argument("dataset_id") +@click.argument("files", nargs=-1, type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option("--oem-version", default="OEMetadata-2.0", show_default=True) +@click.option("--overwrite", is_flag=True, help="Overwrite existing files.") +def init_resources_cmd( + base_dir: Path, + dataset_id: str, + files: tuple[Path, ...], + oem_version: str, + *, + overwrite: bool, +) -> None: + """Create resource YAML files for DATASET_ID from the given FILES.""" + outs = init_resources_from_files(base_dir, dataset_id, files, oem_version=oem_version, overwrite=overwrite) + for p in outs: + click.echo(p) -cli = click.CommandCollection(sources=[grp]) +# Keep CommandCollection for backwards compatibility with your entry point +cli = click.CommandCollection(sources=[grp, init]) def main() -> None: diff --git a/src/omi/create.py b/src/omi/create.py new file mode 100644 index 00000000..2f8faaaf --- /dev/null +++ b/src/omi/create.py @@ -0,0 +1,75 @@ +"""Entry point for OEMetadata creation (split-files layout only).""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Optional, Union + +from omi.creation.assembler import assemble_many_metadata, assemble_metadata_dict + + +def build_from_yaml( + base_dir: Union[str, Path], + dataset_id: str, + output_file: Union[str, Path], + *, + index_file: Optional[Union[str, Path]] = None, +) -> None: + """ + Assemble one dataset and write the resulting OEMetadata JSON to a file. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing the split-files dataset structure. + dataset_id : str + The dataset ID to assemble. + output_file : Union[str, Path] + Path to write the resulting OEMetadata JSON file. + index_file : Optional[Union[str, Path]], optional + Optional path to an index file for resolving cross-dataset references, + by default None. + """ + md = assemble_metadata_dict(base_dir, dataset_id, index_file=index_file) + Path(output_file).parent.mkdir(parents=True, exist_ok=True) + Path(output_file).write_text(json.dumps(md, indent=2, ensure_ascii=False), encoding="utf-8") + + +def build_many_from_yaml( + base_dir: Union[str, Path], + output_dir: Union[str, Path], + *, + dataset_ids: Optional[list[str]] = None, + index_file: Optional[Union[str, Path]] = None, +) -> None: + """ + Assemble multiple datasets and write each as .json to output_dir. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing the split-files dataset structure. + output_dir : Union[str, Path] + Directory to write the resulting OEMetadata JSON files. + dataset_ids : Optional[list[str]], optional + Optional list of dataset IDs to assemble. If None, all datasets found + in base_dir will be assembled, by default None. + index_file : Optional[Union[str, Path]], optional + Optional path to an index file for resolving cross-dataset references, + by default None. + """ + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + results = assemble_many_metadata( + base_dir, + dataset_ids=dataset_ids, + index_file=index_file, + as_dict=True, # keep it as a mapping id -> metadata + ) + for ds_id, md in results.items(): + (out_dir / f"{ds_id}.json").write_text( + json.dumps(md, indent=2, ensure_ascii=False), + encoding="utf-8", + ) diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md new file mode 100644 index 00000000..7fedbfef --- /dev/null +++ b/src/omi/creation/README.md @@ -0,0 +1,496 @@ +# OMI OEMetadata Assembly Guide + +This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, multi-dataset usage, initialization scaffolding, testing, and common pitfalls. + +--- + +## Table of Contents + +1. [Overview](#overview) +2. [Concepts & Data Flow](#concepts--data-flow) +3. [Repository Layout](#repository-layout) +4. [YAML File Formats](#yaml-file-formats) + + * [Dataset YAML](#dataset-yaml) + * [Template YAML (optional)](#template-yaml-optional) + * [Resource YAML](#resource-yaml) + * [Index YAML (optional)](#index-yaml-optional) +5. [Templating Rules](#templating-rules) +6. [Discovery vs. Index Mapping](#discovery-vs-index-mapping) +7. [Programmatic Usage](#programmatic-usage) + + * [Minimal Usage](#minimal-usage) + * [With Index Mapping](#with-index-mapping) + * [Manual Loading (No Discovery)](#manual-loading-no-discovery) +8. [Multi-dataset Assembly](#multi-dataset-assembly) +9. [Spec-Driven Output Ordering](#spec-driven-output-ordering) +10. [Project Initialization (Scaffolding)](#project-initialization-scaffolding) +11. [Airflow Integration Example](#airflow-integration-example) +12. [Testing](#testing) +13. [Validation & Error Handling](#validation--error-handling) +14. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding) +15. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional) +16. [Design Notes & Extensibility](#design-notes--extensibility) +17. [FAQ](#faq) + +--- + +## Overview + +* **Goal:** Author OEMetadata as **YAML** (dataset + resources), keep it **DRY** via **templates**, assemble into a single **JSON** metadata document, and **validate** it with the official schema. +* **Core ideas:** + + * Maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs. + * OMI assembles + validates metadata into final OEMetadata JSON. + * Works in pipelines (e.g., Airflow) and plain Python. + +--- + +## Concepts & Data Flow + +1. **Authoring:** + + * `datasets/.dataset.yaml` + * `datasets/.template.yaml` *(optional)* + * `resources//*.resource.yaml` +2. **Assembly:** + + * Load dataset, template, and resource YAML files. + * Apply template → deep merge; resource overrides. + * Create OEMetadata JSON via `OEMetadataCreator` and validate. +3. **Storage:** + + * Assembly returns a Python `dict`. Store wherever you like (file/DB/API). + +--- + +## Repository Layout + +```bash +metadata/ + datasets/ + .dataset.yaml + .template.yaml # optional + resources/ + / + .resource.yaml + .resource.yaml + metadata_index.yaml # optional explicit mapping +``` + +Use the **convention** above or an **index** file for explicit mapping. + +--- + +## YAML File Formats + +### Dataset YAML + +```yaml +# metadata/datasets/powerplants.dataset.yaml +version: "OEMetadata-2.0.4" # optional (default: OEMetadata-2.0.4) +dataset: + name: oep_oemetadata + title: OEP OEMetadata + description: A dataset for the OEMetadata examples. + "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/ +``` + +> Backwards compatibility: dataset fields can also be at top-level; OMI treats that as `dataset: {...}`. + +--- + +### Template YAML (optional) + +Applied to **every** resource (unless overridden). Keeps YAML DRY. + +```yaml +# metadata/datasets/powerplants.template.yaml +licenses: + - name: ODbL-1.0 + title: Open Data Commons Open Database License 1.0 + path: https://opendatacommons.org/licenses/odbl/1-0/index.html + instruction: > + You are free to share and change, but you must attribute, and + share derivations under the same license. See https://tldrlegal.com/license/odc-open-database-license-(odbl) + for further information. + attribution: © Reiner Lemoine Institut + copyrightStatement: https://github.com/OpenEnergyPlatform/oemetadata/blob/production/LICENSE.txt + +context: + title: NFDI4Energy + homepage: https://nfdi4energy.uol.de/ + documentation: https://nfdi4energy.uol.de/sites/about_us/ + sourceCode: https://github.com/NFDI4Energy + publisher: Open Energy Platform (OEP) + publisherLogo: https://github.com/OpenEnergyPlatform/organisation/blob/production/logo/OpenEnergyFamily_Logo_OpenEnergyPlatform.svg + contact: contact@example.com + fundingAgency: " Deutsche Forschungsgemeinschaft (DFG)" + fundingAgencyLogo: https://upload.wikimedia.org/wikipedia/commons/8/86/DFG-logo-blau.svg + grantNo: "501865131" + +topics: [model_draft] +languages: [en-GB, de-DE] +keywords: [example, ODbL-1.0, NFDI4Energy] +``` + +--- + +### Resource YAML + +```yaml +# metadata/resources/powerplants/oemetadata_table.resource.yaml +name: oemetadata_table +type: table +title: OEMetadata Table Template +description: Example table used to illustrate the OEMetadata structure and features. + +# Resource-specific attributes +path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table +scheme: http +format: CSV +encoding: UTF-8 + +dialect: + decimalSeparator: "." + csv: + delimiter: ";" + +schema: + fields: + - name: id + type: integer + description: Unique identifier + nullable: false + # ... more fields ... + primaryKey: [id] + foreignKeys: + - fields: [id, version] + reference: + resource: model_draft.oep_oemetadata_table_example_version + fields: [id, version] + +"@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv + +sources: + - title: IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report + authors: [Hoesung Lee, José Romero, The Core Writing Team] + publicationYear: "2023" + path: https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf + sourceLicenses: + - name: CC-BY-4.0 + title: Creative Commons Attribution 4.0 International + path: https://creativecommons.org/licenses/by/4.0/legalcode + instruction: > + You are free to share and change, but you must attribute. + See https://tldrlegal.com/license/odc-open-database-license-odbl for further information. + attribution: © Intergovernmental Panel on Climate Change 2023 + copyrightStatement: https://www.ipcc.ch/copyright/ +``` + +Second resource: + +```yaml +# metadata/resources/powerplants/data_2.resource.yaml +name: data_2 +type: table +title: My Second Resource +path: reGon/metadata/data_2.csv +scheme: file +format: csv +mediatype: text/csv +encoding: utf-8 +schema: + fields: + - name: id + type: integer + nullable: true + - name: i + type: integer + nullable: true + - name: o + type: string + nullable: true + primaryKey: [id] +``` + +--- + +### Index YAML (optional) + +Explicit mappings instead of convention: + +```yaml +# metadata/metadata_index.yaml +datasets: + powerplants: + dataset: datasets/powerplants.dataset.yaml + template: datasets/powerplants.template.yaml + resources: + - resources/powerplants/oemetadata_table.resource.yaml + - resources/powerplants/data_2.resource.yaml +``` + +--- + +## Templating Rules + +* **Deep merge** for dictionaries (e.g., `context`): + Resource **overrides**; missing nested keys are **filled** from template. +* **Lists**: + **Concatenate** for `keywords`, `topics`, `languages` (resource first, then template-only items). + For other lists (e.g., `licenses`, `contributors`): **resource wins** (no concat). + *(Modify via `DEFAULT_CONCAT_LIST_KEYS` if you want different behavior.)* +* **Scalars**: resource value **wins**. + +--- + +## Discovery vs. Index Mapping + +* **Discovery (convention):** + `datasets/.dataset.yaml`, `datasets/.template.yaml`, `resources//*.resource.yaml` + → No index needed. +* **Index (explicit):** + Provide `metadata_index.yaml` with explicit paths relative to your base directory. + +--- + +## Programmatic Usage + +### Minimal Usage + +```python +from omi.creation.assembly import assemble_metadata_dict + +metadata = assemble_metadata_dict(base_dir="./metadata", dataset_id="powerplants") +``` + +### With Index Mapping + +```python +metadata = assemble_metadata_dict( + base_dir="./metadata", + dataset_id="powerplants", + index_file="./metadata/metadata_index.yaml", +) +``` + +### Manual Loading (No Discovery) + +```python +from pathlib import Path +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import load_yaml, apply_template_to_resources + +dataset = load_yaml(Path("./metadata/datasets/powerplants.dataset.yaml")).get("dataset", {}) +template = load_yaml(Path("./metadata/datasets/powerplants.template.yaml")) +resources = [ + load_yaml(Path("./metadata/resources/powerplants/oemetadata_table.resource.yaml")), + load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")), +] +resources = apply_template_to_resources(resources, template) + +creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4") +metadata = creator.generate_metadata(dataset, resources) +``` + +> `OEMetadataCreator` injects `@context` and `metaMetadata` from the spec and validates the result. + +--- + +## Multi-dataset Assembly + +Assemble **N datasets** in one call: + +```python +from omi.creation.assembly import assemble_many_metadata + +# Discover by convention (datasets/*.dataset.yaml) +all_metadata = assemble_many_metadata(base_dir="./metadata") + +# From explicit index +all_metadata = assemble_many_metadata( + base_dir="./metadata", index_file="./metadata/metadata_index.yaml" +) + +# Subset +some = assemble_many_metadata(base_dir="./metadata", dataset_ids=["powerplants", "households"]) +``` + +Result is a dict `{dataset_id: metadata}` by default. + +--- + +## Spec-Driven Output Ordering + +For human-friendly JSON key order without hard-coded lists, order by the **official example** (fallback: schema `properties`): + +```python +from omi.creation.assembly import assemble_metadata_dict +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import order_with_spec + +creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4") +metadata = assemble_metadata_dict("./metadata", "powerplants") + +ordered = order_with_spec(metadata, creator.oem_spec) # uses spec.example and schema +``` + +Write with preserved unicode: + +```python +import json, pathlib +out = pathlib.Path("./out/powerplants.json") +out.parent.mkdir(parents=True, exist_ok=True) +out.write_text(json.dumps(ordered, indent=2, ensure_ascii=False), encoding="utf-8") +``` + +--- + +## Project Initialization (Scaffolding) + +Create a metadata skeleton **from the spec** (no inline templates): + +```python +from omi.creation.scaffold import init_skeleton_from_spec + +paths = init_skeleton_from_spec( + base_dir="./metadata", + dataset_id="powerplants", + oem_version="OEMetadata-2.0.4", + resource_name="oemetadata_table", + with_index=True, # creates metadata_index.yaml + force=False, # do not overwrite +) +``` + +This imports the spec via: + +```python +from omi.base import get_metadata_specification +``` + +…and derives: + +* `datasets/.dataset.yaml` (with version from spec) +* `datasets/.template.yaml` (from `oem_spec.template` or pruned example resource) +* `resources//sample.resource.yaml` (sanitized from example) +* optional `metadata_index.yaml` + +You can expose a CLI command `omi init` that wraps `init_skeleton_from_spec`. + +--- + +## Airflow Integration Example + +```python +from omi.creation.assembly import assemble_metadata_dict + +def build_oemetadata_for_powerplants(**context): + md = assemble_metadata_dict( + base_dir="/opt/airflow/dags/metadata", + dataset_id="powerplants", + index_file="/opt/airflow/dags/metadata/metadata_index.yaml", + ) + context["ti"].xcom_push(key="oemetadata", value=md) +``` + +--- + +## Testing + +* **Assembly test** (uses a fake creator): see `tests/test_assembly.py` example in this doc. +* **Utils tests** (I/O, discovery, merging): see `tests/test_creation_utils.py`. + It covers: + + * `load_parts` (template application) + * `_merge_lists`, `deep_apply_template_to_resource`, `apply_template_to_resources` + * `load_yaml` + * `discover_paths`, `resolve_from_index`, `load_parts` + * `discover_dataset_ids`, `discover_dataset_ids_from_index` + +Run: + +```bash +pytest -q +``` + +--- + +## Validation & Error Handling + +`OEMetadataCreator.generate_metadata()` validates with the official schema: + +```python +from omi.validation import ValidationError + +try: + metadata = assemble_metadata_dict("./metadata", "powerplants") +except ValidationError as e: + print("Validation failed:", e) +``` + +**Common causes**: + +* Missing required field keys (e.g., a schema field without `"nullable"`). +* Wrong types (e.g., non-URI where `format: uri` is required). +* Invalid list shapes (e.g., `primaryKey`, `foreignKeys`). + +--- + +## Auto-Generation From Directory (Optional Onboarding) + +You can bootstrap YAMLs from a directory or zip: + +* infer resources from file names/extensions +* for CSV, infer a table schema +* emit dataset YAML + one resource YAML per file + +Use filters to skip temp/log/backup files (see next section). + +--- + +## Filtering Irrelevant Files (Optional) + +When scanning directories, exclude noise such as backup and editor artifacts: + +```python +exclude_extensions = {".log", ".tmp", ".bak", ".DS_Store", ".md"} +exclude_patterns = {"*_backup.*", "*~", "*.old", "*.ignore"} +exclude_hidden = True +``` + +--- + +## Design Notes & Extensibility + +* **Separation of concerns**: + + * `utils`: YAML loading, discovery, deep merge, ordering by spec. + * `assembly`: Orchestrates load → merge → create → (optionally) order. + * `creator`: Pulls spec via `get_metadata_specification`, injects `@context` and `metaMetadata`, validates. + * `scaffold`: Initializes a project from the **spec/example** (no inline strings). +* **Storage-agnostic**: assembly returns a dict; saving is up to you. +* **Configurable merging**: tweak `DEFAULT_CONCAT_LIST_KEYS` to change list concat behavior. + +--- + +## FAQ + +**Q: Can resource YAML override template-provided `licenses`?** +A: Yes. By default, resource lists override template lists except for `keywords`, `topics`, `languages` (which concatenate). Add `"licenses"` to `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation. + +**Q: Where do `@context` and `metaMetadata` come from?** +A: `OEMetadataCreator` loads the spec (`get_metadata_specification(oem_version)`) and injects both before validation. + +**Q: Why does JSON show `\u00a9` instead of `©`?** +A: Use `ensure_ascii=False` in `json.dump` to preserve unicode characters. + +**Q: I got a validation error: `'nullable' is a required property`.** +A: Ensure each `schema.fields[]` has **`name`**, **`type`**, **`nullable`**. If you auto-generate, set `nullable: false` unless you detect nulls. + +**Q: Can I reorder output keys to match the official example?** +A: Yes. Use `order_with_spec(metadata, creator.oem_spec)` for spec-driven ordering (no hard-coded key lists). + +**Q: Can I manage multiple datasets in one metadata module?** +A: Yes. Use `assemble_many_metadata(...)` to discover/assemble **N datasets** at once (by convention or index). diff --git a/src/omi/creation/__init__.py b/src/omi/creation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/omi/creation/assembler.py b/src/omi/creation/assembler.py new file mode 100644 index 00000000..edaa4318 --- /dev/null +++ b/src/omi/creation/assembler.py @@ -0,0 +1,96 @@ +"""Assemble OEMetadata dictionary from parts: dataset, template, and resources.""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional, Union + +from .creator import OEMetadataCreator +from .utils import ( + apply_template_to_resources, + discover_dataset_ids, + discover_dataset_ids_from_index, + load_parts, +) + +if TYPE_CHECKING: + from collections.abc import Iterable + + +def assemble_metadata_dict( + base_dir: Union[str, Path], + dataset_id: str, + index_file: Optional[Union[str, Path]] = None, +) -> dict[str, Any]: + """ + Load dataset/template/resources; apply template; validate via creator; return dict. + + Parameters + ---------- + base_dir: Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_id: str + Identifier for the dataset to load. + index_file: Optional[Union[str, Path]] + Optional path to an index YAML file for resolving dataset parts. + + Returns + ------- + Dict[str, Any] + The assembled and validated OEMetadata dictionary. + """ + version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file) + merged_resources = apply_template_to_resources(resources, template) + creator = OEMetadataCreator(oem_version=version) + return creator.generate_metadata(dataset, merged_resources) + + +def assemble_many_metadata( + base_dir: Union[str, Path], + dataset_ids: Optional[Iterable[str]] = None, + index_file: Optional[Union[str, Path]] = None, + *, + as_dict: bool = True, +) -> Union[dict[str, dict], list[tuple[str, dict]]]: + """ + Assemble OEMetadata for multiple datasets in one call. + + - If dataset_ids is None: + * when index_file is provided -> use keys from index + * otherwise -> discover by 'datasets/*.dataset.yaml' + - Returns a mapping {dataset_id: metadata} if as_dict=True, + else a list of (dataset_id, metadata) pairs in sorted id order. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_ids : Optional[Iterable[str]], optional + Optional iterable of dataset IDs to assemble. If None, all datasets found + in base_dir will be assembled, by default None. + index_file : Optional[Union[str, Path]], optional + Optional path to an index YAML file for resolving dataset parts. + as_dict : bool, optional + Whether to return results as a dict mapping dataset_id to metadata. If False, + returns a list of (dataset_id, metadata) tuples, by default True. + + Returns + ------- + Union[dict[str, dict], list[tuple[str, dict]]] + Assembled OEMetadata for each dataset. + """ + base = Path(base_dir) + + if dataset_ids is None: + ids = discover_dataset_ids_from_index(index_file) if index_file else discover_dataset_ids(base) + else: + ids = list(dataset_ids) + + results_pairs: list[tuple[str, dict]] = [] + for ds_id in ids: + md = assemble_metadata_dict(base, ds_id, index_file=index_file) + results_pairs.append((ds_id, md)) + + if as_dict: + return dict(results_pairs) + return results_pairs diff --git a/src/omi/creation/creator.py b/src/omi/creation/creator.py new file mode 100644 index 00000000..9d93b2b3 --- /dev/null +++ b/src/omi/creation/creator.py @@ -0,0 +1,68 @@ +"""Create OEMetadata JSON datapackage structure and return or store it.""" + +from __future__ import annotations + +import json +from pathlib import Path + +from omi.base import get_metadata_specification +from omi.validation import validate_metadata + + +class OEMetadataCreator: + """ + Create OEMetadata JSON datapackages. + + Output is based on dataset and resource descriptions and validated against + the official schema. + """ + + def __init__(self, oem_version: str = "OEMetadata-2.0") -> None: + """Initialize the creator with a specific OEMetadata version.""" + self.oem_spec = get_metadata_specification(oem_version) + + def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: + """Generate OEMetadata JSON datapackage from dataset and resources.""" + metadata = { + "@context": self.oem_spec.schema["properties"]["@context"]["examples"][0], + **dataset, + "resources": resources, + "metaMetadata": self.oem_spec.example["metaMetadata"], + } + + validate_metadata(metadata, check_license=False) + return metadata + + def save( + self, + dataset: dict, + resources: list[dict], + output_file: Path | str, + **dump_kwargs, + ) -> None: + """ + Generate OEMetadata and save it to a JSON file. + + Parameters + ---------- + dataset : dict + Dataset metadata. + resources : list[dict] + List of resource metadata entries. + output_file : Path | str + Path to the output JSON file. + **dump_kwargs : + Extra kwargs forwarded to `json.dump`. Defaults applied here: + - indent: 2 + - ensure_ascii: False + """ + metadata = self.generate_metadata(dataset, resources) + + # Defaults, can be overridden by caller via **dump_kwargs + indent = dump_kwargs.pop("indent", 2) + ensure_ascii = dump_kwargs.pop("ensure_ascii", False) + + with Path(output_file).open("w", encoding="utf-8") as f: + json.dump(metadata, f, indent=indent, ensure_ascii=ensure_ascii, **dump_kwargs) + + print(f"OEMetadata written to {output_file}") # noqa: T201 diff --git a/src/omi/creation/generator.py b/src/omi/creation/generator.py new file mode 100644 index 00000000..a141e9ad --- /dev/null +++ b/src/omi/creation/generator.py @@ -0,0 +1,205 @@ +""" +Generate an OEMetadata configuration file. + +Module for generating metadata files from resources like directories or zip files. +This used to get started from scratch - init metadata. +""" + +import fnmatch +import zipfile +from dataclasses import dataclass +from pathlib import Path +from typing import Union + +import yaml + +from omi.inspection import infer_metadata + + +@dataclass +class FileFilterOptions: + """ + Options for filtering files when reading directories or zip files. + + Attributes + ---------- + exclude_extensions: list[str] | None + List of file extensions to exclude (e.g., ['.log', '.tmp']). + exclude_patterns: list[str] | None + List of filename patterns to exclude (e.g., ['*_backup.*', '*.bak']). + exclude_hidden: bool + Whether to exclude hidden files/directories (default True). + """ + + exclude_extensions: list[str] | None = None + exclude_patterns: list[str] | None = None + exclude_hidden: bool = True + + +def read_directory( + directory: Union[str, Path], + filter_opts: FileFilterOptions, +) -> list[Path]: + """ + Recursively read files from the directory, applying optional filters. + + Parameters + ---------- + directory: Union[str, Path] + The directory to read files from. Can be a string or a Path object. + filter_opts: FileFilterOptions + Filtering options including extensions, patterns, and hidden files. + + Returns + ------- + list[Path] + A list of Path objects representing the files that match the criteria. + """ + directory = Path(directory) + + exclude_extensions = set(filter_opts.exclude_extensions or [".log", ".tmp", ".bak", ".DS_Store", ".md"]) + exclude_patterns = filter_opts.exclude_patterns or ["*_backup.*", "*~", "*.old", "*.ignore"] + + valid_files = [] + for file_path in directory.rglob("*"): + if not file_path.is_file(): + continue + + if filter_opts.exclude_hidden and any(part.startswith(".") for part in file_path.parts): + continue + + if file_path.suffix in exclude_extensions: + continue + + if any(fnmatch.fnmatch(file_path.name, pattern) for pattern in exclude_patterns): + continue + + valid_files.append(file_path) + + return valid_files + + +def read_zipfile( + zip_path: Union[str, Path], + extract_to: Union[str, Path], + filter_opts: FileFilterOptions, +) -> list[Path]: + """Extract a zip file and return list of extracted files.""" + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(extract_to) + return read_directory(extract_to, filter_opts) + + +def infer_file_metadata(file_path: Path) -> dict: + """ + Infer basic resource metadata from file name and type. + + Parameters + ---------- + file_path: Path + Path to the file for which metadata should be inferred. + + Returns + ------- + dict + A dictionary containing inferred metadata for the resource. + """ + file_name = file_path.stem + file_format = file_path.suffix.replace(".", "").upper() + + resource = { + "name": file_name.lower().replace(" ", "_"), + "title": file_name.replace("_", " ").title(), + "path": file_path.as_posix(), + "description": f"Auto-generated description for {file_name}", + "type": "table" if file_format in ["CSV", "XLSX", "JSON"] else "file", + "format": file_format, + "encoding": "UTF-8", + } + + if file_format == "CSV": + with file_path.open("r") as f: + fields = infer_metadata(f, "OEP")["resources"][0]["schema"] + + resource["schema"] = fields + resource["dialect"] = {"delimiter": fields.get("delimiter", ","), "decimalSeparator": "."} + + return resource + + +def generate_oemetadata_yaml_from_datapackage( + directory: Union[str, Path], + output_yaml: Union[str, Path], + dataset_metadata: dict, + filter_opts: FileFilterOptions, +) -> None: + """ + Generate an OEMetadata YAML configuration file based on files in a directory or zipped directory. + + Parameters + ---------- + directory: Union[str, Path] + Path to the directory or zip file containing data files. + output_yaml: Union[str, Path] + Path to the output YAML file. + dataset_metadata: dict + Metadata for the dataset, including name, title, description, and ID. + filter_opts: FileFilterOptions + Filtering options for excluding files by extension, pattern, or hidden state. + """ + temp_dir = None + directory = Path(directory) + if zipfile.is_zipfile(directory): + temp_dir = Path("temp_extracted") + files = read_zipfile(directory, temp_dir, filter_opts) + files = read_directory(temp_dir, filter_opts) # Apply filtering after extraction + else: + files = read_directory(directory, filter_opts) + + resources = [] + for file in files: + resource_meta = infer_file_metadata(file) + + resources.append(resource_meta) + + yaml_structure = { + "dataset": dataset_metadata, + "template": { # TODO @jh-RLI: This section must be defined by user # noqa: TD003 + "context": { + "title": "Your Project Title", + "homepage": "https://yourhomepage.org", + "contact": "contact@yourproject.org", + }, + }, + "resources": resources, + } + + with open(output_yaml, "w", encoding="utf-8") as yaml_file: # noqa: PTH123 + yaml.dump(yaml_structure, yaml_file, sort_keys=False, allow_unicode=True) + + if temp_dir: + import shutil + + shutil.rmtree(temp_dir) + + print(f"YAML configuration generated at: {output_yaml}") # noqa: T201 + + +# Example usage +if __name__ == "__main__": + dataset_metadata_example = { + "name": "example_dataset", + "title": "Example Dataset", + "description": "This dataset was autogenerated from directory content.", + "@id": "https://example.org/dataset/example_dataset", + } + + generate_oemetadata_yaml_from_datapackage( + directory="/home/jh/projekte/SLE/postprocessed/", + output_yaml="generated_metadata.yaml", + dataset_metadata=dataset_metadata_example, + filter_opts=FileFilterOptions( + exclude_patterns=[".snake*"], + exclude_hidden=True, + ), + ) diff --git a/src/omi/creation/init.py b/src/omi/creation/init.py new file mode 100644 index 00000000..8be170ff --- /dev/null +++ b/src/omi/creation/init.py @@ -0,0 +1,229 @@ +""" +Initialization helpers for OEMetadata split-files layout. + +Provides functions to scaffold dataset and resource YAML files and to +infer resource information from existing data files. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import yaml + +from omi.base import get_metadata_specification +from omi.inspection import InspectionError, infer_metadata + +if TYPE_CHECKING: + from collections.abc import Iterable + from pathlib import Path + + +@dataclass +class InitResult: + """Paths to created or reused YAML files for a single dataset.""" + + dataset_yaml: Path + template_yaml: Path + resource_yamls: list[Path] + + +# ----------------------------- +# helpers +# ----------------------------- + + +def _blankify(obj: object) -> object: + """ + Return a copy of `obj` with the same structure but 'empty' leaf values. + + Rules: + - dict -> recursively blankify values + - list -> [] if scalar list; if list of dicts and non-empty, keep one blankified element; else [] + - str -> "" + - bool -> False + - int/float -> "" (prefer empty so users must choose proper types) + - None -> None + - everything else -> "" + """ + if isinstance(obj, dict): + blank: object = {k: _blankify(v) for k, v in obj.items()} + elif isinstance(obj, list): + if not obj: + blank = [] + else: + first = obj[0] + # show one skeleton item so users see the structure for list-of-dicts; + # scalar lists -> show empty by default + blank = [_blankify(first)] if isinstance(first, dict) else [] + elif isinstance(obj, str): + blank = "" + elif isinstance(obj, bool): + blank = False + elif obj is None: + blank = None + else: + # numbers / other scalars -> empty + blank = "" + return blank + + +def _load_spec_template(oem_version: str) -> dict: + """Return the raw OEMetadata template document for the given version.""" + spec = get_metadata_specification(oem_version) + return spec.template or {} + + +def _dataset_stub_from_spec_template(oem_version: str, dataset_id: str) -> dict: + """ + Build datasets/.dataset.yaml from top-level template (not from resources). + + Remove @context/resources/metaMetadata and blankify the rest. + """ + t = _load_spec_template(oem_version).copy() + t.pop("@context", None) + t.pop("resources", None) # <-- filter out resource-level keys + t.pop("metaMetadata", None) + + blank = _blankify(t) + blank.setdefault("name", dataset_id) + blank.setdefault("title", "") + blank.setdefault("description", "") + blank.setdefault("@id", "") + + return {"version": oem_version, "dataset": blank} + + +def _resource_template_from_spec(oem_version: str) -> dict: + """Build datasets/.template.yaml from the *first* resource template only.""" + tmpl = _load_spec_template(oem_version) + resources = tmpl.get("resources") or [] + base = resources[0] if resources else {} + return _blankify(base) + + +def _resource_stub_from_spec(oem_version: str, resource_name: str) -> dict: + """Build resources//.resource.yaml from the resource template.""" + res = _resource_template_from_spec(oem_version) + res["name"] = resource_name + return res + + +def _dump_yaml(path: Path, data: dict, *, overwrite: bool) -> Path: + """Write `data` as YAML to `path`, respecting the `overwrite` flag.""" + path.parent.mkdir(parents=True, exist_ok=True) + if path.exists() and not overwrite: + return path + path.write_text( + yaml.safe_dump(data, sort_keys=False, allow_unicode=True), + encoding="utf-8", + ) + return path + + +# ----------------------------- +# public API +# ----------------------------- + + +def init_dataset( + base_dir: Path, + dataset_id: str, + *, + oem_version: str = "OEMetadata-2.0", + resources: Iterable[str] = (), + overwrite: bool = False, +) -> InitResult: + """ + Create or extend the split-files layout for one dataset. + + Creates: + + - datasets/.dataset.yaml + - datasets/.template.yaml + - resources//.resource.yaml for each requested resource. + """ + # touch spec (also ensures the version string is valid) + _ = get_metadata_specification(oem_version) + + dataset_yaml = base_dir / "datasets" / f"{dataset_id}.dataset.yaml" + template_yaml = base_dir / "datasets" / f"{dataset_id}.template.yaml" + + dataset_doc = _dataset_stub_from_spec_template(oem_version, dataset_id) + resource_template_doc = _resource_template_from_spec(oem_version) + + out_dataset = _dump_yaml(dataset_yaml, dataset_doc, overwrite=overwrite) + out_template = _dump_yaml(template_yaml, resource_template_doc, overwrite=overwrite) + + created_resources: list[Path] = [] + for res_name in resources: + res_doc = _resource_stub_from_spec(oem_version, res_name) + res_path = base_dir / "resources" / dataset_id / f"{res_name}.resource.yaml" + created_resources.append(_dump_yaml(res_path, res_doc, overwrite=overwrite)) + + return InitResult(dataset_yaml=out_dataset, template_yaml=out_template, resource_yamls=created_resources) + + +def init_resources_from_files( + base_dir: Path, + dataset_id: str, + files: Iterable[Path], + *, + oem_version: str = "OEMetadata-2.0.4", + overwrite: bool = False, +) -> list[Path]: + """ + Create resource stubs for DATASET_ID from the given FILES. + + Uses the spec resource template structure, prefills name/path/format hints, + and for CSV files also infers a schema (fields + types) using `omi.inspection`. + """ + _ = get_metadata_specification(oem_version) + + outputs: list[Path] = [] + for f in files: + name = f.stem + ext = f.suffix.lower().lstrip(".") + res = _resource_stub_from_spec(oem_version, name) + res["path"] = str(f) + + # Lightweight format hinting (non-authoritative; user should review) + if ext == "csv": + res.setdefault("format", "CSV") + res.setdefault("encoding", "UTF-8") + res.setdefault("scheme", "file") + + # Use existing inspection: "OEP" == OEMetadata in this code base + try: + inferred = infer_metadata(str(f), metadata_format="OEP") + except InspectionError: + inferred = None + + if inferred is not None: + # We only care about the *resource* part here + try: + inferred_resource = inferred["resources"][0] + inferred_schema = inferred_resource.get("schema") + except (KeyError, IndexError, TypeError): + inferred_schema = None + + if inferred_schema: + # Overwrite/attach the schema from inspection to this resource stub + res["schema"] = inferred_schema + + elif ext == "json": + res.setdefault("format", "json") + res.setdefault("scheme", "file") + elif ext == "xlsx": + res.setdefault("format", "xlsx") + res.setdefault("scheme", "file") + else: + if ext: + res.setdefault("format", ext) + res.setdefault("scheme", "file") + + out_path = base_dir / "resources" / dataset_id / f"{name}.resource.yaml" + outputs.append(_dump_yaml(out_path, res, overwrite=overwrite)) + + return outputs diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py new file mode 100644 index 00000000..fa591863 --- /dev/null +++ b/src/omi/creation/utils.py @@ -0,0 +1,295 @@ +""" +Utility functions for the OMI creation module. + +This module provides deep-merge templating, YAML IO, and discovery helpers +for assembling OEMetadata from split YAML files (dataset/template/resources). +""" + +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import TYPE_CHECKING, Optional, Union + +import yaml + +if TYPE_CHECKING: + from collections.abc import Hashable + +# --- deep merge helpers ------------------------------------------------------- + +# List keys we concatenate (resource + template) instead of replacing. +DEFAULT_CONCAT_LIST_KEYS = {"keywords", "topics", "languages"} + + +def _hashable_key(x: object) -> Hashable | tuple: + """ + Return a hashable representation of `x` for deduplication purposes. + + - dict -> sorted tuple of (key, value) pairs + - list -> tuple(list) + - other -> value itself + """ + if isinstance(x, dict): + return tuple(sorted(x.items())) + if isinstance(x, list): + return tuple(x) + return x # type: ignore[return-value] + + +def _merge_lists( + template_list: list[object], + resource_list: list[object], + *, + deduplicate: bool = True, +) -> list[object]: + """ + Concatenate lists with resource-first priority. + + When `deduplicate` is True, only items that are not already present in + `resource_list` (by hashable representation) are appended from `template_list`. + """ + merged = list(resource_list) + if not template_list: + return merged + + if deduplicate: + existing = {_hashable_key(v) for v in merged} + for item in template_list: + k = _hashable_key(item) + if k not in existing: + merged.append(item) + else: + merged.extend(template_list) + return merged + + +def deep_apply_template_to_resource( + resource: dict[str, object], + template: dict[str, object], + concat_list_keys: Union[tuple[str, ...], set[str]] = DEFAULT_CONCAT_LIST_KEYS, +) -> dict[str, object]: + """ + Apply a resource template using deep-merge semantics. + + Rules: + - Missing keys are copied from the template. + - Dicts are deep-merged (resource wins on conflicts). + - Lists are concatenated only for keys in `concat_list_keys`; otherwise, the + resource list is preserved as-is. + - Scalars: resource values win. + """ + if not template: + return resource + + result = deepcopy(resource) + for key, tval in template.items(): + if key not in result: + result[key] = deepcopy(tval) + continue + + rval = result[key] + if isinstance(rval, dict) and isinstance(tval, dict): + result[key] = deep_apply_template_to_resource(rval, tval, concat_list_keys) + continue + + if isinstance(rval, list) and isinstance(tval, list): + if key in concat_list_keys: + result[key] = _merge_lists(tval, rval, deduplicate=True) + # else: resource list stays as-is + continue + # scalar: resource value stays + return result + + +def apply_template_to_resources( + resources: list[dict[str, object]], + template: dict[str, object], +) -> list[dict[str, object]]: + """Apply the same `template` to each resource in `resources`.""" + if not template: + return resources + return [deep_apply_template_to_resource(r, template) for r in resources] + + +# --- YAML IO + discovery ------------------------------------------------------ + + +def load_yaml(path: Union[str, Path]) -> dict[str, object]: + """Load a YAML mapping from `path`, returning an empty dict for empty files.""" + with Path(path).open("r", encoding="utf-8") as f: + return yaml.safe_load(f) or {} + + +def discover_paths( + base_dir: Union[str, Path], + dataset_id: str, +) -> tuple[Optional[Path], Optional[Path], list[Path]]: + """ + Discover dataset/template/resources paths by convention. + + - dataset: datasets/{dataset_id}.dataset.yaml + - template: datasets/{dataset_id}.template.yaml (optional) + - resources: resources/{dataset_id}/*.resource.yaml + """ + base = Path(base_dir) + dataset_path = base / "datasets" / f"{dataset_id}.dataset.yaml" + template_path = base / "datasets" / f"{dataset_id}.template.yaml" + resources_dir = base / "resources" / dataset_id + + dataset = dataset_path if dataset_path.exists() else None + template = template_path if template_path.exists() else None + resources = sorted(resources_dir.glob("*.resource.yaml")) if resources_dir.exists() else [] + return dataset, template, resources + + +def resolve_from_index( + base_dir: Union[str, Path], + dataset_id: str, + index_file: Optional[Union[str, Path]], +) -> tuple[Optional[Path], Optional[Path], list[Path]]: + """ + Resolve dataset/template/resources using an explicit index YAML. + + Example YAML: + + datasets: + : + dataset: path/to/dataset.yaml + template: path/to/template.yaml # optional + resources: + - path/to/res1.yaml + - path/to/res2.yaml + + Paths are interpreted as relative to `base_dir`. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_id : str + Identifier for the dataset to load. + index_file : Optional[Union[str, Path]] + Optional path to an index YAML file for resolving dataset parts. + + Returns + ------- + tuple[Optional[Path], Optional[Path], list[Path]] + A tuple containing: + - dataset_path: Optional[Path] + Path to the dataset YAML (or None if not found). + - template_path: Optional[Path] + Path to the template YAML (or None if not found). + - resource_paths: list[Path] + List of paths to resource YAMLs. + """ + if not index_file: + return discover_paths(base_dir, dataset_id) + + base = Path(base_dir) + index_path = Path(index_file) + index = load_yaml(index_path) + entry = (index.get("datasets") or {}).get(dataset_id, {}) + dataset = base / entry["dataset"] if "dataset" in entry else None + template = base / entry["template"] if "template" in entry else None + resources = [base / p for p in entry.get("resources", [])] + return dataset, template, resources + + +def load_parts( + base_dir: Union[str, Path], + dataset_id: str, + index_file: Optional[Union[str, Path]] = None, +) -> tuple[str, dict[str, object], list[dict[str, object]], dict[str, object]]: + """ + Load dataset YAML, optional template YAML, and all resource YAMLs. + + Returns a tuple: (version, dataset, resources, template). + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + dataset_id : str + Identifier for the dataset to load. + index_file : Optional[Union[str, Path]], optional + Optional path to an index YAML file for resolving dataset parts, + by default None. + + Returns + ------- + tuple[str, dict[str, object], list[dict[str, object]], dict[str, object]] + A tuple containing: + - version: str + The OEMetadata version from the dataset YAML (default "OEMetadata-2.0.4"). + - dataset: dict[str, object] + The dataset mapping from the dataset YAML. + - resources: list[dict[str, object]] + A list of resource mappings from the resource YAMLs. + - template: dict[str, object] + The template mapping from the template YAML (empty dict if none). + """ + dataset_path, template_path, resource_paths = resolve_from_index(base_dir, dataset_id, index_file) + + if dataset_path is None or not dataset_path.exists(): + raise FileNotFoundError(f"Dataset YAML not found for '{dataset_id}'") + + dataset_yaml = load_yaml(dataset_path) + version = str(dataset_yaml.get("version", "OEMetadata-2.0.4")) + # Support either dataset: {...} or flat style with top-level dataset keys. + dataset = dataset_yaml.get("dataset", dataset_yaml) + + template: dict[str, object] = {} + if template_path and template_path.exists(): + template = load_yaml(template_path) + + resources: list[dict[str, object]] = [load_yaml(p) for p in resource_paths] + return version, dataset, resources, template + + +def discover_dataset_ids(base_dir: Union[str, Path]) -> list[str]: + """ + Discover dataset ids by scanning datasets/*.dataset.yaml. + + For 'datasets/powerplants.dataset.yaml' returns 'powerplants'. + + Parameters + ---------- + base_dir : Union[str, Path] + Base directory containing datasets, templates, and resources. + + Returns + ------- + list[str] + Sorted list of discovered dataset IDs. + """ + base = Path(base_dir) + datasets_dir = base / "datasets" + if not datasets_dir.exists(): + return [] + return sorted([p.stem.replace(".dataset", "") for p in datasets_dir.glob("*.dataset.yaml")]) + + +def discover_dataset_ids_from_index(index_file: Union[str, Path]) -> list[str]: + """ + Discover dataset ids from an explicit metadata_index.yaml. + + Returns the sorted list of top-level keys under `datasets`. + + Parameters + ---------- + index_file : Union[str, Path] + Path to an index YAML file for resolving dataset parts. + + Returns + ------- + list[str] + Sorted list of discovered dataset IDs. + """ + idx_path = Path(index_file) + if not idx_path.exists(): + return [] + with idx_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + ds = data.get("datasets") or {} + return sorted(ds.keys()) diff --git a/src/omi/inspection.py b/src/omi/inspection.py index f7b4dd52..e7b2b526 100644 --- a/src/omi/inspection.py +++ b/src/omi/inspection.py @@ -1,6 +1,7 @@ """Module to inspect data and create metadata from it.""" from collections.abc import Callable +from copy import deepcopy from typing import Any from frictionless import Detector, Dialect, Resource @@ -121,7 +122,9 @@ def convert_field(field: dict[str, str]) -> dict[str, str]: return {"name": field["name"], "type": f"array {type_mapping[item_type]}"} # All arrays are empty - so no further subtype can be detected return {"name": field["name"], "type": "array"} - return field + oem_field = deepcopy(metadata["resources"][0]["schema"]["fields"][0]) + oem_field.update(field) + return oem_field rows = resource.read_rows() fields = [convert_field(field) for field in fields] diff --git a/tests/test_assembler.py b/tests/test_assembler.py new file mode 100644 index 00000000..318c73e1 --- /dev/null +++ b/tests/test_assembler.py @@ -0,0 +1,316 @@ +""" +Assembly integration tests for split-files OEMetadata authoring. + +This module exercises the public assembler entry point by building a small +on-disk YAML tree, applying a template, and verifying the merged OEMetadata. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import yaml + +# We test the public assembler entry point +from omi.creation.assembler import assemble_many_metadata, assemble_metadata_dict + +if TYPE_CHECKING: + from pathlib import Path + + import pytest + + +# ---------- helpers ---------- + + +def write_yaml(p: Path, data: object) -> None: + """Write `data` (any YAML-serializable object) to path `p`.""" + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text( + yaml.safe_dump(data, sort_keys=False, allow_unicode=True), + encoding="utf-8", + ) + + +class FakeCreator: + """ + Minimal stand-in for OEMetadataCreator used via monkeypatching. + + It mimics `generate_metadata(dataset, resources)` and skips validation. + The constructor accepts the OEMetadata version to embed in metaMetadata. + """ + + def __init__(self, oem_version: str = "OEMetadata-2.0") -> None: + """Initialize the fake creator with a specific OEMetadata version.""" + self.oem_version = oem_version + + def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict: + """Return a small OEMetadata-like dict sufficient for assertions.""" + return { + "@context": "https://example.org/context.json", + **dataset, + "resources": resources, + "metaMetadata": {"metadataVersion": self.oem_version}, + } + + +# ---------- tests ---------- + + +def test_assemble_by_convention_with_template_merge( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """ + Assemble via convention and verify deep merge semantics. + + Asserts: + - dataset is loaded from datasets/{id}.dataset.yaml + - template is applied deeply (resource wins on conflicts) + - keywords are concatenated (resource first, then template-only) + - licenses remain resource-provided if present (no concat by default) + - creator is invoked and returns a full dict + """ + # dataset + write_yaml( + tmp_path / "datasets" / "demo.dataset.yaml", + { + "version": "OEMetadata-2.0.4", + "dataset": {"name": "demo", "title": "Demo", "description": "Demo dataset"}, + }, + ) + + # template + write_yaml( + tmp_path / "datasets" / "demo.template.yaml", + { + "context": {"publisher": "OEP", "contact": "a@b"}, + "keywords": ["k1"], + "topics": ["model_draft"], + "languages": ["en-GB"], + "licenses": [{"name": "L1"}], # applies only if resource doesn't provide licenses + }, + ) + + # resources + write_yaml( + tmp_path / "resources" / "demo" / "r1.resource.yaml", + { + "name": "r1", + "title": "R1", + # overrides nested key, should still inherit contact from template + "context": {"publisher": "Other"}, + # resource provides its own licenses -> should NOT be concatenated by default + "licenses": [{"name": "R1-license"}], + # own keywords -> should concat with template keywords + "keywords": ["r1k"], + }, + ) + write_yaml( + tmp_path / "resources" / "demo" / "r2.resource.yaml", + { + "name": "r2", + "title": "R2", + # no licenses provided -> should get template licenses + }, + ) + + # Patch the creator used inside assembler to our Fake + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + md = assemble_metadata_dict(tmp_path, "demo") + + # dataset propagated + assert md["name"] == "demo" + assert md["title"] == "Demo" + + # resources present + assert isinstance(md["resources"], list) + assert len(md["resources"]) == 2 + r1, r2 = md["resources"] + + # deep merge for context: resource wins on conflicts, template fills missing keys + assert r1["context"]["publisher"] == "Other" + assert r1["context"]["contact"] == "a@b" + + # keywords/topics/languages concatenate (resource first, then template-only) + assert r1["keywords"] == ["r1k", "k1"] + # topics/languages inherited if missing + assert r1["topics"] == ["model_draft"] + assert r1["languages"] == ["en-GB"] + + # licenses: resource list wins (no concat by default) + assert r1["licenses"] == [{"name": "R1-license"}] + + # r2 inherits licenses from template (since none provided) + assert r2["licenses"] == [{"name": "L1"}] + # r2 inherits keywords/topics/languages from template + assert r2["keywords"] == ["k1"] + assert r2["topics"] == ["model_draft"] + assert r2["languages"] == ["en-GB"] + + # metaMetadata present from FakeCreator (assembler passes through the version) + assert md["metaMetadata"]["metadataVersion"] == "OEMetadata-2.0.4" + + +def test_assemble_with_index_mapping( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Assemble using an explicit metadata_index.yaml mapping.""" + base = tmp_path + + # index mapping + write_yaml( + base / "metadata_index.yaml", + { + "datasets": { + "pp": { + "dataset": "datasets/powerplants.dataset.yaml", + "template": "datasets/powerplants.template.yaml", + "resources": [ + "resources/powerplants/a.resource.yaml", + "resources/powerplants/b.resource.yaml", + ], + }, + }, + }, + ) + + write_yaml( + base / "datasets" / "powerplants.dataset.yaml", + {"dataset": {"name": "pp", "title": "PP"}}, + ) + write_yaml( + base / "datasets" / "powerplants.template.yaml", + {"keywords": ["t-k"]}, + ) + write_yaml( + base / "resources" / "powerplants" / "a.resource.yaml", + {"name": "a", "title": "A", "keywords": ["a-k"]}, + ) + write_yaml( + base / "resources" / "powerplants" / "b.resource.yaml", + {"name": "b", "title": "B"}, + ) + + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + # Use the index explicitly + md = assemble_metadata_dict(base, "pp", index_file=base / "metadata_index.yaml") + + assert md["name"] == "pp" + names = [r["name"] for r in md["resources"]] + assert names == ["a", "b"] + + # keywords concatenated for 'a', inherited for 'b' + r_a = md["resources"][0] + r_b = md["resources"][1] + assert r_a["keywords"] == ["a-k", "t-k"] + assert r_b["keywords"] == ["t-k"] + + +def test_assemble_many_metadata_convention_as_dict( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Assemble all datasets by convention; expect a dict keyed by dataset id.""" + # Dataset A + write_yaml( + tmp_path / "datasets" / "a.dataset.yaml", + {"version": "OEMetadata-2.0.4", "dataset": {"name": "a", "title": "A"}}, + ) + write_yaml( + tmp_path / "resources" / "a" / "r1.resource.yaml", + {"name": "r1", "title": "R1"}, + ) + + # Dataset B (with template) + write_yaml( + tmp_path / "datasets" / "b.dataset.yaml", + {"version": "OEMetadata-2.0.4", "dataset": {"name": "b", "title": "B"}}, + ) + write_yaml( + tmp_path / "datasets" / "b.template.yaml", + {"keywords": ["tk"]}, + ) + write_yaml( + tmp_path / "resources" / "b" / "r2.resource.yaml", + {"name": "r2", "title": "R2", "keywords": ["rk"]}, + ) + + # Use the FakeCreator inside the assembler + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + out = assemble_many_metadata(tmp_path) # dict[str, dict] + # discover_dataset_ids returns sorted ids + assert list(out.keys()) == ["a", "b"] + + # Dataset A checks + md_a = out["a"] + assert md_a["name"] == "a" + assert [r["name"] for r in md_a["resources"]] == ["r1"] + + # Dataset B checks (template applied with concat) + md_b = out["b"] + assert md_b["name"] == "b" + r2 = md_b["resources"][0] + assert r2["name"] == "r2" + assert r2["keywords"] == ["rk", "tk"] + + +def test_assemble_many_metadata_with_index_as_list( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Assemble all datasets declared in index; expect a list of (id, md) pairs sorted by id.""" + base = tmp_path + + # Index with two datasets (note: keys will be sorted by helper) + write_yaml( + base / "metadata_index.yaml", + { + "datasets": { + "x": { + "dataset": "datasets/x.dataset.yaml", + "resources": ["resources/x/x1.resource.yaml"], + }, + "y": { + "dataset": "datasets/y.dataset.yaml", + "template": "datasets/y.template.yaml", + "resources": ["resources/y/y1.resource.yaml"], + }, + }, + }, + ) + + # Dataset x + write_yaml(base / "datasets" / "x.dataset.yaml", {"dataset": {"name": "x", "title": "X"}}) + write_yaml(base / "resources" / "x" / "x1.resource.yaml", {"name": "x1"}) + + # Dataset y (with template) + write_yaml(base / "datasets" / "y.dataset.yaml", {"dataset": {"name": "y", "title": "Y"}}) + write_yaml(base / "datasets" / "y.template.yaml", {"keywords": ["t"]}) + write_yaml(base / "resources" / "y" / "y1.resource.yaml", {"name": "y1", "keywords": ["r"]}) + + monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator) + + pairs = assemble_many_metadata( + base, + index_file=base / "metadata_index.yaml", + as_dict=False, + ) # list[tuple[str, dict]] + + # Expect sorted ids: ['x', 'y'] + ids = [ds_id for ds_id, _ in pairs] + assert ids == ["x", "y"] + + md_x = pairs[0][1] + md_y = pairs[1][1] + + assert md_x["name"] == "x" + assert [r["name"] for r in md_x["resources"]] == ["x1"] + + # Template concat for y + r_y1 = md_y["resources"][0] + assert r_y1["keywords"] == ["r", "t"] diff --git a/tests/test_create.py b/tests/test_create.py new file mode 100644 index 00000000..7fe2dd07 --- /dev/null +++ b/tests/test_create.py @@ -0,0 +1,147 @@ +""" +Integration tests for OEMetadata assembly and entry point using YAML test data. + +This test suite consumes the example YAML tree located at: +tests/test_data/create/metadata/ +and verifies that OMI assembles and writes a valid OEMetadata document. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import TYPE_CHECKING + +from omi.create import build_from_yaml +from omi.creation.assembler import assemble_metadata_dict + +if TYPE_CHECKING: + import pytest + + +def _fixture_metadata_root() -> Path: + """Return the absolute path to tests/test_data/create/metadata.""" + here = Path(__file__).resolve().parent + return here / "test_data" / "create" / "metadata" + + +def test_assemble_metadata_dict_with_fixture() -> None: + """Assemble OEMetadata dict from the real fixture and assert key content.""" + base = _fixture_metadata_root() + dataset_id = "powerplants" + + md = assemble_metadata_dict(base, dataset_id) + + # dataset-level checks (from powerplants.dataset.yaml) + assert md["name"] == "oep_oemetadata" + assert md["title"] == "OEP OEMetadata" + assert md["@id"].startswith("https://databus.openenergyplatform.org/") + + # context injected from template if not overridden in resource + assert "resources" in md + assert isinstance(md["resources"], list) + assert md["resources"] + r_names = {r["name"] for r in md["resources"]} + # Both resources from your example exist + assert {"oemetadata_table", "data_2"}.issubset(r_names) + + # Check one resource that should have inherited from template + r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table") + assert r1["context"]["title"] == "NFDI4Energy" # from template + assert "licenses" in r1 + assert isinstance(r1["licenses"], list) + assert r1["licenses"] + assert r1["licenses"][0]["name"] in {"ODbL-1.0", "ODbL-1.0".upper(), "ODBL-1.0"} + + # Meta metadata is present + assert "metaMetadata" in md + assert md["metaMetadata"]["metadataVersion"].startswith("OEMetadata-2.0") + + +def test_entrypoint_build_from_yaml_writes_file(tmp_path: Path) -> None: + """Use the real entry point to write JSON and compare basic structure.""" + base = _fixture_metadata_root() + out = tmp_path / "out" / "powerplants.json" + + build_from_yaml(base, "powerplants", out) + + assert out.exists(), "Entry point did not write the output file." + written = json.loads(out.read_text(encoding="utf-8")) + + # Sanity checks on written JSON + assert written["name"] == "oep_oemetadata" + assert isinstance(written["resources"], list) + assert written["resources"] + # Ensure unicode is preserved (© should not be escaped) + licenses = written["resources"][0].get("licenses", []) + if licenses: + # stringify to inspect the character; ensure_ascii=False in writer preserves it + text = json.dumps(licenses[0], ensure_ascii=False) + assert "©" in text + + +def test_build_from_yaml_writes_file_when_output_is_file( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure build_from_yaml writes to the exact file path provided.""" + from omi import create as create_mod + + expected: dict[str, object] = {"name": "pp", "resources": []} + + # Avoid needing real YAML on disk + def fake_assemble( + base_dir: Path, + dataset_id: str, + index_file: Path | None = None, + ) -> dict[str, object]: + # use args to avoid ARG001 + _ = base_dir, index_file + assert dataset_id == "powerplants" + return expected + + monkeypatch.setattr(create_mod, "assemble_metadata_dict", fake_assemble) + + out = tmp_path / "out.json" + create_mod.build_from_yaml(tmp_path / "meta", "powerplants", out) + + assert out.exists() + assert json.loads(out.read_text(encoding="utf-8")) == expected + + +def test_build_many_from_yaml_writes_many_default_names( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure build_many_from_yaml writes .json files into output_dir.""" + from omi import create as create_mod + + canned: dict[str, dict[str, object]] = { + "a": {"name": "a", "resources": []}, + "b": {"name": "b", "resources": []}, + } + + def fake_many( + base_dir: Path, + *, + dataset_ids: list[str] | None = None, + index_file: Path | None = None, + as_dict: bool = True, + ) -> dict[str, dict[str, object]]: + # Called by build_many_from_yaml; return mapping id -> md + _ = base_dir, dataset_ids, index_file # avoid ARG001 + assert as_dict is True + return canned + + monkeypatch.setattr(create_mod, "assemble_many_metadata", fake_many) + + out_dir = tmp_path / "out" + create_mod.build_many_from_yaml(tmp_path / "meta", out_dir) + + a_path = out_dir / "a.json" + b_path = out_dir / "b.json" + assert a_path.exists() + assert b_path.exists() + + assert json.loads(a_path.read_text(encoding="utf-8")) == canned["a"] + assert json.loads(b_path.read_text(encoding="utf-8")) == canned["b"] diff --git a/tests/test_creation_utils.py b/tests/test_creation_utils.py new file mode 100644 index 00000000..de3c7a67 --- /dev/null +++ b/tests/test_creation_utils.py @@ -0,0 +1,202 @@ +"""Unit tests for the OMI creation utils (templating, IO, discovery).""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +import yaml + +# Functions under test +from omi.creation.utils import ( + DEFAULT_CONCAT_LIST_KEYS, + _merge_lists, + apply_template_to_resources, + deep_apply_template_to_resource, + discover_dataset_ids, + discover_dataset_ids_from_index, + discover_paths, + load_parts, + load_yaml, + resolve_from_index, +) + +if TYPE_CHECKING: + from pathlib import Path + + +# ---------- helpers ---------- + + +def _write_yaml(p: Path, data: object) -> None: + """Write a YAML-serializable `data` object to `p`, creating parents.""" + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8") + + +# ---------- tests: list merging + deep template ---------- + + +def test_merge_lists_deduplicates_and_respects_resource_first() -> None: + """`_merge_lists` keeps resource-first order and de-duplicates template items.""" + resource_list = ["a", "b"] + template_list = ["b", "c"] + merged = _merge_lists(template_list, resource_list, deduplicate=True) + assert merged == ["a", "b", "c"] + + +def test_deep_apply_template_to_resource_concat_for_keywords_topics_languages() -> None: + """Default concat keys (keywords/topics/languages) are concatenated; others are not.""" + resource = { + "name": "r", + "keywords": ["rk"], + "topics": ["rt"], + "languages": ["rl"], + "context": {"publisher": "R"}, + "list_no_concat": [1, 2], + } + template = { + "keywords": ["tk"], + "topics": ["tt"], + "languages": ["tl"], + "context": {"publisher": "T", "contact": "a@b"}, + "list_no_concat": [3, 4], + } + + out = deep_apply_template_to_resource(resource, template) + # concat lists for default concat keys + assert out["keywords"] == ["rk", "tk"] + assert out["topics"] == ["rt", "tt"] + assert out["languages"] == ["rl", "tl"] + # resource list wins for non-concat keys + assert out["list_no_concat"] == [1, 2] + # deep dict merge: resource wins on conflict, template fills missing + assert out["context"]["publisher"] == "R" + assert out["context"]["contact"] == "a@b" + + +def test_deep_apply_template_to_resource_custom_concat_keys() -> None: + """Custom concat set allows concatenating lists like `licenses`.""" + resource = {"licenses": [{"name": "R1"}]} + template = {"licenses": [{"name": "T1"}]} + # By default, 'licenses' is NOT concatenated + out_default = deep_apply_template_to_resource(resource, template) + assert out_default["licenses"] == [{"name": "R1"}] + + # If we opt-in, it concatenates (resource first, then template-only) + custom_keys = set(DEFAULT_CONCAT_LIST_KEYS) | {"licenses"} + out_custom = deep_apply_template_to_resource(resource, template, concat_list_keys=custom_keys) + assert out_custom["licenses"] == [{"name": "R1"}, {"name": "T1"}] + + +def test_apply_template_to_resources_applies_per_item() -> None: + """Template is applied to each resource; concat for `keywords` by default.""" + resources = [{"name": "a"}, {"name": "b", "keywords": ["bk"]}] + template = {"keywords": ["tk"]} + out = apply_template_to_resources(resources, template) + assert out[0]["keywords"] == ["tk"] # inherited from template + assert out[1]["keywords"] == ["bk", "tk"] # concatenated: resource first, then template-only + + +# ---------- tests: YAML IO + discovery ---------- + + +def test_load_yaml_reads_empty_as_empty_dict(tmp_path: Path) -> None: + """Empty YAML file is read as an empty dict.""" + p = tmp_path / "empty.yaml" + p.write_text("", encoding="utf-8") + data = load_yaml(p) + assert data == {} + + +def test_discover_paths_and_resolve_from_index(tmp_path: Path) -> None: + """Discovery by convention and resolution by index both return expected paths.""" + base = tmp_path + # convention files + ds = base / "datasets" / "powerplants.dataset.yaml" + tp = base / "datasets" / "powerplants.template.yaml" + rdir = base / "resources" / "powerplants" + r1 = rdir / "a.resource.yaml" + r2 = rdir / "b.resource.yaml" + + _write_yaml(ds, {"version": "OEMetadata-2.0.4", "dataset": {"name": "pp"}}) + _write_yaml(tp, {"keywords": ["k1"]}) + _write_yaml(r1, {"name": "a"}) + _write_yaml(r2, {"name": "b"}) + + dspath, tpath, rpaths = discover_paths(base, "powerplants") + assert dspath == ds + assert tpath == tp + assert rpaths == [r1, r2] + + # index mapping (deliberately flips resource order) + idx = base / "metadata_index.yaml" + _write_yaml( + idx, + { + "datasets": { + "powerplants": { + "dataset": "datasets/powerplants.dataset.yaml", + "template": "datasets/powerplants.template.yaml", + "resources": [ + "resources/powerplants/b.resource.yaml", + "resources/powerplants/a.resource.yaml", + ], + }, + }, + }, + ) + d2, t2, rs2 = resolve_from_index(base, "powerplants", idx) + assert d2 == ds + assert t2 == tp + assert rs2 == [base / "resources/powerplants/b.resource.yaml", base / "resources/powerplants/a.resource.yaml"] + + +def test_load_parts_returns_all_sections(tmp_path: Path) -> None: + """`load_parts` returns (version, dataset, resources, template) with expected contents.""" + base = tmp_path + ds = base / "datasets" / "households.dataset.yaml" + tp = base / "datasets" / "households.template.yaml" + rdir = base / "resources" / "households" + r1 = rdir / "hh1.resource.yaml" + + _write_yaml(ds, {"version": "OEMetadata-2.0.4", "dataset": {"name": "households", "title": "HH"}}) + _write_yaml(tp, {"context": {"publisher": "OEP"}}) + _write_yaml(r1, {"name": "hh1"}) + + version, dataset, resources, template = load_parts(base, "households") + assert version == "OEMetadata-2.0.4" + assert dataset == {"name": "households", "title": "HH"} + assert resources == [{"name": "hh1"}] + assert template == {"context": {"publisher": "OEP"}} + + +def test_load_parts_raises_when_dataset_missing(tmp_path: Path) -> None: + """`load_parts` raises FileNotFoundError if the dataset YAML is missing.""" + with pytest.raises(FileNotFoundError): + load_parts(tmp_path, "missing") + + +# ---------- tests: dataset id discovery ---------- + + +def test_discover_dataset_ids(tmp_path: Path) -> None: + """`discover_dataset_ids` finds dataset ids by scanning datasets/*.dataset.yaml.""" + _write_yaml(tmp_path / "datasets" / "a.dataset.yaml", {"dataset": {"name": "a"}}) + _write_yaml(tmp_path / "datasets" / "b.dataset.yaml", {"dataset": {"name": "b"}}) + ids = discover_dataset_ids(tmp_path) + assert ids == ["a", "b"] + + +def test_discover_dataset_ids_from_index(tmp_path: Path) -> None: + """`discover_dataset_ids_from_index` returns top-level 'datasets' keys in index YAML.""" + idx = tmp_path / "metadata_index.yaml" + _write_yaml(idx, {"datasets": {"x": {}, "y": {}}}) + ids = discover_dataset_ids_from_index(idx) + assert ids == ["x", "y"] + + +def test_discover_dataset_ids_from_index_missing_file(tmp_path: Path) -> None: + """Missing index file yields an empty list of dataset ids.""" + ids = discover_dataset_ids_from_index(tmp_path / "nope.yaml") + assert ids == [] diff --git a/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml b/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml new file mode 100644 index 00000000..38bb43a2 --- /dev/null +++ b/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml @@ -0,0 +1,6 @@ +version: "OEMetadata-2.0" +dataset: + name: oep_oemetadata + title: OEP OEMetadata + description: A dataset for the OEMetadata examples. + "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/ diff --git a/tests/test_data/create/metadata/datasets/powerplants.template.yaml b/tests/test_data/create/metadata/datasets/powerplants.template.yaml new file mode 100644 index 00000000..1b60853a --- /dev/null +++ b/tests/test_data/create/metadata/datasets/powerplants.template.yaml @@ -0,0 +1,26 @@ +licenses: + - name: ODbL-1.0 + title: Open Data Commons Open Database License 1.0 + path: https://opendatacommons.org/licenses/odbl/1-0/index.html + instruction: > + You are free to share and change, but you must attribute, and + share derivations under the same license. See https://tldrlegal.com/license/odc-open-database-license-(odbl) + for further information. + attribution: © Reiner Lemoine Institut + copyrightStatement: https://github.com/OpenEnergyPlatform/oemetadata/blob/production/LICENSE.txt + +context: + title: NFDI4Energy + homepage: https://nfdi4energy.uol.de/ + documentation: https://nfdi4energy.uol.de/sites/about_us/ + sourceCode: https://github.com/NFDI4Energy + publisher: Open Energy Platform (OEP) + publisherLogo: https://github.com/OpenEnergyPlatform/organisation/blob/production/logo/OpenEnergyFamily_Logo_OpenEnergyPlatform.svg + contact: contact@example.com + fundingAgency: " Deutsche Forschungsgemeinschaft (DFG)" + fundingAgencyLogo: https://upload.wikimedia.org/wikipedia/commons/8/86/DFG-logo-blau.svg + grantNo: "501865131" + +topics: [model_draft] +languages: [en-GB, de-DE] +keywords: [example, ODbL-1.0, NFDI4Energy] diff --git a/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml new file mode 100644 index 00000000..a03ee242 --- /dev/null +++ b/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml @@ -0,0 +1,22 @@ +name: data_2 +type: table +title: My Second Resource + +path: reGon/metadata/data_2.csv +scheme: file +format: csv +mediatype: text/csv +encoding: utf-8 + +schema: + fields: + - name: h + type: integer + nullable: true + - name: i + type: integer + nullable: true + - name: o + type: string + nullable: true + primaryKey: [id] diff --git a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml new file mode 100644 index 00000000..f28b8392 --- /dev/null +++ b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml @@ -0,0 +1,191 @@ +name: oemetadata_table +type: table +title: OEMetadata Table Template +description: Example table used to illustrate the OEMetadata structure and features. +"@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv + +# Resource-specific attributes (template will add licenses/context/topics/languages/keywords) +path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table +scheme: http +format: CSV +encoding: UTF-8 + +dialect: + decimalSeparator: "." + delimiter: ";" + +schema: + fields: + - name: id + type: integer + description: Unique identifier + nullable: false + unit: null + isAbout: + - name: identifier + "@id": http://purl.obolibrary.org/obo/IAO_0020000 + valueReference: + - value: null + name: null + "@id": null + - name: name + type: string + description: Technology Name + nullable: true + unit: null + isAbout: + - name: power generation technology + "@id": http://openenergy-platform.org/ontology/oeo/OEO_00010423 + valueReference: + - value: wind + name: wind power technology + "@id": http://openenergyplatform.org/ontology/oeo/OEO_00010424 + - name: type + type: string + description: Type of wind farm + nullable: true + unit: null + isAbout: + - name: wind farm + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000447/ + valueReference: + - value: onshore + name: onshore wind farm + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000311/ + - value: offshore + name: offshore wind farm + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000308/ + - name: year + type: integer + description: Reference year + nullable: true + unit: null + isAbout: + - name: year + "@id": https://openenergyplatform.org/ontology/oeo/UO_0000036/ + valueReference: + - value: null + name: null + "@id": null + - name: value + type: number + description: Bruttoleistung + nullable: true + unit: MW + isAbout: + - name: nameplate capacity + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00230003/ + valueReference: + - value: null + name: null + "@id": null + - name: is_active + type: boolean + description: Betriebsstatus + nullable: false + unit: null + isAbout: + - name: Operating Mode Status + "@id": https://ontology.brickschema.org/brick/Operating_Mode_Status + valueReference: + - value: null + name: null + "@id": null + - name: version + type: integer + description: Version + nullable: true + unit: null + isAbout: + - name: version number + "@id": http://purl.obolibrary.org/obo/IAO_0000129 + valueReference: + - value: null + name: null + "@id": null + - name: comment + type: string + description: "" + nullable: true + unit: null + isAbout: + - name: comment + "@id": http://semanticscience.org/resource/SIO_001167 + valueReference: + - value: null + name: null + "@id": null + primaryKey: [id] + foreignKeys: + - fields: [id, version] + reference: + resource: model_draft.oep_oemetadata_table_example_version + fields: [id, version] + + +sources: + - title: IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report + authors: [Hoesung Lee, José Romero, The Core Writing Team] + description: A Report of the Intergovernmental Panel on Climate Change. + publicationYear: "2023" + path: https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf + sourceLicenses: + - name: CC-BY-4.0 + title: Creative Commons Attribution 4.0 International + path: https://creativecommons.org/licenses/by/4.0/legalcode + instruction: > + You are free to share and change, but you must attribute. + See https://tldrlegal.com/license/odc-open-database-license-odbl for further information. + attribution: © Intergovernmental Panel on Climate Change 2023 + copyrightStatement: https://www.ipcc.ch/copyright/ + +subject: + - name: energy + "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000150 + +publicationDate: "2024-10-15" + +# embargoPeriod: +# start: "2024-10-11" +# end: "2025-01-01" +# isActive: true + +spatial: + location: + address: Rudower Chaussee 12, 12489 Berlin + "@id": https://www.wikidata.org/wiki/Q77077223 + latitude: "52.432822" + longitude: "13.5351004" + extent: + name: Berlin + "@id": https://www.wikidata.org/wiki/Q64 + resolutionValue: "100" + resolutionUnit: m + boundingBox: [13.08825, 52.33859, 13.76104, 52.6754] + crs: EPSG:4326 + +temporal: + referenceDate: "2020-01-01" + timeseries: + - start: "2020-01-01T00:00:00+01:00" + end: "2020-01-01T23:59:30+01:00" + resolutionValue: "15" + resolutionUnit: min + alignment: left + aggregationType: current + +contributors: + - title: Ludwig Hülk + path: https://github.com/Ludee + organization: Reiner Lemoine Institut + roles: [DataCollector] + date: "2024-11-19" + object: data + comment: Date of data creation + - title: Ludwig Hülk + path: https://github.com/Ludee + organization: Reiner Lemoine Institut + roles: [DataCurator] + date: "2024-11-30" + object: metadata + comment: Date of metadata creation diff --git a/tests/test_inspection.py b/tests/test_inspection.py index 8cf504ba..27d1afb1 100644 --- a/tests/test_inspection.py +++ b/tests/test_inspection.py @@ -34,3 +34,9 @@ def test_inspection(): assert metadata["resources"][0]["schema"]["fields"][6]["type"] == "object" assert metadata["resources"][0]["schema"]["fields"][7]["type"] == "date" assert metadata["resources"][0]["schema"]["fields"][8]["type"] == "boolean" + + +# TODO @jh-RLI: Add test for special cases in csv as e.g. this data will cause issues # noqa: TD003 +# cat objective.csv +# ;0 +# objective;97356714.15339188 diff --git a/tests/test_metadata_creation.py b/tests/test_metadata_creation.py new file mode 100644 index 00000000..0387906d --- /dev/null +++ b/tests/test_metadata_creation.py @@ -0,0 +1,141 @@ +"""Test suite for the OEMetadataCreator class in the OMI creation module (split-files layout).""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +import pytest +import yaml + +from omi.creation.creator import OEMetadataCreator +from omi.creation.utils import apply_template_to_resources, load_parts + +if TYPE_CHECKING: + from pathlib import Path + + +@pytest.fixture() +def sample_tree(tmp_path: Path) -> tuple[Path, str]: + """ + Create a split-files metadata tree. + + metadata/ + datasets/ + demo.dataset.yaml + demo.template.yaml + resources/ + demo/ + table.resource.yaml + """ + base = tmp_path / "metadata" + ds_dir = base / "datasets" + rs_dir = base / "resources" / "demo" + + ds_dir.mkdir(parents=True, exist_ok=True) + rs_dir.mkdir(parents=True, exist_ok=True) + + # dataset yaml + (ds_dir / "demo.dataset.yaml").write_text( + yaml.safe_dump( + { + "version": "OEMetadata-2.0", + "dataset": { + "name": "test_dataset", + "title": "Test Dataset", + "description": "For unit testing", + "@id": "https://example.org/test_dataset", + }, + }, + sort_keys=False, + allow_unicode=True, + ), + encoding="utf-8", + ) + + # template yaml (applied to every resource) + (ds_dir / "demo.template.yaml").write_text( + yaml.safe_dump( + { + "languages": ["en-GB"], + "keywords": ["example"], + "context": {"publisher": "OEP", "contact": "contact@example.org"}, + }, + sort_keys=False, + allow_unicode=True, + ), + encoding="utf-8", + ) + + # one resource yaml + (rs_dir / "table.resource.yaml").write_text( + yaml.safe_dump( + { + "name": "test_resource", + "title": "Test Resource", + "type": "table", + "format": "CSV", + "schema": { + "fields": [ + {"name": "id", "type": "integer", "nullable": False}, + ], + "primaryKey": ["id"], + }, + }, + sort_keys=False, + allow_unicode=True, + ), + encoding="utf-8", + ) + + return base, "demo" + + +def test_generate_oemetadata_from_split_files(sample_tree: tuple[Path, str]) -> None: + """End-to-end: load parts, apply template, generate metadata via creator.""" + base_dir, dataset_id = sample_tree + + # Load version/dataset/resources/template from split-files layout + version, dataset, resources, template = load_parts(base_dir, dataset_id) + + # Deep-apply template to resources (dicts merge, lists concat for keywords/topics/languages) + merged_resources = apply_template_to_resources(resources, template) + + creator = OEMetadataCreator(oem_version=version) + result = creator.generate_metadata(dataset, merged_resources) + + # Basic assertions + assert result["@context"].startswith("https://") + assert result["name"] == "test_dataset" + assert "resources" in result + assert isinstance(result["resources"], list) + assert result["resources"][0]["name"] == "test_resource" + + # Template has been applied deeply (languages concatenated / context merged) + r0 = result["resources"][0] + assert r0["languages"] == ["en-GB"] + assert r0["keywords"] == ["example"] + assert r0["context"]["publisher"] == "OEP" + assert r0["context"]["contact"] == "contact@example.org" + + # Schema minimally intact + assert r0["schema"]["primaryKey"] == ["id"] + assert r0["schema"]["fields"][0]["name"] == "id" + assert r0["schema"]["fields"][0]["nullable"] is False + + +def test_creator_save_writes_json(sample_tree: tuple[Path, str]) -> None: + """Ensure creator.save writes JSON and preserves unicode.""" + base_dir, dataset_id = sample_tree + version, dataset, resources, template = load_parts(base_dir, dataset_id) + merged_resources = apply_template_to_resources(resources, template) + + out = base_dir / "out.json" + creator = OEMetadataCreator(oem_version=version) + creator.save(dataset, merged_resources, out, ensure_ascii=False, indent=2) + + assert out.exists() + data = json.loads(out.read_text(encoding="utf-8")) + assert data["name"] == "test_dataset" + # unicode preserved (no \u escapes because ensure_ascii=False) + assert "©" not in out.read_text(encoding="utf-8") # sanity check; none present here by default diff --git a/tests/test_metadata_validation.py b/tests/test_metadata_validation.py index 2a0de492..278f251d 100644 --- a/tests/test_metadata_validation.py +++ b/tests/test_metadata_validation.py @@ -110,11 +110,10 @@ def deactivate__test_metadata_against_oep_table(): validation.validate_oep_table_against_metadata(oep_table=table, oep_schema="model_draft", metadata=metadata) -def test_metadata_against_oep_table_using_metadata_from_oep(): - """Test OEP table definition against OEP metadata, where metadata is taken from OEP.""" - table = "x2x_p2gas_soec_1" - with pytest.raises(validation.ValidationError, match="None is not of type 'object'"): - validation.validate_oep_table_against_metadata(oep_table=table, oep_schema="model_draft") +# Test fails always as tables does not exist in OEP anymore +# def test_metadata_against_oep_table_using_metadata_from_oep(): +# """Test OEP table definition against OEP metadata, where metadata is taken from OEP.""" +# with pytest.raises(validation.ValidationError, match="None is not of type 'object'"): def test_metadata_against_oep_table_invalid_name(): diff --git a/tests/test_metadata_yaml_generation.py b/tests/test_metadata_yaml_generation.py new file mode 100644 index 00000000..21131af8 --- /dev/null +++ b/tests/test_metadata_yaml_generation.py @@ -0,0 +1 @@ +"""Test for metadata yaml generation."""