From 34988f595a49a70f06b6162fa698634debb7047e Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 12:57:58 +0200
Subject: [PATCH 01/37] #126: add empty test for metadata yaml generation

---
 tests/test_metadata_yaml_generation.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 tests/test_metadata_yaml_generation.py

diff --git a/tests/test_metadata_yaml_generation.py b/tests/test_metadata_yaml_generation.py
new file mode 100644
index 00000000..21131af8
--- /dev/null
+++ b/tests/test_metadata_yaml_generation.py
@@ -0,0 +1 @@
+"""Test for metadata yaml generation."""

From 720cb12e95b6f757ac1d2dfd47442eb5a56e8735 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 13:06:10 +0200
Subject: [PATCH 02/37] #126: Implement oemetadata creator class to create
 valid oemetadata json datapackages

---
 src/omi/creation/creator.py | 49 +++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 src/omi/creation/creator.py

diff --git a/src/omi/creation/creator.py b/src/omi/creation/creator.py
new file mode 100644
index 00000000..ec390251
--- /dev/null
+++ b/src/omi/creation/creator.py
@@ -0,0 +1,49 @@
+"""Create oemetadata json datapackage descriptions."""
+
+from omi.base import get_metadata_specification
+from omi.validation import validate_metadata
+
+
+class OEMetadataCreator:
+    """
+    Class to create oemetadata json datapackages.
+
+    Output is based on datapackage and resource descriptions.
+    """
+
+    def __init__(self, oem_version: str = "OEMetadata-2.0") -> None:
+        """
+        Initialize the OEMetadataCreator with a specific version.
+
+        Parameters
+        ----------
+        oem_version:str
+            The version of the OEMetadata specification to use.
+        """
+        self.oem_spec = get_metadata_specification(oem_version)
+
+    def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict:
+        """
+        Generate oemetadata json datapackage from dataset and resources.
+
+        Parameters
+        ----------
+        dataset: dict
+            The dataset description.
+        resources: list[dict]
+            The list of resource descriptions.
+
+        Returns
+        -------
+        dict
+            The generated oemetadata json datapackage.
+        """
+        metadata = {
+            "@context": self.oem_spec.schema["properties"]["@context"]["examples"][0],
+            **dataset,
+            "resources": resources,
+            "metaMetadata": self.oem_spec.example["metaMetadata"],
+        }
+
+        validate_metadata(metadata, check_license=False)
+        return metadata

From ad0fe47fccca3f05d224cdd365bef5eb0e1c856e Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 13:16:25 +0200
Subject: [PATCH 03/37] #126: Add utility module which offers general purpose
 functionality. Currently includes: - loading metadata from yaml oemetadata
 definition for further processing

---
 src/omi/creation/utils.py | 45 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 src/omi/creation/utils.py

diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py
new file mode 100644
index 00000000..3718e54c
--- /dev/null
+++ b/src/omi/creation/utils.py
@@ -0,0 +1,45 @@
+"""Utility functions for OMI creation module."""
+
+from pathlib import Path
+from typing import Union
+
+import yaml
+
+
+def load_yaml_metadata(file_path: Union[str, Path]) -> tuple[str, dict, list[dict], dict]:
+    """
+    Load YAML file containing version, dataset, template, and resource metadata.
+
+    This function reads a YAML file and extracts the version, dataset description,
+    resources, and template. It applies the template to each resource, merging any
+    specified fields.
+    Returns: version, dataset, list of resources with merged template, and raw template.
+
+    Parameters
+    ----------
+    file_path: Union[str, Path]
+        Path to the YAML file.
+
+    Returns
+    -------
+    Tuple[str, Dict, List[Dict], Dict]
+        A tuple containing:
+        - version: The version of the metadata.
+        - dataset: The dataset description.
+        - resources: A list of resources with the template applied.
+        - template: The raw template used for resources.
+    """
+    with Path.open(file_path, encoding="utf-8") as file:
+        data = yaml.safe_load(file)
+
+    version = data.get("version", "OEMetadata-2.0.4")
+    dataset = data.get("dataset", {})
+    template = data.get("template", {})
+    resources = data.get("resources", [])
+
+    # Apply template to each resource
+    for resource in resources:
+        for key, value in template.items():
+            resource.setdefault(key, value)
+
+    return version, dataset, resources

From a722b0bd91d91f5ef8b43eebb1a0dcedfae19838 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 13:16:46 +0200
Subject: [PATCH 04/37] #126: setup creation module

---
 src/omi/creation/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/omi/creation/__init__.py

diff --git a/src/omi/creation/__init__.py b/src/omi/creation/__init__.py
new file mode 100644
index 00000000..e69de29b

From 63b2a3051f2e399fb5886a1f3857be5a2128b0f7 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 13:26:35 +0200
Subject: [PATCH 05/37] #126: add test for creation functionality

---
 tests/test_metadata_creation.py | 47 +++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 tests/test_metadata_creation.py

diff --git a/tests/test_metadata_creation.py b/tests/test_metadata_creation.py
new file mode 100644
index 00000000..2cf6a7d5
--- /dev/null
+++ b/tests/test_metadata_creation.py
@@ -0,0 +1,47 @@
+"""Test suite for the OEMetadataCreator class in the OMI creation module."""
+
+from pathlib import Path
+
+import pytest
+import yaml
+
+from omi.creation.creator import OEMetadataCreator
+from omi.creation.utils import load_yaml_metadata
+
+
+@pytest.fixture()
+def sample_yaml(tmp_path: Path) -> Path:
+    """Fixture to create a sample YAML file for testing."""
+    content = {
+        "version": "OEMetadata-2.0.4",
+        "dataset": {
+            "name": "test_dataset",
+            "title": "Test Dataset",
+            "description": "For unit testing",
+            "@id": "https://example.org/test_dataset",
+        },
+        "template": {"languages": ["en-GB"]},
+        "resources": [{"name": "test_resource", "title": "Test Resource", "format": "CSV", "type": "table"}],
+    }
+
+    file_path = tmp_path / "metadata.yaml"
+    with Path.open(file_path, "w", encoding="utf-8") as f:
+        yaml.dump(content, f, sort_keys=False)
+
+    return file_path
+
+
+def test_generate_oemetadata(sample_yaml: Path) -> None:
+    """Test the generation of OEMetadata from a sample YAML file."""
+    version, dataset, resources = load_yaml_metadata(sample_yaml)
+    creator = OEMetadataCreator()
+
+    result = creator.generate_metadata(dataset, resources)
+
+    # Basic assertions
+    assert result["@context"].startswith("https://")
+    assert result["name"] == "test_dataset"
+    assert "resources" in result
+    assert isinstance(result["resources"], list)
+    assert result["resources"][0]["name"] == "test_resource"
+    assert "languages" in result["resources"][0]

From c0f559f9cde8e869471cc86b1737cbe7bd18ed83 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 15:02:20 +0200
Subject: [PATCH 06/37] #126: add generator for functionality to generate
 metadata. Currently implemented is generate yaml metadata which can be used
 to create json metadata

---
 src/omi/creation/generator.py | 205 ++++++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 src/omi/creation/generator.py

diff --git a/src/omi/creation/generator.py b/src/omi/creation/generator.py
new file mode 100644
index 00000000..a141e9ad
--- /dev/null
+++ b/src/omi/creation/generator.py
@@ -0,0 +1,205 @@
+"""
+Generate an OEMetadata configuration file.
+
+Module for generating metadata files from resources like directories or zip files.
+This used to get started from scratch - init metadata.
+"""
+
+import fnmatch
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+
+import yaml
+
+from omi.inspection import infer_metadata
+
+
+@dataclass
+class FileFilterOptions:
+    """
+    Options for filtering files when reading directories or zip files.
+
+    Attributes
+    ----------
+    exclude_extensions: list[str] | None
+        List of file extensions to exclude (e.g., ['.log', '.tmp']).
+    exclude_patterns: list[str] | None
+        List of filename patterns to exclude (e.g., ['*_backup.*', '*.bak']).
+    exclude_hidden: bool
+    Whether to exclude hidden files/directories (default True).
+    """
+
+    exclude_extensions: list[str] | None = None
+    exclude_patterns: list[str] | None = None
+    exclude_hidden: bool = True
+
+
+def read_directory(
+    directory: Union[str, Path],
+    filter_opts: FileFilterOptions,
+) -> list[Path]:
+    """
+    Recursively read files from the directory, applying optional filters.
+
+    Parameters
+    ----------
+    directory: Union[str, Path]
+        The directory to read files from. Can be a string or a Path object.
+    filter_opts: FileFilterOptions
+        Filtering options including extensions, patterns, and hidden files.
+
+    Returns
+    -------
+    list[Path]
+        A list of Path objects representing the files that match the criteria.
+    """
+    directory = Path(directory)
+
+    exclude_extensions = set(filter_opts.exclude_extensions or [".log", ".tmp", ".bak", ".DS_Store", ".md"])
+    exclude_patterns = filter_opts.exclude_patterns or ["*_backup.*", "*~", "*.old", "*.ignore"]
+
+    valid_files = []
+    for file_path in directory.rglob("*"):
+        if not file_path.is_file():
+            continue
+
+        if filter_opts.exclude_hidden and any(part.startswith(".") for part in file_path.parts):
+            continue
+
+        if file_path.suffix in exclude_extensions:
+            continue
+
+        if any(fnmatch.fnmatch(file_path.name, pattern) for pattern in exclude_patterns):
+            continue
+
+        valid_files.append(file_path)
+
+    return valid_files
+
+
+def read_zipfile(
+    zip_path: Union[str, Path],
+    extract_to: Union[str, Path],
+    filter_opts: FileFilterOptions,
+) -> list[Path]:
+    """Extract a zip file and return list of extracted files."""
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(extract_to)
+    return read_directory(extract_to, filter_opts)
+
+
+def infer_file_metadata(file_path: Path) -> dict:
+    """
+    Infer basic resource metadata from file name and type.
+
+    Parameters
+    ----------
+    file_path: Path
+        Path to the file for which metadata should be inferred.
+
+    Returns
+    -------
+    dict
+        A dictionary containing inferred metadata for the resource.
+    """
+    file_name = file_path.stem
+    file_format = file_path.suffix.replace(".", "").upper()
+
+    resource = {
+        "name": file_name.lower().replace(" ", "_"),
+        "title": file_name.replace("_", " ").title(),
+        "path": file_path.as_posix(),
+        "description": f"Auto-generated description for {file_name}",
+        "type": "table" if file_format in ["CSV", "XLSX", "JSON"] else "file",
+        "format": file_format,
+        "encoding": "UTF-8",
+    }
+
+    if file_format == "CSV":
+        with file_path.open("r") as f:
+            fields = infer_metadata(f, "OEP")["resources"][0]["schema"]
+
+        resource["schema"] = fields
+        resource["dialect"] = {"delimiter": fields.get("delimiter", ","), "decimalSeparator": "."}
+
+    return resource
+
+
+def generate_oemetadata_yaml_from_datapackage(
+    directory: Union[str, Path],
+    output_yaml: Union[str, Path],
+    dataset_metadata: dict,
+    filter_opts: FileFilterOptions,
+) -> None:
+    """
+    Generate an OEMetadata YAML configuration file based on files in a directory or zipped directory.
+
+    Parameters
+    ----------
+    directory: Union[str, Path]
+        Path to the directory or zip file containing data files.
+    output_yaml: Union[str, Path]
+        Path to the output YAML file.
+    dataset_metadata: dict
+        Metadata for the dataset, including name, title, description, and ID.
+    filter_opts: FileFilterOptions
+        Filtering options for excluding files by extension, pattern, or hidden state.
+    """
+    temp_dir = None
+    directory = Path(directory)
+    if zipfile.is_zipfile(directory):
+        temp_dir = Path("temp_extracted")
+        files = read_zipfile(directory, temp_dir, filter_opts)
+        files = read_directory(temp_dir, filter_opts)  # Apply filtering after extraction
+    else:
+        files = read_directory(directory, filter_opts)
+
+    resources = []
+    for file in files:
+        resource_meta = infer_file_metadata(file)
+
+        resources.append(resource_meta)
+
+    yaml_structure = {
+        "dataset": dataset_metadata,
+        "template": {  # TODO @jh-RLI: This section must be defined by user # noqa: TD003
+            "context": {
+                "title": "Your Project Title",
+                "homepage": "https://yourhomepage.org",
+                "contact": "contact@yourproject.org",
+            },
+        },
+        "resources": resources,
+    }
+
+    with open(output_yaml, "w", encoding="utf-8") as yaml_file:  # noqa: PTH123
+        yaml.dump(yaml_structure, yaml_file, sort_keys=False, allow_unicode=True)
+
+    if temp_dir:
+        import shutil
+
+        shutil.rmtree(temp_dir)
+
+    print(f"YAML configuration generated at: {output_yaml}")  # noqa: T201
+
+
+# Example usage
+if __name__ == "__main__":
+    dataset_metadata_example = {
+        "name": "example_dataset",
+        "title": "Example Dataset",
+        "description": "This dataset was autogenerated from directory content.",
+        "@id": "https://example.org/dataset/example_dataset",
+    }
+
+    generate_oemetadata_yaml_from_datapackage(
+        directory="/home/jh/projekte/SLE/postprocessed/",
+        output_yaml="generated_metadata.yaml",
+        dataset_metadata=dataset_metadata_example,
+        filter_opts=FileFilterOptions(
+            exclude_patterns=[".snake*"],
+            exclude_hidden=True,
+        ),
+    )

From 7ad24dc22408440ea3aeacf358fcd3fd8a055048 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 15:03:48 +0200
Subject: [PATCH 07/37] #126: make linter happy

---
 src/omi/creation/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py
index 3718e54c..97f1644d 100644
--- a/src/omi/creation/utils.py
+++ b/src/omi/creation/utils.py
@@ -29,7 +29,7 @@ def load_yaml_metadata(file_path: Union[str, Path]) -> tuple[str, dict, list[dic
         - resources: A list of resources with the template applied.
         - template: The raw template used for resources.
     """
-    with Path.open(file_path, encoding="utf-8") as file:
+    with Path(file_path).open(encoding="utf-8") as file:
         data = yaml.safe_load(file)
 
     version = data.get("version", "OEMetadata-2.0.4")

From 6a51fbce0c209abaa7129d6ef1f4cf940c013aa1 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 15:08:08 +0200
Subject: [PATCH 08/37] #126: add entry point for metadata creation with
 function to create JSON oemetadata from yaml file

---
 src/omi/create.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 src/omi/create.py

diff --git a/src/omi/create.py b/src/omi/create.py
new file mode 100644
index 00000000..2b9a4bd1
--- /dev/null
+++ b/src/omi/create.py
@@ -0,0 +1,29 @@
+"""Enty point for metadata creation."""
+
+import json
+from pathlib import Path
+from typing import Union
+
+from omi.creation.creator import OEMetadataCreator
+from omi.creation.utils import load_yaml_metadata
+
+
+def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None:
+    """
+    Generate OEMetadata from a YAML file and write it to an output file.
+
+    Parameters
+    ----------
+    yaml_file: str
+        Path to the input YAML file containing dataset and resources.
+    output_file: str
+        Path to the output file where the generated OEMetadata JSON will be saved.
+    """
+    version, dataset, resources = load_yaml_metadata(yaml_file)
+    creator = OEMetadataCreator()
+    metadata = creator.generate_metadata(dataset, resources)
+
+    with Path(output_file).open("w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+
+    print(f"OEMetadata written to {output_file}")  # noqa: T201

From ac555f63e472d335f09d137365c0d52f0cc9f0e5 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 15:11:41 +0200
Subject: [PATCH 09/37] #126: add cli function to create metadata json from
 yaml file

---
 src/omi/cli.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/omi/cli.py b/src/omi/cli.py
index 6b4d0aac..2a2093a5 100644
--- a/src/omi/cli.py
+++ b/src/omi/cli.py
@@ -15,8 +15,15 @@
   Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration
 """
 
+import json
+from pathlib import Path
+from typing import Union
+
 import click
 
+from omi.creation.creator import OEMetadataCreator
+from omi.creation.utils import load_yaml_metadata
+
 
 @click.group()
 def grp() -> None:
@@ -29,3 +36,27 @@ def grp() -> None:
 def main() -> None:
     """Start click application."""
     cli()
+
+
+@click.command()
+@click.argument("yaml_file")
+@click.argument("output_file")
+def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None:
+    """
+    Generate OEMetadata from a YAML file and write it to an output file.
+
+    Parameters
+    ----------
+    yaml_file: Union[str, Path]
+        Path to the input YAML file containing dataset and resources.
+    output_file: Union[str, Path]
+        Path to the output file where the generated OEMetadata JSON will be saved.
+    """
+    version, dataset, resources = load_yaml_metadata(yaml_file)
+    generator = OEMetadataCreator()
+    metadata = generator.generate_metadata(dataset, resources)
+
+    with Path(output_file).open("w", encoding="utf-8") as f:
+        json.dump(metadata, f, indent=2)
+
+    print(f"OEMetadata written to {output_file}")  # noqa: T201

From 107ab3324469f3bca9ee0348e83f66ac0776919f Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Tue, 29 Jul 2025 15:12:42 +0200
Subject: [PATCH 10/37] #126: make sure when inspecting data resources to infer
 the fields metadata into oemetadata format instead of plain frictionless

---
 src/omi/inspection.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/omi/inspection.py b/src/omi/inspection.py
index f7b4dd52..e7b2b526 100644
--- a/src/omi/inspection.py
+++ b/src/omi/inspection.py
@@ -1,6 +1,7 @@
 """Module to inspect data and create metadata from it."""
 
 from collections.abc import Callable
+from copy import deepcopy
 from typing import Any
 
 from frictionless import Detector, Dialect, Resource
@@ -121,7 +122,9 @@ def convert_field(field: dict[str, str]) -> dict[str, str]:
                 return {"name": field["name"], "type": f"array {type_mapping[item_type]}"}
             # All arrays are empty - so no further subtype can be detected
             return {"name": field["name"], "type": "array"}
-        return field
+        oem_field = deepcopy(metadata["resources"][0]["schema"]["fields"][0])
+        oem_field.update(field)
+        return oem_field
 
     rows = resource.read_rows()
     fields = [convert_field(field) for field in fields]

From 2191eac8b470021078e5aa93a13a0563d90f9c3c Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:00:01 +0100
Subject: [PATCH 11/37] #126: Add documentation for the creation module

---
 src/omi/creation/README.md | 471 +++++++++++++++++++++++++++++++++++++
 1 file changed, 471 insertions(+)
 create mode 100644 src/omi/creation/README.md

diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md
new file mode 100644
index 00000000..274283ff
--- /dev/null
+++ b/src/omi/creation/README.md
@@ -0,0 +1,471 @@
+# OMI OEMetadata Assembly Guide
+
+This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, testing, and common pitfalls. You can drop this as a single `.md` file in your repo (e.g. `docs/oemetadata-assembly.md`) or split into multiple files later.
+
+---
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Concepts & Data Flow](#concepts--data-flow)
+3. [Repository Layout](#repository-layout)
+4. [YAML File Formats](#yaml-file-formats)
+
+   * [Dataset YAML](#dataset-yaml)
+   * [Template YAML (optional)](#template-yaml-optional)
+   * [Resource YAML](#resource-yaml)
+   * [Index YAML (optional)](#index-yaml-optional)
+5. [Templating Rules](#templating-rules)
+6. [Discovery vs. Index Mapping](#discovery-vs-index-mapping)
+7. [Programmatic Usage](#programmatic-usage)
+
+   * [Minimal Usage](#minimal-usage)
+   * [With Index Mapping](#with-index-mapping)
+   * [Manual Loading (No Discovery)](#manual-loading-no-discovery)
+8. [Airflow Integration Example](#airflow-integration-example)
+9. [Testing](#testing)
+10. [Validation & Error Handling](#validation--error-handling)
+11. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding)
+12. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional)
+13. [Design Notes & Extensibility](#design-notes--extensibility)
+14. [FAQ](#faq)
+
+---
+
+## Overview
+
+* **Goal:** Author OEMetadata as **YAML** (dataset + resources), keep it **DRY** via **templates**, assemble into a single **JSON** metadata document, and **validate** it with the official schema.
+* **Core ideas:**
+
+  * Authors maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs.
+  * OMI assembles and validates metadata into a final OEMetadata JSON.
+  * Works well in pipelines (e.g., Airflow) and in regular Python.
+
+---
+
+## Concepts & Data Flow
+
+1. **Authoring:**
+
+   * `datasets/<id>.dataset.yaml`
+   * `datasets/<id>.template.yaml` *(optional)*
+   * `resources/<id>/*.resource.yaml`
+
+2. **Assembly:**
+
+   * OMI **loads** dataset, template, and resource YAML files.
+   * OMI **applies the template** to each resource (deep merge; resource overrides template).
+   * OMI **generates and validates** OEMetadata JSON via `OEMetadataCreator`.
+
+3. **Storage:**
+
+   * You decide where to store: file, DB, API, etc. (OMI returns a Python `dict`).
+
+---
+
+## Repository Layout
+
+```
+metadata/
+  datasets/
+    <dataset_id>.dataset.yaml
+    <dataset_id>.template.yaml        # optional
+  resources/
+    <dataset_id>/
+      <resource_a>.resource.yaml
+      <resource_b>.resource.yaml
+  metadata_index.yaml                 # optional explicit mapping
+```
+
+* You can use **convention** (the directory / filename structure above) or an **index** file for explicit mapping.
+
+---
+
+## YAML File Formats
+
+### Dataset YAML
+
+```yaml
+# metadata/datasets/powerplants.dataset.yaml
+version: "OEMetadata-2.0.4"               # optional (default: OEMetadata-2.0.4)
+dataset:
+  name: oep_oemetadata
+  title: OEP OEMetadata
+  description: A dataset for the OEMetadata examples.
+  "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/
+```
+
+> Backwards compatibility: if you prefer, you may put dataset fields directly at the top level; OMI will treat that as `dataset: {...}`.
+
+---
+
+### Template YAML (optional)
+
+Applied to **every** resource (unless the resource overrides specific fields). Keeps your YAML DRY.
+
+```yaml
+# metadata/datasets/powerplants.template.yaml
+licenses:
+  - name: ODbL-1.0
+    title: Open Data Commons Open Database License 1.0
+    path: https://opendatacommons.org/licenses/odbl/1-0/index.html
+    instruction: >
+      You are free to share and change, but you must attribute, and
+      share derivations under the same license. See https://tldrlegal.com/license/odc-open-database-license-(odbl)
+      for further information.
+    attribution: © Reiner Lemoine Institut
+    copyrightStatement: https://github.com/OpenEnergyPlatform/oemetadata/blob/production/LICENSE.txt
+
+context:
+  title: NFDI4Energy
+  homepage: https://nfdi4energy.uol.de/
+  documentation: https://nfdi4energy.uol.de/sites/about_us/
+  sourceCode: https://github.com/NFDI4Energy
+  publisher: Open Energy Platform (OEP)
+  publisherLogo: https://github.com/OpenEnergyPlatform/organisation/blob/production/logo/OpenEnergyFamily_Logo_OpenEnergyPlatform.svg
+  contact: contact@example.com
+  fundingAgency: " Deutsche Forschungsgemeinschaft (DFG)"
+  fundingAgencyLogo: https://upload.wikimedia.org/wikipedia/commons/8/86/DFG-logo-blau.svg
+  grantNo: "501865131"
+
+topics: [model_draft]
+languages: [en-GB, de-DE]
+keywords: [example, ODbL-1.0, NFDI4Energy]
+```
+
+---
+
+### Resource YAML
+
+```yaml
+# metadata/resources/powerplants/oemetadata_table_template.resource.yaml
+name: oemetadata_table_template
+type: table
+title: OEMetadata Table Template
+description: Example table used to illustrate the OEMetadata structure and features.
+
+# Resource-specific attributes
+path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template
+scheme: http
+format: CSV
+encoding: UTF-8
+
+dialect:
+  decimalSeparator: "."
+  csv:
+    delimiter: ";"
+
+schema:
+  fields:
+    - name: id
+      type: integer
+      description: Unique identifier
+      nullable: false
+    # ... more fields ...
+  primaryKey: [id]
+  foreignKeys:
+    - fields: [id, version]
+      reference:
+        resource: model_draft.oep_oemetadata_table_example_version
+        fields: [id, version]
+
+"@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv
+
+sources:
+  - title: IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report
+    authors: [Hoesung Lee, José Romero, The Core Writing Team]
+    publicationYear: "2023"
+    path: https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf
+    sourceLicenses:
+      - name: CC-BY-4.0
+        title: Creative Commons Attribution 4.0 International
+        path: https://creativecommons.org/licenses/by/4.0/legalcode
+        instruction: >
+          You are free to share and change, but you must attribute.
+          See https://tldrlegal.com/license/odc-open-database-license-odbl for further information.
+        attribution: © Intergovernmental Panel on Climate Change 2023
+        copyrightStatement: https://www.ipcc.ch/copyright/
+
+# Other metadata like subject, publicationDate, spatial, temporal, contributors, review...
+```
+
+A second resource:
+
+```yaml
+# metadata/resources/powerplants/data_2.resource.yaml
+name: data_2
+type: table
+title: My Second Resource
+
+path: reGon/metadata/data_2.csv
+scheme: file
+format: csv
+mediatype: text/csv
+encoding: utf-8
+
+schema:
+  fields:
+    - name: id
+      type: integer
+      nullable: true
+    - name: i
+      type: integer
+      nullable: true
+    - name: o
+      type: string
+      nullable: true
+  primaryKey: [id]
+
+```
+
+---
+
+### Index YAML (optional)
+
+Use this if you want explicit mappings instead of convention-based discovery.
+
+```yaml
+# metadata/metadata_index.yaml
+datasets:
+  powerplants:
+    dataset: datasets/powerplants.dataset.yaml
+    template: datasets/powerplants.template.yaml
+    resources:
+      - resources/powerplants/oemetadata_table_template.resource.yaml
+      - resources/powerplants/data_2.resource.yaml
+```
+
+---
+
+## Templating Rules
+
+* **Deep merge** for dictionaries (e.g., `context`):
+
+  * Resource **overrides** template on conflicts.
+  * Missing nested keys are **filled** from template.
+
+* **Lists**:
+
+  * **Concatenate** (resource first, then template-only items) for:
+    `keywords`, `topics`, `languages`.
+  * For other lists (e.g., `licenses`, `contributors`), **resource wins** (no concat).
+  * You can change this behavior in code by adding keys to `DEFAULT_CONCAT_LIST_KEYS`.
+
+* **Scalars**: resource value **wins**.
+
+This keeps YAML DRY while allowing fine-grained per-resource overrides.
+
+---
+
+## Discovery vs. Index Mapping
+
+* **Discovery (convention):**
+  `datasets/<id>.dataset.yaml`, `datasets/<id>.template.yaml`, and `resources/<id>/*.resource.yaml`
+  → No index file needed.
+
+* **Index (explicit mapping):**
+  Use `metadata_index.yaml` to map dataset/template/resources by path, relative to the metadata base directory.
+
+---
+
+## Programmatic Usage
+
+OMI exposes high-level assembly and creation utilities.
+
+### Minimal Usage
+
+```python
+from omi.creation.assembly import assemble_metadata_dict
+
+metadata = assemble_metadata_dict(
+    base_dir="./metadata",
+    dataset_id="powerplants",
+)  # returns a Python dict with valid OEMetadata
+```
+
+### With Index Mapping
+
+```python
+from omi.creation.assembly import assemble_metadata_dict
+
+metadata = assemble_metadata_dict(
+    base_dir="./metadata",
+    dataset_id="powerplants",
+    index_file="./metadata/metadata_index.yaml",
+)
+```
+
+### Manual Loading (No Discovery)
+
+```python
+from pathlib import Path
+from omi.creation.creator import OEMetadataCreator
+from omi.creation.utils import load_yaml, apply_template_to_resources
+
+dataset = load_yaml(Path("./metadata/datasets/powerplants.dataset.yaml")).get("dataset", {})
+template = load_yaml(Path("./metadata/datasets/powerplants.template.yaml"))
+resources = [
+    load_yaml(Path("./metadata/resources/powerplants/oemetadata_table_template.resource.yaml")),
+    load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")),
+]
+
+resources = apply_template_to_resources(resources, template)
+creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4")
+metadata = creator.generate_metadata(dataset, resources)
+```
+
+> The `OEMetadataCreator` injects `@context` and `metaMetadata` and calls validation.
+
+---
+
+## Airflow Integration Example
+
+```python
+# In a DAG task (PythonOperator callable)
+from omi.creation.assembly import assemble_metadata_dict
+
+def build_oemetadata_for_powerplants(**context):
+    md = assemble_metadata_dict(
+        base_dir="/opt/airflow/dags/metadata",          # your metadata module
+        dataset_id="powerplants",
+        index_file="/opt/airflow/dags/metadata/metadata_index.yaml",  # or None for discovery
+    )
+    # Store or pass downstream: write to file/DB/API, or XCom
+    context["ti"].xcom_push(key="oemetadata", value=md)
+```
+
+---
+
+## Testing
+
+You can unit test assembly logic without depending on the real spec/validator by **monkeypatching** the creator.
+
+**Example (`tests/test_assembly.py`):**
+
+```python
+from pathlib import Path
+import yaml
+import pytest
+from omi.creation.assembly import assemble_metadata_dict
+
+def write_yaml(p: Path, data) -> None:
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8")
+
+class FakeCreator:
+    def __init__(self, oem_version: str = "OEMetadata-2.0.4"):
+        self.oem_version = oem_version
+    def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict:
+        return {"@context": "...", **dataset, "resources": resources, "metaMetadata": {"metadataVersion": self.oem_version}}
+
+def test_assemble(tmp_path, monkeypatch):
+    write_yaml(tmp_path / "datasets" / "demo.dataset.yaml", {"dataset": {"name": "demo", "title": "Demo"}})
+    write_yaml(tmp_path / "datasets" / "demo.template.yaml", {"keywords": ["k1"], "context": {"contact": "a@b"}})
+    write_yaml(tmp_path / "resources" / "demo" / "a.resource.yaml", {"name": "a", "title": "A", "keywords": ["ak"]})
+    write_yaml(tmp_path / "resources" / "demo" / "b.resource.yaml", {"name": "b", "title": "B", "context": {"publisher": "X"}})
+
+    monkeypatch.setattr("omi.creation.assembly.OEMetadataCreator", FakeCreator)
+    md = assemble_metadata_dict(tmp_path, "demo")
+
+    assert md["name"] == "demo"
+    a, b = md["resources"]
+    assert a["keywords"] == ["ak", "k1"]          # concat
+    assert b["context"]["contact"] == "a@b"       # filled from template
+    assert b["context"]["publisher"] == "X"       # resource wins
+```
+
+Run with:
+
+```bash
+pytest -q
+```
+
+---
+
+## Validation & Error Handling
+
+* `OEMetadataCreator.generate_metadata()` runs `validate_metadata(metadata, check_license=False)`.
+* If validation fails, catch and inspect the exception from `omi.validation`:
+
+```python
+from omi.validation import ValidationError
+
+try:
+    metadata = assemble_metadata_dict("./metadata", "powerplants")
+except ValidationError as e:
+    print("Validation failed:", e)
+```
+
+**Common causes:**
+
+* Missing **required** keys (e.g., field missing `"nullable"`).
+* Incorrect data types (e.g., non-URI in a field that requires `format: uri`).
+* Invalid list shapes (`primaryKey`, `foreignKeys`, etc.).
+
+---
+
+## Auto-Generation From Directory (Optional Onboarding)
+
+You can auto-generate a starter YAML for a dataset by scanning a directory or zip:
+
+* Infer resource entries based on file names & extensions.
+* For CSVs, call your CSV inference to produce initial `schema.fields`.
+* Write a `dataset` YAML + per-file `resource` YAMLs as a starting point.
+
+> Keep this as an onboarding tool; human review is still recommended.
+
+---
+
+## Filtering Irrelevant Files (Optional)
+
+If auto-generating from a directory, filter out noise:
+
+```python
+def read_directory(directory, exclude_extensions=None, exclude_patterns=None, exclude_hidden=True):
+    # ...
+    # exclude_extensions=['.log','.tmp','.bak','.DS_Store','.md']
+    # exclude_patterns=['*_backup.*','*~','*.old','*.ignore']
+    return files
+```
+
+Helps avoid including backups, temp files, editor artifacts, etc.
+
+---
+
+## Design Notes & Extensibility
+
+* **Separation of concerns**:
+
+  * `utils` covers loading YAML, discovery, merging/templating.
+  * `assembly` orchestrates the load → merge → create flow.
+  * `creator` handles schema-based assembly and validation.
+* **Storage-agnostic**: assembly returns a dict; you decide where to store it (file/DB/API).
+* **Configurable merge**: change list concat behavior by editing `DEFAULT_CONCAT_LIST_KEYS`.
+
+---
+
+## FAQ
+
+**Q:** Can a resource override template-provided `licenses`?
+**A:** Yes. By default, **resource wins** for lists except `keywords`, `topics`, `languages` (which concatenate). You can include `"licenses"` in `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation.
+
+**Q:** Where does `@context` and `metaMetadata` come from?
+**A:** `OEMetadataCreator` reads the official spec via `get_metadata_specification(oem_version)` and injects `@context` and a `metaMetadata` block, then validates the final result.
+
+**Q:** The output JSON shows `\u00a9` instead of `©`.
+**A:** Use `ensure_ascii=False` when dumping JSON:
+
+```python
+json.dump(metadata, f, indent=2, ensure_ascii=False)
+```
+
+**Q:** I see validation errors about fields missing `nullable`.
+**A:** Ensure each `schema.fields[]` has **`name`**, **`type`**, and **`nullable`** at minimum. If you auto-generate fields, set `nullable: false` as a safe default unless you detect nulls.
+
+**Q:** How do I run without a template YAML?
+**A:** Just omit `datasets/<id>.template.yaml`; assembly works without it.
+
+---
+
+> If you want this split across multiple docs, consider:
+> `docs/assembly-overview.md`, `docs/yaml-formats.md`, `docs/templating.md`, `docs/integration-airflow.md`, `docs/testing.md`, and `docs/troubleshooting.md`.

From 79da3a2bf5ed249dd6c588d51127a4e92ddae33a Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:37:40 +0100
Subject: [PATCH 12/37] #126: Add test for yaml based oemetadata layout
 assembly module

---
 tests/test_assembly.py | 210 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 tests/test_assembly.py

diff --git a/tests/test_assembly.py b/tests/test_assembly.py
new file mode 100644
index 00000000..dce7efb9
--- /dev/null
+++ b/tests/test_assembly.py
@@ -0,0 +1,210 @@
+"""
+Assembly integration tests for split-files OEMetadata authoring.
+
+This module exercises the public assembler entry point by building a small
+on-disk YAML tree, applying a template, and verifying the merged OEMetadata.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import yaml
+
+# We test the public assembler entry point
+from omi.creation.assembler import assemble_metadata_dict
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pytest
+
+
+# ---------- helpers ----------
+
+
+def write_yaml(p: Path, data: object) -> None:
+    """Write `data` (any YAML-serializable object) to path `p`."""
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(
+        yaml.safe_dump(data, sort_keys=False, allow_unicode=True),
+        encoding="utf-8",
+    )
+
+
+class FakeCreator:
+    """
+    Minimal stand-in for OEMetadataCreator used via monkeypatching.
+
+    It mimics `generate_metadata(dataset, resources)` and skips validation.
+    The constructor accepts the OEMetadata version to embed in metaMetadata.
+    """
+
+    def __init__(self, oem_version: str = "OEMetadata-2.0") -> None:
+        """Initialize the fake creator with a specific OEMetadata version."""
+        self.oem_version = oem_version
+
+    def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict:
+        """Return a small OEMetadata-like dict sufficient for assertions."""
+        return {
+            "@context": "https://example.org/context.json",
+            **dataset,
+            "resources": resources,
+            "metaMetadata": {"metadataVersion": self.oem_version},
+        }
+
+
+# ---------- tests ----------
+
+
+def test_assemble_by_convention_with_template_merge(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """
+    Assemble via convention and verify deep merge semantics.
+
+    Asserts:
+    - dataset is loaded from datasets/{id}.dataset.yaml
+    - template is applied deeply (resource wins on conflicts)
+    - keywords are concatenated (resource first, then template-only)
+    - licenses remain resource-provided if present (no concat by default)
+    - creator is invoked and returns a full dict
+    """
+    # dataset
+    write_yaml(
+        tmp_path / "datasets" / "demo.dataset.yaml",
+        {
+            "version": "OEMetadata-2.0.4",
+            "dataset": {"name": "demo", "title": "Demo", "description": "Demo dataset"},
+        },
+    )
+
+    # template
+    write_yaml(
+        tmp_path / "datasets" / "demo.template.yaml",
+        {
+            "context": {"publisher": "OEP", "contact": "a@b"},
+            "keywords": ["k1"],
+            "topics": ["model_draft"],
+            "languages": ["en-GB"],
+            "licenses": [{"name": "L1"}],  # applies only if resource doesn't provide licenses
+        },
+    )
+
+    # resources
+    write_yaml(
+        tmp_path / "resources" / "demo" / "r1.resource.yaml",
+        {
+            "name": "r1",
+            "title": "R1",
+            # overrides nested key, should still inherit contact from template
+            "context": {"publisher": "Other"},
+            # resource provides its own licenses -> should NOT be concatenated by default
+            "licenses": [{"name": "R1-license"}],
+            # own keywords -> should concat with template keywords
+            "keywords": ["r1k"],
+        },
+    )
+    write_yaml(
+        tmp_path / "resources" / "demo" / "r2.resource.yaml",
+        {
+            "name": "r2",
+            "title": "R2",
+            # no licenses provided -> should get template licenses
+        },
+    )
+
+    # Patch the creator used inside assembler to our Fake
+    monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator)
+
+    md = assemble_metadata_dict(tmp_path, "demo")
+
+    # dataset propagated
+    assert md["name"] == "demo"
+    assert md["title"] == "Demo"
+
+    # resources present
+    assert isinstance(md["resources"], list)
+    assert len(md["resources"]) == 2
+    r1, r2 = md["resources"]
+
+    # deep merge for context: resource wins on conflicts, template fills missing keys
+    assert r1["context"]["publisher"] == "Other"
+    assert r1["context"]["contact"] == "a@b"
+
+    # keywords/topics/languages concatenate (resource first, then template-only)
+    assert r1["keywords"] == ["r1k", "k1"]
+    # topics/languages inherited if missing
+    assert r1["topics"] == ["model_draft"]
+    assert r1["languages"] == ["en-GB"]
+
+    # licenses: resource list wins (no concat by default)
+    assert r1["licenses"] == [{"name": "R1-license"}]
+
+    # r2 inherits licenses from template (since none provided)
+    assert r2["licenses"] == [{"name": "L1"}]
+    # r2 inherits keywords/topics/languages from template
+    assert r2["keywords"] == ["k1"]
+    assert r2["topics"] == ["model_draft"]
+    assert r2["languages"] == ["en-GB"]
+
+    # metaMetadata present from FakeCreator (assembler passes through the version)
+    assert md["metaMetadata"]["metadataVersion"] == "OEMetadata-2.0.4"
+
+
+def test_assemble_with_index_mapping(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Assemble using an explicit metadata_index.yaml mapping."""
+    base = tmp_path
+
+    # index mapping
+    write_yaml(
+        base / "metadata_index.yaml",
+        {
+            "datasets": {
+                "pp": {
+                    "dataset": "datasets/powerplants.dataset.yaml",
+                    "template": "datasets/powerplants.template.yaml",
+                    "resources": [
+                        "resources/powerplants/a.resource.yaml",
+                        "resources/powerplants/b.resource.yaml",
+                    ],
+                },
+            },
+        },
+    )
+
+    write_yaml(
+        base / "datasets" / "powerplants.dataset.yaml",
+        {"dataset": {"name": "pp", "title": "PP"}},
+    )
+    write_yaml(
+        base / "datasets" / "powerplants.template.yaml",
+        {"keywords": ["t-k"]},
+    )
+    write_yaml(
+        base / "resources" / "powerplants" / "a.resource.yaml",
+        {"name": "a", "title": "A", "keywords": ["a-k"]},
+    )
+    write_yaml(
+        base / "resources" / "powerplants" / "b.resource.yaml",
+        {"name": "b", "title": "B"},
+    )
+
+    monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator)
+
+    # Use the index explicitly
+    md = assemble_metadata_dict(base, "pp", index_file=base / "metadata_index.yaml")
+
+    assert md["name"] == "pp"
+    names = [r["name"] for r in md["resources"]]
+    assert names == ["a", "b"]
+
+    # keywords concatenated for 'a', inherited for 'b'
+    r_a = md["resources"][0]
+    r_b = md["resources"][1]
+    assert r_a["keywords"] == ["a-k", "t-k"]
+    assert r_b["keywords"] == ["t-k"]

From a90288517cc15e93d6f64a2c2d12fd7add5fd31d Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:40:49 +0100
Subject: [PATCH 13/37] #126: Add test for yaml based oemetadata creation -> as
 dict or save as file

---
 tests/test_creation_utils.py | 202 +++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 tests/test_creation_utils.py

diff --git a/tests/test_creation_utils.py b/tests/test_creation_utils.py
new file mode 100644
index 00000000..de3c7a67
--- /dev/null
+++ b/tests/test_creation_utils.py
@@ -0,0 +1,202 @@
+"""Unit tests for the OMI creation utils (templating, IO, discovery)."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pytest
+import yaml
+
+# Functions under test
+from omi.creation.utils import (
+    DEFAULT_CONCAT_LIST_KEYS,
+    _merge_lists,
+    apply_template_to_resources,
+    deep_apply_template_to_resource,
+    discover_dataset_ids,
+    discover_dataset_ids_from_index,
+    discover_paths,
+    load_parts,
+    load_yaml,
+    resolve_from_index,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+# ---------- helpers ----------
+
+
+def _write_yaml(p: Path, data: object) -> None:
+    """Write a YAML-serializable `data` object to `p`, creating parents."""
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8")
+
+
+# ---------- tests: list merging + deep template ----------
+
+
+def test_merge_lists_deduplicates_and_respects_resource_first() -> None:
+    """`_merge_lists` keeps resource-first order and de-duplicates template items."""
+    resource_list = ["a", "b"]
+    template_list = ["b", "c"]
+    merged = _merge_lists(template_list, resource_list, deduplicate=True)
+    assert merged == ["a", "b", "c"]
+
+
+def test_deep_apply_template_to_resource_concat_for_keywords_topics_languages() -> None:
+    """Default concat keys (keywords/topics/languages) are concatenated; others are not."""
+    resource = {
+        "name": "r",
+        "keywords": ["rk"],
+        "topics": ["rt"],
+        "languages": ["rl"],
+        "context": {"publisher": "R"},
+        "list_no_concat": [1, 2],
+    }
+    template = {
+        "keywords": ["tk"],
+        "topics": ["tt"],
+        "languages": ["tl"],
+        "context": {"publisher": "T", "contact": "a@b"},
+        "list_no_concat": [3, 4],
+    }
+
+    out = deep_apply_template_to_resource(resource, template)
+    # concat lists for default concat keys
+    assert out["keywords"] == ["rk", "tk"]
+    assert out["topics"] == ["rt", "tt"]
+    assert out["languages"] == ["rl", "tl"]
+    # resource list wins for non-concat keys
+    assert out["list_no_concat"] == [1, 2]
+    # deep dict merge: resource wins on conflict, template fills missing
+    assert out["context"]["publisher"] == "R"
+    assert out["context"]["contact"] == "a@b"
+
+
+def test_deep_apply_template_to_resource_custom_concat_keys() -> None:
+    """Custom concat set allows concatenating lists like `licenses`."""
+    resource = {"licenses": [{"name": "R1"}]}
+    template = {"licenses": [{"name": "T1"}]}
+    # By default, 'licenses' is NOT concatenated
+    out_default = deep_apply_template_to_resource(resource, template)
+    assert out_default["licenses"] == [{"name": "R1"}]
+
+    # If we opt-in, it concatenates (resource first, then template-only)
+    custom_keys = set(DEFAULT_CONCAT_LIST_KEYS) | {"licenses"}
+    out_custom = deep_apply_template_to_resource(resource, template, concat_list_keys=custom_keys)
+    assert out_custom["licenses"] == [{"name": "R1"}, {"name": "T1"}]
+
+
+def test_apply_template_to_resources_applies_per_item() -> None:
+    """Template is applied to each resource; concat for `keywords` by default."""
+    resources = [{"name": "a"}, {"name": "b", "keywords": ["bk"]}]
+    template = {"keywords": ["tk"]}
+    out = apply_template_to_resources(resources, template)
+    assert out[0]["keywords"] == ["tk"]  # inherited from template
+    assert out[1]["keywords"] == ["bk", "tk"]  # concatenated: resource first, then template-only
+
+
+# ---------- tests: YAML IO + discovery ----------
+
+
+def test_load_yaml_reads_empty_as_empty_dict(tmp_path: Path) -> None:
+    """Empty YAML file is read as an empty dict."""
+    p = tmp_path / "empty.yaml"
+    p.write_text("", encoding="utf-8")
+    data = load_yaml(p)
+    assert data == {}
+
+
+def test_discover_paths_and_resolve_from_index(tmp_path: Path) -> None:
+    """Discovery by convention and resolution by index both return expected paths."""
+    base = tmp_path
+    # convention files
+    ds = base / "datasets" / "powerplants.dataset.yaml"
+    tp = base / "datasets" / "powerplants.template.yaml"
+    rdir = base / "resources" / "powerplants"
+    r1 = rdir / "a.resource.yaml"
+    r2 = rdir / "b.resource.yaml"
+
+    _write_yaml(ds, {"version": "OEMetadata-2.0.4", "dataset": {"name": "pp"}})
+    _write_yaml(tp, {"keywords": ["k1"]})
+    _write_yaml(r1, {"name": "a"})
+    _write_yaml(r2, {"name": "b"})
+
+    dspath, tpath, rpaths = discover_paths(base, "powerplants")
+    assert dspath == ds
+    assert tpath == tp
+    assert rpaths == [r1, r2]
+
+    # index mapping (deliberately flips resource order)
+    idx = base / "metadata_index.yaml"
+    _write_yaml(
+        idx,
+        {
+            "datasets": {
+                "powerplants": {
+                    "dataset": "datasets/powerplants.dataset.yaml",
+                    "template": "datasets/powerplants.template.yaml",
+                    "resources": [
+                        "resources/powerplants/b.resource.yaml",
+                        "resources/powerplants/a.resource.yaml",
+                    ],
+                },
+            },
+        },
+    )
+    d2, t2, rs2 = resolve_from_index(base, "powerplants", idx)
+    assert d2 == ds
+    assert t2 == tp
+    assert rs2 == [base / "resources/powerplants/b.resource.yaml", base / "resources/powerplants/a.resource.yaml"]
+
+
+def test_load_parts_returns_all_sections(tmp_path: Path) -> None:
+    """`load_parts` returns (version, dataset, resources, template) with expected contents."""
+    base = tmp_path
+    ds = base / "datasets" / "households.dataset.yaml"
+    tp = base / "datasets" / "households.template.yaml"
+    rdir = base / "resources" / "households"
+    r1 = rdir / "hh1.resource.yaml"
+
+    _write_yaml(ds, {"version": "OEMetadata-2.0.4", "dataset": {"name": "households", "title": "HH"}})
+    _write_yaml(tp, {"context": {"publisher": "OEP"}})
+    _write_yaml(r1, {"name": "hh1"})
+
+    version, dataset, resources, template = load_parts(base, "households")
+    assert version == "OEMetadata-2.0.4"
+    assert dataset == {"name": "households", "title": "HH"}
+    assert resources == [{"name": "hh1"}]
+    assert template == {"context": {"publisher": "OEP"}}
+
+
+def test_load_parts_raises_when_dataset_missing(tmp_path: Path) -> None:
+    """`load_parts` raises FileNotFoundError if the dataset YAML is missing."""
+    with pytest.raises(FileNotFoundError):
+        load_parts(tmp_path, "missing")
+
+
+# ---------- tests: dataset id discovery ----------
+
+
+def test_discover_dataset_ids(tmp_path: Path) -> None:
+    """`discover_dataset_ids` finds dataset ids by scanning datasets/*.dataset.yaml."""
+    _write_yaml(tmp_path / "datasets" / "a.dataset.yaml", {"dataset": {"name": "a"}})
+    _write_yaml(tmp_path / "datasets" / "b.dataset.yaml", {"dataset": {"name": "b"}})
+    ids = discover_dataset_ids(tmp_path)
+    assert ids == ["a", "b"]
+
+
+def test_discover_dataset_ids_from_index(tmp_path: Path) -> None:
+    """`discover_dataset_ids_from_index` returns top-level 'datasets' keys in index YAML."""
+    idx = tmp_path / "metadata_index.yaml"
+    _write_yaml(idx, {"datasets": {"x": {}, "y": {}}})
+    ids = discover_dataset_ids_from_index(idx)
+    assert ids == ["x", "y"]
+
+
+def test_discover_dataset_ids_from_index_missing_file(tmp_path: Path) -> None:
+    """Missing index file yields an empty list of dataset ids."""
+    ids = discover_dataset_ids_from_index(tmp_path / "nope.yaml")
+    assert ids == []

From 8f5fc0c363643a7a4c3aa99cb4733f0da67835a4 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:42:20 +0100
Subject: [PATCH 14/37] #126: Add todo to extend inspection tests

---
 tests/test_inspection.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_inspection.py b/tests/test_inspection.py
index 8cf504ba..27d1afb1 100644
--- a/tests/test_inspection.py
+++ b/tests/test_inspection.py
@@ -34,3 +34,9 @@ def test_inspection():
     assert metadata["resources"][0]["schema"]["fields"][6]["type"] == "object"
     assert metadata["resources"][0]["schema"]["fields"][7]["type"] == "date"
     assert metadata["resources"][0]["schema"]["fields"][8]["type"] == "boolean"
+
+
+# TODO @jh-RLI: Add test for special cases in csv as e.g. this data will cause issues # noqa: TD003
+# cat objective.csv
+# ;0
+# objective;97356714.15339188

From 1fdd91e8853a3b9276ef1c4a7c5e71b2e107d704 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:43:46 +0100
Subject: [PATCH 15/37] #126: Add test for yaml based oemetadata creation -> as
 dict or save as file

---
 tests/test_metadata_creation.py | 156 +++++++++++++++++++++++++-------
 1 file changed, 125 insertions(+), 31 deletions(-)

diff --git a/tests/test_metadata_creation.py b/tests/test_metadata_creation.py
index 2cf6a7d5..0387906d 100644
--- a/tests/test_metadata_creation.py
+++ b/tests/test_metadata_creation.py
@@ -1,42 +1,108 @@
-"""Test suite for the OEMetadataCreator class in the OMI creation module."""
+"""Test suite for the OEMetadataCreator class in the OMI creation module (split-files layout)."""
 
-from pathlib import Path
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING
 
 import pytest
 import yaml
 
 from omi.creation.creator import OEMetadataCreator
-from omi.creation.utils import load_yaml_metadata
+from omi.creation.utils import apply_template_to_resources, load_parts
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 
 @pytest.fixture()
-def sample_yaml(tmp_path: Path) -> Path:
-    """Fixture to create a sample YAML file for testing."""
-    content = {
-        "version": "OEMetadata-2.0.4",
-        "dataset": {
-            "name": "test_dataset",
-            "title": "Test Dataset",
-            "description": "For unit testing",
-            "@id": "https://example.org/test_dataset",
-        },
-        "template": {"languages": ["en-GB"]},
-        "resources": [{"name": "test_resource", "title": "Test Resource", "format": "CSV", "type": "table"}],
-    }
-
-    file_path = tmp_path / "metadata.yaml"
-    with Path.open(file_path, "w", encoding="utf-8") as f:
-        yaml.dump(content, f, sort_keys=False)
-
-    return file_path
-
-
-def test_generate_oemetadata(sample_yaml: Path) -> None:
-    """Test the generation of OEMetadata from a sample YAML file."""
-    version, dataset, resources = load_yaml_metadata(sample_yaml)
-    creator = OEMetadataCreator()
-
-    result = creator.generate_metadata(dataset, resources)
+def sample_tree(tmp_path: Path) -> tuple[Path, str]:
+    """
+    Create a split-files metadata tree.
+
+    metadata/
+      datasets/
+        demo.dataset.yaml
+        demo.template.yaml
+      resources/
+        demo/
+          table.resource.yaml
+    """
+    base = tmp_path / "metadata"
+    ds_dir = base / "datasets"
+    rs_dir = base / "resources" / "demo"
+
+    ds_dir.mkdir(parents=True, exist_ok=True)
+    rs_dir.mkdir(parents=True, exist_ok=True)
+
+    # dataset yaml
+    (ds_dir / "demo.dataset.yaml").write_text(
+        yaml.safe_dump(
+            {
+                "version": "OEMetadata-2.0",
+                "dataset": {
+                    "name": "test_dataset",
+                    "title": "Test Dataset",
+                    "description": "For unit testing",
+                    "@id": "https://example.org/test_dataset",
+                },
+            },
+            sort_keys=False,
+            allow_unicode=True,
+        ),
+        encoding="utf-8",
+    )
+
+    # template yaml (applied to every resource)
+    (ds_dir / "demo.template.yaml").write_text(
+        yaml.safe_dump(
+            {
+                "languages": ["en-GB"],
+                "keywords": ["example"],
+                "context": {"publisher": "OEP", "contact": "contact@example.org"},
+            },
+            sort_keys=False,
+            allow_unicode=True,
+        ),
+        encoding="utf-8",
+    )
+
+    # one resource yaml
+    (rs_dir / "table.resource.yaml").write_text(
+        yaml.safe_dump(
+            {
+                "name": "test_resource",
+                "title": "Test Resource",
+                "type": "table",
+                "format": "CSV",
+                "schema": {
+                    "fields": [
+                        {"name": "id", "type": "integer", "nullable": False},
+                    ],
+                    "primaryKey": ["id"],
+                },
+            },
+            sort_keys=False,
+            allow_unicode=True,
+        ),
+        encoding="utf-8",
+    )
+
+    return base, "demo"
+
+
+def test_generate_oemetadata_from_split_files(sample_tree: tuple[Path, str]) -> None:
+    """End-to-end: load parts, apply template, generate metadata via creator."""
+    base_dir, dataset_id = sample_tree
+
+    # Load version/dataset/resources/template from split-files layout
+    version, dataset, resources, template = load_parts(base_dir, dataset_id)
+
+    # Deep-apply template to resources (dicts merge, lists concat for keywords/topics/languages)
+    merged_resources = apply_template_to_resources(resources, template)
+
+    creator = OEMetadataCreator(oem_version=version)
+    result = creator.generate_metadata(dataset, merged_resources)
 
     # Basic assertions
     assert result["@context"].startswith("https://")
@@ -44,4 +110,32 @@ def test_generate_oemetadata(sample_yaml: Path) -> None:
     assert "resources" in result
     assert isinstance(result["resources"], list)
     assert result["resources"][0]["name"] == "test_resource"
-    assert "languages" in result["resources"][0]
+
+    # Template has been applied deeply (languages concatenated / context merged)
+    r0 = result["resources"][0]
+    assert r0["languages"] == ["en-GB"]
+    assert r0["keywords"] == ["example"]
+    assert r0["context"]["publisher"] == "OEP"
+    assert r0["context"]["contact"] == "contact@example.org"
+
+    # Schema minimally intact
+    assert r0["schema"]["primaryKey"] == ["id"]
+    assert r0["schema"]["fields"][0]["name"] == "id"
+    assert r0["schema"]["fields"][0]["nullable"] is False
+
+
+def test_creator_save_writes_json(sample_tree: tuple[Path, str]) -> None:
+    """Ensure creator.save writes JSON and preserves unicode."""
+    base_dir, dataset_id = sample_tree
+    version, dataset, resources, template = load_parts(base_dir, dataset_id)
+    merged_resources = apply_template_to_resources(resources, template)
+
+    out = base_dir / "out.json"
+    creator = OEMetadataCreator(oem_version=version)
+    creator.save(dataset, merged_resources, out, ensure_ascii=False, indent=2)
+
+    assert out.exists()
+    data = json.loads(out.read_text(encoding="utf-8"))
+    assert data["name"] == "test_dataset"
+    # unicode preserved (no \u escapes because ensure_ascii=False)
+    assert "©" not in out.read_text(encoding="utf-8")  # sanity check; none present here by default

From 2c180556b3fc98bde2f6451d2084b47f745f0ae1 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:48:32 +0100
Subject: [PATCH 16/37] #126: Move all utility functionality in creation module
 here. - Add utils to merge dataset, resource and template parts - Add io
 utils - Add utils to properly read version, dataset, template, resource from
 yaml directory - Add util to read info for many datasets from yaml directory

---
 src/omi/creation/utils.py | 249 +++++++++++++++++++++++++++++++++-----
 1 file changed, 218 insertions(+), 31 deletions(-)

diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py
index 97f1644d..a0e74a6a 100644
--- a/src/omi/creation/utils.py
+++ b/src/omi/creation/utils.py
@@ -1,45 +1,232 @@
-"""Utility functions for OMI creation module."""
+"""
+Utility functions for the OMI creation module.
 
+This module provides deep-merge templating, YAML IO, and discovery helpers
+for assembling OEMetadata from split YAML files (dataset/template/resources).
+"""
+
+from __future__ import annotations
+
+from copy import deepcopy
 from pathlib import Path
-from typing import Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import yaml
 
+if TYPE_CHECKING:
+    from collections.abc import Hashable
+
+# --- deep merge helpers -------------------------------------------------------
+
+# List keys we concatenate (resource + template) instead of replacing.
+DEFAULT_CONCAT_LIST_KEYS = {"keywords", "topics", "languages"}
+
+
+def _hashable_key(x: object) -> Hashable | tuple:
+    """
+    Return a hashable representation of `x` for deduplication purposes.
+
+    - dict  -> sorted tuple of (key, value) pairs
+    - list  -> tuple(list)
+    - other -> value itself
+    """
+    if isinstance(x, dict):
+        return tuple(sorted(x.items()))
+    if isinstance(x, list):
+        return tuple(x)
+    return x  # type: ignore[return-value]
+
 
-def load_yaml_metadata(file_path: Union[str, Path]) -> tuple[str, dict, list[dict], dict]:
+def _merge_lists(
+    template_list: list[object],
+    resource_list: list[object],
+    *,
+    deduplicate: bool = True,
+) -> list[object]:
     """
-    Load YAML file containing version, dataset, template, and resource metadata.
+    Concatenate lists with resource-first priority.
 
-    This function reads a YAML file and extracts the version, dataset description,
-    resources, and template. It applies the template to each resource, merging any
-    specified fields.
-    Returns: version, dataset, list of resources with merged template, and raw template.
+    When `deduplicate` is True, only items that are not already present in
+    `resource_list` (by hashable representation) are appended from `template_list`.
+    """
+    merged = list(resource_list)
+    if not template_list:
+        return merged
 
-    Parameters
-    ----------
-    file_path: Union[str, Path]
-        Path to the YAML file.
+    if deduplicate:
+        existing = {_hashable_key(v) for v in merged}
+        for item in template_list:
+            k = _hashable_key(item)
+            if k not in existing:
+                merged.append(item)
+    else:
+        merged.extend(template_list)
+    return merged
+
+
+def deep_apply_template_to_resource(
+    resource: dict[str, object],
+    template: dict[str, object],
+    concat_list_keys: Union[tuple[str, ...], set[str]] = DEFAULT_CONCAT_LIST_KEYS,
+) -> dict[str, object]:
+    """
+    Apply a resource template using deep-merge semantics.
 
-    Returns
-    -------
-    Tuple[str, Dict, List[Dict], Dict]
-        A tuple containing:
-        - version: The version of the metadata.
-        - dataset: The dataset description.
-        - resources: A list of resources with the template applied.
-        - template: The raw template used for resources.
+    Rules:
+    - Missing keys are copied from the template.
+    - Dicts are deep-merged (resource wins on conflicts).
+    - Lists are concatenated only for keys in `concat_list_keys`; otherwise, the
+      resource list is preserved as-is.
+    - Scalars: resource values win.
     """
-    with Path(file_path).open(encoding="utf-8") as file:
-        data = yaml.safe_load(file)
+    if not template:
+        return resource
 
-    version = data.get("version", "OEMetadata-2.0.4")
-    dataset = data.get("dataset", {})
-    template = data.get("template", {})
-    resources = data.get("resources", [])
+    result = deepcopy(resource)
+    for key, tval in template.items():
+        if key not in result:
+            result[key] = deepcopy(tval)
+            continue
+
+        rval = result[key]
+        if isinstance(rval, dict) and isinstance(tval, dict):
+            result[key] = deep_apply_template_to_resource(rval, tval, concat_list_keys)
+            continue
+
+        if isinstance(rval, list) and isinstance(tval, list):
+            if key in concat_list_keys:
+                result[key] = _merge_lists(tval, rval, deduplicate=True)
+            # else: resource list stays as-is
+            continue
+        # scalar: resource value stays
+    return result
+
+
+def apply_template_to_resources(
+    resources: list[dict[str, object]],
+    template: dict[str, object],
+) -> list[dict[str, object]]:
+    """Apply the same `template` to each resource in `resources`."""
+    if not template:
+        return resources
+    return [deep_apply_template_to_resource(r, template) for r in resources]
+
+
+# --- YAML IO + discovery ------------------------------------------------------
+
+
+def load_yaml(path: Union[str, Path]) -> dict[str, object]:
+    """Load a YAML mapping from `path`, returning an empty dict for empty files."""
+    with Path(path).open("r", encoding="utf-8") as f:
+        return yaml.safe_load(f) or {}
+
+
+def discover_paths(
+    base_dir: Union[str, Path],
+    dataset_id: str,
+) -> tuple[Optional[Path], Optional[Path], list[Path]]:
+    """
+    Discover dataset/template/resources paths by convention.
+
+    - dataset:   datasets/{dataset_id}.dataset.yaml
+    - template:  datasets/{dataset_id}.template.yaml  (optional)
+    - resources: resources/{dataset_id}/*.resource.yaml
+    """
+    base = Path(base_dir)
+    dataset_path = base / "datasets" / f"{dataset_id}.dataset.yaml"
+    template_path = base / "datasets" / f"{dataset_id}.template.yaml"
+    resources_dir = base / "resources" / dataset_id
 
-    # Apply template to each resource
-    for resource in resources:
-        for key, value in template.items():
-            resource.setdefault(key, value)
+    dataset = dataset_path if dataset_path.exists() else None
+    template = template_path if template_path.exists() else None
+    resources = sorted(resources_dir.glob("*.resource.yaml")) if resources_dir.exists() else []
+    return dataset, template, resources
 
-    return version, dataset, resources
+
+def resolve_from_index(
+    base_dir: Union[str, Path],
+    dataset_id: str,
+    index_file: Optional[Union[str, Path]],
+) -> tuple[Optional[Path], Optional[Path], list[Path]]:
+    """
+    Resolve dataset/template/resources using an explicit index YAML.
+
+    Example YAML:
+
+        datasets:
+          <dataset_id>:
+            dataset: path/to/dataset.yaml
+            template: path/to/template.yaml   # optional
+            resources:
+              - path/to/res1.yaml
+              - path/to/res2.yaml
+
+    Paths are interpreted as relative to `base_dir`.
+    """
+    if not index_file:
+        return discover_paths(base_dir, dataset_id)
+
+    base = Path(base_dir)
+    index_path = Path(index_file)
+    index = load_yaml(index_path)
+    entry = (index.get("datasets") or {}).get(dataset_id, {})
+    dataset = base / entry["dataset"] if "dataset" in entry else None
+    template = base / entry["template"] if "template" in entry else None
+    resources = [base / p for p in entry.get("resources", [])]
+    return dataset, template, resources
+
+
+def load_parts(
+    base_dir: Union[str, Path],
+    dataset_id: str,
+    index_file: Optional[Union[str, Path]] = None,
+) -> tuple[str, dict[str, object], list[dict[str, object]], dict[str, object]]:
+    """
+    Load dataset YAML, optional template YAML, and all resource YAMLs.
+
+    Returns a tuple: (version, dataset, resources, template).
+    """
+    dataset_path, template_path, resource_paths = resolve_from_index(base_dir, dataset_id, index_file)
+
+    if dataset_path is None or not dataset_path.exists():
+        raise FileNotFoundError(f"Dataset YAML not found for '{dataset_id}'")
+
+    dataset_yaml = load_yaml(dataset_path)
+    version = str(dataset_yaml.get("version", "OEMetadata-2.0.4"))
+    # Support either dataset: {...} or flat style with top-level dataset keys.
+    dataset = dataset_yaml.get("dataset", dataset_yaml)
+
+    template: dict[str, object] = {}
+    if template_path and template_path.exists():
+        template = load_yaml(template_path)
+
+    resources: list[dict[str, object]] = [load_yaml(p) for p in resource_paths]
+    return version, dataset, resources, template
+
+
+def discover_dataset_ids(base_dir: Union[str, Path]) -> list[str]:
+    """
+    Discover dataset ids by scanning datasets/*.dataset.yaml.
+
+    For 'datasets/powerplants.dataset.yaml' returns 'powerplants'.
+    """
+    base = Path(base_dir)
+    datasets_dir = base / "datasets"
+    if not datasets_dir.exists():
+        return []
+    return sorted([p.stem.replace(".dataset", "") for p in datasets_dir.glob("*.dataset.yaml")])
+
+
+def discover_dataset_ids_from_index(index_file: Union[str, Path]) -> list[str]:
+    """
+    Discover dataset ids from an explicit metadata_index.yaml.
+
+    Returns the sorted list of top-level keys under `datasets`.
+    """
+    idx_path = Path(index_file)
+    if not idx_path.exists():
+        return []
+    with idx_path.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    ds = data.get("datasets") or {}
+    return sorted(ds.keys())

From 1494928c078db5a10f36607fe37c1390650a1c15 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:56:08 +0100
Subject: [PATCH 17/37] #126: Update create entrypoint to build oemetadata form
 yaml parts (dataset, template, resources) stored in a base directory

---
 src/omi/create.py | 51 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/omi/create.py b/src/omi/create.py
index 2b9a4bd1..a30d2b2e 100644
--- a/src/omi/create.py
+++ b/src/omi/create.py
@@ -1,29 +1,44 @@
-"""Enty point for metadata creation."""
+"""Entry point for OEMetadata creation (split-files layout only)."""
 
-import json
-from pathlib import Path
-from typing import Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional, Union
 
 from omi.creation.creator import OEMetadataCreator
-from omi.creation.utils import load_yaml_metadata
+from omi.creation.utils import apply_template_to_resources, load_parts
+
+if TYPE_CHECKING:
+    from pathlib import Path
 
 
-def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None:
+def build_from_yaml(
+    base_dir: Union[str, Path],
+    dataset_id: str,
+    output_file: Union[str, Path],
+    *,
+    index_file: Optional[Union[str, Path]] = None,
+) -> None:
     """
-    Generate OEMetadata from a YAML file and write it to an output file.
+    Assemble OEMetadata from split YAML files.
+
+      - datasets/<dataset_id>.dataset.yaml
+      - datasets/<dataset_id>.template.yaml  (optional)
+      - resources/<dataset_id>/*.resource.yaml
+      (optionally resolved via an index YAML)
 
     Parameters
     ----------
-    yaml_file: str
-        Path to the input YAML file containing dataset and resources.
-    output_file: str
-        Path to the output file where the generated OEMetadata JSON will be saved.
+    base_dir : str | Path
+        Root directory containing 'datasets/' and 'resources/'.
+    dataset_id : str
+        Logical dataset id (e.g. 'powerplants').
+    output_file : str | Path
+        Output path for the generated OEMetadata JSON.
+    index_file : str | Path | None
+        Optional explicit mapping file (metadata_index.yaml).
     """
-    version, dataset, resources = load_yaml_metadata(yaml_file)
-    creator = OEMetadataCreator()
-    metadata = creator.generate_metadata(dataset, resources)
-
-    with Path(output_file).open("w", encoding="utf-8") as f:
-        json.dump(metadata, f, indent=2)
+    version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file)
+    merged_resources = apply_template_to_resources(resources, template)
 
-    print(f"OEMetadata written to {output_file}")  # noqa: T201
+    creator = OEMetadataCreator(oem_version=version)
+    creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2)

From 7239484d54ece80365cae7ce5bd819a014ca8041 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 17:57:40 +0100
Subject: [PATCH 18/37] #126: Rename test for assembler and add test case to
 check if assembling of many datasets at once will work

---
 tests/{test_assembly.py => test_assembler.py} | 108 +++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)
 rename tests/{test_assembly.py => test_assembler.py} (65%)

diff --git a/tests/test_assembly.py b/tests/test_assembler.py
similarity index 65%
rename from tests/test_assembly.py
rename to tests/test_assembler.py
index dce7efb9..318c73e1 100644
--- a/tests/test_assembly.py
+++ b/tests/test_assembler.py
@@ -12,7 +12,7 @@
 import yaml
 
 # We test the public assembler entry point
-from omi.creation.assembler import assemble_metadata_dict
+from omi.creation.assembler import assemble_many_metadata, assemble_metadata_dict
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -208,3 +208,109 @@ def test_assemble_with_index_mapping(
     r_b = md["resources"][1]
     assert r_a["keywords"] == ["a-k", "t-k"]
     assert r_b["keywords"] == ["t-k"]
+
+
+def test_assemble_many_metadata_convention_as_dict(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Assemble all datasets by convention; expect a dict keyed by dataset id."""
+    # Dataset A
+    write_yaml(
+        tmp_path / "datasets" / "a.dataset.yaml",
+        {"version": "OEMetadata-2.0.4", "dataset": {"name": "a", "title": "A"}},
+    )
+    write_yaml(
+        tmp_path / "resources" / "a" / "r1.resource.yaml",
+        {"name": "r1", "title": "R1"},
+    )
+
+    # Dataset B (with template)
+    write_yaml(
+        tmp_path / "datasets" / "b.dataset.yaml",
+        {"version": "OEMetadata-2.0.4", "dataset": {"name": "b", "title": "B"}},
+    )
+    write_yaml(
+        tmp_path / "datasets" / "b.template.yaml",
+        {"keywords": ["tk"]},
+    )
+    write_yaml(
+        tmp_path / "resources" / "b" / "r2.resource.yaml",
+        {"name": "r2", "title": "R2", "keywords": ["rk"]},
+    )
+
+    # Use the FakeCreator inside the assembler
+    monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator)
+
+    out = assemble_many_metadata(tmp_path)  # dict[str, dict]
+    # discover_dataset_ids returns sorted ids
+    assert list(out.keys()) == ["a", "b"]
+
+    # Dataset A checks
+    md_a = out["a"]
+    assert md_a["name"] == "a"
+    assert [r["name"] for r in md_a["resources"]] == ["r1"]
+
+    # Dataset B checks (template applied with concat)
+    md_b = out["b"]
+    assert md_b["name"] == "b"
+    r2 = md_b["resources"][0]
+    assert r2["name"] == "r2"
+    assert r2["keywords"] == ["rk", "tk"]
+
+
+def test_assemble_many_metadata_with_index_as_list(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Assemble all datasets declared in index; expect a list of (id, md) pairs sorted by id."""
+    base = tmp_path
+
+    # Index with two datasets (note: keys will be sorted by helper)
+    write_yaml(
+        base / "metadata_index.yaml",
+        {
+            "datasets": {
+                "x": {
+                    "dataset": "datasets/x.dataset.yaml",
+                    "resources": ["resources/x/x1.resource.yaml"],
+                },
+                "y": {
+                    "dataset": "datasets/y.dataset.yaml",
+                    "template": "datasets/y.template.yaml",
+                    "resources": ["resources/y/y1.resource.yaml"],
+                },
+            },
+        },
+    )
+
+    # Dataset x
+    write_yaml(base / "datasets" / "x.dataset.yaml", {"dataset": {"name": "x", "title": "X"}})
+    write_yaml(base / "resources" / "x" / "x1.resource.yaml", {"name": "x1"})
+
+    # Dataset y (with template)
+    write_yaml(base / "datasets" / "y.dataset.yaml", {"dataset": {"name": "y", "title": "Y"}})
+    write_yaml(base / "datasets" / "y.template.yaml", {"keywords": ["t"]})
+    write_yaml(base / "resources" / "y" / "y1.resource.yaml", {"name": "y1", "keywords": ["r"]})
+
+    monkeypatch.setattr("omi.creation.assembler.OEMetadataCreator", FakeCreator)
+
+    pairs = assemble_many_metadata(
+        base,
+        index_file=base / "metadata_index.yaml",
+        as_dict=False,
+    )  # list[tuple[str, dict]]
+
+    # Expect sorted ids: ['x', 'y']
+    ids = [ds_id for ds_id, _ in pairs]
+    assert ids == ["x", "y"]
+
+    md_x = pairs[0][1]
+    md_y = pairs[1][1]
+
+    assert md_x["name"] == "x"
+    assert [r["name"] for r in md_x["resources"]] == ["x1"]
+
+    # Template concat for y
+    r_y1 = md_y["resources"][0]
+    assert r_y1["keywords"] == ["r", "t"]

From 5567f7375a59a9e26d7f9daf918b12119e4f9898 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:01:39 +0100
Subject: [PATCH 19/37] #126: Add assembler module which handles the assembling
 of yaml file based parts (dataset, template and resources) . Hint: The
 creator will then build/generate the oemetadata string

---
 src/omi/creation/assembler.py | 78 +++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 src/omi/creation/assembler.py

diff --git a/src/omi/creation/assembler.py b/src/omi/creation/assembler.py
new file mode 100644
index 00000000..9814f0ad
--- /dev/null
+++ b/src/omi/creation/assembler.py
@@ -0,0 +1,78 @@
+"""Assemble OEMetadata dictionary from parts: dataset, template, and resources."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from .creator import OEMetadataCreator
+from .utils import (
+    apply_template_to_resources,
+    discover_dataset_ids,
+    discover_dataset_ids_from_index,
+    load_parts,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+def assemble_metadata_dict(
+    base_dir: Union[str, Path],
+    dataset_id: str,
+    index_file: Optional[Union[str, Path]] = None,
+) -> dict[str, Any]:
+    """
+    Load dataset/template/resources; apply template; validate via creator; return dict.
+
+    Parameters
+    ----------
+    base_dir: Union[str, Path]
+        Base directory containing datasets, templates, and resources.
+    dataset_id: str
+        Identifier for the dataset to load.
+    index_file: Optional[Union[str, Path]]
+        Optional path to an index YAML file for resolving dataset parts.
+
+    Returns
+    -------
+    Dict[str, Any]
+        The assembled and validated OEMetadata dictionary.
+    """
+    version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file)
+    merged_resources = apply_template_to_resources(resources, template)
+    creator = OEMetadataCreator(oem_version=version)
+    return creator.generate_metadata(dataset, merged_resources)
+
+
+def assemble_many_metadata(
+    base_dir: Union[str, Path],
+    dataset_ids: Optional[Iterable[str]] = None,
+    index_file: Optional[Union[str, Path]] = None,
+    *,
+    as_dict: bool = True,
+) -> Union[dict[str, dict], list[tuple[str, dict]]]:
+    """
+    Assemble OEMetadata for multiple datasets in one call.
+
+    - If dataset_ids is None:
+        * when index_file is provided -> use keys from index
+        * otherwise -> discover by 'datasets/*.dataset.yaml'
+    - Returns a mapping {dataset_id: metadata} if as_dict=True,
+      else a list of (dataset_id, metadata) pairs in sorted id order.
+    """
+    base = Path(base_dir)
+
+    if dataset_ids is None:
+        ids = discover_dataset_ids_from_index(index_file) if index_file else discover_dataset_ids(base)
+    else:
+        ids = list(dataset_ids)
+
+    results_pairs: list[tuple[str, dict]] = []
+    for ds_id in ids:
+        md = assemble_metadata_dict(base, ds_id, index_file=index_file)
+        results_pairs.append((ds_id, md))
+
+    if as_dict:
+        return dict(results_pairs)
+    return results_pairs

From eeb9c2713a25f259958ef854d2ed10848d5ff65e Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:03:00 +0100
Subject: [PATCH 20/37] #126: update cli functionality to include omi creation
 module

---
 src/omi/cli.py | 94 ++++++++++++++++++++++++++++----------------------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/src/omi/cli.py b/src/omi/cli.py
index 2a2093a5..a596a5c6 100644
--- a/src/omi/cli.py
+++ b/src/omi/cli.py
@@ -1,62 +1,74 @@
 """
-Module that contains the command line app.
+Command line interface for OMI.
 
-Why does this file exist, and why not put this in __main__?
+This CLI only supports the split-files layout:
+- datasets/<dataset_id>.dataset.yaml
+- datasets/<dataset_id>.template.yaml  (optional)
+- resources/<dataset_id>/*.resource.yaml
+(optionally wired via metadata_index.yaml)
 
-  You might be tempted to import things from __main__ later, but that will cause
-  problems: the code will get executed twice:
+Usage:
+omi assemble \
+  --base-dir ./metadata \
+  --dataset-id powerplants \
+  --output-file ./out/powerplants.json \
+  --index-file ./metadata/metadata_index.yaml   # optional
 
-  - When you run `python -m omi` python will execute
-    ``__main__.py`` as a script. That means there won't be any
-    ``omi.__main__`` in ``sys.modules``.
-  - When you import __main__ it will get executed again (as a module) because
-    there's no ``omi.__main__`` in ``sys.modules``.
-
-  Also see (1) from http://click.pocoo.org/5/setuptools/#setuptools-integration
 """
 
-import json
+from __future__ import annotations
+
 from pathlib import Path
-from typing import Union
+from typing import Optional
 
 import click
 
 from omi.creation.creator import OEMetadataCreator
-from omi.creation.utils import load_yaml_metadata
+from omi.creation.utils import apply_template_to_resources, load_parts
 
 
 @click.group()
 def grp() -> None:
-    """Init click group."""
-
-
+    """OMI CLI."""
+
+
+@grp.command("assemble")
+@click.option(
+    "--base-dir",
+    required=True,
+    type=click.Path(file_okay=False, path_type=Path),
+    help="Root directory containing 'datasets/' and 'resources/'.",
+)
+@click.option("--dataset-id", required=True, help="Logical dataset id (e.g. 'powerplants').")
+@click.option(
+    "--output-file",
+    required=True,
+    type=click.Path(dir_okay=False, path_type=Path),
+    help="Path to write the generated OEMetadata JSON.",
+)
+@click.option(
+    "--index-file",
+    default=None,
+    type=click.Path(dir_okay=False, path_type=Path),
+    help="Optional metadata index YAML for explicit mapping.",
+)
+def assemble_cmd(base_dir: Path, dataset_id: str, output_file: Path, index_file: Optional[Path]) -> None:
+    """Assemble OEMetadata from split YAML files and write JSON to OUTPUT_FILE."""
+    # Load pieces
+    version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file)
+    merged_resources = apply_template_to_resources(resources, template)
+
+    # Build & save with the correct spec version
+    creator = OEMetadataCreator(oem_version=version)
+    creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2)
+
+    click.echo(f"OEMetadata written to {output_file}")
+
+
+# Keep CommandCollection for backwards compatibility with your entry point
 cli = click.CommandCollection(sources=[grp])
 
 
 def main() -> None:
     """Start click application."""
     cli()
-
-
-@click.command()
-@click.argument("yaml_file")
-@click.argument("output_file")
-def from_yaml(yaml_file: Union[str, Path], output_file: Union[str, Path]) -> None:
-    """
-    Generate OEMetadata from a YAML file and write it to an output file.
-
-    Parameters
-    ----------
-    yaml_file: Union[str, Path]
-        Path to the input YAML file containing dataset and resources.
-    output_file: Union[str, Path]
-        Path to the output file where the generated OEMetadata JSON will be saved.
-    """
-    version, dataset, resources = load_yaml_metadata(yaml_file)
-    generator = OEMetadataCreator()
-    metadata = generator.generate_metadata(dataset, resources)
-
-    with Path(output_file).open("w", encoding="utf-8") as f:
-        json.dump(metadata, f, indent=2)
-
-    print(f"OEMetadata written to {output_file}")  # noqa: T201

From 5f4d3fc82b444b025b360a78142a053714d57e4a Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:08:05 +0100
Subject: [PATCH 21/37] #126: add method to save generated metadata to file

---
 src/omi/creation/creator.py | 71 +++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/omi/creation/creator.py b/src/omi/creation/creator.py
index ec390251..9d93b2b3 100644
--- a/src/omi/creation/creator.py
+++ b/src/omi/creation/creator.py
@@ -1,4 +1,9 @@
-"""Create oemetadata json datapackage descriptions."""
+"""Create OEMetadata JSON datapackage structure and return or store it."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
 
 from omi.base import get_metadata_specification
 from omi.validation import validate_metadata
@@ -6,38 +11,18 @@
 
 class OEMetadataCreator:
     """
-    Class to create oemetadata json datapackages.
+    Create OEMetadata JSON datapackages.
 
-    Output is based on datapackage and resource descriptions.
+    Output is based on dataset and resource descriptions and validated against
+    the official schema.
     """
 
     def __init__(self, oem_version: str = "OEMetadata-2.0") -> None:
-        """
-        Initialize the OEMetadataCreator with a specific version.
-
-        Parameters
-        ----------
-        oem_version:str
-            The version of the OEMetadata specification to use.
-        """
+        """Initialize the creator with a specific OEMetadata version."""
         self.oem_spec = get_metadata_specification(oem_version)
 
     def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict:
-        """
-        Generate oemetadata json datapackage from dataset and resources.
-
-        Parameters
-        ----------
-        dataset: dict
-            The dataset description.
-        resources: list[dict]
-            The list of resource descriptions.
-
-        Returns
-        -------
-        dict
-            The generated oemetadata json datapackage.
-        """
+        """Generate OEMetadata JSON datapackage from dataset and resources."""
         metadata = {
             "@context": self.oem_spec.schema["properties"]["@context"]["examples"][0],
             **dataset,
@@ -47,3 +32,37 @@ def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict:
 
         validate_metadata(metadata, check_license=False)
         return metadata
+
+    def save(
+        self,
+        dataset: dict,
+        resources: list[dict],
+        output_file: Path | str,
+        **dump_kwargs,
+    ) -> None:
+        """
+        Generate OEMetadata and save it to a JSON file.
+
+        Parameters
+        ----------
+        dataset : dict
+            Dataset metadata.
+        resources : list[dict]
+            List of resource metadata entries.
+        output_file : Path | str
+            Path to the output JSON file.
+        **dump_kwargs :
+            Extra kwargs forwarded to `json.dump`. Defaults applied here:
+            - indent: 2
+            - ensure_ascii: False
+        """
+        metadata = self.generate_metadata(dataset, resources)
+
+        # Defaults, can be overridden by caller via **dump_kwargs
+        indent = dump_kwargs.pop("indent", 2)
+        ensure_ascii = dump_kwargs.pop("ensure_ascii", False)
+
+        with Path(output_file).open("w", encoding="utf-8") as f:
+            json.dump(metadata, f, indent=indent, ensure_ascii=ensure_ascii, **dump_kwargs)
+
+        print(f"OEMetadata written to {output_file}")  # noqa: T201

From d4e285f48e8ad521b18306fb9816445c4c5f0e33 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:08:43 +0100
Subject: [PATCH 22/37] #126: Update docs

---
 src/omi/creation/README.md | 297 ++++++++++++++++++++-----------------
 1 file changed, 161 insertions(+), 136 deletions(-)

diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md
index 274283ff..cdcd0135 100644
--- a/src/omi/creation/README.md
+++ b/src/omi/creation/README.md
@@ -1,6 +1,6 @@
 # OMI OEMetadata Assembly Guide
 
-This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, testing, and common pitfalls. You can drop this as a single `.md` file in your repo (e.g. `docs/oemetadata-assembly.md`) or split into multiple files later.
+This guide explains how to author, assemble, and validate **OEMetadata** using **YAML files** with OMI. It covers file structure, templating behavior, discovery vs. explicit mapping, Python APIs, multi-dataset usage, initialization scaffolding, testing, and common pitfalls.
 
 ---
 
@@ -22,13 +22,16 @@ This guide explains how to author, assemble, and validate **OEMetadata** using *
    * [Minimal Usage](#minimal-usage)
    * [With Index Mapping](#with-index-mapping)
    * [Manual Loading (No Discovery)](#manual-loading-no-discovery)
-8. [Airflow Integration Example](#airflow-integration-example)
-9. [Testing](#testing)
-10. [Validation & Error Handling](#validation--error-handling)
-11. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding)
-12. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional)
-13. [Design Notes & Extensibility](#design-notes--extensibility)
-14. [FAQ](#faq)
+8. [Multi-dataset Assembly](#multi-dataset-assembly)
+9. [Spec-Driven Output Ordering](#spec-driven-output-ordering)
+10. [Project Initialization (Scaffolding)](#project-initialization-scaffolding)
+11. [Airflow Integration Example](#airflow-integration-example)
+12. [Testing](#testing)
+13. [Validation & Error Handling](#validation--error-handling)
+14. [Auto-Generation From Directory (Optional Onboarding)](#auto-generation-from-directory-optional-onboarding)
+15. [Filtering Irrelevant Files (Optional)](#filtering-irrelevant-files-optional)
+16. [Design Notes & Extensibility](#design-notes--extensibility)
+17. [FAQ](#faq)
 
 ---
 
@@ -37,9 +40,9 @@ This guide explains how to author, assemble, and validate **OEMetadata** using *
 * **Goal:** Author OEMetadata as **YAML** (dataset + resources), keep it **DRY** via **templates**, assemble into a single **JSON** metadata document, and **validate** it with the official schema.
 * **Core ideas:**
 
-  * Authors maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs.
-  * OMI assembles and validates metadata into a final OEMetadata JSON.
-  * Works well in pipelines (e.g., Airflow) and in regular Python.
+  * Maintain a dataset YAML, an optional template YAML (applied to all resources), and one or more resource YAMLs.
+  * OMI assembles + validates metadata into final OEMetadata JSON.
+  * Works in pipelines (e.g., Airflow) and plain Python.
 
 ---
 
@@ -50,22 +53,20 @@ This guide explains how to author, assemble, and validate **OEMetadata** using *
    * `datasets/<id>.dataset.yaml`
    * `datasets/<id>.template.yaml` *(optional)*
    * `resources/<id>/*.resource.yaml`
-
 2. **Assembly:**
 
-   * OMI **loads** dataset, template, and resource YAML files.
-   * OMI **applies the template** to each resource (deep merge; resource overrides template).
-   * OMI **generates and validates** OEMetadata JSON via `OEMetadataCreator`.
-
+   * Load dataset, template, and resource YAML files.
+   * Apply template → deep merge; resource overrides.
+   * Create OEMetadata JSON via `OEMetadataCreator` and validate.
 3. **Storage:**
 
-   * You decide where to store: file, DB, API, etc. (OMI returns a Python `dict`).
+   * Assembly returns a Python `dict`. Store wherever you like (file/DB/API).
 
 ---
 
 ## Repository Layout
 
-```
+```bash
 metadata/
   datasets/
     <dataset_id>.dataset.yaml
@@ -77,7 +78,7 @@ metadata/
   metadata_index.yaml                 # optional explicit mapping
 ```
 
-* You can use **convention** (the directory / filename structure above) or an **index** file for explicit mapping.
+Use the **convention** above or an **index** file for explicit mapping.
 
 ---
 
@@ -95,13 +96,13 @@ dataset:
   "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/
 ```
 
-> Backwards compatibility: if you prefer, you may put dataset fields directly at the top level; OMI will treat that as `dataset: {...}`.
+> Backwards compatibility: dataset fields can also be at top-level; OMI treats that as `dataset: {...}`.
 
 ---
 
 ### Template YAML (optional)
 
-Applied to **every** resource (unless the resource overrides specific fields). Keeps your YAML DRY.
+Applied to **every** resource (unless overridden). Keeps YAML DRY.
 
 ```yaml
 # metadata/datasets/powerplants.template.yaml
@@ -185,24 +186,20 @@ sources:
           See https://tldrlegal.com/license/odc-open-database-license-odbl for further information.
         attribution: © Intergovernmental Panel on Climate Change 2023
         copyrightStatement: https://www.ipcc.ch/copyright/
-
-# Other metadata like subject, publicationDate, spatial, temporal, contributors, review...
 ```
 
-A second resource:
+Second resource:
 
 ```yaml
 # metadata/resources/powerplants/data_2.resource.yaml
 name: data_2
 type: table
 title: My Second Resource
-
 path: reGon/metadata/data_2.csv
 scheme: file
 format: csv
 mediatype: text/csv
 encoding: utf-8
-
 schema:
   fields:
     - name: id
@@ -215,14 +212,13 @@ schema:
       type: string
       nullable: true
   primaryKey: [id]
-
 ```
 
 ---
 
 ### Index YAML (optional)
 
-Use this if you want explicit mappings instead of convention-based discovery.
+Explicit mappings instead of convention:
 
 ```yaml
 # metadata/metadata_index.yaml
@@ -240,54 +236,38 @@ datasets:
 ## Templating Rules
 
 * **Deep merge** for dictionaries (e.g., `context`):
-
-  * Resource **overrides** template on conflicts.
-  * Missing nested keys are **filled** from template.
-
+  Resource **overrides**; missing nested keys are **filled** from template.
 * **Lists**:
-
-  * **Concatenate** (resource first, then template-only items) for:
-    `keywords`, `topics`, `languages`.
-  * For other lists (e.g., `licenses`, `contributors`), **resource wins** (no concat).
-  * You can change this behavior in code by adding keys to `DEFAULT_CONCAT_LIST_KEYS`.
-
+  **Concatenate** for `keywords`, `topics`, `languages` (resource first, then template-only items).
+  For other lists (e.g., `licenses`, `contributors`): **resource wins** (no concat).
+  *(Modify via `DEFAULT_CONCAT_LIST_KEYS` if you want different behavior.)*
 * **Scalars**: resource value **wins**.
 
-This keeps YAML DRY while allowing fine-grained per-resource overrides.
-
 ---
 
 ## Discovery vs. Index Mapping
 
 * **Discovery (convention):**
-  `datasets/<id>.dataset.yaml`, `datasets/<id>.template.yaml`, and `resources/<id>/*.resource.yaml`
-  → No index file needed.
-
-* **Index (explicit mapping):**
-  Use `metadata_index.yaml` to map dataset/template/resources by path, relative to the metadata base directory.
+  `datasets/<id>.dataset.yaml`, `datasets/<id>.template.yaml`, `resources/<id>/*.resource.yaml`
+  → No index needed.
+* **Index (explicit):**
+  Provide `metadata_index.yaml` with explicit paths relative to your base directory.
 
 ---
 
 ## Programmatic Usage
 
-OMI exposes high-level assembly and creation utilities.
-
 ### Minimal Usage
 
 ```python
 from omi.creation.assembly import assemble_metadata_dict
 
-metadata = assemble_metadata_dict(
-    base_dir="./metadata",
-    dataset_id="powerplants",
-)  # returns a Python dict with valid OEMetadata
+metadata = assemble_metadata_dict(base_dir="./metadata", dataset_id="powerplants")
 ```
 
 ### With Index Mapping
 
 ```python
-from omi.creation.assembly import assemble_metadata_dict
-
 metadata = assemble_metadata_dict(
     base_dir="./metadata",
     dataset_id="powerplants",
@@ -308,29 +288,110 @@ resources = [
     load_yaml(Path("./metadata/resources/powerplants/oemetadata_table_template.resource.yaml")),
     load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")),
 ]
-
 resources = apply_template_to_resources(resources, template)
+
 creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4")
 metadata = creator.generate_metadata(dataset, resources)
 ```
 
-> The `OEMetadataCreator` injects `@context` and `metaMetadata` and calls validation.
+> `OEMetadataCreator` injects `@context` and `metaMetadata` from the spec and validates the result.
+
+---
+
+## Multi-dataset Assembly
+
+Assemble **N datasets** in one call:
+
+```python
+from omi.creation.assembly import assemble_many_metadata
+
+# Discover by convention (datasets/*.dataset.yaml)
+all_metadata = assemble_many_metadata(base_dir="./metadata")
+
+# From explicit index
+all_metadata = assemble_many_metadata(
+    base_dir="./metadata", index_file="./metadata/metadata_index.yaml"
+)
+
+# Subset
+some = assemble_many_metadata(base_dir="./metadata", dataset_ids=["powerplants", "households"])
+```
+
+Result is a dict `{dataset_id: metadata}` by default.
+
+---
+
+## Spec-Driven Output Ordering
+
+For human-friendly JSON key order without hard-coded lists, order by the **official example** (fallback: schema `properties`):
+
+```python
+from omi.creation.assembly import assemble_metadata_dict
+from omi.creation.creator import OEMetadataCreator
+from omi.creation.utils import order_with_spec
+
+creator = OEMetadataCreator(oem_version="OEMetadata-2.0.4")
+metadata = assemble_metadata_dict("./metadata", "powerplants")
+
+ordered = order_with_spec(metadata, creator.oem_spec)  # uses spec.example and schema
+```
+
+Write with preserved unicode:
+
+```python
+import json, pathlib
+out = pathlib.Path("./out/powerplants.json")
+out.parent.mkdir(parents=True, exist_ok=True)
+out.write_text(json.dumps(ordered, indent=2, ensure_ascii=False), encoding="utf-8")
+```
+
+---
+
+## Project Initialization (Scaffolding)
+
+Create a metadata skeleton **from the spec** (no inline templates):
+
+```python
+from omi.creation.scaffold import init_skeleton_from_spec
+
+paths = init_skeleton_from_spec(
+    base_dir="./metadata",
+    dataset_id="powerplants",
+    oem_version="OEMetadata-2.0.4",
+    resource_name="oemetadata_table_template",
+    with_index=True,   # creates metadata_index.yaml
+    force=False,       # do not overwrite
+)
+```
+
+This imports the spec via:
+
+```python
+from omi.base import get_metadata_specification
+```
+
+…and derives:
+
+* `datasets/<id>.dataset.yaml` (with version from spec)
+* `datasets/<id>.template.yaml` (from `oem_spec.template` or pruned example resource)
+* `resources/<id>/sample.resource.yaml` (sanitized from example)
+* optional `metadata_index.yaml`
+
+You can expose a CLI command `omi init` that wraps `init_skeleton_from_spec`.
 
 ---
 
 ## Airflow Integration Example
 
 ```python
-# In a DAG task (PythonOperator callable)
 from omi.creation.assembly import assemble_metadata_dict
 
 def build_oemetadata_for_powerplants(**context):
     md = assemble_metadata_dict(
-        base_dir="/opt/airflow/dags/metadata",          # your metadata module
+        base_dir="/opt/airflow/dags/metadata",
         dataset_id="powerplants",
-        index_file="/opt/airflow/dags/metadata/metadata_index.yaml",  # or None for discovery
+        index_file="/opt/airflow/dags/metadata/metadata_index.yaml",
     )
-    # Store or pass downstream: write to file/DB/API, or XCom
     context["ti"].xcom_push(key="oemetadata", value=md)
 ```
 
@@ -338,43 +399,17 @@ def build_oemetadata_for_powerplants(**context):
 
 ## Testing
 
-You can unit test assembly logic without depending on the real spec/validator by **monkeypatching** the creator.
-
-**Example (`tests/test_assembly.py`):**
-
-```python
-from pathlib import Path
-import yaml
-import pytest
-from omi.creation.assembly import assemble_metadata_dict
+* **Assembly test** (uses a fake creator): see `tests/test_assembly.py` example in this doc.
+* **Utils tests** (I/O, discovery, merging): see `tests/test_creation_utils.py`.
+  It covers:
 
-def write_yaml(p: Path, data) -> None:
-    p.parent.mkdir(parents=True, exist_ok=True)
-    p.write_text(yaml.safe_dump(data, sort_keys=False, allow_unicode=True), encoding="utf-8")
-
-class FakeCreator:
-    def __init__(self, oem_version: str = "OEMetadata-2.0.4"):
-        self.oem_version = oem_version
-    def generate_metadata(self, dataset: dict, resources: list[dict]) -> dict:
-        return {"@context": "...", **dataset, "resources": resources, "metaMetadata": {"metadataVersion": self.oem_version}}
-
-def test_assemble(tmp_path, monkeypatch):
-    write_yaml(tmp_path / "datasets" / "demo.dataset.yaml", {"dataset": {"name": "demo", "title": "Demo"}})
-    write_yaml(tmp_path / "datasets" / "demo.template.yaml", {"keywords": ["k1"], "context": {"contact": "a@b"}})
-    write_yaml(tmp_path / "resources" / "demo" / "a.resource.yaml", {"name": "a", "title": "A", "keywords": ["ak"]})
-    write_yaml(tmp_path / "resources" / "demo" / "b.resource.yaml", {"name": "b", "title": "B", "context": {"publisher": "X"}})
-
-    monkeypatch.setattr("omi.creation.assembly.OEMetadataCreator", FakeCreator)
-    md = assemble_metadata_dict(tmp_path, "demo")
-
-    assert md["name"] == "demo"
-    a, b = md["resources"]
-    assert a["keywords"] == ["ak", "k1"]          # concat
-    assert b["context"]["contact"] == "a@b"       # filled from template
-    assert b["context"]["publisher"] == "X"       # resource wins
-```
+  * `load_parts` (template application)
+  * `_merge_lists`, `deep_apply_template_to_resource`, `apply_template_to_resources`
+  * `load_yaml`
+  * `discover_paths`, `resolve_from_index`, `load_parts`
+  * `discover_dataset_ids`, `discover_dataset_ids_from_index`
 
-Run with:
+Run:
 
 ```bash
 pytest -q
@@ -384,8 +419,7 @@ pytest -q
 
 ## Validation & Error Handling
 
-* `OEMetadataCreator.generate_metadata()` runs `validate_metadata(metadata, check_license=False)`.
-* If validation fails, catch and inspect the exception from `omi.validation`:
+`OEMetadataCreator.generate_metadata()` validates with the official schema:
 
 ```python
 from omi.validation import ValidationError
@@ -396,76 +430,67 @@ except ValidationError as e:
     print("Validation failed:", e)
 ```
 
-**Common causes:**
+**Common causes**:
 
-* Missing **required** keys (e.g., field missing `"nullable"`).
-* Incorrect data types (e.g., non-URI in a field that requires `format: uri`).
-* Invalid list shapes (`primaryKey`, `foreignKeys`, etc.).
+* Missing required field keys (e.g., a schema field without `"nullable"`).
+* Wrong types (e.g., non-URI where `format: uri` is required).
+* Invalid list shapes (e.g., `primaryKey`, `foreignKeys`).
 
 ---
 
 ## Auto-Generation From Directory (Optional Onboarding)
 
-You can auto-generate a starter YAML for a dataset by scanning a directory or zip:
+You can bootstrap YAMLs from a directory or zip:
 
-* Infer resource entries based on file names & extensions.
-* For CSVs, call your CSV inference to produce initial `schema.fields`.
-* Write a `dataset` YAML + per-file `resource` YAMLs as a starting point.
+* infer resources from file names/extensions
+* for CSV, infer a table schema
+* emit dataset YAML + one resource YAML per file
 
-> Keep this as an onboarding tool; human review is still recommended.
+Use filters to skip temp/log/backup files (see next section).
 
 ---
 
 ## Filtering Irrelevant Files (Optional)
 
-If auto-generating from a directory, filter out noise:
+When scanning directories, exclude noise such as backup and editor artifacts:
 
 ```python
-def read_directory(directory, exclude_extensions=None, exclude_patterns=None, exclude_hidden=True):
-    # ...
-    # exclude_extensions=['.log','.tmp','.bak','.DS_Store','.md']
-    # exclude_patterns=['*_backup.*','*~','*.old','*.ignore']
-    return files
+exclude_extensions = {".log", ".tmp", ".bak", ".DS_Store", ".md"}
+exclude_patterns   = {"*_backup.*", "*~", "*.old", "*.ignore"}
+exclude_hidden     = True
 ```
 
-Helps avoid including backups, temp files, editor artifacts, etc.
-
 ---
 
 ## Design Notes & Extensibility
 
 * **Separation of concerns**:
 
-  * `utils` covers loading YAML, discovery, merging/templating.
-  * `assembly` orchestrates the load → merge → create flow.
-  * `creator` handles schema-based assembly and validation.
-* **Storage-agnostic**: assembly returns a dict; you decide where to store it (file/DB/API).
-* **Configurable merge**: change list concat behavior by editing `DEFAULT_CONCAT_LIST_KEYS`.
+  * `utils`: YAML loading, discovery, deep merge, ordering by spec.
+  * `assembly`: Orchestrates load → merge → create → (optionally) order.
+  * `creator`: Pulls spec via `get_metadata_specification`, injects `@context` and `metaMetadata`, validates.
+  * `scaffold`: Initializes a project from the **spec/example** (no inline strings).
+* **Storage-agnostic**: assembly returns a dict; saving is up to you.
+* **Configurable merging**: tweak `DEFAULT_CONCAT_LIST_KEYS` to change list concat behavior.
 
 ---
 
 ## FAQ
 
-**Q:** Can a resource override template-provided `licenses`?
-**A:** Yes. By default, **resource wins** for lists except `keywords`, `topics`, `languages` (which concatenate). You can include `"licenses"` in `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation.
+**Q: Can resource YAML override template-provided `licenses`?**
+A: Yes. By default, resource lists override template lists except for `keywords`, `topics`, `languages` (which concatenate). Add `"licenses"` to `DEFAULT_CONCAT_LIST_KEYS` if you want concatenation.
 
-**Q:** Where does `@context` and `metaMetadata` come from?
-**A:** `OEMetadataCreator` reads the official spec via `get_metadata_specification(oem_version)` and injects `@context` and a `metaMetadata` block, then validates the final result.
+**Q: Where do `@context` and `metaMetadata` come from?**
+A: `OEMetadataCreator` loads the spec (`get_metadata_specification(oem_version)`) and injects both before validation.
 
-**Q:** The output JSON shows `\u00a9` instead of `©`.
-**A:** Use `ensure_ascii=False` when dumping JSON:
+**Q: Why does JSON show `\u00a9` instead of `©`?**
+A: Use `ensure_ascii=False` in `json.dump` to preserve unicode characters.
 
-```python
-json.dump(metadata, f, indent=2, ensure_ascii=False)
-```
-
-**Q:** I see validation errors about fields missing `nullable`.
-**A:** Ensure each `schema.fields[]` has **`name`**, **`type`**, and **`nullable`** at minimum. If you auto-generate fields, set `nullable: false` as a safe default unless you detect nulls.
+**Q: I got a validation error: `'nullable' is a required property`.**
+A: Ensure each `schema.fields[]` has **`name`**, **`type`**, **`nullable`**. If you auto-generate, set `nullable: false` unless you detect nulls.
 
-**Q:** How do I run without a template YAML?
-**A:** Just omit `datasets/<id>.template.yaml`; assembly works without it.
-
----
+**Q: Can I reorder output keys to match the official example?**
+A: Yes. Use `order_with_spec(metadata, creator.oem_spec)` for spec-driven ordering (no hard-coded key lists).
 
-> If you want this split across multiple docs, consider:
-> `docs/assembly-overview.md`, `docs/yaml-formats.md`, `docs/templating.md`, `docs/integration-airflow.md`, `docs/testing.md`, and `docs/troubleshooting.md`.
+**Q: Can I manage multiple datasets in one metadata module?**
+A: Yes. Use `assemble_many_metadata(...)` to discover/assemble **N datasets** at once (by convention or index).

From 2477b1b29b8455c7f15ae50bce6299d22434902e Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:17:27 +0100
Subject: [PATCH 23/37] #126: Update the create module as entry point for the
 oemetadata creation. It now uses the new assembler.

---
 src/omi/create.py | 77 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 23 deletions(-)

diff --git a/src/omi/create.py b/src/omi/create.py
index a30d2b2e..2f8faaaf 100644
--- a/src/omi/create.py
+++ b/src/omi/create.py
@@ -2,13 +2,11 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional, Union
+import json
+from pathlib import Path
+from typing import Optional, Union
 
-from omi.creation.creator import OEMetadataCreator
-from omi.creation.utils import apply_template_to_resources, load_parts
-
-if TYPE_CHECKING:
-    from pathlib import Path
+from omi.creation.assembler import assemble_many_metadata, assemble_metadata_dict
 
 
 def build_from_yaml(
@@ -19,26 +17,59 @@ def build_from_yaml(
     index_file: Optional[Union[str, Path]] = None,
 ) -> None:
     """
-    Assemble OEMetadata from split YAML files.
-
-      - datasets/<dataset_id>.dataset.yaml
-      - datasets/<dataset_id>.template.yaml  (optional)
-      - resources/<dataset_id>/*.resource.yaml
-      (optionally resolved via an index YAML)
+    Assemble one dataset and write the resulting OEMetadata JSON to a file.
 
     Parameters
     ----------
-    base_dir : str | Path
-        Root directory containing 'datasets/' and 'resources/'.
+    base_dir : Union[str, Path]
+        Base directory containing the split-files dataset structure.
     dataset_id : str
-        Logical dataset id (e.g. 'powerplants').
-    output_file : str | Path
-        Output path for the generated OEMetadata JSON.
-    index_file : str | Path | None
-        Optional explicit mapping file (metadata_index.yaml).
+        The dataset ID to assemble.
+    output_file : Union[str, Path]
+        Path to write the resulting OEMetadata JSON file.
+    index_file : Optional[Union[str, Path]], optional
+        Optional path to an index file for resolving cross-dataset references,
+        by default None.
+    """
+    md = assemble_metadata_dict(base_dir, dataset_id, index_file=index_file)
+    Path(output_file).parent.mkdir(parents=True, exist_ok=True)
+    Path(output_file).write_text(json.dumps(md, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def build_many_from_yaml(
+    base_dir: Union[str, Path],
+    output_dir: Union[str, Path],
+    *,
+    dataset_ids: Optional[list[str]] = None,
+    index_file: Optional[Union[str, Path]] = None,
+) -> None:
+    """
+    Assemble multiple datasets and write each as <dataset_id>.json to output_dir.
+
+    Parameters
+    ----------
+    base_dir : Union[str, Path]
+        Base directory containing the split-files dataset structure.
+    output_dir : Union[str, Path]
+        Directory to write the resulting OEMetadata JSON files.
+    dataset_ids : Optional[list[str]], optional
+        Optional list of dataset IDs to assemble. If None, all datasets found
+        in base_dir will be assembled, by default None.
+    index_file : Optional[Union[str, Path]], optional
+        Optional path to an index file for resolving cross-dataset references,
+        by default None.
     """
-    version, dataset, resources, template = load_parts(base_dir, dataset_id, index_file=index_file)
-    merged_resources = apply_template_to_resources(resources, template)
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
 
-    creator = OEMetadataCreator(oem_version=version)
-    creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2)
+    results = assemble_many_metadata(
+        base_dir,
+        dataset_ids=dataset_ids,
+        index_file=index_file,
+        as_dict=True,  # keep it as a mapping id -> metadata
+    )
+    for ds_id, md in results.items():
+        (out_dir / f"{ds_id}.json").write_text(
+            json.dumps(md, indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )

From 7070d3b620ea3ae0d899b054c9185fafdd5defb8 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:22:35 +0100
Subject: [PATCH 24/37] #126: Add test data for "create" integration test

---
 .../datasets/powerplants.dataset.yaml         |   6 +
 .../datasets/powerplants.template.yaml        |  26 +++
 .../powerplants/data_2.resource.yaml          |  22 ++
 .../oemetadata_table_template.resource.yaml   | 191 ++++++++++++++++++
 4 files changed, 245 insertions(+)
 create mode 100644 tests/test_data/create/metadata/datasets/powerplants.dataset.yaml
 create mode 100644 tests/test_data/create/metadata/datasets/powerplants.template.yaml
 create mode 100644 tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml
 create mode 100644 tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml

diff --git a/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml b/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml
new file mode 100644
index 00000000..38bb43a2
--- /dev/null
+++ b/tests/test_data/create/metadata/datasets/powerplants.dataset.yaml
@@ -0,0 +1,6 @@
+version: "OEMetadata-2.0"
+dataset:
+  name: oep_oemetadata
+  title: OEP OEMetadata
+  description: A dataset for the OEMetadata examples.
+  "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/
diff --git a/tests/test_data/create/metadata/datasets/powerplants.template.yaml b/tests/test_data/create/metadata/datasets/powerplants.template.yaml
new file mode 100644
index 00000000..1b60853a
--- /dev/null
+++ b/tests/test_data/create/metadata/datasets/powerplants.template.yaml
@@ -0,0 +1,26 @@
+licenses:
+  - name: ODbL-1.0
+    title: Open Data Commons Open Database License 1.0
+    path: https://opendatacommons.org/licenses/odbl/1-0/index.html
+    instruction: >
+      You are free to share and change, but you must attribute, and
+      share derivations under the same license. See https://tldrlegal.com/license/odc-open-database-license-(odbl)
+      for further information.
+    attribution: © Reiner Lemoine Institut
+    copyrightStatement: https://github.com/OpenEnergyPlatform/oemetadata/blob/production/LICENSE.txt
+
+context:
+  title: NFDI4Energy
+  homepage: https://nfdi4energy.uol.de/
+  documentation: https://nfdi4energy.uol.de/sites/about_us/
+  sourceCode: https://github.com/NFDI4Energy
+  publisher: Open Energy Platform (OEP)
+  publisherLogo: https://github.com/OpenEnergyPlatform/organisation/blob/production/logo/OpenEnergyFamily_Logo_OpenEnergyPlatform.svg
+  contact: contact@example.com
+  fundingAgency: " Deutsche Forschungsgemeinschaft (DFG)"
+  fundingAgencyLogo: https://upload.wikimedia.org/wikipedia/commons/8/86/DFG-logo-blau.svg
+  grantNo: "501865131"
+
+topics: [model_draft]
+languages: [en-GB, de-DE]
+keywords: [example, ODbL-1.0, NFDI4Energy]
diff --git a/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml
new file mode 100644
index 00000000..a03ee242
--- /dev/null
+++ b/tests/test_data/create/metadata/resources/powerplants/data_2.resource.yaml
@@ -0,0 +1,22 @@
+name: data_2
+type: table
+title: My Second Resource
+
+path: reGon/metadata/data_2.csv
+scheme: file
+format: csv
+mediatype: text/csv
+encoding: utf-8
+
+schema:
+  fields:
+    - name: h
+      type: integer
+      nullable: true
+    - name: i
+      type: integer
+      nullable: true
+    - name: o
+      type: string
+      nullable: true
+  primaryKey: [id]
diff --git a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml
new file mode 100644
index 00000000..1a030e54
--- /dev/null
+++ b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml
@@ -0,0 +1,191 @@
+name: oemetadata_table_template
+type: table
+title: OEMetadata Table Template
+description: Example table used to illustrate the OEMetadata structure and features.
+"@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv
+
+# Resource-specific attributes (template will add licenses/context/topics/languages/keywords)
+path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template
+scheme: http
+format: CSV
+encoding: UTF-8
+
+dialect:
+  decimalSeparator: "."
+  delimiter: ";"
+
+schema:
+  fields:
+    - name: id
+      type: integer
+      description: Unique identifier
+      nullable: false
+      unit: null
+      isAbout:
+        - name: identifier
+          "@id": http://purl.obolibrary.org/obo/IAO_0020000
+      valueReference:
+        - value: null
+          name: null
+          "@id": null
+    - name: name
+      type: string
+      description: Technology Name
+      nullable: true
+      unit: null
+      isAbout:
+        - name: power generation technology
+          "@id": http://openenergy-platform.org/ontology/oeo/OEO_00010423
+      valueReference:
+        - value: wind
+          name: wind power technology
+          "@id": http://openenergyplatform.org/ontology/oeo/OEO_00010424
+    - name: type
+      type: string
+      description: Type of wind farm
+      nullable: true
+      unit: null
+      isAbout:
+        - name: wind farm
+          "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000447/
+      valueReference:
+        - value: onshore
+          name: onshore wind farm
+          "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000311/
+        - value: offshore
+          name: offshore wind farm
+          "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000308/
+    - name: year
+      type: integer
+      description: Reference year
+      nullable: true
+      unit: null
+      isAbout:
+        - name: year
+          "@id": https://openenergyplatform.org/ontology/oeo/UO_0000036/
+      valueReference:
+        - value: null
+          name: null
+          "@id": null
+    - name: value
+      type: number
+      description: Bruttoleistung
+      nullable: true
+      unit: MW
+      isAbout:
+        - name: nameplate capacity
+          "@id": https://openenergyplatform.org/ontology/oeo/OEO_00230003/
+      valueReference:
+        - value: null
+          name: null
+          "@id": null
+    - name: is_active
+      type: boolean
+      description: Betriebsstatus
+      nullable: false
+      unit: null
+      isAbout:
+        - name: Operating Mode Status
+          "@id": https://ontology.brickschema.org/brick/Operating_Mode_Status
+      valueReference:
+        - value: null
+          name: null
+          "@id": null
+    - name: version
+      type: integer
+      description: Version
+      nullable: true
+      unit: null
+      isAbout:
+        - name: version number
+          "@id": http://purl.obolibrary.org/obo/IAO_0000129
+      valueReference:
+        - value: null
+          name: null
+          "@id": null
+    - name: comment
+      type: string
+      description: ""
+      nullable: true
+      unit: null
+      isAbout:
+        - name: comment
+          "@id": http://semanticscience.org/resource/SIO_001167
+      valueReference:
+        - value: null
+          name: null
+          "@id": null
+  primaryKey: [id]
+  foreignKeys:
+    - fields: [id, version]
+      reference:
+        resource: model_draft.oep_oemetadata_table_example_version
+        fields: [id, version]
+
+
+sources:
+  - title: IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report
+    authors: [Hoesung Lee, José Romero, The Core Writing Team]
+    description: A Report of the Intergovernmental Panel on Climate Change.
+    publicationYear: "2023"
+    path: https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf
+    sourceLicenses:
+      - name: CC-BY-4.0
+        title: Creative Commons Attribution 4.0 International
+        path: https://creativecommons.org/licenses/by/4.0/legalcode
+        instruction: >
+          You are free to share and change, but you must attribute.
+          See https://tldrlegal.com/license/odc-open-database-license-odbl for further information.
+        attribution: © Intergovernmental Panel on Climate Change 2023
+        copyrightStatement: https://www.ipcc.ch/copyright/
+
+subject:
+  - name: energy
+    "@id": https://openenergyplatform.org/ontology/oeo/OEO_00000150
+
+publicationDate: "2024-10-15"
+
+# embargoPeriod:
+#   start: "2024-10-11"
+#   end: "2025-01-01"
+#   isActive: true
+
+spatial:
+  location:
+    address: Rudower Chaussee 12, 12489 Berlin
+    "@id": https://www.wikidata.org/wiki/Q77077223
+    latitude: "52.432822"
+    longitude: "13.5351004"
+  extent:
+    name: Berlin
+    "@id": https://www.wikidata.org/wiki/Q64
+    resolutionValue: "100"
+    resolutionUnit: m
+    boundingBox: [13.08825, 52.33859, 13.76104, 52.6754]
+    crs: EPSG:4326
+
+temporal:
+  referenceDate: "2020-01-01"
+  timeseries:
+    - start: "2020-01-01T00:00:00+01:00"
+      end: "2020-01-01T23:59:30+01:00"
+      resolutionValue: "15"
+      resolutionUnit: min
+      alignment: left
+      aggregationType: current
+
+contributors:
+  - title: Ludwig Hülk
+    path: https://github.com/Ludee
+    organization: Reiner Lemoine Institut
+    roles: [DataCollector]
+    date: "2024-11-19"
+    object: data
+    comment: Date of data creation
+  - title: Ludwig Hülk
+    path: https://github.com/Ludee
+    organization: Reiner Lemoine Institut
+    roles: [DataCurator]
+    date: "2024-11-30"
+    object: metadata
+    comment: Date of metadata creation

From 666242b0d29da795cebe27e1b49d9fbec8a4d8a1 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:23:46 +0100
Subject: [PATCH 25/37] #126: Add test for creation module entry point "create"
 as integration test using test data

---
 tests/test_create.py | 76 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 tests/test_create.py

diff --git a/tests/test_create.py b/tests/test_create.py
new file mode 100644
index 00000000..708f89c2
--- /dev/null
+++ b/tests/test_create.py
@@ -0,0 +1,76 @@
+"""
+Integration tests for OEMetadata assembly and entry point using real YAML.
+
+This test suite consumes the example YAML tree located at:
+tests/test_data/create/metadata/
+and verifies that OMI assembles and writes a valid OEMetadata document.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from omi.create import build_from_yaml
+from omi.creation.assembler import assemble_metadata_dict
+
+
+def _fixture_metadata_root() -> Path:
+    """Return the absolute path to tests/test_data/create/metadata."""
+    here = Path(__file__).resolve().parent
+    return here / "test_data" / "create" / "metadata"
+
+
+def test_assemble_metadata_dict_with_fixture() -> None:
+    """Assemble OEMetadata dict from the real fixture and assert key content."""
+    base = _fixture_metadata_root()
+    dataset_id = "powerplants"
+
+    md = assemble_metadata_dict(base, dataset_id)
+
+    # dataset-level checks (from powerplants.dataset.yaml)
+    assert md["name"] == "oep_oemetadata"
+    assert md["title"] == "OEP OEMetadata"
+    assert md["@id"].startswith("https://databus.openenergyplatform.org/")
+
+    # context injected from template if not overridden in resource
+    assert "resources" in md
+    assert isinstance(md["resources"], list)
+    assert md["resources"]
+    r_names = {r["name"] for r in md["resources"]}
+    # Both resources from your example exist
+    assert {"oemetadata_table_template", "data_2"}.issubset(r_names)
+
+    # Check one resource that should have inherited from template
+    r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table_template")
+    assert r1["context"]["title"] == "NFDI4Energy"  # from template
+    assert "licenses" in r1
+    assert isinstance(r1["licenses"], list)
+    assert r1["licenses"]
+    assert r1["licenses"][0]["name"] in {"ODbL-1.0", "ODbL-1.0".upper(), "ODBL-1.0"}
+
+    # Meta metadata is present
+    assert "metaMetadata" in md
+    assert md["metaMetadata"]["metadataVersion"].startswith("OEMetadata-2.0")
+
+
+def test_entrypoint_build_from_yaml_writes_file(tmp_path: Path) -> None:
+    """Use the real entry point to write JSON and compare basic structure."""
+    base = _fixture_metadata_root()
+    out = tmp_path / "out" / "powerplants.json"
+
+    build_from_yaml(base, "powerplants", out)
+
+    assert out.exists(), "Entry point did not write the output file."
+    written = json.loads(out.read_text(encoding="utf-8"))
+
+    # Sanity checks on written JSON
+    assert written["name"] == "oep_oemetadata"
+    assert isinstance(written["resources"], list)
+    assert written["resources"]
+    # Ensure unicode is preserved (© should not be escaped)
+    licenses = written["resources"][0].get("licenses", [])
+    if licenses:
+        # stringify to inspect the character; ensure_ascii=False in writer preserves it
+        text = json.dumps(licenses[0], ensure_ascii=False)
+        assert "©" in text

From 41aafda1a7511e78388d1c9b5168253b1be5c799 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:30:29 +0100
Subject: [PATCH 26/37] #126: Add docs on how to use the create module (entry
 point for creation module)

---
 docs/create.md | 159 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 docs/create.md

diff --git a/docs/create.md b/docs/create.md
new file mode 100644
index 00000000..09d6481a
--- /dev/null
+++ b/docs/create.md
@@ -0,0 +1,159 @@
+# OMI “Create” Entry Point
+
+This mini-guide explains how to use the **programmatic entry points** that turn your split YAML metadata (dataset + template + resources) into a single OEMetadata JSON document.
+
+> If you’re looking for how to author the YAML files and how templating works, see the main **Assembly Guide** in the `creation` module directory. This page just shows how to *call* the entry points.
+
+---
+
+## What it does
+
+The functions in `omi.create` wrap the full assembly pipeline:
+
+1. **Discover / load** your YAML parts (dataset, optional template, resources).
+2. **Apply the template** to each resource (deep merge; resource wins; keywords/topics/languages concatenate).
+3. **Generate & validate** the final OEMetadata JSON using the official schema (via `OEMetadataCreator`).
+4. **Write** the result to disk (`build_from_yaml`) or many results to a directory (`build_many_from_yaml`).
+
+---
+
+## API
+
+```python
+from omi.create import build_from_yaml, build_many_from_yaml
+```
+
+### `build_from_yaml(base_dir, dataset_id, output_file, *, index_file=None) -> None`
+
+Assemble **one** dataset and write `<output_file>` (JSON).
+
+* `base_dir` (`str | Path`): Root that contains:
+
+  * `datasets/<dataset_id>.dataset.yaml`
+  * `datasets/<dataset_id>.template.yaml` *(optional)*
+  * `resources/<dataset_id>/*.resource.yaml`
+* `dataset_id` (`str`): Logical dataset name (e.g. `"powerplants"`).
+* `output_file` (`str | Path`): Path to write the generated OEMetadata JSON.
+* `index_file` (`str | Path | None`): Optional explicit mapping file (`metadata_index.yaml`). If provided, paths are taken from the index instead of convention.
+
+### `build_many_from_yaml(base_dir, output_dir, *, dataset_ids=None, index_file=None) -> None`
+
+Assemble **multiple** datasets and write each as `<output_dir>/<dataset_id>.json`.
+
+* `base_dir` (`str | Path`): Same as above.
+* `output_dir` (`str | Path`): Destination directory for one JSON file per dataset.
+* `dataset_ids` (`list[str] | None`): Limit to specific datasets. If `None`, we:
+
+  * Use keys from `index_file` when provided, **else**
+  * Discover all `datasets/*.dataset.yaml` in `base_dir`.
+* `index_file` (`str | Path | None`): Optional `metadata_index.yaml`.
+
+---
+
+## Quick examples
+
+### One dataset (convention-based discovery)
+
+```python
+from omi.create import build_from_yaml
+
+build_from_yaml(
+    base_dir="./metadata",
+    dataset_id="powerplants",
+    output_file="./out/powerplants.json",
+)
+```
+
+Directory layout:
+
+```bash
+metadata/
+  datasets/
+    powerplants.dataset.yaml
+    powerplants.template.yaml     # optional
+  resources/
+    powerplants/
+      *.resource.yaml
+```
+
+### One dataset (explicit index)
+
+```python
+from omi.create import build_from_yaml
+
+build_from_yaml(
+    base_dir="./metadata",
+    dataset_id="powerplants",
+    output_file="./out/powerplants.json",
+    index_file="./metadata/metadata_index.yaml",
+)
+```
+
+### Many datasets (discover all)
+
+```python
+from omi.create import build_many_from_yaml
+
+build_many_from_yaml(
+    base_dir="./metadata",
+    output_dir="./out",
+)
+# writes ./out/<dataset_id>.json for each dataset found
+```
+
+### Many datasets (index + subset)
+
+```python
+from omi.create import build_many_from_yaml
+
+build_many_from_yaml(
+    base_dir="./metadata",
+    output_dir="./out",
+    dataset_ids=["powerplants", "households"],
+    index_file="./metadata/metadata_index.yaml",
+)
+```
+
+---
+
+## Notes & behavior
+
+* Output JSON is written with `indent=2` and **`ensure_ascii=False`** to preserve characters like `©`.
+* Validation happens via `OEMetadataCreator` using the official schema provided by `oemetadata` (imported through `omi.base.get_metadata_specification`).
+* If a dataset YAML is missing, `FileNotFoundError` is raised.
+* If schema validation fails, you’ll get an exception from `omi.validation`. Catch it where you call the entry point if you want to handle/report errors.
+
+---
+
+## Using in 3rd Party code like data pipelines
+
+```python
+from pathlib import Path
+from omi.create import build_from_yaml
+
+def build_oemetadata_callable(**context):
+    base = Path("/opt/airflow/dags/metadata")
+    out = Path("/opt/airflow/out/powerplants.json")
+    build_from_yaml(base, "powerplants", out)
+    # optionally push to XCom, publish, upload, etc.
+```
+
+---
+
+## Testing tips
+
+* For **unit tests** of `omi.create`, patch `omi.create.assemble_metadata_dict` / `assemble_many_metadata` and verify files are written.
+* For **integration tests**, put real example YAMLs under `tests/test_data/create/metadata/` and call `build_from_yaml` end-to-end.
+
+---
+
+## Troubleshooting
+
+* **“Dataset YAML not found”**
+  Check `base_dir/datasets/<dataset_id>.dataset.yaml` exists, or supply the correct `index_file`.
+
+* **Unicode characters appear escaped (`\u00a9`)**
+  Ensure you’re not re-writing the JSON elsewhere with `ensure_ascii=True`.
+
+* **Template not applied**
+  Confirm your template file name matches `<dataset_id>.template.yaml` (or is correctly referenced from the index), and the keys you expect to inherit aren’t already set in the resource (resource values win).

From 70435e98e870dfdcf0f298491453f4988d443391 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 18:36:15 +0100
Subject: [PATCH 27/37] deactivate test

---
 tests/test_metadata_validation.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/test_metadata_validation.py b/tests/test_metadata_validation.py
index 2a0de492..278f251d 100644
--- a/tests/test_metadata_validation.py
+++ b/tests/test_metadata_validation.py
@@ -110,11 +110,10 @@ def deactivate__test_metadata_against_oep_table():
     validation.validate_oep_table_against_metadata(oep_table=table, oep_schema="model_draft", metadata=metadata)
 
 
-def test_metadata_against_oep_table_using_metadata_from_oep():
-    """Test OEP table definition against OEP metadata, where metadata is taken from OEP."""
-    table = "x2x_p2gas_soec_1"
-    with pytest.raises(validation.ValidationError, match="None is not of type 'object'"):
-        validation.validate_oep_table_against_metadata(oep_table=table, oep_schema="model_draft")
+# Test fails always as tables does not exist in OEP anymore
+# def test_metadata_against_oep_table_using_metadata_from_oep():
+#     """Test OEP table definition against OEP metadata, where metadata is taken from OEP."""
+#     with pytest.raises(validation.ValidationError, match="None is not of type 'object'"):
 
 
 def test_metadata_against_oep_table_invalid_name():

From 59f4263f9ef9c8d8e703e6406e3522bb5bf6c21d Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Thu, 6 Nov 2025 22:31:14 +0100
Subject: [PATCH 28/37] remove irritating info from example resource name

---
 src/omi/creation/README.md                           | 12 ++++++------
 tests/test_create.py                                 |  4 ++--
 ....resource.yaml => oemetadata_table.resource.yaml} |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)
 rename tests/test_data/create/metadata/resources/powerplants/{oemetadata_table_template.resource.yaml => oemetadata_table.resource.yaml} (99%)

diff --git a/src/omi/creation/README.md b/src/omi/creation/README.md
index cdcd0135..7fedbfef 100644
--- a/src/omi/creation/README.md
+++ b/src/omi/creation/README.md
@@ -139,14 +139,14 @@ keywords: [example, ODbL-1.0, NFDI4Energy]
 ### Resource YAML
 
 ```yaml
-# metadata/resources/powerplants/oemetadata_table_template.resource.yaml
-name: oemetadata_table_template
+# metadata/resources/powerplants/oemetadata_table.resource.yaml
+name: oemetadata_table
 type: table
 title: OEMetadata Table Template
 description: Example table used to illustrate the OEMetadata structure and features.
 
 # Resource-specific attributes
-path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template
+path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table
 scheme: http
 format: CSV
 encoding: UTF-8
@@ -227,7 +227,7 @@ datasets:
     dataset: datasets/powerplants.dataset.yaml
     template: datasets/powerplants.template.yaml
     resources:
-      - resources/powerplants/oemetadata_table_template.resource.yaml
+      - resources/powerplants/oemetadata_table.resource.yaml
       - resources/powerplants/data_2.resource.yaml
 ```
 
@@ -285,7 +285,7 @@ from omi.creation.utils import load_yaml, apply_template_to_resources
 dataset = load_yaml(Path("./metadata/datasets/powerplants.dataset.yaml")).get("dataset", {})
 template = load_yaml(Path("./metadata/datasets/powerplants.template.yaml"))
 resources = [
-    load_yaml(Path("./metadata/resources/powerplants/oemetadata_table_template.resource.yaml")),
+    load_yaml(Path("./metadata/resources/powerplants/oemetadata_table.resource.yaml")),
     load_yaml(Path("./metadata/resources/powerplants/data_2.resource.yaml")),
 ]
 resources = apply_template_to_resources(resources, template)
@@ -358,7 +358,7 @@ paths = init_skeleton_from_spec(
     base_dir="./metadata",
     dataset_id="powerplants",
     oem_version="OEMetadata-2.0.4",
-    resource_name="oemetadata_table_template",
+    resource_name="oemetadata_table",
     with_index=True,   # creates metadata_index.yaml
     force=False,       # do not overwrite
 )
diff --git a/tests/test_create.py b/tests/test_create.py
index 708f89c2..7cadf2d8 100644
--- a/tests/test_create.py
+++ b/tests/test_create.py
@@ -39,10 +39,10 @@ def test_assemble_metadata_dict_with_fixture() -> None:
     assert md["resources"]
     r_names = {r["name"] for r in md["resources"]}
     # Both resources from your example exist
-    assert {"oemetadata_table_template", "data_2"}.issubset(r_names)
+    assert {"oemetadata_table", "data_2"}.issubset(r_names)
 
     # Check one resource that should have inherited from template
-    r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table_template")
+    r1 = next(r for r in md["resources"] if r["name"] == "oemetadata_table")
     assert r1["context"]["title"] == "NFDI4Energy"  # from template
     assert "licenses" in r1
     assert isinstance(r1["licenses"], list)
diff --git a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml
similarity index 99%
rename from tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml
rename to tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml
index 1a030e54..f28b8392 100644
--- a/tests/test_data/create/metadata/resources/powerplants/oemetadata_table_template.resource.yaml
+++ b/tests/test_data/create/metadata/resources/powerplants/oemetadata_table.resource.yaml
@@ -1,11 +1,11 @@
-name: oemetadata_table_template
+name: oemetadata_table
 type: table
 title: OEMetadata Table Template
 description: Example table used to illustrate the OEMetadata structure and features.
 "@id": https://databus.openenergyplatform.org/oeplatform/supply/wri_global_power_plant_database/2022-11-07/wri_global_power_plant_database_variant=data.csv
 
 # Resource-specific attributes (template will add licenses/context/topics/languages/keywords)
-path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table_template
+path: http://openenergyplatform.org/dataedit/view/model_draft/oemetadata_table
 scheme: http
 format: CSV
 encoding: UTF-8

From b37ecf07d4a6b2bdf2d8c2b4bc832bb49240a52c Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 10:40:39 +0100
Subject: [PATCH 29/37] #126: Update create docs

---
 docs/create.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/create.md b/docs/create.md
index 09d6481a..20b816e7 100644
--- a/docs/create.md
+++ b/docs/create.md
@@ -132,10 +132,10 @@ from pathlib import Path
 from omi.create import build_from_yaml
 
 def build_oemetadata_callable(**context):
-    base = Path("/opt/airflow/dags/metadata")
-    out = Path("/opt/airflow/out/powerplants.json")
+    base = Path("/project/metadata")
+    out = Path("/project/metadata/out/powerplants.json")
     build_from_yaml(base, "powerplants", out)
-    # optionally push to XCom, publish, upload, etc.
+    # optionally push to airflow XCom, publish, upload, etc.
 ```
 
 ---

From c269469c22354d08bb1aae7b289c671dfcb00a23 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 10:45:46 +0100
Subject: [PATCH 30/37] #126: Add CLI command to initialize a new metadata
 workspace with template contents

---
 src/omi/cli.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/omi/cli.py b/src/omi/cli.py
index a596a5c6..fbfb722e 100644
--- a/src/omi/cli.py
+++ b/src/omi/cli.py
@@ -24,6 +24,7 @@
 import click
 
 from omi.creation.creator import OEMetadataCreator
+from omi.creation.init import init_dataset, init_resources_from_files
 from omi.creation.utils import apply_template_to_resources, load_parts
 
 
@@ -62,11 +63,56 @@ def assemble_cmd(base_dir: Path, dataset_id: str, output_file: Path, index_file:
     creator = OEMetadataCreator(oem_version=version)
     creator.save(dataset, merged_resources, output_file, ensure_ascii=False, indent=2)
 
-    click.echo(f"OEMetadata written to {output_file}")
+
+@click.group()
+def init() -> None:
+    """Scaffold OEMetadata split-files layout."""
+
+
+@init.command("dataset")
+@click.argument("base_dir", type=click.Path(file_okay=False, path_type=Path))
+@click.argument("dataset_id")
+@click.option("--oem-version", default="OEMetadata-2.0", show_default=True)
+@click.option("--resource", "resources", multiple=True, help="Initial resource names (repeatable).")
+@click.option("--overwrite", is_flag=True, help="Overwrite existing files.")
+def init_dataset_cmd(
+    base_dir: Path,
+    dataset_id: str,
+    oem_version: str,
+    resources: tuple[str, ...],
+    *,
+    overwrite: bool,
+) -> None:
+    """Initialize a split-files OEMetadata dataset layout under BASE_DIR."""
+    res = init_dataset(base_dir, dataset_id, oem_version=oem_version, resources=resources, overwrite=overwrite)
+    click.echo(f"dataset:  {res.dataset_yaml}")
+    click.echo(f"template: {res.template_yaml}")
+    for p in res.resource_yamls:
+        click.echo(f"resource: {p}")
+
+
+@init.command("resources")
+@click.argument("base_dir", type=click.Path(file_okay=False, path_type=Path))
+@click.argument("dataset_id")
+@click.argument("files", nargs=-1, type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option("--oem-version", default="OEMetadata-2.0", show_default=True)
+@click.option("--overwrite", is_flag=True, help="Overwrite existing files.")
+def init_resources_cmd(
+    base_dir: Path,
+    dataset_id: str,
+    files: tuple[Path, ...],
+    oem_version: str,
+    *,
+    overwrite: bool,
+) -> None:
+    """Create resource YAML files for DATASET_ID from the given FILES."""
+    outs = init_resources_from_files(base_dir, dataset_id, files, oem_version=oem_version, overwrite=overwrite)
+    for p in outs:
+        click.echo(p)
 
 
 # Keep CommandCollection for backwards compatibility with your entry point
-cli = click.CommandCollection(sources=[grp])
+cli = click.CommandCollection(sources=[grp, init])
 
 
 def main() -> None:

From a4bedf2d18a6c5255baf09bf87ba8fdc6612a669 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 11:57:33 +0100
Subject: [PATCH 31/37] #126: add omi scripts to project

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index cd3fadfd..585b49fc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,3 +78,6 @@ unfixable = ["UP007", "I001"]
 "*/__init__.py" = [
   "D104",  # Missing docstring in public package
 ]
+
+[omi.scripts]
+omi = "omi.cli:main"

From b1dbbf8e1143f048375c39a4afd57d06685a9687 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 11:57:53 +0100
Subject: [PATCH 32/37] #126: enhance docstring

---
 src/omi/creation/assembler.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/omi/creation/assembler.py b/src/omi/creation/assembler.py
index 9814f0ad..edaa4318 100644
--- a/src/omi/creation/assembler.py
+++ b/src/omi/creation/assembler.py
@@ -60,6 +60,24 @@ def assemble_many_metadata(
         * otherwise -> discover by 'datasets/*.dataset.yaml'
     - Returns a mapping {dataset_id: metadata} if as_dict=True,
       else a list of (dataset_id, metadata) pairs in sorted id order.
+
+    Parameters
+    ----------
+    base_dir : Union[str, Path]
+        Base directory containing datasets, templates, and resources.
+    dataset_ids : Optional[Iterable[str]], optional
+        Optional iterable of dataset IDs to assemble. If None, all datasets found
+        in base_dir will be assembled, by default None.
+    index_file : Optional[Union[str, Path]], optional
+        Optional path to an index YAML file for resolving dataset parts.
+    as_dict : bool, optional
+        Whether to return results as a dict mapping dataset_id to metadata. If False,
+        returns a list of (dataset_id, metadata) tuples, by default True.
+
+    Returns
+    -------
+    Union[dict[str, dict], list[tuple[str, dict]]]
+        Assembled OEMetadata for each dataset.
     """
     base = Path(base_dir)
 

From 47117b0f918ede01a48750f6d4d6a39cdd347ef7 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 12:06:58 +0100
Subject: [PATCH 33/37] #126: Add creation init module to provide backend for
 CLI functionality for: - Initializing a new dataset from yaml files - Add
 resources to the dataset Either empty or from file with inferred metadata

---
 src/omi/creation/init.py | 229 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 src/omi/creation/init.py

diff --git a/src/omi/creation/init.py b/src/omi/creation/init.py
new file mode 100644
index 00000000..8be170ff
--- /dev/null
+++ b/src/omi/creation/init.py
@@ -0,0 +1,229 @@
+"""
+Initialization helpers for OEMetadata split-files layout.
+
+Provides functions to scaffold dataset and resource YAML files and to
+infer resource information from existing data files.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import yaml
+
+from omi.base import get_metadata_specification
+from omi.inspection import InspectionError, infer_metadata
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from pathlib import Path
+
+
+@dataclass
+class InitResult:
+    """Paths to created or reused YAML files for a single dataset."""
+
+    dataset_yaml: Path
+    template_yaml: Path
+    resource_yamls: list[Path]
+
+
+# -----------------------------
+# helpers
+# -----------------------------
+
+
+def _blankify(obj: object) -> object:
+    """
+    Return a copy of `obj` with the same structure but 'empty' leaf values.
+
+    Rules:
+    - dict  -> recursively blankify values
+    - list  -> [] if scalar list; if list of dicts and non-empty, keep one blankified element; else []
+    - str   -> ""
+    - bool  -> False
+    - int/float -> ""   (prefer empty so users must choose proper types)
+    - None  -> None
+    - everything else -> ""
+    """
+    if isinstance(obj, dict):
+        blank: object = {k: _blankify(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        if not obj:
+            blank = []
+        else:
+            first = obj[0]
+            # show one skeleton item so users see the structure for list-of-dicts;
+            # scalar lists -> show empty by default
+            blank = [_blankify(first)] if isinstance(first, dict) else []
+    elif isinstance(obj, str):
+        blank = ""
+    elif isinstance(obj, bool):
+        blank = False
+    elif obj is None:
+        blank = None
+    else:
+        # numbers / other scalars -> empty
+        blank = ""
+    return blank
+
+
+def _load_spec_template(oem_version: str) -> dict:
+    """Return the raw OEMetadata template document for the given version."""
+    spec = get_metadata_specification(oem_version)
+    return spec.template or {}
+
+
+def _dataset_stub_from_spec_template(oem_version: str, dataset_id: str) -> dict:
+    """
+    Build datasets/<id>.dataset.yaml from top-level template (not from resources).
+
+    Remove @context/resources/metaMetadata and blankify the rest.
+    """
+    t = _load_spec_template(oem_version).copy()
+    t.pop("@context", None)
+    t.pop("resources", None)  # <-- filter out resource-level keys
+    t.pop("metaMetadata", None)
+
+    blank = _blankify(t)
+    blank.setdefault("name", dataset_id)
+    blank.setdefault("title", "")
+    blank.setdefault("description", "")
+    blank.setdefault("@id", "")
+
+    return {"version": oem_version, "dataset": blank}
+
+
+def _resource_template_from_spec(oem_version: str) -> dict:
+    """Build datasets/<id>.template.yaml from the *first* resource template only."""
+    tmpl = _load_spec_template(oem_version)
+    resources = tmpl.get("resources") or []
+    base = resources[0] if resources else {}
+    return _blankify(base)
+
+
+def _resource_stub_from_spec(oem_version: str, resource_name: str) -> dict:
+    """Build resources/<id>/<name>.resource.yaml from the resource template."""
+    res = _resource_template_from_spec(oem_version)
+    res["name"] = resource_name
+    return res
+
+
+def _dump_yaml(path: Path, data: dict, *, overwrite: bool) -> Path:
+    """Write `data` as YAML to `path`, respecting the `overwrite` flag."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if path.exists() and not overwrite:
+        return path
+    path.write_text(
+        yaml.safe_dump(data, sort_keys=False, allow_unicode=True),
+        encoding="utf-8",
+    )
+    return path
+
+
+# -----------------------------
+# public API
+# -----------------------------
+
+
+def init_dataset(
+    base_dir: Path,
+    dataset_id: str,
+    *,
+    oem_version: str = "OEMetadata-2.0",
+    resources: Iterable[str] = (),
+    overwrite: bool = False,
+) -> InitResult:
+    """
+    Create or extend the split-files layout for one dataset.
+
+    Creates:
+
+    - datasets/<id>.dataset.yaml
+    - datasets/<id>.template.yaml
+    - resources/<id>/<resource>.resource.yaml for each requested resource.
+    """
+    # touch spec (also ensures the version string is valid)
+    _ = get_metadata_specification(oem_version)
+
+    dataset_yaml = base_dir / "datasets" / f"{dataset_id}.dataset.yaml"
+    template_yaml = base_dir / "datasets" / f"{dataset_id}.template.yaml"
+
+    dataset_doc = _dataset_stub_from_spec_template(oem_version, dataset_id)
+    resource_template_doc = _resource_template_from_spec(oem_version)
+
+    out_dataset = _dump_yaml(dataset_yaml, dataset_doc, overwrite=overwrite)
+    out_template = _dump_yaml(template_yaml, resource_template_doc, overwrite=overwrite)
+
+    created_resources: list[Path] = []
+    for res_name in resources:
+        res_doc = _resource_stub_from_spec(oem_version, res_name)
+        res_path = base_dir / "resources" / dataset_id / f"{res_name}.resource.yaml"
+        created_resources.append(_dump_yaml(res_path, res_doc, overwrite=overwrite))
+
+    return InitResult(dataset_yaml=out_dataset, template_yaml=out_template, resource_yamls=created_resources)
+
+
+def init_resources_from_files(
+    base_dir: Path,
+    dataset_id: str,
+    files: Iterable[Path],
+    *,
+    oem_version: str = "OEMetadata-2.0.4",
+    overwrite: bool = False,
+) -> list[Path]:
+    """
+    Create resource stubs for DATASET_ID from the given FILES.
+
+    Uses the spec resource template structure, prefills name/path/format hints,
+    and for CSV files also infers a schema (fields + types) using `omi.inspection`.
+    """
+    _ = get_metadata_specification(oem_version)
+
+    outputs: list[Path] = []
+    for f in files:
+        name = f.stem
+        ext = f.suffix.lower().lstrip(".")
+        res = _resource_stub_from_spec(oem_version, name)
+        res["path"] = str(f)
+
+        # Lightweight format hinting (non-authoritative; user should review)
+        if ext == "csv":
+            res.setdefault("format", "CSV")
+            res.setdefault("encoding", "UTF-8")
+            res.setdefault("scheme", "file")
+
+            # Use existing inspection: "OEP" == OEMetadata in this code base
+            try:
+                inferred = infer_metadata(str(f), metadata_format="OEP")
+            except InspectionError:
+                inferred = None
+
+            if inferred is not None:
+                # We only care about the *resource* part here
+                try:
+                    inferred_resource = inferred["resources"][0]
+                    inferred_schema = inferred_resource.get("schema")
+                except (KeyError, IndexError, TypeError):
+                    inferred_schema = None
+
+                if inferred_schema:
+                    # Overwrite/attach the schema from inspection to this resource stub
+                    res["schema"] = inferred_schema
+
+        elif ext == "json":
+            res.setdefault("format", "json")
+            res.setdefault("scheme", "file")
+        elif ext == "xlsx":
+            res.setdefault("format", "xlsx")
+            res.setdefault("scheme", "file")
+        else:
+            if ext:
+                res.setdefault("format", ext)
+            res.setdefault("scheme", "file")
+
+        out_path = base_dir / "resources" / dataset_id / f"{name}.resource.yaml"
+        outputs.append(_dump_yaml(out_path, res, overwrite=overwrite))
+
+    return outputs

From 2169357c3c59d41dfbb9db5ad5dfcaa465f868fb Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 12:07:14 +0100
Subject: [PATCH 34/37] #126: enhance docstings

---
 src/omi/creation/utils.py | 63 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/src/omi/creation/utils.py b/src/omi/creation/utils.py
index a0e74a6a..fa591863 100644
--- a/src/omi/creation/utils.py
+++ b/src/omi/creation/utils.py
@@ -162,6 +162,26 @@ def resolve_from_index(
               - path/to/res2.yaml
 
     Paths are interpreted as relative to `base_dir`.
+
+    Parameters
+    ----------
+    base_dir : Union[str, Path]
+        Base directory containing datasets, templates, and resources.
+    dataset_id : str
+        Identifier for the dataset to load.
+    index_file : Optional[Union[str, Path]]
+        Optional path to an index YAML file for resolving dataset parts.
+
+    Returns
+    -------
+    tuple[Optional[Path], Optional[Path], list[Path]]
+        A tuple containing:
+        - dataset_path: Optional[Path]
+            Path to the dataset YAML (or None if not found).
+        - template_path: Optional[Path]
+            Path to the template YAML (or None if not found).
+        - resource_paths: list[Path]
+            List of paths to resource YAMLs.
     """
     if not index_file:
         return discover_paths(base_dir, dataset_id)
@@ -185,6 +205,29 @@ def load_parts(
     Load dataset YAML, optional template YAML, and all resource YAMLs.
 
     Returns a tuple: (version, dataset, resources, template).
+
+    Parameters
+    ----------
+    base_dir : Union[str, Path]
+        Base directory containing datasets, templates, and resources.
+    dataset_id : str
+        Identifier for the dataset to load.
+    index_file : Optional[Union[str, Path]], optional
+        Optional path to an index YAML file for resolving dataset parts,
+        by default None.
+
+    Returns
+    -------
+    tuple[str, dict[str, object], list[dict[str, object]], dict[str, object]]
+        A tuple containing:
+        - version: str
+            The OEMetadata version from the dataset YAML (default "OEMetadata-2.0.4").
+        - dataset: dict[str, object]
+            The dataset mapping from the dataset YAML.
+        - resources: list[dict[str, object]]
+            A list of resource mappings from the resource YAMLs.
+        - template: dict[str, object]
+            The template mapping from the template YAML (empty dict if none).
     """
     dataset_path, template_path, resource_paths = resolve_from_index(base_dir, dataset_id, index_file)
 
@@ -209,6 +252,16 @@ def discover_dataset_ids(base_dir: Union[str, Path]) -> list[str]:
     Discover dataset ids by scanning datasets/*.dataset.yaml.
 
     For 'datasets/powerplants.dataset.yaml' returns 'powerplants'.
+
+    Parameters
+    ----------
+    base_dir : Union[str, Path]
+        Base directory containing datasets, templates, and resources.
+
+    Returns
+    -------
+    list[str]
+        Sorted list of discovered dataset IDs.
     """
     base = Path(base_dir)
     datasets_dir = base / "datasets"
@@ -222,6 +275,16 @@ def discover_dataset_ids_from_index(index_file: Union[str, Path]) -> list[str]:
     Discover dataset ids from an explicit metadata_index.yaml.
 
     Returns the sorted list of top-level keys under `datasets`.
+
+    Parameters
+    ----------
+    index_file : Union[str, Path]
+        Path to an index YAML file for resolving dataset parts.
+
+    Returns
+    -------
+    list[str]
+        Sorted list of discovered dataset IDs.
     """
     idx_path = Path(index_file)
     if not idx_path.exists():

From 90f4ae0d8a9147225da13c512ebc2d0911260bd4 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 12:46:21 +0100
Subject: [PATCH 35/37] #126: add more test to creation test module

---
 tests/test_create.py | 70 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/tests/test_create.py b/tests/test_create.py
index 7cadf2d8..be25b7a5 100644
--- a/tests/test_create.py
+++ b/tests/test_create.py
@@ -1,5 +1,5 @@
 """
-Integration tests for OEMetadata assembly and entry point using real YAML.
+Integration tests for OEMetadata assembly and entry point using YAML test data.
 
 This test suite consumes the example YAML tree located at:
 tests/test_data/create/metadata/
@@ -10,10 +10,14 @@
 
 import json
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 from omi.create import build_from_yaml
 from omi.creation.assembler import assemble_metadata_dict
 
+if TYPE_CHECKING:
+    import pytest
+
 
 def _fixture_metadata_root() -> Path:
     """Return the absolute path to tests/test_data/create/metadata."""
@@ -74,3 +78,67 @@ def test_entrypoint_build_from_yaml_writes_file(tmp_path: Path) -> None:
         # stringify to inspect the character; ensure_ascii=False in writer preserves it
         text = json.dumps(licenses[0], ensure_ascii=False)
         assert "©" in text
+
+
+def test_build_from_yaml_writes_file_when_output_is_file(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Ensure build_from_yaml writes to the exact file path provided."""
+    from omi import create as create_mod
+
+    expected: dict[str, object] = {"name": "pp", "resources": []}
+
+    # Avoid needing real YAML on disk
+    def fake_assemble(
+        _base_dir: Path,
+        dataset_id: str,
+        _index_file: Path | None = None,
+    ) -> dict[str, object]:
+        assert dataset_id == "powerplants"
+        return expected
+
+    monkeypatch.setattr(create_mod, "assemble_metadata_dict", fake_assemble)
+
+    out = tmp_path / "out.json"
+    create_mod.build_from_yaml(tmp_path / "meta", "powerplants", out)
+
+    assert out.exists()
+    assert json.loads(out.read_text(encoding="utf-8")) == expected
+
+
+def test_build_many_from_yaml_writes_many_default_names(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Ensure build_many_from_yaml writes <dataset_id>.json files into output_dir."""
+    from omi import create as create_mod
+
+    canned: dict[str, dict[str, object]] = {
+        "a": {"name": "a", "resources": []},
+        "b": {"name": "b", "resources": []},
+    }
+
+    def fake_many(
+        _base_dir: Path,
+        *,
+        _dataset_ids: list[str] | None = None,
+        _index_file: Path | None = None,
+        as_dict: bool = True,
+    ) -> dict[str, dict[str, object]]:
+        # Called by build_many_from_yaml; return mapping id -> md
+        assert as_dict is True
+        return canned
+
+    monkeypatch.setattr(create_mod, "assemble_many_metadata", fake_many)
+
+    out_dir = tmp_path / "out"
+    create_mod.build_many_from_yaml(tmp_path / "meta", out_dir)
+
+    a_path = out_dir / "a.json"
+    b_path = out_dir / "b.json"
+    assert a_path.exists()
+    assert b_path.exists()
+
+    assert json.loads(a_path.read_text(encoding="utf-8")) == canned["a"]
+    assert json.loads(b_path.read_text(encoding="utf-8")) == canned["b"]

From 1b2a38ff8ba6fb651504af41ce13d94ab330ab4c Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 19 Nov 2025 12:48:57 +0100
Subject: [PATCH 36/37] 126: fix test

---
 tests/test_create.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/test_create.py b/tests/test_create.py
index be25b7a5..7fe2dd07 100644
--- a/tests/test_create.py
+++ b/tests/test_create.py
@@ -91,10 +91,12 @@ def test_build_from_yaml_writes_file_when_output_is_file(
 
     # Avoid needing real YAML on disk
     def fake_assemble(
-        _base_dir: Path,
+        base_dir: Path,
         dataset_id: str,
-        _index_file: Path | None = None,
+        index_file: Path | None = None,
     ) -> dict[str, object]:
+        # use args to avoid ARG001
+        _ = base_dir, index_file
         assert dataset_id == "powerplants"
         return expected
 
@@ -120,13 +122,14 @@ def test_build_many_from_yaml_writes_many_default_names(
     }
 
     def fake_many(
-        _base_dir: Path,
+        base_dir: Path,
         *,
-        _dataset_ids: list[str] | None = None,
-        _index_file: Path | None = None,
+        dataset_ids: list[str] | None = None,
+        index_file: Path | None = None,
         as_dict: bool = True,
     ) -> dict[str, dict[str, object]]:
         # Called by build_many_from_yaml; return mapping id -> md
+        _ = base_dir, dataset_ids, index_file  # avoid ARG001
         assert as_dict is True
         return canned
 

From 32b3b539c7a2c615ed9c71f72da9c940ed811343 Mon Sep 17 00:00:00 2001
From: jh-RLI <jonas.huber@rl-institut.de>
Date: Wed, 3 Dec 2025 10:22:29 +0100
Subject: [PATCH 37/37] #126: update changelog

---
 CHANGELOG.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index e1c7c862..207bd1c5 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -4,7 +4,8 @@ Changelog
 
 current
 --------------------
-*
+* Add the creation module and create entry: They implement yaml based metadata creation, provide template feature to keep metadata creation DRY, provide functionality to setup the metadata structure & generate metadata from existing sources like datapackages and csv files, provide functionality to create the full datapackage.json and save it to file [(#127)](https://github.com/rl-institut/super-repo/pull/127)
+
 
 1.1.0 (2025-03-25)
 --------------------