diff --git a/tests/test_artifacts_policy.py b/tests/test_artifacts_policy.py index 7cccc30..e06eb87 100644 --- a/tests/test_artifacts_policy.py +++ b/tests/test_artifacts_policy.py @@ -1,6 +1,8 @@ import shutil from pathlib import Path +import duckdb + from toolkit.clean.run import run_clean from toolkit.cli.cmd_profile import profile as profile_cmd from toolkit.core.config import load_config @@ -134,3 +136,56 @@ def test_artifacts_policy_standard_keeps_current_debug_artifacts(tmp_path: Path, assert (profile_dir / "suggested_read.yml").exists() assert (clean_dir / "_run" / "clean_rendered.sql").exists() assert any((mart_dir / "_run").glob("*_rendered.sql")) + + +def test_run_mart_supports_root_posix_placeholder(tmp_path: Path) -> None: + config_path = tmp_path / "dataset.yml" + sql_dir = tmp_path / "sql" + sql_dir.mkdir(parents=True, exist_ok=True) + root_dir = tmp_path / "out" + dataset = "demo_ds" + year = 2022 + + clean_dir = root_dir / "data" / "clean" / dataset / str(year) + clean_dir.mkdir(parents=True, exist_ok=True) + clean_path = clean_dir / f"{dataset}_{year}_clean.parquet" + duckdb.execute(f"COPY (SELECT 1 AS value) TO '{clean_path.as_posix()}' (FORMAT PARQUET)") + + lookup_path = root_dir / "lookup" / "mart_lookup_2022.parquet" + lookup_path.parent.mkdir(parents=True, exist_ok=True) + duckdb.execute( + f"COPY (SELECT 'ok' AS marker) TO '{lookup_path.as_posix()}' (FORMAT PARQUET)" + ) + + (sql_dir / "mart_example.sql").write_text( + "select * from read_parquet('{root_posix}/lookup/mart_lookup_2022.parquet')", + encoding="utf-8", + ) + config_path.write_text( + "\n".join( + [ + f'root: "{root_dir.as_posix()}"', + "dataset:", + f' name: "{dataset}"', + f" years: [{year}]", + "raw: {}", + "clean:", + ' sql: "sql/clean.sql"', + "mart:", + " tables:", + ' - name: "mart_example"', + ' sql: "sql/mart_example.sql"', + ] + ), + encoding="utf-8", + ) + (tmp_path / "sql" / "clean.sql").write_text("select 1 as value", encoding="utf-8") + + cfg = load_config(config_path) + logger = _NoopLogger() + result = run_mart(cfg.dataset, year, cfg.root, cfg.mart, logger, base_dir=cfg.base_dir, output_cfg=cfg.output) + + mart_output = root_dir / "data" / "mart" / dataset / str(year) / "mart_example.parquet" + assert mart_output.exists() + assert duckdb.execute(f"SELECT marker FROM read_parquet('{mart_output.as_posix()}')").fetchone() == ("ok",) + assert result["output_rows"] == 1 diff --git a/tests/test_paths.py b/tests/test_paths.py index 62c74ea..04c93b5 100644 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -7,7 +7,13 @@ def test_resolve_root_returns_expanded_explicit_path(tmp_path): root = resolve_root(tmp_path / "out") - assert root == tmp_path / "out" + assert root == (tmp_path / "out").resolve() + + +def test_resolve_root_canonicalizes_relative_path(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + root = resolve_root("out") + assert root == (tmp_path / "out").resolve() def test_resolve_root_requires_explicit_value(): diff --git a/tests/test_run_dry_run.py b/tests/test_run_dry_run.py index 18bcf37..9e2ec8c 100644 --- a/tests/test_run_dry_run.py +++ b/tests/test_run_dry_run.py @@ -4,6 +4,7 @@ import logging from pathlib import Path +import duckdb from typer.testing import CliRunner from toolkit.cli.app import app @@ -165,6 +166,49 @@ def test_run_dry_run_accepts_unquoted_raw_columns_without_read_columns(tmp_path: assert "sql_validation: OK" in result.output +def test_run_dry_run_accepts_mart_sql_with_root_posix_placeholder(tmp_path: Path) -> None: + sql_dir = tmp_path / "sql" / "mart" + sql_dir.mkdir(parents=True, exist_ok=True) + root_dir = tmp_path / "out" + lookup_path = root_dir / "lookup" / "mart_lookup_2022.parquet" + lookup_path.parent.mkdir(parents=True, exist_ok=True) + duckdb.execute( + f"COPY (SELECT 1 AS lookup_value) TO '{lookup_path.as_posix()}' (FORMAT PARQUET)" + ) + + (tmp_path / "sql" / "clean.sql").write_text("select 1 as value", encoding="utf-8") + (sql_dir / "mart_example.sql").write_text( + "select * from read_parquet('{root_posix}/lookup/mart_lookup_2022.parquet')", + encoding="utf-8", + ) + + config_path = tmp_path / "dataset.yml" + config_path.write_text( + "\n".join( + [ + f'root: "{root_dir.as_posix()}"', + "dataset:", + ' name: "demo_ds"', + " years: [2022]", + "raw: {}", + "clean:", + ' sql: "sql/clean.sql"', + "mart:", + " tables:", + ' - name: "mart_example"', + ' sql: "sql/mart/mart_example.sql"', + ] + ), + encoding="utf-8", + ) + + runner = CliRunner() + result = runner.invoke(app, ["run", "all", "--config", str(config_path), "--dry-run"]) + + assert result.exit_code == 0 + assert "sql_validation: OK" in result.output + + def test_run_year_logs_effective_root_context(tmp_path: Path, caplog) -> None: sql_dir = tmp_path / "sql" / "mart" sql_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_template.py b/tests/test_template.py new file mode 100644 index 0000000..1d74d63 --- /dev/null +++ b/tests/test_template.py @@ -0,0 +1,8 @@ +import pytest + +from toolkit.core.template import render_template + + +def test_render_template_raises_clear_error_for_unresolved_placeholder(): + with pytest.raises(ValueError, match=r"unresolved placeholders.*\{root_posix\}"): + render_template("select * from read_parquet('{root_posix}/file.parquet')", {"year": 2024}) diff --git a/toolkit/clean/run.py b/toolkit/clean/run.py index 72e5028..3fa9d18 100644 --- a/toolkit/clean/run.py +++ b/toolkit/clean/run.py @@ -15,7 +15,7 @@ from toolkit.core.artifacts import ARTIFACT_POLICY_DEBUG, resolve_artifact_policy, should_write from toolkit.core.metadata import config_hash_for_year, file_record, write_layer_manifest, write_metadata from toolkit.core.paths import layer_year_dir, resolve_root, to_root_relative -from toolkit.core.template import render_template +from toolkit.core.template import build_runtime_template_ctx, public_template_ctx, render_template def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | None: @@ -40,6 +40,7 @@ def _load_clean_sql( *, dataset: str, year: int, + root: str | Path | None, base_dir: Path | None, ) -> tuple[Path, str, dict[str, Any]]: sql_ref = clean_cfg.get("sql") @@ -50,7 +51,12 @@ def _load_clean_sql( if not sql_path_obj.exists(): raise FileNotFoundError(f"CLEAN SQL file not found: {sql_path_obj}") - template_ctx = {"year": year, "dataset": dataset} + template_ctx = build_runtime_template_ctx( + dataset=dataset, + year=year, + root=root, + base_dir=base_dir, + ) sql = render_template(sql_path_obj.read_text(encoding="utf-8"), template_ctx) return sql_path_obj, sql, template_ctx @@ -160,7 +166,7 @@ def _clean_metadata_payload( "year": year, "sql": _serialize_metadata_path(sql_path_obj, base_dir), "sql_rendered": _serialize_metadata_path(rendered_sql_path, root_dir), - "template_ctx": template_ctx, + "template_ctx": public_template_ctx(template_ctx), "read": clean_cfg.get("read"), "read_mode": read_mode, "read_params_source": read_params_source, @@ -232,6 +238,7 @@ def run_clean( clean_cfg, dataset=dataset, year=year, + root=root_dir, base_dir=base_dir, ) rendered_sql_path = _write_rendered_sql( diff --git a/toolkit/cli/sql_dry_run.py b/toolkit/cli/sql_dry_run.py index f3c4ef1..400614e 100644 --- a/toolkit/cli/sql_dry_run.py +++ b/toolkit/cli/sql_dry_run.py @@ -6,6 +6,7 @@ import duckdb from toolkit.clean.run import _load_clean_sql +from toolkit.core.template import build_runtime_template_ctx from toolkit.core.template import render_template from toolkit.mart.run import _resolve_sql_path as _resolve_mart_sql_path @@ -80,6 +81,7 @@ def _build_clean_preview( cfg.clean, dataset=cfg.dataset, year=year, + root=cfg.root, base_dir=cfg.base_dir, ) clean_sql = _normalize_sql(clean_sql) @@ -111,7 +113,12 @@ def _validate_mart_sql(cfg, *, year: int, con: duckdb.DuckDBPyConnection) -> Non con.execute("CREATE OR REPLACE VIEW clean AS SELECT * FROM clean_input") tables = cfg.mart.get("tables") or [] - template_ctx = {"year": year, "dataset": cfg.dataset} + template_ctx = build_runtime_template_ctx( + dataset=cfg.dataset, + year=year, + root=cfg.root, + base_dir=cfg.base_dir, + ) for table in tables: name = table.get("name") diff --git a/toolkit/core/paths.py b/toolkit/core/paths.py index b9dcd5f..36ade6f 100644 --- a/toolkit/core/paths.py +++ b/toolkit/core/paths.py @@ -26,7 +26,7 @@ def from_root_relative(rel: str, root: Path) -> Path: def resolve_root(root: str | os.PathLike[str]) -> Path: - return Path(root).expanduser() + return Path(root).expanduser().resolve() def dataset_dir(root: str | os.PathLike[str], layer: str, dataset: str) -> Path: diff --git a/toolkit/core/template.py b/toolkit/core/template.py index 0623c1e..45f9413 100644 --- a/toolkit/core/template.py +++ b/toolkit/core/template.py @@ -1,7 +1,11 @@ from __future__ import annotations +import re +from pathlib import Path from typing import Any +_UNRESOLVED_PLACEHOLDER_RE = re.compile(r"\{[A-Za-z_][A-Za-z0-9_]*\}") + def render_template(text: str, ctx: dict[str, Any]) -> str: """ @@ -13,4 +17,50 @@ def render_template(text: str, ctx: dict[str, Any]) -> str: out = text for k, v in ctx.items(): out = out.replace("{" + k + "}", str(v)) + unresolved = sorted(set(_UNRESOLVED_PLACEHOLDER_RE.findall(out))) + if unresolved: + raise ValueError( + "Template contains unresolved placeholders after render: " + + ", ".join(unresolved) + ) return out + + +def build_runtime_template_ctx( + *, + dataset: str, + year: int, + root: str | Path | None = None, + base_dir: Path | None = None, +) -> dict[str, Any]: + """ + Build the minimal deterministic template context exposed to SQL runtime. + + Existing placeholders `{year}` and `{dataset}` remain stable; additional + path placeholders are additive-only and let SQL bind to the effective root + without depending on the current working directory. + + Path placeholders trust that `root` and `base_dir` are already canonical + runtime paths. Callers should resolve them before building the context. + """ + ctx: dict[str, Any] = {"year": year, "dataset": dataset} + if root is not None: + root_path = Path(root) + ctx["root"] = str(root_path) + ctx["root_posix"] = root_path.as_posix() + if base_dir is not None: + ctx["base_dir"] = str(base_dir) + ctx["base_dir_posix"] = base_dir.as_posix() + return ctx + + +def public_template_ctx(ctx: dict[str, Any]) -> dict[str, Any]: + """ + Return the stable public subset safe to persist in metadata. + + Runtime-only path helpers such as `root_posix` and `base_dir_posix` are + intentionally excluded so metadata stays portable and does not leak + absolute filesystem paths. + """ + public_keys = ("year", "dataset") + return {key: ctx[key] for key in public_keys if key in ctx} diff --git a/toolkit/mart/run.py b/toolkit/mart/run.py index 327fc63..c557a74 100644 --- a/toolkit/mart/run.py +++ b/toolkit/mart/run.py @@ -8,7 +8,7 @@ from toolkit.core.artifacts import ARTIFACT_POLICY_DEBUG, resolve_artifact_policy, should_write from toolkit.core.metadata import config_hash_for_year, file_record, write_layer_manifest, write_metadata from toolkit.core.paths import layer_year_dir, resolve_root, to_root_relative -from toolkit.core.template import render_template +from toolkit.core.template import build_runtime_template_ctx, public_template_ctx, render_template def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | None: @@ -67,7 +67,12 @@ def run_mart( if not isinstance(tables, list) or not tables: raise ValueError("mart.tables missing or empty in dataset.yml") - template_ctx = {"year": year, "dataset": dataset} + template_ctx = build_runtime_template_ctx( + dataset=dataset, + year=year, + root=root_dir, + base_dir=base_dir, + ) run_dir: Path | None = None if should_write("mart", "rendered_sql", policy, {"output": output_cfg or {}}): @@ -139,7 +144,7 @@ def run_mart( "inputs": [file_record(p) for p in clean_files], "outputs": outputs, "output_paths": [_serialize_metadata_path(p, root_dir) for p in written], - "template_ctx": template_ctx, + "template_ctx": public_template_ctx(template_ctx), "tables": executed, } if policy == ARTIFACT_POLICY_DEBUG: