Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions tests/test_artifacts_policy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import shutil
from pathlib import Path

import duckdb

from toolkit.clean.run import run_clean
from toolkit.cli.cmd_profile import profile as profile_cmd
from toolkit.core.config import load_config
Expand Down Expand Up @@ -134,3 +136,56 @@ def test_artifacts_policy_standard_keeps_current_debug_artifacts(tmp_path: Path,
assert (profile_dir / "suggested_read.yml").exists()
assert (clean_dir / "_run" / "clean_rendered.sql").exists()
assert any((mart_dir / "_run").glob("*_rendered.sql"))


def test_run_mart_supports_root_posix_placeholder(tmp_path: Path) -> None:
config_path = tmp_path / "dataset.yml"
sql_dir = tmp_path / "sql"
sql_dir.mkdir(parents=True, exist_ok=True)
root_dir = tmp_path / "out"
dataset = "demo_ds"
year = 2022

clean_dir = root_dir / "data" / "clean" / dataset / str(year)
clean_dir.mkdir(parents=True, exist_ok=True)
clean_path = clean_dir / f"{dataset}_{year}_clean.parquet"
duckdb.execute(f"COPY (SELECT 1 AS value) TO '{clean_path.as_posix()}' (FORMAT PARQUET)")

lookup_path = root_dir / "lookup" / "mart_lookup_2022.parquet"
lookup_path.parent.mkdir(parents=True, exist_ok=True)
duckdb.execute(
f"COPY (SELECT 'ok' AS marker) TO '{lookup_path.as_posix()}' (FORMAT PARQUET)"
)

(sql_dir / "mart_example.sql").write_text(
"select * from read_parquet('{root_posix}/lookup/mart_lookup_2022.parquet')",
encoding="utf-8",
)
config_path.write_text(
"\n".join(
[
f'root: "{root_dir.as_posix()}"',
"dataset:",
f' name: "{dataset}"',
f" years: [{year}]",
"raw: {}",
"clean:",
' sql: "sql/clean.sql"',
"mart:",
" tables:",
' - name: "mart_example"',
' sql: "sql/mart_example.sql"',
]
),
encoding="utf-8",
)
(tmp_path / "sql" / "clean.sql").write_text("select 1 as value", encoding="utf-8")

cfg = load_config(config_path)
logger = _NoopLogger()
result = run_mart(cfg.dataset, year, cfg.root, cfg.mart, logger, base_dir=cfg.base_dir, output_cfg=cfg.output)

mart_output = root_dir / "data" / "mart" / dataset / str(year) / "mart_example.parquet"
assert mart_output.exists()
assert duckdb.execute(f"SELECT marker FROM read_parquet('{mart_output.as_posix()}')").fetchone() == ("ok",)
assert result["output_rows"] == 1
8 changes: 7 additions & 1 deletion tests/test_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@

def test_resolve_root_returns_expanded_explicit_path(tmp_path):
root = resolve_root(tmp_path / "out")
assert root == tmp_path / "out"
assert root == (tmp_path / "out").resolve()


def test_resolve_root_canonicalizes_relative_path(tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)
root = resolve_root("out")
assert root == (tmp_path / "out").resolve()


def test_resolve_root_requires_explicit_value():
Expand Down
44 changes: 44 additions & 0 deletions tests/test_run_dry_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
from pathlib import Path

import duckdb
from typer.testing import CliRunner

from toolkit.cli.app import app
Expand Down Expand Up @@ -165,6 +166,49 @@ def test_run_dry_run_accepts_unquoted_raw_columns_without_read_columns(tmp_path:
assert "sql_validation: OK" in result.output


def test_run_dry_run_accepts_mart_sql_with_root_posix_placeholder(tmp_path: Path) -> None:
sql_dir = tmp_path / "sql" / "mart"
sql_dir.mkdir(parents=True, exist_ok=True)
root_dir = tmp_path / "out"
lookup_path = root_dir / "lookup" / "mart_lookup_2022.parquet"
lookup_path.parent.mkdir(parents=True, exist_ok=True)
duckdb.execute(
f"COPY (SELECT 1 AS lookup_value) TO '{lookup_path.as_posix()}' (FORMAT PARQUET)"
)

(tmp_path / "sql" / "clean.sql").write_text("select 1 as value", encoding="utf-8")
(sql_dir / "mart_example.sql").write_text(
"select * from read_parquet('{root_posix}/lookup/mart_lookup_2022.parquet')",
encoding="utf-8",
)

config_path = tmp_path / "dataset.yml"
config_path.write_text(
"\n".join(
[
f'root: "{root_dir.as_posix()}"',
"dataset:",
' name: "demo_ds"',
" years: [2022]",
"raw: {}",
"clean:",
' sql: "sql/clean.sql"',
"mart:",
" tables:",
' - name: "mart_example"',
' sql: "sql/mart/mart_example.sql"',
]
),
encoding="utf-8",
)

runner = CliRunner()
result = runner.invoke(app, ["run", "all", "--config", str(config_path), "--dry-run"])

assert result.exit_code == 0
assert "sql_validation: OK" in result.output


def test_run_year_logs_effective_root_context(tmp_path: Path, caplog) -> None:
sql_dir = tmp_path / "sql" / "mart"
sql_dir.mkdir(parents=True, exist_ok=True)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pytest

from toolkit.core.template import render_template


def test_render_template_raises_clear_error_for_unresolved_placeholder():
with pytest.raises(ValueError, match=r"unresolved placeholders.*\{root_posix\}"):
render_template("select * from read_parquet('{root_posix}/file.parquet')", {"year": 2024})
13 changes: 10 additions & 3 deletions toolkit/clean/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from toolkit.core.artifacts import ARTIFACT_POLICY_DEBUG, resolve_artifact_policy, should_write
from toolkit.core.metadata import config_hash_for_year, file_record, write_layer_manifest, write_metadata
from toolkit.core.paths import layer_year_dir, resolve_root, to_root_relative
from toolkit.core.template import render_template
from toolkit.core.template import build_runtime_template_ctx, public_template_ctx, render_template


def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | None:
Expand All @@ -40,6 +40,7 @@ def _load_clean_sql(
*,
dataset: str,
year: int,
root: str | Path | None,
base_dir: Path | None,
) -> tuple[Path, str, dict[str, Any]]:
sql_ref = clean_cfg.get("sql")
Expand All @@ -50,7 +51,12 @@ def _load_clean_sql(
if not sql_path_obj.exists():
raise FileNotFoundError(f"CLEAN SQL file not found: {sql_path_obj}")

template_ctx = {"year": year, "dataset": dataset}
template_ctx = build_runtime_template_ctx(
dataset=dataset,
year=year,
root=root,
base_dir=base_dir,
)
sql = render_template(sql_path_obj.read_text(encoding="utf-8"), template_ctx)
return sql_path_obj, sql, template_ctx

Expand Down Expand Up @@ -160,7 +166,7 @@ def _clean_metadata_payload(
"year": year,
"sql": _serialize_metadata_path(sql_path_obj, base_dir),
"sql_rendered": _serialize_metadata_path(rendered_sql_path, root_dir),
"template_ctx": template_ctx,
"template_ctx": public_template_ctx(template_ctx),
"read": clean_cfg.get("read"),
"read_mode": read_mode,
"read_params_source": read_params_source,
Expand Down Expand Up @@ -232,6 +238,7 @@ def run_clean(
clean_cfg,
dataset=dataset,
year=year,
root=root_dir,
base_dir=base_dir,
)
rendered_sql_path = _write_rendered_sql(
Expand Down
9 changes: 8 additions & 1 deletion toolkit/cli/sql_dry_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import duckdb

from toolkit.clean.run import _load_clean_sql
from toolkit.core.template import build_runtime_template_ctx
from toolkit.core.template import render_template
from toolkit.mart.run import _resolve_sql_path as _resolve_mart_sql_path

Expand Down Expand Up @@ -80,6 +81,7 @@ def _build_clean_preview(
cfg.clean,
dataset=cfg.dataset,
year=year,
root=cfg.root,
base_dir=cfg.base_dir,
)
clean_sql = _normalize_sql(clean_sql)
Expand Down Expand Up @@ -111,7 +113,12 @@ def _validate_mart_sql(cfg, *, year: int, con: duckdb.DuckDBPyConnection) -> Non
con.execute("CREATE OR REPLACE VIEW clean AS SELECT * FROM clean_input")

tables = cfg.mart.get("tables") or []
template_ctx = {"year": year, "dataset": cfg.dataset}
template_ctx = build_runtime_template_ctx(
dataset=cfg.dataset,
year=year,
root=cfg.root,
base_dir=cfg.base_dir,
)

for table in tables:
name = table.get("name")
Expand Down
2 changes: 1 addition & 1 deletion toolkit/core/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def from_root_relative(rel: str, root: Path) -> Path:


def resolve_root(root: str | os.PathLike[str]) -> Path:
return Path(root).expanduser()
return Path(root).expanduser().resolve()


def dataset_dir(root: str | os.PathLike[str], layer: str, dataset: str) -> Path:
Expand Down
50 changes: 50 additions & 0 deletions toolkit/core/template.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from __future__ import annotations

import re
from pathlib import Path
from typing import Any

_UNRESOLVED_PLACEHOLDER_RE = re.compile(r"\{[A-Za-z_][A-Za-z0-9_]*\}")


def render_template(text: str, ctx: dict[str, Any]) -> str:
"""
Expand All @@ -13,4 +17,50 @@ def render_template(text: str, ctx: dict[str, Any]) -> str:
out = text
for k, v in ctx.items():
out = out.replace("{" + k + "}", str(v))
unresolved = sorted(set(_UNRESOLVED_PLACEHOLDER_RE.findall(out)))
if unresolved:
raise ValueError(
"Template contains unresolved placeholders after render: "
+ ", ".join(unresolved)
)
return out


def build_runtime_template_ctx(
*,
dataset: str,
year: int,
root: str | Path | None = None,
base_dir: Path | None = None,
) -> dict[str, Any]:
"""
Build the minimal deterministic template context exposed to SQL runtime.

Existing placeholders `{year}` and `{dataset}` remain stable; additional
path placeholders are additive-only and let SQL bind to the effective root
without depending on the current working directory.

Path placeholders trust that `root` and `base_dir` are already canonical
runtime paths. Callers should resolve them before building the context.
"""
ctx: dict[str, Any] = {"year": year, "dataset": dataset}
if root is not None:
root_path = Path(root)
ctx["root"] = str(root_path)
ctx["root_posix"] = root_path.as_posix()
if base_dir is not None:
ctx["base_dir"] = str(base_dir)
ctx["base_dir_posix"] = base_dir.as_posix()
return ctx


def public_template_ctx(ctx: dict[str, Any]) -> dict[str, Any]:
"""
Return the stable public subset safe to persist in metadata.

Runtime-only path helpers such as `root_posix` and `base_dir_posix` are
intentionally excluded so metadata stays portable and does not leak
absolute filesystem paths.
"""
public_keys = ("year", "dataset")
return {key: ctx[key] for key in public_keys if key in ctx}
11 changes: 8 additions & 3 deletions toolkit/mart/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from toolkit.core.artifacts import ARTIFACT_POLICY_DEBUG, resolve_artifact_policy, should_write
from toolkit.core.metadata import config_hash_for_year, file_record, write_layer_manifest, write_metadata
from toolkit.core.paths import layer_year_dir, resolve_root, to_root_relative
from toolkit.core.template import render_template
from toolkit.core.template import build_runtime_template_ctx, public_template_ctx, render_template


def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | None:
Expand Down Expand Up @@ -67,7 +67,12 @@ def run_mart(
if not isinstance(tables, list) or not tables:
raise ValueError("mart.tables missing or empty in dataset.yml")

template_ctx = {"year": year, "dataset": dataset}
template_ctx = build_runtime_template_ctx(
dataset=dataset,
year=year,
root=root_dir,
base_dir=base_dir,
)

run_dir: Path | None = None
if should_write("mart", "rendered_sql", policy, {"output": output_cfg or {}}):
Expand Down Expand Up @@ -139,7 +144,7 @@ def run_mart(
"inputs": [file_record(p) for p in clean_files],
"outputs": outputs,
"output_paths": [_serialize_metadata_path(p, root_dir) for p in written],
"template_ctx": template_ctx,
"template_ctx": public_template_ctx(template_ctx),
"tables": executed,
}
if policy == ARTIFACT_POLICY_DEBUG:
Expand Down
Loading