Skip to content

Commit 4f443e9

Browse files
authored
test: hardening CI path semantics per candidate depth-2, sources depth-3, support dataset (#61)
* test: hardening CI path semantics per candidate depth-2, sources depth-3, support dataset - aggiunge guard opzionale repo_root in DatasetConfig per validare che effective_root resti dentro la repo in contesti CI - espone il guard tramite load_config(..., repo_root=...) - aggiunge docstring esplicito: il guard è pensato per caller esterni (es. CI di monorepo come dataset-incubator), non per uso generale - copre con test i layout reali: candidate depth-2, sources/* depth-3, support dataset, path assoluto dentro repo, errore fuori repo, retrocompatibilità senza guard Closes #54 * fix: risolvi double-resolve e aggiungi is_dir check su repo_root - _ensure_root_within_repo ora riceve path già risolti dal caller (docstring esplicito, no double expanduser/resolve nel guard) - call site in load_config_model risolve repo_root prima di passarlo - aggiunto check is_dir() su repo_root con errore esplicito se il path non esiste o non è una directory
1 parent 69ce8f4 commit 4f443e9

3 files changed

Lines changed: 162 additions & 3 deletions

File tree

tests/test_config.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,117 @@ def test_load_config_uses_base_dir_when_root_missing_and_dcl_root_missing(tmp_pa
293293
assert cfg.root_source == "base_dir_fallback"
294294

295295

296+
@pytest.mark.parametrize(
297+
("dataset_rel", "root_value"),
298+
[
299+
("candidates/demo_dataset", "../../out"),
300+
("candidates/demo_dataset/sources/demo_source", "../../../../out"),
301+
("support_datasets/demo_support", "../../out"),
302+
],
303+
)
304+
def test_load_config_resolves_repo_out_for_dataset_incubator_layouts(
305+
tmp_path: Path,
306+
dataset_rel: str,
307+
root_value: str,
308+
):
309+
repo_root = tmp_path / "dataset-incubator"
310+
dataset_dir = repo_root / Path(dataset_rel)
311+
dataset_dir.mkdir(parents=True, exist_ok=True)
312+
yml = dataset_dir / "dataset.yml"
313+
yml.write_text(
314+
f"""
315+
root: "{root_value}"
316+
dataset:
317+
name: demo
318+
years: [2022]
319+
raw: {{}}
320+
clean: {{}}
321+
mart: {{}}
322+
""".strip(),
323+
encoding="utf-8",
324+
)
325+
326+
cfg = load_config(yml, repo_root=repo_root)
327+
328+
assert cfg.root == (repo_root / "out").resolve()
329+
assert cfg.root_source == "yml"
330+
331+
332+
def test_load_config_accepts_absolute_root_within_repo_when_repo_root_is_provided(tmp_path: Path):
333+
repo_root = tmp_path / "dataset-incubator"
334+
dataset_dir = repo_root / "candidates" / "demo_dataset"
335+
dataset_dir.mkdir(parents=True, exist_ok=True)
336+
allowed_root = (repo_root / "out").resolve()
337+
yml = dataset_dir / "dataset.yml"
338+
yml.write_text(
339+
f"""
340+
root: "{allowed_root.as_posix()}"
341+
dataset:
342+
name: demo
343+
years: [2022]
344+
raw: {{}}
345+
clean: {{}}
346+
mart: {{}}
347+
""".strip(),
348+
encoding="utf-8",
349+
)
350+
351+
cfg = load_config(yml, repo_root=repo_root)
352+
353+
assert cfg.root == allowed_root
354+
assert cfg.root_source == "yml"
355+
356+
357+
def test_load_config_rejects_root_outside_repo_when_repo_root_is_provided(tmp_path: Path):
358+
repo_root = tmp_path / "dataset-incubator"
359+
dataset_dir = repo_root / "candidates" / "demo_dataset"
360+
dataset_dir.mkdir(parents=True, exist_ok=True)
361+
outside_root = tmp_path / "outside"
362+
yml = dataset_dir / "dataset.yml"
363+
yml.write_text(
364+
f"""
365+
root: "{outside_root.as_posix()}"
366+
dataset:
367+
name: demo
368+
years: [2022]
369+
raw: {{}}
370+
clean: {{}}
371+
mart: {{}}
372+
""".strip(),
373+
encoding="utf-8",
374+
)
375+
376+
with pytest.raises(ValueError) as exc:
377+
load_config(yml, repo_root=repo_root)
378+
379+
assert "root resolves outside repo_root" in str(exc.value)
380+
assert str(outside_root.resolve()) in str(exc.value)
381+
assert str(repo_root.resolve()) in str(exc.value)
382+
383+
384+
def test_load_config_allows_root_outside_repo_without_repo_root_guard(tmp_path: Path):
385+
repo_root = tmp_path / "dataset-incubator"
386+
dataset_dir = repo_root / "candidates" / "demo_dataset"
387+
dataset_dir.mkdir(parents=True, exist_ok=True)
388+
yml = dataset_dir / "dataset.yml"
389+
yml.write_text(
390+
"""
391+
root: "../../../outside"
392+
dataset:
393+
name: demo
394+
years: [2022]
395+
raw: {}
396+
clean: {}
397+
mart: {}
398+
""".strip(),
399+
encoding="utf-8",
400+
)
401+
402+
cfg = load_config(yml)
403+
404+
assert cfg.root == (tmp_path / "outside").resolve()
405+
406+
296407
def test_load_config_rejects_legacy_clean_read_csv_shape(tmp_path: Path):
297408
project_dir = tmp_path / "project"
298409
project_dir.mkdir()

toolkit/core/config.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,13 @@ def _compat_cross_year(model: ToolkitConfigModel) -> dict[str, Any]:
7070
)
7171

7272

73-
def load_config(path: str | Path, *, strict_config: bool = False) -> ToolkitConfig:
74-
model = load_config_model(path, strict_config=strict_config)
73+
def load_config(
74+
path: str | Path,
75+
*,
76+
strict_config: bool = False,
77+
repo_root: str | Path | None = None,
78+
) -> ToolkitConfig:
79+
model = load_config_model(path, strict_config=strict_config, repo_root=repo_root)
7580
return ToolkitConfig(
7681
base_dir=model.base_dir,
7782
schema_version=model.schema_version,

toolkit/core/config_models.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,26 @@ def _resolve_root(root: Any, *, base_dir: Path) -> tuple[Path, str]:
585585
source = "env:TOOLKIT_OUTDIR" if os.environ.get("TOOLKIT_OUTDIR") else "env:DCL_OUTDIR"
586586
return Path(managed_outdir).expanduser().resolve(), source
587587
return _resolve_path_value(root, base_dir=base_dir), "yml"
588+
589+
590+
def _ensure_root_within_repo(root: Path, *, repo_root: Path, path: Path) -> Path:
591+
"""
592+
Verify that `root` is contained within `repo_root` using resolved paths.
593+
Both `root` and `repo_root` must already be fully resolved by the caller.
594+
Returns `root` unchanged on success; raises ValueError on violation.
595+
`path` is the config file path, used only for error context.
596+
Note: this guard checks only the output root directory, not SQL input paths.
597+
"""
598+
try:
599+
root.relative_to(repo_root)
600+
except ValueError as exc:
601+
raise _err(
602+
f"root resolves outside repo_root: root={root} repo_root={repo_root}",
603+
path=path,
604+
) from exc
605+
return root
606+
607+
588608
def _emit_deprecation_notice(
589609
key: str,
590610
*,
@@ -730,7 +750,22 @@ def _read_strict_config(data: dict[str, Any], *, path: Path) -> bool:
730750
return parse_bool(strict_value, "config.strict")
731751

732752

733-
def load_config_model(path: str | Path, *, strict_config: bool = False) -> ToolkitConfigModel:
753+
def load_config_model(
754+
path: str | Path,
755+
*,
756+
strict_config: bool = False,
757+
repo_root: str | Path | None = None,
758+
) -> ToolkitConfigModel:
759+
"""
760+
Load and normalize toolkit config.
761+
762+
repo_root is an optional guardrail for callers that need to enforce that
763+
the resolved effective root stays inside a known repository tree. This is
764+
intentionally opt-in so the toolkit can still support valid workflows that
765+
write outputs outside the project directory. A typical caller is external
766+
CI that validates dataset.yml contracts for monorepos such as
767+
dataset-incubator.
768+
"""
734769
p = Path(path)
735770
base_dir = p.parent.resolve()
736771

@@ -752,6 +787,14 @@ def load_config_model(path: str | Path, *, strict_config: bool = False) -> Toolk
752787
normalized = _normalize_legacy_payload(data, path=p, strict_config=strict_mode)
753788
normalized = _warn_or_reject_unknown_keys(normalized, path=p, strict_config=strict_mode)
754789
root_path, root_source = _resolve_root(normalized.get("root"), base_dir=base_dir)
790+
if repo_root is not None:
791+
repo_root_path = Path(repo_root).expanduser().resolve()
792+
if not repo_root_path.is_dir():
793+
raise _err(
794+
f"repo_root does not exist or is not a directory: {repo_root_path}",
795+
path=p,
796+
)
797+
root_path = _ensure_root_within_repo(root_path, repo_root=repo_root_path, path=p)
755798

756799
raw = normalized.get("raw", {}) or {}
757800
clean = normalized.get("clean", {}) or {}

0 commit comments

Comments
 (0)