From 2a5dd2c45ba62956fe4119f12a9d6743054e3784 Mon Sep 17 00:00:00 2001 From: Zio Gabber <78922322+Gabrymi93@users.noreply.github.com> Date: Thu, 2 Apr 2026 17:12:56 +0100 Subject: [PATCH] Guard mart table names and clean_input usage --- toolkit/core/config_models.py | 16 ++++++++++++++++ toolkit/mart/run.py | 9 +++++++++ 2 files changed, 25 insertions(+) diff --git a/toolkit/core/config_models.py b/toolkit/core/config_models.py index f2a33ee..58279ed 100644 --- a/toolkit/core/config_models.py +++ b/toolkit/core/config_models.py @@ -14,6 +14,7 @@ logger = logging.getLogger("toolkit.core.config") _MANAGED_OUTPUT_ROOTS = {"_smoke_out", "_test_out"} +_SAFE_SQL_IDENTIFIER_RE = r"^[A-Za-z_][A-Za-z0-9_]*$" @dataclass(frozen=True) @@ -319,6 +320,21 @@ class MartTableConfig(BaseModel): name: str sql: Path + @field_validator("name") + @classmethod + def _validate_name(cls, value: str) -> str: + text = value.strip() + if not text: + raise ValueError("mart.tables[].name must not be empty") + import re + + if not re.fullmatch(_SAFE_SQL_IDENTIFIER_RE, text): + raise ValueError( + "mart.tables[].name must be a safe SQL identifier " + "(letters, numbers, underscore; cannot start with a number)" + ) + return text + class CrossYearTableConfig(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/toolkit/mart/run.py b/toolkit/mart/run.py index 4471a27..bee21e2 100644 --- a/toolkit/mart/run.py +++ b/toolkit/mart/run.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from pathlib import Path from typing import Any @@ -11,6 +12,9 @@ from toolkit.core.template import build_runtime_template_ctx, public_template_ctx, render_template +_CLEAN_INPUT_TOKEN_RE = re.compile(r"\bclean_input\b", re.IGNORECASE) + + def _serialize_metadata_path(path: Path | None, rel_root: Path | None) -> str | None: if path is None: return None @@ -106,6 +110,11 @@ def run_mart( sql = sql_path.read_text(encoding="utf-8") sql = render_template(sql, template_ctx) + if not clean_sql_configured and _CLEAN_INPUT_TOKEN_RE.search(sql): + raise ValueError( + "MART SQL references clean_input but clean.sql is not configured in dataset.yml" + ) + # Save rendered SQL for audit/debug rendered_sql_path: Path | None = None if run_dir is not None: