diff --git a/CHANGELOG.md b/CHANGELOG.md index 13d8039..3172aef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,19 @@ All notable changes to this project will be documented in this file. +## [Unreleased] + +### Removed + +- Legacy config forms below no longer emit deprecation warnings and now fail with explicit config errors: + - `raw.source` + - `raw.sources[].plugin` + - `raw.sources[].id` + - scalar `clean.read` + - `clean.read.csv.*` + - `clean.sql_path` + - `mart.sql_dir` + ## [1.0.0] - 2026-02-28 ### Added @@ -41,6 +54,4 @@ All notable changes to this project will be documented in this file. - `raw.sources[].id` in favor of `raw.sources[].name` - scalar `clean.read` in favor of `clean.read.source` - `clean.read.csv.*` in favor of `clean.read.*` -- `clean.sql_path` -- `mart.sql_dir` - `bq` diff --git a/docs/advanced-workflows.md b/docs/advanced-workflows.md index fdf1bb3..3fd48f2 100644 --- a/docs/advanced-workflows.md +++ b/docs/advanced-workflows.md @@ -95,8 +95,6 @@ Regola pratica: ## Compat legacy -Il toolkit mantiene compatibilita` con alcune forme legacy del config per facilitare la migrazione. - Per i repo nuovi: - usa la shape canonica documentata in [config-schema.md](./config-schema.md) diff --git a/docs/config-schema.md b/docs/config-schema.md index 112a123..651fd2c 100644 --- a/docs/config-schema.md +++ b/docs/config-schema.md @@ -222,16 +222,23 @@ Con `config.strict: true` o `--strict-config`, gli stessi casi diventano errori. | Code | Legacy | Replacement | Status | |---|---|---|---| -| `DCL001` | `raw.source` | `raw.sources` | deprecated | -| `DCL002` | `raw.sources[].plugin` | `raw.sources[].type` | deprecated | -| `DCL003` | `raw.sources[].id` | `raw.sources[].name` | deprecated | -| `DCL004` | `clean.read: "auto"` | `clean.read.source: auto` | deprecated | -| `DCL005` | `clean.read.csv.*` | `clean.read.*` | deprecated | -| `DCL006` | `clean.sql_path` | `clean.sql` | ignored | -| `DCL007` | `mart.sql_dir` | `mart.tables[].sql` | ignored | | `DCL008` | `bq` | rimuovere il campo | ignored | | `DCL013` | `cross_year.* unknown keys` | rimuovere il campo | ignored | +## Legacy rimosso + +Le forme seguenti non sono piu supportate. Non generano warning legacy: falliscono subito con errore di config e va usata la shape canonica. + +| Legacy rimosso | Usa invece | +|---|---| +| `raw.source` | `raw.sources` | +| `raw.sources[].plugin` | `raw.sources[].type` | +| `raw.sources[].id` | `raw.sources[].name` | +| `clean.read: "auto"` | `clean.read.source: auto` | +| `clean.read.csv.*` | `clean.read.*` | +| `clean.sql_path` | `clean.sql` | +| `mart.sql_dir` | `mart.tables[].sql` | + ## Esempi minimi ### RAW only @@ -312,7 +319,7 @@ Esempi tipici: - `Config validation failed: output.unknown_flag: Extra inputs are not permitted` - `Config validation failed: raw.sources: Input should be a valid list` - `Config validation failed: clean.validate.primary_key: clean.validate.primary_key must be a string or a list of strings` -- `DCL001 raw.source is deprecated, usare raw.sources` +- `Config validation failed: raw.sources: Input should be a valid list` Regola pratica: diff --git a/tests/test_clean_csv_columns.py b/tests/test_clean_csv_columns.py index 8bb741c..6d34354 100644 --- a/tests/test_clean_csv_columns.py +++ b/tests/test_clean_csv_columns.py @@ -33,16 +33,14 @@ def test_run_clean_csv_columns_reads_trailing_delimiter_csv(tmp_path: Path): "sql": str(sql_path), "read": { "mode": "latest", - "csv": { - "delim": ";", - "header": True, - "ignore_errors": True, - "null_padding": True, - "trim_whitespace": True, - "columns": { - "a": "VARCHAR", - "b": "VARCHAR", - }, + "delim": ";", + "header": True, + "ignore_errors": True, + "null_padding": True, + "trim_whitespace": True, + "columns": { + "a": "VARCHAR", + "b": "VARCHAR", }, }, }, diff --git a/tests/test_clean_duckdb_read.py b/tests/test_clean_duckdb_read.py index ad36bc5..be6fffc 100644 --- a/tests/test_clean_duckdb_read.py +++ b/tests/test_clean_duckdb_read.py @@ -370,7 +370,7 @@ def test_resolve_clean_read_cfg_config_only_ignores_suggested(tmp_path: Path): _, relation_cfg, params_source = duckdb_read.resolve_clean_read_cfg( raw_dir, - {"read": "config_only"}, + {"read": {"source": "config_only"}}, logging.getLogger("tests.clean.duckdb_read.config_only"), ) diff --git a/tests/test_config.py b/tests/test_config.py index a4e79c0..88a259a 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -3,11 +3,18 @@ import logging import pytest -from toolkit.cli.cmd_run import run as run_cmd from toolkit.core.config import ensure_str_list, load_config, parse_bool from toolkit.core.config_models import load_config_model +def _bind_config_logger(caplog, monkeypatch): + module_logger = logging.getLogger("toolkit.core.config") + monkeypatch.setattr(module_logger, "handlers", [caplog.handler]) + monkeypatch.setattr(module_logger, "propagate", False) + module_logger.setLevel(logging.WARNING) + caplog.set_level(logging.WARNING, logger="toolkit.core.config") + + def test_load_config_ok(tmp_path: Path): yml = tmp_path / "dataset.yml" yml.write_text( @@ -64,10 +71,10 @@ def test_load_config_resolves_relative_paths_from_dataset_dir(tmp_path: Path): name: demo years: [2022] raw: - source: - type: local_file - args: - path: "data/raw.csv" + sources: + - type: local_file + args: + path: "data/raw.csv" clean: sql: "sql/clean.sql" mart: @@ -88,7 +95,7 @@ def test_load_config_resolves_relative_paths_from_dataset_dir(tmp_path: Path): assert cfg.base_dir == project_dir.resolve() assert cfg.root == (project_dir / "out").resolve() assert cfg.root_source == "yml" - assert cfg.raw["source"]["args"]["path"] == (project_dir / "data" / "raw.csv").resolve() + assert cfg.raw["sources"][0]["args"]["path"] == (project_dir / "data" / "raw.csv").resolve() assert cfg.clean["sql"] == (project_dir / "sql" / "clean.sql").resolve() assert cfg.mart["tables"][0]["sql"] == (project_dir / "sql" / "mart" / "demo.sql").resolve() assert cfg.cross_year["tables"][0]["sql"] == (project_dir / "sql" / "cross" / "demo_cross.sql").resolve() @@ -106,11 +113,11 @@ def test_load_config_does_not_transform_non_whitelisted_path_like_fields(tmp_pat name: demo years: [2022] raw: - source: - type: local_file - args: - path: "data/raw.csv" - filename: "nested/raw.csv" + sources: + - type: local_file + args: + path: "data/raw.csv" + filename: "nested/raw.csv" clean: sql: "sql/clean.sql" note_path: "docs/clean.md" @@ -125,8 +132,8 @@ def test_load_config_does_not_transform_non_whitelisted_path_like_fields(tmp_pat cfg = load_config(yml) - assert cfg.raw["source"]["args"]["path"] == (project_dir / "data" / "raw.csv").resolve() - assert cfg.raw["source"]["args"]["filename"] == "nested/raw.csv" + assert cfg.raw["sources"][0]["args"]["path"] == (project_dir / "data" / "raw.csv").resolve() + assert cfg.raw["sources"][0]["args"]["filename"] == "nested/raw.csv" assert cfg.clean["note_path"] == "docs/clean.md" assert cfg.mart["label_path"] == "labels/mart.txt" @@ -180,7 +187,6 @@ def test_load_config_logs_normalized_whitelist_fields(tmp_path: Path, caplog, mo clean: sql: "sql/clean.sql" mart: - sql_dir: "sql/mart" tables: - name: demo_mart sql: "sql/mart/demo.sql" @@ -200,14 +206,12 @@ def test_load_config_logs_normalized_whitelist_fields(tmp_path: Path, caplog, mo assert cfg.root == (project_dir / "out").resolve() assert cfg.raw["sources"][0]["args"]["path"] == (project_dir / "data" / "raw_a.csv").resolve() assert cfg.clean["sql"] == (project_dir / "sql" / "clean.sql").resolve() - assert cfg.mart["sql_dir"] == (project_dir / "sql" / "mart").resolve() assert cfg.mart["tables"][0]["sql"] == (project_dir / "sql" / "mart" / "demo.sql").resolve() assert "Normalized config paths:" in caplog.text assert "root=" in caplog.text assert "raw.sources[0].args.path=" in caplog.text assert "clean.sql=" in caplog.text - assert "mart.sql_dir=" in caplog.text assert "mart.tables[0].sql=" in caplog.text @@ -289,7 +293,7 @@ def test_load_config_uses_base_dir_when_root_missing_and_dcl_root_missing(tmp_pa assert cfg.root_source == "base_dir_fallback" -def test_load_config_normalizes_legacy_clean_read_csv_and_warns(tmp_path: Path, caplog): +def test_load_config_rejects_legacy_clean_read_csv_shape(tmp_path: Path): project_dir = tmp_path / "project" project_dir.mkdir() yml = project_dir / "dataset.yml" @@ -311,21 +315,10 @@ def test_load_config_normalizes_legacy_clean_read_csv_and_warns(tmp_path: Path, encoding="utf-8", ) - module_logger = logging.getLogger("toolkit.core.config") - module_logger.handlers = [caplog.handler] - module_logger.propagate = True - module_logger.setLevel(logging.WARNING) - - with caplog.at_level(logging.WARNING, logger="toolkit.core.config"): - cfg = load_config(yml) + with pytest.raises(ValueError) as exc: + load_config(yml) - assert cfg.clean["read"] == { - "source": "auto", - "columns": {"amount": "DOUBLE"}, - "delim": ";", - } - assert "DCL005" in caplog.text - assert "deprecated, usare clean.read.*" in caplog.text + assert "clean.read.csv" in str(exc.value) def test_load_config_canonical_clean_read_has_no_deprecation_warning(tmp_path: Path, caplog): @@ -440,7 +433,7 @@ def test_load_config_normalizes_bool_and_string_list_fields(tmp_path: Path): assert cfg.mart["validate"]["table_rules"]["mart_ok"]["primary_key"] == ["key_id"] -def test_load_config_warns_on_zombie_fields(tmp_path: Path, caplog): +def test_load_config_warns_on_zombie_field_bq(tmp_path: Path, caplog, monkeypatch): yml = tmp_path / "dataset.yml" yml.write_text( """ @@ -448,86 +441,70 @@ def test_load_config_warns_on_zombie_fields(tmp_path: Path, caplog): name: demo years: [2022] raw: {} -clean: - sql_path: sql/legacy_clean.sql -mart: - sql_dir: sql/mart bq: dataset: ignored +clean: {} +mart: {} """.strip(), encoding="utf-8", ) + _bind_config_logger(caplog, monkeypatch) + with caplog.at_level(logging.WARNING, logger="toolkit.core.config"): load_config(yml) - assert "DCL006" in caplog.text - assert "DCL007" in caplog.text assert "DCL008" in caplog.text - assert "deprecated/ignored, usare clean.sql" in caplog.text - assert "deprecated/ignored, usare mart.tables[].sql" in caplog.text assert "deprecated/ignored, usare remove field" in caplog.text -def test_load_config_model_normalizes_legacy_aliases_to_canonical_shape(tmp_path: Path): +def test_load_config_rejects_clean_sql_path(tmp_path: Path): yml = tmp_path / "dataset.yml" yml.write_text( """ dataset: name: demo years: [2022] -raw: - source: - id: src_legacy - plugin: local_file - args: - path: data/raw.csv +raw: {} clean: - read: auto + sql_path: sql/legacy_clean.sql mart: {} +bq: + dataset: ignored """.strip(), encoding="utf-8", ) - model = load_config_model(yml) + with pytest.raises(ValueError) as exc: + load_config(yml) - assert len(model.raw.sources) == 1 - assert model.raw.sources[0].name == "src_legacy" - assert model.raw.sources[0].type == "local_file" - assert model.clean.read is not None - assert model.clean.read.source == "auto" + assert "clean.sql_path" in str(exc.value) -def test_load_config_logs_deprecation_codes_for_legacy_normalization(tmp_path: Path, caplog): +def test_load_config_rejects_mart_sql_dir(tmp_path: Path): yml = tmp_path / "dataset.yml" yml.write_text( """ dataset: name: demo years: [2022] -raw: - source: - id: src_legacy - plugin: local_file - args: - path: data/raw.csv -clean: - read: auto -mart: {} +raw: {} +clean: {} +mart: + sql_dir: sql/mart +bq: + dataset: ignored """.strip(), encoding="utf-8", ) - with caplog.at_level(logging.WARNING, logger="toolkit.core.config"): + with pytest.raises(ValueError) as exc: load_config(yml) - assert "DCL001" in caplog.text - assert "DCL002" in caplog.text - assert "DCL003" in caplog.text - assert "DCL004" in caplog.text + assert "mart.sql_dir" in str(exc.value) -def test_load_config_model_strict_config_rejects_legacy_normalization(tmp_path: Path): +def test_load_config_model_rejects_legacy_raw_source_plugin_id_shape(tmp_path: Path): yml = tmp_path / "dataset.yml" yml.write_text( """ @@ -536,7 +513,8 @@ def test_load_config_model_strict_config_rejects_legacy_normalization(tmp_path: years: [2022] raw: source: - type: local_file + id: src_legacy + plugin: local_file args: path: data/raw.csv clean: {} @@ -546,23 +524,25 @@ def test_load_config_model_strict_config_rejects_legacy_normalization(tmp_path: ) with pytest.raises(ValueError) as exc: - load_config_model(yml, strict_config=True) + load_config_model(yml) - assert "DCL001" in str(exc.value) - assert "raw.source is deprecated, usare raw.sources" in str(exc.value) + assert "raw.sources" in str(exc.value) or "raw.source" in str(exc.value) -def test_load_config_model_config_strict_rejects_legacy_normalization(tmp_path: Path): +def test_load_config_model_rejects_legacy_raw_sources_plugin_id_fields(tmp_path: Path): yml = tmp_path / "dataset.yml" yml.write_text( """ -config: - strict: true dataset: name: demo years: [2022] -clean: - read: auto +raw: + sources: + - id: src_legacy + plugin: local_file + args: + path: data/raw.csv +clean: {} mart: {} """.strip(), encoding="utf-8", @@ -571,34 +551,27 @@ def test_load_config_model_config_strict_rejects_legacy_normalization(tmp_path: with pytest.raises(ValueError) as exc: load_config_model(yml) - assert "DCL004" in str(exc.value) - assert "clean.read scalar form is deprecated" in str(exc.value) + assert "raw.sources.0" in str(exc.value) -def test_cli_strict_config_rejects_legacy_config(tmp_path: Path): - project_dir = tmp_path / "project" - project_dir.mkdir() - yml = project_dir / "dataset.yml" +def test_load_config_rejects_legacy_clean_read_scalar_form(tmp_path: Path): + yml = tmp_path / "dataset.yml" yml.write_text( """ dataset: name: demo years: [2022] -raw: - source: - type: local_file - args: - path: data/raw.csv -clean: {} +clean: + read: auto mart: {} """.strip(), encoding="utf-8", ) with pytest.raises(ValueError) as exc: - run_cmd(step="raw", config=str(yml), strict_config=True) + load_config(yml) - assert "DCL001" in str(exc.value) + assert "clean.read" in str(exc.value) def test_project_example_config_parses_in_strict_mode(): @@ -608,7 +581,7 @@ def test_project_example_config_parses_in_strict_mode(): assert len(model.raw.sources) == 1 -def test_load_config_warns_on_unknown_top_level_keys_in_non_strict_mode(tmp_path: Path, caplog): +def test_load_config_warns_on_unknown_top_level_keys_in_non_strict_mode(tmp_path: Path, caplog, monkeypatch): yml = tmp_path / "dataset.yml" yml.write_text( """ @@ -623,6 +596,8 @@ def test_load_config_warns_on_unknown_top_level_keys_in_non_strict_mode(tmp_path encoding="utf-8", ) + _bind_config_logger(caplog, monkeypatch) + with caplog.at_level(logging.WARNING, logger="toolkit.core.config"): cfg = load_config(yml) @@ -703,6 +678,7 @@ def test_load_config_model_rejects_unknown_top_level_keys_in_strict_mode(tmp_pat def test_load_config_warns_on_unknown_section_keys_in_non_strict_mode( tmp_path: Path, caplog, + monkeypatch, section: str, yaml_text: str, code: str, @@ -711,6 +687,8 @@ def test_load_config_warns_on_unknown_section_keys_in_non_strict_mode( yml = tmp_path / "dataset.yml" yml.write_text(yaml_text, encoding="utf-8") + _bind_config_logger(caplog, monkeypatch) + with caplog.at_level(logging.WARNING, logger="toolkit.core.config"): cfg = load_config(yml) diff --git a/toolkit/clean/duckdb_read.py b/toolkit/clean/duckdb_read.py index b253b8d..94577dd 100644 --- a/toolkit/clean/duckdb_read.py +++ b/toolkit/clean/duckdb_read.py @@ -46,17 +46,13 @@ def _read_source_mode(clean_cfg: dict[str, Any], logger=None) -> tuple[str, dict if raw_read_cfg is None: pass - elif isinstance(raw_read_cfg, str): - if logger is not None: - logger.warning("clean.read scalar form is deprecated; use clean.read.source") - read_source = raw_read_cfg elif isinstance(raw_read_cfg, dict): explicit_cfg = dict(raw_read_cfg) nested_source = explicit_cfg.pop("source", None) if nested_source is not None: read_source = nested_source else: - raise ValueError("clean.read must be either a mapping (dict) or one of: auto, config_only") + raise ValueError("clean.read must be a mapping (dict)") normalized_source = str(read_source or "auto") if normalized_source not in READ_SOURCE_MODES: diff --git a/toolkit/core/config_models.py b/toolkit/core/config_models.py index e964347..da2780e 100644 --- a/toolkit/core/config_models.py +++ b/toolkit/core/config_models.py @@ -26,55 +26,6 @@ class ConfigDeprecation: _CONFIG_DEPRECATIONS: dict[str, ConfigDeprecation] = { - "raw.source": ConfigDeprecation( - code="DCL001", - legacy="raw.source", - replacement="raw.sources", - status="deprecated", - message="raw.source is deprecated, usare raw.sources", - ), - "raw.sources[].plugin": ConfigDeprecation( - code="DCL002", - legacy="raw.sources[].plugin", - replacement="raw.sources[].type", - status="deprecated", - message="raw.sources[].plugin is deprecated, usare raw.sources[].type", - ), - "raw.sources[].id": ConfigDeprecation( - code="DCL003", - legacy="raw.sources[].id", - replacement="raw.sources[].name", - status="deprecated", - message="raw.sources[].id is deprecated, usare raw.sources[].name", - ), - "clean.read": ConfigDeprecation( - code="DCL004", - legacy="clean.read: ", - replacement="clean.read.source", - status="deprecated", - message="clean.read scalar form is deprecated, usare clean.read.source", - ), - "clean.read.csv": ConfigDeprecation( - code="DCL005", - legacy="clean.read.csv.*", - replacement="clean.read.*", - status="deprecated", - message="clean.read.csv.* is deprecated, usare clean.read.*", - ), - "clean.sql_path": ConfigDeprecation( - code="DCL006", - legacy="clean.sql_path", - replacement="clean.sql", - status="ignored", - message="clean.sql_path is deprecated/ignored, usare clean.sql", - ), - "mart.sql_dir": ConfigDeprecation( - code="DCL007", - legacy="mart.sql_dir", - replacement="mart.tables[].sql", - status="ignored", - message="mart.sql_dir is deprecated/ignored, usare mart.tables[].sql", - ), "bq": ConfigDeprecation( code="DCL008", legacy="bq", @@ -506,10 +457,8 @@ def _resolve_path_value(value: Any, *, base_dir: Path) -> Any: ), "clean": ( ("sql",), - ("sql_path",), ), "mart": ( - ("sql_dir",), ("tables", "*", "sql"), ), "cross_year": ( @@ -644,19 +593,6 @@ def _resolve_root(root: Any, *, base_dir: Path) -> tuple[Path, str]: source = "env:TOOLKIT_OUTDIR" if os.environ.get("TOOLKIT_OUTDIR") else "env:DCL_OUTDIR" return Path(managed_outdir).expanduser().resolve(), source return _resolve_path_value(root, base_dir=base_dir), "yml" - - -def _normalize_legacy_source(source: dict[str, Any]) -> dict[str, Any]: - normalized = dict(source) - plugin = normalized.pop("plugin", None) - if plugin is not None and "type" not in normalized: - normalized["type"] = plugin - source_id = normalized.pop("id", None) - if source_id is not None and "name" not in normalized: - normalized["name"] = source_id - return normalized - - def _emit_deprecation_notice( key: str, *, @@ -708,44 +644,11 @@ def _declared_model_keys(model_cls: type[BaseModel]) -> set[str]: "bq", } _RAW_ALLOWED_KEYS = _declared_model_keys(RawConfig) -_CLEAN_ALLOWED_KEYS = _declared_model_keys(CleanConfig) | {"sql_path"} -_MART_ALLOWED_KEYS = _declared_model_keys(MartConfig) | {"sql_dir"} +_CLEAN_ALLOWED_KEYS = _declared_model_keys(CleanConfig) +_MART_ALLOWED_KEYS = _declared_model_keys(MartConfig) _CROSS_YEAR_ALLOWED_KEYS = _declared_model_keys(CrossYearConfig) -def _normalize_legacy_clean_read( - clean: dict[str, Any], - *, - path: Path, - strict_config: bool, -) -> dict[str, Any]: - normalized = dict(clean) - read_cfg = normalized.get("read") - - if isinstance(read_cfg, str): - _emit_deprecation_notice("clean.read", strict_config=strict_config, path=path) - normalized["read"] = {"source": read_cfg} - read_cfg = normalized["read"] - - if not isinstance(read_cfg, dict): - return normalized - - csv_cfg = read_cfg.get("csv") - if csv_cfg is None: - return normalized - if not isinstance(csv_cfg, dict): - raise _err("clean.read.csv deve essere una mappa YAML (oggetto).", path=path) - - merged_read = dict(read_cfg) - merged_read.pop("csv", None) - for key, value in csv_cfg.items(): - merged_read.setdefault(key, value) - - _emit_deprecation_notice("clean.read.csv", strict_config=strict_config, path=path) - normalized["read"] = merged_read - return normalized - - def _normalize_legacy_payload( data: dict[str, Any], *, @@ -756,41 +659,14 @@ def _normalize_legacy_payload( raw = normalized.get("raw") if isinstance(raw, dict): - updated_raw = dict(raw) - if "source" in updated_raw: - source = updated_raw.pop("source") - if "sources" in updated_raw: - raise _err("Use either raw.source or raw.sources, not both.", path=path) - updated_raw["sources"] = [source] - _emit_deprecation_notice("raw.source", strict_config=strict_config, path=path) - sources = updated_raw.get("sources") - if isinstance(sources, list): - normalized_sources: list[Any] = [] - for source in sources: - if not isinstance(source, dict): - normalized_sources.append(source) - continue - original = dict(source) - normalized_source = _normalize_legacy_source(source) - if "plugin" in original and "type" not in original: - _emit_deprecation_notice("raw.sources[].plugin", strict_config=strict_config, path=path) - if "id" in original and "name" not in original: - _emit_deprecation_notice("raw.sources[].id", strict_config=strict_config, path=path) - normalized_sources.append(normalized_source) - updated_raw["sources"] = normalized_sources - normalized["raw"] = updated_raw + normalized["raw"] = dict(raw) clean = normalized.get("clean") if isinstance(clean, dict): - updated_clean = _normalize_legacy_clean_read(clean, path=path, strict_config=strict_config) - if "sql_path" in updated_clean: - _emit_deprecation_notice("clean.sql_path", strict_config=strict_config, path=path) - normalized["clean"] = updated_clean + normalized["clean"] = dict(clean) mart = normalized.get("mart") if isinstance(mart, dict): - if "sql_dir" in mart: - _emit_deprecation_notice("mart.sql_dir", strict_config=strict_config, path=path) normalized["mart"] = dict(mart) if "bq" in normalized: @@ -828,6 +704,12 @@ def _warn_or_reject_unknown_keys( if not isinstance(section, dict): continue extras = [key for key in section.keys() if key not in allowed_keys] + if section_name == "raw" and "source" in extras: + raise _err("raw.source is no longer supported; use raw.sources", path=path) + if section_name == "clean" and "sql_path" in extras: + raise _err("clean.sql_path is no longer supported; use clean.sql", path=path) + if section_name == "mart" and "sql_dir" in extras: + raise _err("mart.sql_dir is no longer supported; use mart.tables[].sql", path=path) if extras: _emit_unknown_keys_notice( notice_key, diff --git a/toolkit/core/csv_read.py b/toolkit/core/csv_read.py index f88350a..02ceaef 100644 --- a/toolkit/core/csv_read.py +++ b/toolkit/core/csv_read.py @@ -23,33 +23,12 @@ "prefer_from_raw_run", "allow_ambiguous", "include", - "csv", "columns", "normalize_rows_to_columns", "trim_whitespace", "sample_size", "sheet_name", } -ALLOWED_NESTED_CSV_KEYS = { - "delim", - "header", - "encoding", - "decimal", - "skip", - "auto_detect", - "quote", - "escape", - "comment", - "ignore_errors", - "strict_mode", - "null_padding", - "parallel", - "nullstr", - "columns", - "normalize_rows_to_columns", - "trim_whitespace", - "sheet_name", -} FORMAT_HINT_KEYS = { "delim", "header", @@ -118,12 +97,8 @@ def normalize_columns_spec(columns: object) -> dict[str, str] | None: def normalize_read_cfg(read_cfg: dict[str, Any] | None) -> dict[str, Any]: cfg = dict(read_cfg or {}) - csv_cfg = cfg.get("csv") or {} - if csv_cfg and not isinstance(csv_cfg, dict): - raise ValueError( - "clean.read must be a mapping (dict) in dataset.yml; " - "legacy clean.read.csv must also be a mapping if used" - ) + if "csv" in cfg: + raise ValueError("clean.read.csv is no longer supported; use clean.read.* directly") unknown_top = sorted(set(cfg.keys()) - ALLOWED_READ_CSV_KEYS) if unknown_top: @@ -131,21 +106,8 @@ def normalize_read_cfg(read_cfg: dict[str, Any] | None) -> dict[str, Any]: "Unsupported clean.read options for CSV reader: " f"{unknown_top}. Allowed keys: {sorted(ALLOWED_READ_CSV_KEYS)}" ) - - if csv_cfg: - unknown_nested = sorted(set(csv_cfg.keys()) - ALLOWED_NESTED_CSV_KEYS) - if unknown_nested: - raise ValueError( - "Unsupported legacy clean.read.csv options: " - f"{unknown_nested}. Allowed keys: {sorted(ALLOWED_NESTED_CSV_KEYS)}" - ) - - merged = dict(csv_cfg) - for key in ALLOWED_NESTED_CSV_KEYS: - if key in cfg: - merged[key] = cfg[key] - merged["columns"] = normalize_columns_spec(merged.get("columns")) - return merged + cfg["columns"] = normalize_columns_spec(cfg.get("columns")) + return cfg def filter_suggested_format_keys(cfg: dict[str, Any] | None) -> dict[str, Any]: