diff --git a/tests/test_clean_input_selection.py b/tests/test_clean_input_selection.py index 6e75431..b527a3a 100644 --- a/tests/test_clean_input_selection.py +++ b/tests/test_clean_input_selection.py @@ -177,6 +177,67 @@ def test_run_clean_uses_manifest_primary(tmp_path: Path, monkeypatch): assert seen["input_files"] == [selected_file] +def test_run_clean_mode_all_ignores_manifest_primary(tmp_path: Path, monkeypatch): + raw_dir = tmp_path / "data" / "raw" / "demo" / "2024" + raw_dir.mkdir(parents=True, exist_ok=True) + first_file = raw_dir / "a.csv" + first_file.write_text("a\n1\n", encoding="utf-8") + second_file = raw_dir / "b.csv" + second_file.write_text("a\n2\n", encoding="utf-8") + write_raw_manifest( + raw_dir, + { + "dataset": "demo", + "year": 2024, + "run_id": "run-1", + "created_at": "2026-02-28T00:00:00+00:00", + "sources": [{"name": "source_1", "output_file": "a.csv"}], + "primary_output_file": "a.csv", + }, + ) + + sql_path = _write_clean_sql(tmp_path) + seen = _run_clean_capture_inputs( + monkeypatch, + tmp_path, + {"sql": str(sql_path), "read": {"mode": "all"}}, + ) + + assert seen["input_files"] == [first_file, second_file] + + +def test_run_clean_explicit_include_ignores_manifest_primary(tmp_path: Path, monkeypatch): + raw_dir = tmp_path / "data" / "raw" / "demo" / "2024" + raw_dir.mkdir(parents=True, exist_ok=True) + preferred_file = raw_dir / "preferred.csv" + preferred_file.write_text("a\n1\n", encoding="utf-8") + target_file = raw_dir / "uscite.csv" + target_file.write_text("a\n2\n", encoding="utf-8") + write_raw_manifest( + raw_dir, + { + "dataset": "demo", + "year": 2024, + "run_id": "run-1", + "created_at": "2026-02-28T00:00:00+00:00", + "sources": [{"name": "source_1", "output_file": "preferred.csv"}], + "primary_output_file": "preferred.csv", + }, + ) + + sql_path = _write_clean_sql(tmp_path) + seen = _run_clean_capture_inputs( + monkeypatch, + tmp_path, + { + "sql": str(sql_path), + "read": {"mode": "explicit", "include": ["uscite.csv"]}, + }, + ) + + assert seen["input_files"] == [target_file] + + def test_run_clean_rejects_php_only_inputs_with_clear_error(tmp_path: Path): raw_dir = tmp_path / "data" / "raw" / "demo" / "2024" raw_dir.mkdir(parents=True, exist_ok=True) diff --git a/toolkit/clean/input_selection.py b/toolkit/clean/input_selection.py index 222ba8c..c239cbb 100644 --- a/toolkit/clean/input_selection.py +++ b/toolkit/clean/input_selection.py @@ -138,6 +138,14 @@ def _manifest_primary_input(raw_year_dir: Path) -> tuple[Path | None, str | None ) +def _should_use_manifest_primary(mode: str, include=None) -> bool: + if mode == "all": + return False + if mode == "explicit" and include is not None: + return False + return True + + def select_raw_input( raw_year_dir: Path, logger, @@ -165,11 +173,12 @@ def select_raw_input( prefer_from_raw_run=prefer_from_raw_run, ) - manifest_primary, manifest_warning = _manifest_primary_input(raw_year_dir) - if manifest_primary is not None: - return [manifest_primary] - if manifest_warning is not None: - logger.warning(manifest_warning) + if _should_use_manifest_primary(mode, include=include): + manifest_primary, manifest_warning = _manifest_primary_input(raw_year_dir) + if manifest_primary is not None: + return [manifest_primary] + if manifest_warning is not None: + logger.warning(manifest_warning) selected = select_inputs( selected_candidates,