diff --git a/docs/config-schema.md b/docs/config-schema.md index 9a43f8c..d08b552 100644 --- a/docs/config-schema.md +++ b/docs/config-schema.md @@ -90,12 +90,20 @@ I path relativi sono sempre risolti rispetto alla directory che contiene `datase | `columns` | `dict[string,string] \| null` | `null` | | `trim_whitespace` | `bool` | `true` | | `sample_size` | `int \| null` | `null` | +| `sheet_name` | `string \| int \| null` | `null` | | `mode` | `explicit \| latest \| largest \| all \| null` | `null` | | `glob` | `string` | `*` | | `prefer_from_raw_run` | `bool` | `true` | | `allow_ambiguous` | `bool` | `false` | | `include` | `list[string] \| null` | `null` | +Note pratiche: + +- i file `.xlsx` sono supportati nel layer CLEAN +- RAW conserva il workbook originale senza convertirlo +- per `.xlsx`, le opzioni utili sono soprattutto `header`, `skip`, `columns`, `trim_whitespace`, `sheet_name` +- `sheet_name` usa il primo foglio se omesso + `CleanValidate`: | Campo | Tipo | Default | diff --git a/docs/runtime-boundaries.md b/docs/runtime-boundaries.md index a3c64f5..ce33f23 100644 --- a/docs/runtime-boundaries.md +++ b/docs/runtime-boundaries.md @@ -37,3 +37,8 @@ Le sorgenti builtin supportate dal runtime canonico sono: - `local_file` - `http_file` + +Nota: + +- il runtime canonico puo' conservare file `.xlsx` in RAW e leggerli in CLEAN +- questo non cambia il ruolo del layer RAW: il file originale resta l'artefatto sorgente diff --git a/pyproject.toml b/pyproject.toml index 1829f7c..0d2911e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "pyyaml>=6.0", "pydantic>=2.8.0", "pandas>=2.0", + "openpyxl>=3.1.0", "pyarrow>=14.0", "duckdb>=0.10.0", "requests>=2.31", diff --git a/tests/test_clean_duckdb_read.py b/tests/test_clean_duckdb_read.py index 43dd2d0..dddfe68 100644 --- a/tests/test_clean_duckdb_read.py +++ b/tests/test_clean_duckdb_read.py @@ -4,6 +4,7 @@ from pathlib import Path import duckdb +import pandas as pd import pytest import yaml @@ -175,6 +176,67 @@ def test_read_raw_to_relation_handles_no_header_fixed_schema_without_extra_colum con.close() +def test_read_raw_to_relation_reads_xlsx_first_sheet(tmp_path: Path): + input_file = tmp_path / "ok.xlsx" + pd.DataFrame( + [ + {"Anno": 2022, "Regione": "Lazio", "Domanda": 123.4}, + {"Anno": 2022, "Regione": "Umbria", "Domanda": 56.7}, + ] + ).to_excel(input_file, index=False) + + con = duckdb.connect(":memory:") + logger = logging.getLogger("tests.clean.duckdb_read.xlsx") + + info = duckdb_read.read_raw_to_relation( + con, + [input_file], + {"header": True}, + "fallback", + logger, + ) + + rows = con.execute('SELECT "Anno", "Regione", "Domanda" FROM raw_input ORDER BY "Regione"').fetchall() + assert info.source == "excel" + assert info.params_used["sheet_name"] == 0 + assert rows == [(2022, "Lazio", 123.4), (2022, "Umbria", 56.7)] + con.close() + + +def test_read_raw_to_relation_reads_xlsx_with_explicit_sheet_and_columns(tmp_path: Path): + input_file = tmp_path / "sheeted.xlsx" + with pd.ExcelWriter(input_file, engine="openpyxl") as writer: + pd.DataFrame({"skipme": ["ignore"]}).to_excel(writer, sheet_name="Other", index=False) + pd.DataFrame([["A", 1], ["B", 2]]).to_excel( + writer, + sheet_name="Export", + header=False, + index=False, + ) + + con = duckdb.connect(":memory:") + logger = logging.getLogger("tests.clean.duckdb_read.xlsx_sheet") + + info = duckdb_read.read_raw_to_relation( + con, + [input_file], + { + "header": False, + "sheet_name": "Export", + "columns": {"col0": "VARCHAR", "col1": "VARCHAR"}, + }, + "fallback", + logger, + ) + + rows = con.execute("SELECT col0, col1 FROM raw_input ORDER BY col0").fetchall() + assert info.source == "excel" + assert info.params_used["sheet_name"] == "Export" + assert info.params_used["columns"] == {"col0": "VARCHAR", "col1": "VARCHAR"} + assert rows == [("A", 1), ("B", 2)] + con.close() + + def test_resolve_clean_read_cfg_uses_suggested_hints_in_auto_mode(tmp_path: Path): raw_dir = tmp_path / "raw" / "demo" / "2024" profile_dir = raw_dir / "_profile" diff --git a/tests/test_clean_input_selection.py b/tests/test_clean_input_selection.py index 4fca53f..6e75431 100644 --- a/tests/test_clean_input_selection.py +++ b/tests/test_clean_input_selection.py @@ -114,6 +114,22 @@ def test_run_clean_accepts_csv_gz_inputs(tmp_path: Path, monkeypatch): assert seen["input_files"] == [gz_file] +def test_run_clean_accepts_xlsx_inputs(tmp_path: Path, monkeypatch): + raw_dir = tmp_path / "data" / "raw" / "demo" / "2024" + raw_dir.mkdir(parents=True, exist_ok=True) + xlsx_file = raw_dir / "data.xlsx" + xlsx_file.write_bytes(b"fake-xlsx-content") + + sql_path = _write_clean_sql(tmp_path) + seen = _run_clean_capture_inputs( + monkeypatch, + tmp_path, + {"sql": str(sql_path), "read": {}}, + ) + + assert seen["input_files"] == [xlsx_file] + + def test_run_clean_include_pattern_restricts_to_matching_input(tmp_path: Path, monkeypatch): raw_dir = tmp_path / "data" / "raw" / "demo" / "2024" raw_dir.mkdir(parents=True, exist_ok=True) diff --git a/toolkit/clean/duckdb_read.py b/toolkit/clean/duckdb_read.py index c28d6a9..fcc1d26 100644 --- a/toolkit/clean/duckdb_read.py +++ b/toolkit/clean/duckdb_read.py @@ -6,6 +6,7 @@ from typing import Any import duckdb +import pandas as pd import yaml from toolkit.core.csv_read import ( READ_SELECTION_KEYS, @@ -19,7 +20,16 @@ ) -SUPPORTED_INPUT_EXTS = {".csv", ".tsv", ".txt", ".parquet", ".csv.gz", ".tsv.gz", ".txt.gz"} +SUPPORTED_INPUT_EXTS = { + ".csv", + ".tsv", + ".txt", + ".parquet", + ".csv.gz", + ".tsv.gz", + ".txt.gz", + ".xlsx", +} @dataclass(frozen=True) @@ -277,6 +287,97 @@ def _execute_parquet_read( return ReadInfo(source="parquet", params_used={}) +def _normalize_excel_sheet_name(value: Any) -> str | int: + if value is None: + return 0 + if isinstance(value, bool): + raise ValueError("clean.read.sheet_name must be a string, integer, or null") + if isinstance(value, int): + return value + if isinstance(value, str): + text = value.strip() + if not text: + return 0 + return text + raise ValueError("clean.read.sheet_name must be a string, integer, or null") + + +def _trim_excel_dataframe(df: pd.DataFrame) -> pd.DataFrame: + return df.apply(lambda column: column.map(lambda value: value.strip() if isinstance(value, str) else value)) + + +def _load_excel_frame( + input_file: Path, + read_cfg: dict[str, Any], +) -> tuple[pd.DataFrame, dict[str, Any]]: + header = bool(read_cfg.get("header", True)) + skip = int(read_cfg["skip"]) if read_cfg.get("skip") is not None else 0 + trim_whitespace = read_cfg.get("trim_whitespace", True) + columns = read_cfg.get("columns") + sheet_name = _normalize_excel_sheet_name(read_cfg.get("sheet_name")) + + df = pd.read_excel( + input_file, + sheet_name=sheet_name, + header=0 if header else None, + skiprows=skip, + dtype=object, + engine="openpyxl", + ) + + if columns: + expected_columns = list(columns.keys()) + if len(expected_columns) != len(df.columns): + raise ValueError( + "Excel input columns mismatch. " + f"Configured={len(expected_columns)} detected={len(df.columns)} file={input_file}" + ) + df.columns = expected_columns + elif not header: + df.columns = [f"col{i}" for i in range(len(df.columns))] + + if trim_whitespace: + df = _trim_excel_dataframe(df) + + return df, { + "sheet_name": sheet_name, + "header": header, + "skip": skip, + "trim_whitespace": bool(trim_whitespace), + "columns": dict(columns) if columns else None, + } + + +def _execute_excel_read( + con: duckdb.DuckDBPyConnection, + input_files: list[Path], + read_cfg: dict[str, Any], + *, + logger, +) -> ReadInfo: + frames: list[pd.DataFrame] = [] + params_used: dict[str, Any] | None = None + + for input_file in input_files: + frame, frame_params = _load_excel_frame(input_file, read_cfg) + frames.append(frame) + if params_used is None: + params_used = frame_params + + combined = pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0] + con.register("raw_input_df", combined) + con.execute("CREATE OR REPLACE VIEW raw_input AS SELECT * FROM raw_input_df;") + + used = dict(params_used or {}) + if used.get("columns") is None: + used.pop("columns", None) + logger.info( + "read_excel params used: source=excel params=%s", + json.dumps(used, ensure_ascii=False, sort_keys=True), + ) + return ReadInfo(source="excel", params_used=used) + + def _validate_read_mode(mode: str) -> str: normalized_mode = str(mode or "fallback") if normalized_mode not in {"strict", "fallback", "robust"}: @@ -386,6 +487,8 @@ def read_raw_to_relation( info = _execute_parquet_read(con, input_files) logger.info("read_csv params used: source=parquet params={}") return info + if exts <= {".xlsx"}: + return _execute_excel_read(con, input_files, read_cfg, logger=logger) normalized_mode = _validate_read_mode(mode) return _read_csv_relation( diff --git a/toolkit/clean/input_selection.py b/toolkit/clean/input_selection.py index 9ec0316..222ba8c 100644 --- a/toolkit/clean/input_selection.py +++ b/toolkit/clean/input_selection.py @@ -14,7 +14,7 @@ def is_supported_input_file(path: Path) -> bool: return False if name.endswith((".csv.gz", ".tsv.gz", ".txt.gz")): return True - if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet"}: + if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet", ".xlsx"}: return True return False diff --git a/toolkit/core/config_models.py b/toolkit/core/config_models.py index a4aece4..26a1e4c 100644 --- a/toolkit/core/config_models.py +++ b/toolkit/core/config_models.py @@ -272,6 +272,7 @@ class CleanReadConfig(BaseModel): columns: dict[str, str] | None = None trim_whitespace: bool = True sample_size: int | None = None + sheet_name: str | int | None = None mode: Literal["explicit", "latest", "largest", "all"] | None = None glob: str = "*" prefer_from_raw_run: bool = True diff --git a/toolkit/core/csv_read.py b/toolkit/core/csv_read.py index d3e1f83..de8e4a1 100644 --- a/toolkit/core/csv_read.py +++ b/toolkit/core/csv_read.py @@ -27,6 +27,7 @@ "columns", "trim_whitespace", "sample_size", + "sheet_name", } ALLOWED_NESTED_CSV_KEYS = { "delim", @@ -45,6 +46,7 @@ "nullstr", "columns", "trim_whitespace", + "sheet_name", } FORMAT_HINT_KEYS = { "delim",