feat: add xlsx support in clean layer (#10)

Gabrymi93 · web-flow · commit 72ef3bed7933 · 2026-03-03T18:43:30.000Z
diff --git a/docs/config-schema.md b/docs/config-schema.md
@@ -90,12 +90,20 @@ I path relativi sono sempre risolti rispetto alla directory che contiene `datase
 | `columns` | `dict[string,string] \| null` | `null` |
 | `trim_whitespace` | `bool` | `true` |
 | `sample_size` | `int \| null` | `null` |
+| `sheet_name` | `string \| int \| null` | `null` |
 | `mode` | `explicit \| latest \| largest \| all \| null` | `null` |
 | `glob` | `string` | `*` |
 | `prefer_from_raw_run` | `bool` | `true` |
 | `allow_ambiguous` | `bool` | `false` |
 | `include` | `list[string] \| null` | `null` |
 
+Note pratiche:
+
+- i file `.xlsx` sono supportati nel layer CLEAN
+- RAW conserva il workbook originale senza convertirlo
+- per `.xlsx`, le opzioni utili sono soprattutto `header`, `skip`, `columns`, `trim_whitespace`, `sheet_name`
+- `sheet_name` usa il primo foglio se omesso
+
 `CleanValidate`:
 
 | Campo | Tipo | Default |
diff --git a/docs/runtime-boundaries.md b/docs/runtime-boundaries.md
@@ -37,3 +37,8 @@ Le sorgenti builtin supportate dal runtime canonico sono:
 
 - `local_file`
 - `http_file`
+
+Nota:
+
+- il runtime canonico puo' conservare file `.xlsx` in RAW e leggerli in CLEAN
+- questo non cambia il ruolo del layer RAW: il file originale resta l'artefatto sorgente
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
   "pyyaml>=6.0",
   "pydantic>=2.8.0",
   "pandas>=2.0",
+  "openpyxl>=3.1.0",
   "pyarrow>=14.0",
   "duckdb>=0.10.0",
   "requests>=2.31",
diff --git a/tests/test_clean_duckdb_read.py b/tests/test_clean_duckdb_read.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import duckdb
+import pandas as pd
 import pytest
 import yaml
 
@@ -175,6 +176,67 @@ def test_read_raw_to_relation_handles_no_header_fixed_schema_without_extra_colum
     con.close()
 
 
+def test_read_raw_to_relation_reads_xlsx_first_sheet(tmp_path: Path):
+    input_file = tmp_path / "ok.xlsx"
+    pd.DataFrame(
+        [
+            {"Anno": 2022, "Regione": "Lazio", "Domanda": 123.4},
+            {"Anno": 2022, "Regione": "Umbria", "Domanda": 56.7},
+        ]
+    ).to_excel(input_file, index=False)
+
+    con = duckdb.connect(":memory:")
+    logger = logging.getLogger("tests.clean.duckdb_read.xlsx")
+
+    info = duckdb_read.read_raw_to_relation(
+        con,
+        [input_file],
+        {"header": True},
+        "fallback",
+        logger,
+    )
+
+    rows = con.execute('SELECT "Anno", "Regione", "Domanda" FROM raw_input ORDER BY "Regione"').fetchall()
+    assert info.source == "excel"
+    assert info.params_used["sheet_name"] == 0
+    assert rows == [(2022, "Lazio", 123.4), (2022, "Umbria", 56.7)]
+    con.close()
+
+
+def test_read_raw_to_relation_reads_xlsx_with_explicit_sheet_and_columns(tmp_path: Path):
+    input_file = tmp_path / "sheeted.xlsx"
+    with pd.ExcelWriter(input_file, engine="openpyxl") as writer:
+        pd.DataFrame({"skipme": ["ignore"]}).to_excel(writer, sheet_name="Other", index=False)
+        pd.DataFrame([["A", 1], ["B", 2]]).to_excel(
+            writer,
+            sheet_name="Export",
+            header=False,
+            index=False,
+        )
+
+    con = duckdb.connect(":memory:")
+    logger = logging.getLogger("tests.clean.duckdb_read.xlsx_sheet")
+
+    info = duckdb_read.read_raw_to_relation(
+        con,
+        [input_file],
+        {
+            "header": False,
+            "sheet_name": "Export",
+            "columns": {"col0": "VARCHAR", "col1": "VARCHAR"},
+        },
+        "fallback",
+        logger,
+    )
+
+    rows = con.execute("SELECT col0, col1 FROM raw_input ORDER BY col0").fetchall()
+    assert info.source == "excel"
+    assert info.params_used["sheet_name"] == "Export"
+    assert info.params_used["columns"] == {"col0": "VARCHAR", "col1": "VARCHAR"}
+    assert rows == [("A", 1), ("B", 2)]
+    con.close()
+
+
 def test_resolve_clean_read_cfg_uses_suggested_hints_in_auto_mode(tmp_path: Path):
     raw_dir = tmp_path / "raw" / "demo" / "2024"
     profile_dir = raw_dir / "_profile"
diff --git a/tests/test_clean_input_selection.py b/tests/test_clean_input_selection.py
@@ -114,6 +114,22 @@ def test_run_clean_accepts_csv_gz_inputs(tmp_path: Path, monkeypatch):
     assert seen["input_files"] == [gz_file]
 
 
+def test_run_clean_accepts_xlsx_inputs(tmp_path: Path, monkeypatch):
+    raw_dir = tmp_path / "data" / "raw" / "demo" / "2024"
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    xlsx_file = raw_dir / "data.xlsx"
+    xlsx_file.write_bytes(b"fake-xlsx-content")
+
+    sql_path = _write_clean_sql(tmp_path)
+    seen = _run_clean_capture_inputs(
+        monkeypatch,
+        tmp_path,
+        {"sql": str(sql_path), "read": {}},
+    )
+
+    assert seen["input_files"] == [xlsx_file]
+
+
 def test_run_clean_include_pattern_restricts_to_matching_input(tmp_path: Path, monkeypatch):
     raw_dir = tmp_path / "data" / "raw" / "demo" / "2024"
     raw_dir.mkdir(parents=True, exist_ok=True)
diff --git a/toolkit/clean/duckdb_read.py b/toolkit/clean/duckdb_read.py
@@ -6,6 +6,7 @@
 from typing import Any
 
 import duckdb
+import pandas as pd
 import yaml
 from toolkit.core.csv_read import (
     READ_SELECTION_KEYS,
@@ -19,7 +20,16 @@
 )
 
 
-SUPPORTED_INPUT_EXTS = {".csv", ".tsv", ".txt", ".parquet", ".csv.gz", ".tsv.gz", ".txt.gz"}
+SUPPORTED_INPUT_EXTS = {
+    ".csv",
+    ".tsv",
+    ".txt",
+    ".parquet",
+    ".csv.gz",
+    ".tsv.gz",
+    ".txt.gz",
+    ".xlsx",
+}
 
 
 @dataclass(frozen=True)
@@ -277,6 +287,97 @@ def _execute_parquet_read(
     return ReadInfo(source="parquet", params_used={})
 
 
+def _normalize_excel_sheet_name(value: Any) -> str | int:
+    if value is None:
+        return 0
+    if isinstance(value, bool):
+        raise ValueError("clean.read.sheet_name must be a string, integer, or null")
+    if isinstance(value, int):
+        return value
+    if isinstance(value, str):
+        text = value.strip()
+        if not text:
+            return 0
+        return text
+    raise ValueError("clean.read.sheet_name must be a string, integer, or null")
+
+
+def _trim_excel_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    return df.apply(lambda column: column.map(lambda value: value.strip() if isinstance(value, str) else value))
+
+
+def _load_excel_frame(
+    input_file: Path,
+    read_cfg: dict[str, Any],
+) -> tuple[pd.DataFrame, dict[str, Any]]:
+    header = bool(read_cfg.get("header", True))
+    skip = int(read_cfg["skip"]) if read_cfg.get("skip") is not None else 0
+    trim_whitespace = read_cfg.get("trim_whitespace", True)
+    columns = read_cfg.get("columns")
+    sheet_name = _normalize_excel_sheet_name(read_cfg.get("sheet_name"))
+
+    df = pd.read_excel(
+        input_file,
+        sheet_name=sheet_name,
+        header=0 if header else None,
+        skiprows=skip,
+        dtype=object,
+        engine="openpyxl",
+    )
+
+    if columns:
+        expected_columns = list(columns.keys())
+        if len(expected_columns) != len(df.columns):
+            raise ValueError(
+                "Excel input columns mismatch. "
+                f"Configured={len(expected_columns)} detected={len(df.columns)} file={input_file}"
+            )
+        df.columns = expected_columns
+    elif not header:
+        df.columns = [f"col{i}" for i in range(len(df.columns))]
+
+    if trim_whitespace:
+        df = _trim_excel_dataframe(df)
+
+    return df, {
+        "sheet_name": sheet_name,
+        "header": header,
+        "skip": skip,
+        "trim_whitespace": bool(trim_whitespace),
+        "columns": dict(columns) if columns else None,
+    }
+
+
+def _execute_excel_read(
+    con: duckdb.DuckDBPyConnection,
+    input_files: list[Path],
+    read_cfg: dict[str, Any],
+    *,
+    logger,
+) -> ReadInfo:
+    frames: list[pd.DataFrame] = []
+    params_used: dict[str, Any] | None = None
+
+    for input_file in input_files:
+        frame, frame_params = _load_excel_frame(input_file, read_cfg)
+        frames.append(frame)
+        if params_used is None:
+            params_used = frame_params
+
+    combined = pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
+    con.register("raw_input_df", combined)
+    con.execute("CREATE OR REPLACE VIEW raw_input AS SELECT * FROM raw_input_df;")
+
+    used = dict(params_used or {})
+    if used.get("columns") is None:
+        used.pop("columns", None)
+    logger.info(
+        "read_excel params used: source=excel params=%s",
+        json.dumps(used, ensure_ascii=False, sort_keys=True),
+    )
+    return ReadInfo(source="excel", params_used=used)
+
+
 def _validate_read_mode(mode: str) -> str:
     normalized_mode = str(mode or "fallback")
     if normalized_mode not in {"strict", "fallback", "robust"}:
@@ -386,6 +487,8 @@ def read_raw_to_relation(
         info = _execute_parquet_read(con, input_files)
         logger.info("read_csv params used: source=parquet params={}")
         return info
+    if exts <= {".xlsx"}:
+        return _execute_excel_read(con, input_files, read_cfg, logger=logger)
 
     normalized_mode = _validate_read_mode(mode)
     return _read_csv_relation(
diff --git a/toolkit/clean/input_selection.py b/toolkit/clean/input_selection.py
@@ -14,7 +14,7 @@ def is_supported_input_file(path: Path) -> bool:
         return False
     if name.endswith((".csv.gz", ".tsv.gz", ".txt.gz")):
         return True
-    if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet"}:
+    if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet", ".xlsx"}:
         return True
     return False
 
diff --git a/toolkit/core/config_models.py b/toolkit/core/config_models.py
@@ -272,6 +272,7 @@ class CleanReadConfig(BaseModel):
     columns: dict[str, str] | None = None
     trim_whitespace: bool = True
     sample_size: int | None = None
+    sheet_name: str | int | None = None
     mode: Literal["explicit", "latest", "largest", "all"] | None = None
     glob: str = "*"
     prefer_from_raw_run: bool = True
diff --git a/toolkit/core/csv_read.py b/toolkit/core/csv_read.py
@@ -27,6 +27,7 @@
     "columns",
     "trim_whitespace",
     "sample_size",
+    "sheet_name",
 }
 ALLOWED_NESTED_CSV_KEYS = {
     "delim",
@@ -45,6 +46,7 @@
     "nullstr",
     "columns",
     "trim_whitespace",
+    "sheet_name",
 }
 FORMAT_HINT_KEYS = {
     "delim",