Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/config-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,20 @@ I path relativi sono sempre risolti rispetto alla directory che contiene `datase
| `columns` | `dict[string,string] \| null` | `null` |
| `trim_whitespace` | `bool` | `true` |
| `sample_size` | `int \| null` | `null` |
| `sheet_name` | `string \| int \| null` | `null` |
| `mode` | `explicit \| latest \| largest \| all \| null` | `null` |
| `glob` | `string` | `*` |
| `prefer_from_raw_run` | `bool` | `true` |
| `allow_ambiguous` | `bool` | `false` |
| `include` | `list[string] \| null` | `null` |

Note pratiche:

- i file `.xlsx` sono supportati nel layer CLEAN
- RAW conserva il workbook originale senza convertirlo
- per `.xlsx`, le opzioni utili sono soprattutto `header`, `skip`, `columns`, `trim_whitespace`, `sheet_name`
- `sheet_name` usa il primo foglio se omesso

`CleanValidate`:

| Campo | Tipo | Default |
Expand Down
5 changes: 5 additions & 0 deletions docs/runtime-boundaries.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ Le sorgenti builtin supportate dal runtime canonico sono:

- `local_file`
- `http_file`

Nota:

- il runtime canonico puo' conservare file `.xlsx` in RAW e leggerli in CLEAN
- questo non cambia il ruolo del layer RAW: il file originale resta l'artefatto sorgente
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ dependencies = [
"pyyaml>=6.0",
"pydantic>=2.8.0",
"pandas>=2.0",
"openpyxl>=3.1.0",
"pyarrow>=14.0",
"duckdb>=0.10.0",
"requests>=2.31",
Expand Down
62 changes: 62 additions & 0 deletions tests/test_clean_duckdb_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

import duckdb
import pandas as pd
import pytest
import yaml

Expand Down Expand Up @@ -175,6 +176,67 @@ def test_read_raw_to_relation_handles_no_header_fixed_schema_without_extra_colum
con.close()


def test_read_raw_to_relation_reads_xlsx_first_sheet(tmp_path: Path):
input_file = tmp_path / "ok.xlsx"
pd.DataFrame(
[
{"Anno": 2022, "Regione": "Lazio", "Domanda": 123.4},
{"Anno": 2022, "Regione": "Umbria", "Domanda": 56.7},
]
).to_excel(input_file, index=False)

con = duckdb.connect(":memory:")
logger = logging.getLogger("tests.clean.duckdb_read.xlsx")

info = duckdb_read.read_raw_to_relation(
con,
[input_file],
{"header": True},
"fallback",
logger,
)

rows = con.execute('SELECT "Anno", "Regione", "Domanda" FROM raw_input ORDER BY "Regione"').fetchall()
assert info.source == "excel"
assert info.params_used["sheet_name"] == 0
assert rows == [(2022, "Lazio", 123.4), (2022, "Umbria", 56.7)]
con.close()


def test_read_raw_to_relation_reads_xlsx_with_explicit_sheet_and_columns(tmp_path: Path):
input_file = tmp_path / "sheeted.xlsx"
with pd.ExcelWriter(input_file, engine="openpyxl") as writer:
pd.DataFrame({"skipme": ["ignore"]}).to_excel(writer, sheet_name="Other", index=False)
pd.DataFrame([["A", 1], ["B", 2]]).to_excel(
writer,
sheet_name="Export",
header=False,
index=False,
)

con = duckdb.connect(":memory:")
logger = logging.getLogger("tests.clean.duckdb_read.xlsx_sheet")

info = duckdb_read.read_raw_to_relation(
con,
[input_file],
{
"header": False,
"sheet_name": "Export",
"columns": {"col0": "VARCHAR", "col1": "VARCHAR"},
},
"fallback",
logger,
)

rows = con.execute("SELECT col0, col1 FROM raw_input ORDER BY col0").fetchall()
assert info.source == "excel"
assert info.params_used["sheet_name"] == "Export"
assert info.params_used["columns"] == {"col0": "VARCHAR", "col1": "VARCHAR"}
assert rows == [("A", 1), ("B", 2)]
con.close()


def test_resolve_clean_read_cfg_uses_suggested_hints_in_auto_mode(tmp_path: Path):
raw_dir = tmp_path / "raw" / "demo" / "2024"
profile_dir = raw_dir / "_profile"
Expand Down
16 changes: 16 additions & 0 deletions tests/test_clean_input_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,22 @@ def test_run_clean_accepts_csv_gz_inputs(tmp_path: Path, monkeypatch):
assert seen["input_files"] == [gz_file]


def test_run_clean_accepts_xlsx_inputs(tmp_path: Path, monkeypatch):
raw_dir = tmp_path / "data" / "raw" / "demo" / "2024"
raw_dir.mkdir(parents=True, exist_ok=True)
xlsx_file = raw_dir / "data.xlsx"
xlsx_file.write_bytes(b"fake-xlsx-content")

sql_path = _write_clean_sql(tmp_path)
seen = _run_clean_capture_inputs(
monkeypatch,
tmp_path,
{"sql": str(sql_path), "read": {}},
)

assert seen["input_files"] == [xlsx_file]


def test_run_clean_include_pattern_restricts_to_matching_input(tmp_path: Path, monkeypatch):
raw_dir = tmp_path / "data" / "raw" / "demo" / "2024"
raw_dir.mkdir(parents=True, exist_ok=True)
Expand Down
105 changes: 104 additions & 1 deletion toolkit/clean/duckdb_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Any

import duckdb
import pandas as pd
import yaml
from toolkit.core.csv_read import (
READ_SELECTION_KEYS,
Expand All @@ -19,7 +20,16 @@
)


SUPPORTED_INPUT_EXTS = {".csv", ".tsv", ".txt", ".parquet", ".csv.gz", ".tsv.gz", ".txt.gz"}
SUPPORTED_INPUT_EXTS = {
".csv",
".tsv",
".txt",
".parquet",
".csv.gz",
".tsv.gz",
".txt.gz",
".xlsx",
}


@dataclass(frozen=True)
Expand Down Expand Up @@ -277,6 +287,97 @@ def _execute_parquet_read(
return ReadInfo(source="parquet", params_used={})


def _normalize_excel_sheet_name(value: Any) -> str | int:
if value is None:
return 0
if isinstance(value, bool):
raise ValueError("clean.read.sheet_name must be a string, integer, or null")
if isinstance(value, int):
return value
if isinstance(value, str):
text = value.strip()
if not text:
return 0
return text
raise ValueError("clean.read.sheet_name must be a string, integer, or null")


def _trim_excel_dataframe(df: pd.DataFrame) -> pd.DataFrame:
return df.apply(lambda column: column.map(lambda value: value.strip() if isinstance(value, str) else value))


def _load_excel_frame(
input_file: Path,
read_cfg: dict[str, Any],
) -> tuple[pd.DataFrame, dict[str, Any]]:
header = bool(read_cfg.get("header", True))
skip = int(read_cfg["skip"]) if read_cfg.get("skip") is not None else 0
trim_whitespace = read_cfg.get("trim_whitespace", True)
columns = read_cfg.get("columns")
sheet_name = _normalize_excel_sheet_name(read_cfg.get("sheet_name"))

df = pd.read_excel(
input_file,
sheet_name=sheet_name,
header=0 if header else None,
skiprows=skip,
dtype=object,
engine="openpyxl",
)

if columns:
expected_columns = list(columns.keys())
if len(expected_columns) != len(df.columns):
raise ValueError(
"Excel input columns mismatch. "
f"Configured={len(expected_columns)} detected={len(df.columns)} file={input_file}"
)
df.columns = expected_columns
elif not header:
df.columns = [f"col{i}" for i in range(len(df.columns))]

if trim_whitespace:
df = _trim_excel_dataframe(df)

return df, {
"sheet_name": sheet_name,
"header": header,
"skip": skip,
"trim_whitespace": bool(trim_whitespace),
"columns": dict(columns) if columns else None,
}


def _execute_excel_read(
con: duckdb.DuckDBPyConnection,
input_files: list[Path],
read_cfg: dict[str, Any],
*,
logger,
) -> ReadInfo:
frames: list[pd.DataFrame] = []
params_used: dict[str, Any] | None = None

for input_file in input_files:
frame, frame_params = _load_excel_frame(input_file, read_cfg)
frames.append(frame)
if params_used is None:
params_used = frame_params

combined = pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
con.register("raw_input_df", combined)
con.execute("CREATE OR REPLACE VIEW raw_input AS SELECT * FROM raw_input_df;")

used = dict(params_used or {})
if used.get("columns") is None:
used.pop("columns", None)
logger.info(
"read_excel params used: source=excel params=%s",
json.dumps(used, ensure_ascii=False, sort_keys=True),
)
return ReadInfo(source="excel", params_used=used)


def _validate_read_mode(mode: str) -> str:
normalized_mode = str(mode or "fallback")
if normalized_mode not in {"strict", "fallback", "robust"}:
Expand Down Expand Up @@ -386,6 +487,8 @@ def read_raw_to_relation(
info = _execute_parquet_read(con, input_files)
logger.info("read_csv params used: source=parquet params={}")
return info
if exts <= {".xlsx"}:
return _execute_excel_read(con, input_files, read_cfg, logger=logger)

normalized_mode = _validate_read_mode(mode)
return _read_csv_relation(
Expand Down
2 changes: 1 addition & 1 deletion toolkit/clean/input_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def is_supported_input_file(path: Path) -> bool:
return False
if name.endswith((".csv.gz", ".tsv.gz", ".txt.gz")):
return True
if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet"}:
if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet", ".xlsx"}:
return True
return False

Expand Down
1 change: 1 addition & 0 deletions toolkit/core/config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ class CleanReadConfig(BaseModel):
columns: dict[str, str] | None = None
trim_whitespace: bool = True
sample_size: int | None = None
sheet_name: str | int | None = None
mode: Literal["explicit", "latest", "largest", "all"] | None = None
glob: str = "*"
prefer_from_raw_run: bool = True
Expand Down
2 changes: 2 additions & 0 deletions toolkit/core/csv_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"columns",
"trim_whitespace",
"sample_size",
"sheet_name",
}
ALLOWED_NESTED_CSV_KEYS = {
"delim",
Expand All @@ -45,6 +46,7 @@
"nullstr",
"columns",
"trim_whitespace",
"sheet_name",
}
FORMAT_HINT_KEYS = {
"delim",
Expand Down