Skip to content

Commit 72ef3be

Browse files
authored
feat: add xlsx support in clean layer (#10)
1 parent 7233ecb commit 72ef3be

9 files changed

Lines changed: 200 additions & 2 deletions

docs/config-schema.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,20 @@ I path relativi sono sempre risolti rispetto alla directory che contiene `datase
9090
| `columns` | `dict[string,string] \| null` | `null` |
9191
| `trim_whitespace` | `bool` | `true` |
9292
| `sample_size` | `int \| null` | `null` |
93+
| `sheet_name` | `string \| int \| null` | `null` |
9394
| `mode` | `explicit \| latest \| largest \| all \| null` | `null` |
9495
| `glob` | `string` | `*` |
9596
| `prefer_from_raw_run` | `bool` | `true` |
9697
| `allow_ambiguous` | `bool` | `false` |
9798
| `include` | `list[string] \| null` | `null` |
9899

100+
Note pratiche:
101+
102+
- i file `.xlsx` sono supportati nel layer CLEAN
103+
- RAW conserva il workbook originale senza convertirlo
104+
- per `.xlsx`, le opzioni utili sono soprattutto `header`, `skip`, `columns`, `trim_whitespace`, `sheet_name`
105+
- `sheet_name` usa il primo foglio se omesso
106+
99107
`CleanValidate`:
100108

101109
| Campo | Tipo | Default |

docs/runtime-boundaries.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,8 @@ Le sorgenti builtin supportate dal runtime canonico sono:
3737

3838
- `local_file`
3939
- `http_file`
40+
41+
Nota:
42+
43+
- il runtime canonico puo' conservare file `.xlsx` in RAW e leggerli in CLEAN
44+
- questo non cambia il ruolo del layer RAW: il file originale resta l'artefatto sorgente

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ dependencies = [
2727
"pyyaml>=6.0",
2828
"pydantic>=2.8.0",
2929
"pandas>=2.0",
30+
"openpyxl>=3.1.0",
3031
"pyarrow>=14.0",
3132
"duckdb>=0.10.0",
3233
"requests>=2.31",

tests/test_clean_duckdb_read.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pathlib import Path
55

66
import duckdb
7+
import pandas as pd
78
import pytest
89
import yaml
910

@@ -175,6 +176,67 @@ def test_read_raw_to_relation_handles_no_header_fixed_schema_without_extra_colum
175176
con.close()
176177

177178

179+
def test_read_raw_to_relation_reads_xlsx_first_sheet(tmp_path: Path):
180+
input_file = tmp_path / "ok.xlsx"
181+
pd.DataFrame(
182+
[
183+
{"Anno": 2022, "Regione": "Lazio", "Domanda": 123.4},
184+
{"Anno": 2022, "Regione": "Umbria", "Domanda": 56.7},
185+
]
186+
).to_excel(input_file, index=False)
187+
188+
con = duckdb.connect(":memory:")
189+
logger = logging.getLogger("tests.clean.duckdb_read.xlsx")
190+
191+
info = duckdb_read.read_raw_to_relation(
192+
con,
193+
[input_file],
194+
{"header": True},
195+
"fallback",
196+
logger,
197+
)
198+
199+
rows = con.execute('SELECT "Anno", "Regione", "Domanda" FROM raw_input ORDER BY "Regione"').fetchall()
200+
assert info.source == "excel"
201+
assert info.params_used["sheet_name"] == 0
202+
assert rows == [(2022, "Lazio", 123.4), (2022, "Umbria", 56.7)]
203+
con.close()
204+
205+
206+
def test_read_raw_to_relation_reads_xlsx_with_explicit_sheet_and_columns(tmp_path: Path):
207+
input_file = tmp_path / "sheeted.xlsx"
208+
with pd.ExcelWriter(input_file, engine="openpyxl") as writer:
209+
pd.DataFrame({"skipme": ["ignore"]}).to_excel(writer, sheet_name="Other", index=False)
210+
pd.DataFrame([["A", 1], ["B", 2]]).to_excel(
211+
writer,
212+
sheet_name="Export",
213+
header=False,
214+
index=False,
215+
)
216+
217+
con = duckdb.connect(":memory:")
218+
logger = logging.getLogger("tests.clean.duckdb_read.xlsx_sheet")
219+
220+
info = duckdb_read.read_raw_to_relation(
221+
con,
222+
[input_file],
223+
{
224+
"header": False,
225+
"sheet_name": "Export",
226+
"columns": {"col0": "VARCHAR", "col1": "VARCHAR"},
227+
},
228+
"fallback",
229+
logger,
230+
)
231+
232+
rows = con.execute("SELECT col0, col1 FROM raw_input ORDER BY col0").fetchall()
233+
assert info.source == "excel"
234+
assert info.params_used["sheet_name"] == "Export"
235+
assert info.params_used["columns"] == {"col0": "VARCHAR", "col1": "VARCHAR"}
236+
assert rows == [("A", 1), ("B", 2)]
237+
con.close()
238+
239+
178240
def test_resolve_clean_read_cfg_uses_suggested_hints_in_auto_mode(tmp_path: Path):
179241
raw_dir = tmp_path / "raw" / "demo" / "2024"
180242
profile_dir = raw_dir / "_profile"

tests/test_clean_input_selection.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,22 @@ def test_run_clean_accepts_csv_gz_inputs(tmp_path: Path, monkeypatch):
114114
assert seen["input_files"] == [gz_file]
115115

116116

117+
def test_run_clean_accepts_xlsx_inputs(tmp_path: Path, monkeypatch):
118+
raw_dir = tmp_path / "data" / "raw" / "demo" / "2024"
119+
raw_dir.mkdir(parents=True, exist_ok=True)
120+
xlsx_file = raw_dir / "data.xlsx"
121+
xlsx_file.write_bytes(b"fake-xlsx-content")
122+
123+
sql_path = _write_clean_sql(tmp_path)
124+
seen = _run_clean_capture_inputs(
125+
monkeypatch,
126+
tmp_path,
127+
{"sql": str(sql_path), "read": {}},
128+
)
129+
130+
assert seen["input_files"] == [xlsx_file]
131+
132+
117133
def test_run_clean_include_pattern_restricts_to_matching_input(tmp_path: Path, monkeypatch):
118134
raw_dir = tmp_path / "data" / "raw" / "demo" / "2024"
119135
raw_dir.mkdir(parents=True, exist_ok=True)

toolkit/clean/duckdb_read.py

Lines changed: 104 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any
77

88
import duckdb
9+
import pandas as pd
910
import yaml
1011
from toolkit.core.csv_read import (
1112
READ_SELECTION_KEYS,
@@ -19,7 +20,16 @@
1920
)
2021

2122

22-
SUPPORTED_INPUT_EXTS = {".csv", ".tsv", ".txt", ".parquet", ".csv.gz", ".tsv.gz", ".txt.gz"}
23+
SUPPORTED_INPUT_EXTS = {
24+
".csv",
25+
".tsv",
26+
".txt",
27+
".parquet",
28+
".csv.gz",
29+
".tsv.gz",
30+
".txt.gz",
31+
".xlsx",
32+
}
2333

2434

2535
@dataclass(frozen=True)
@@ -277,6 +287,97 @@ def _execute_parquet_read(
277287
return ReadInfo(source="parquet", params_used={})
278288

279289

290+
def _normalize_excel_sheet_name(value: Any) -> str | int:
291+
if value is None:
292+
return 0
293+
if isinstance(value, bool):
294+
raise ValueError("clean.read.sheet_name must be a string, integer, or null")
295+
if isinstance(value, int):
296+
return value
297+
if isinstance(value, str):
298+
text = value.strip()
299+
if not text:
300+
return 0
301+
return text
302+
raise ValueError("clean.read.sheet_name must be a string, integer, or null")
303+
304+
305+
def _trim_excel_dataframe(df: pd.DataFrame) -> pd.DataFrame:
306+
return df.apply(lambda column: column.map(lambda value: value.strip() if isinstance(value, str) else value))
307+
308+
309+
def _load_excel_frame(
310+
input_file: Path,
311+
read_cfg: dict[str, Any],
312+
) -> tuple[pd.DataFrame, dict[str, Any]]:
313+
header = bool(read_cfg.get("header", True))
314+
skip = int(read_cfg["skip"]) if read_cfg.get("skip") is not None else 0
315+
trim_whitespace = read_cfg.get("trim_whitespace", True)
316+
columns = read_cfg.get("columns")
317+
sheet_name = _normalize_excel_sheet_name(read_cfg.get("sheet_name"))
318+
319+
df = pd.read_excel(
320+
input_file,
321+
sheet_name=sheet_name,
322+
header=0 if header else None,
323+
skiprows=skip,
324+
dtype=object,
325+
engine="openpyxl",
326+
)
327+
328+
if columns:
329+
expected_columns = list(columns.keys())
330+
if len(expected_columns) != len(df.columns):
331+
raise ValueError(
332+
"Excel input columns mismatch. "
333+
f"Configured={len(expected_columns)} detected={len(df.columns)} file={input_file}"
334+
)
335+
df.columns = expected_columns
336+
elif not header:
337+
df.columns = [f"col{i}" for i in range(len(df.columns))]
338+
339+
if trim_whitespace:
340+
df = _trim_excel_dataframe(df)
341+
342+
return df, {
343+
"sheet_name": sheet_name,
344+
"header": header,
345+
"skip": skip,
346+
"trim_whitespace": bool(trim_whitespace),
347+
"columns": dict(columns) if columns else None,
348+
}
349+
350+
351+
def _execute_excel_read(
352+
con: duckdb.DuckDBPyConnection,
353+
input_files: list[Path],
354+
read_cfg: dict[str, Any],
355+
*,
356+
logger,
357+
) -> ReadInfo:
358+
frames: list[pd.DataFrame] = []
359+
params_used: dict[str, Any] | None = None
360+
361+
for input_file in input_files:
362+
frame, frame_params = _load_excel_frame(input_file, read_cfg)
363+
frames.append(frame)
364+
if params_used is None:
365+
params_used = frame_params
366+
367+
combined = pd.concat(frames, ignore_index=True) if len(frames) > 1 else frames[0]
368+
con.register("raw_input_df", combined)
369+
con.execute("CREATE OR REPLACE VIEW raw_input AS SELECT * FROM raw_input_df;")
370+
371+
used = dict(params_used or {})
372+
if used.get("columns") is None:
373+
used.pop("columns", None)
374+
logger.info(
375+
"read_excel params used: source=excel params=%s",
376+
json.dumps(used, ensure_ascii=False, sort_keys=True),
377+
)
378+
return ReadInfo(source="excel", params_used=used)
379+
380+
280381
def _validate_read_mode(mode: str) -> str:
281382
normalized_mode = str(mode or "fallback")
282383
if normalized_mode not in {"strict", "fallback", "robust"}:
@@ -386,6 +487,8 @@ def read_raw_to_relation(
386487
info = _execute_parquet_read(con, input_files)
387488
logger.info("read_csv params used: source=parquet params={}")
388489
return info
490+
if exts <= {".xlsx"}:
491+
return _execute_excel_read(con, input_files, read_cfg, logger=logger)
389492

390493
normalized_mode = _validate_read_mode(mode)
391494
return _read_csv_relation(

toolkit/clean/input_selection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def is_supported_input_file(path: Path) -> bool:
1414
return False
1515
if name.endswith((".csv.gz", ".tsv.gz", ".txt.gz")):
1616
return True
17-
if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet"}:
17+
if path.suffix.lower() in {".csv", ".tsv", ".txt", ".parquet", ".xlsx"}:
1818
return True
1919
return False
2020

toolkit/core/config_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ class CleanReadConfig(BaseModel):
272272
columns: dict[str, str] | None = None
273273
trim_whitespace: bool = True
274274
sample_size: int | None = None
275+
sheet_name: str | int | None = None
275276
mode: Literal["explicit", "latest", "largest", "all"] | None = None
276277
glob: str = "*"
277278
prefer_from_raw_run: bool = True

toolkit/core/csv_read.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
"columns",
2828
"trim_whitespace",
2929
"sample_size",
30+
"sheet_name",
3031
}
3132
ALLOWED_NESTED_CSV_KEYS = {
3233
"delim",
@@ -45,6 +46,7 @@
4546
"nullstr",
4647
"columns",
4748
"trim_whitespace",
49+
"sheet_name",
4850
}
4951
FORMAT_HINT_KEYS = {
5052
"delim",

0 commit comments

Comments
 (0)