Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions tests/test_clean_duckdb_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,31 @@ def test_read_raw_to_relation_passes_parallel_flag(tmp_path: Path):
con.close()


def test_read_raw_to_relation_keeps_explicit_columns_unchanged(tmp_path: Path):
input_file = tmp_path / "ok.csv"
input_file.write_text("a;b\n1;2\n", encoding="utf-8")

con = duckdb.connect(":memory:")
logger = logging.getLogger("tests.clean.duckdb_read.columns")

info = duckdb_read.read_raw_to_relation(
con,
[input_file],
{
"delim": ";",
"encoding": "utf-8",
"header": True,
"columns": {"a": "VARCHAR", "b": "VARCHAR"},
},
"strict",
logger,
)

assert info.source == "strict"
assert info.params_used["columns"] == {"a": "VARCHAR", "b": "VARCHAR"}
con.close()


def test_read_raw_to_relation_strict_error_message_uses_current_config_keys(tmp_path: Path):
input_file = tmp_path / "bad.csv"
input_file.write_text("a;b\n1;2;3\n", encoding="utf-8")
Expand Down Expand Up @@ -117,6 +142,39 @@ def _fail_execute_csv_read(_con, _input_files, _read_cfg):
con.close()


def test_read_raw_to_relation_handles_no_header_fixed_schema_without_extra_column(tmp_path: Path):
input_file = tmp_path / "fixed.csv"
input_file.write_text("A,2024,1,123,45.6\nB,2024,2,456,78.9\n", encoding="utf-8")

con = duckdb.connect(":memory:")
logger = logging.getLogger("tests.clean.duckdb_read.fixed_schema")

info = duckdb_read.read_raw_to_relation(
con,
[input_file],
{
"delim": ",",
"encoding": "utf-8",
"header": False,
"auto_detect": False,
"columns": {
"col0": "VARCHAR",
"col1": "VARCHAR",
"col2": "VARCHAR",
"col3": "VARCHAR",
"col4": "VARCHAR",
},
},
"fallback",
logger,
)

rows = con.execute("SELECT col0, col1, col2, col3, col4 FROM raw_input ORDER BY col0").fetchall()
assert info.source == "strict"
assert rows == [("A", "2024", "1", "123", "45.6"), ("B", "2024", "2", "456", "78.9")]
con.close()


def test_resolve_clean_read_cfg_uses_suggested_hints_in_auto_mode(tmp_path: Path):
raw_dir = tmp_path / "raw" / "demo" / "2024"
profile_dir = raw_dir / "_profile"
Expand Down
2 changes: 0 additions & 2 deletions toolkit/clean/duckdb_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,6 @@ def _csv_read_options(read_cfg: dict[str, Any]) -> tuple[list[str], dict[str, An
source_columns = None
if columns:
source_columns = dict(columns)
if "__extra" not in source_columns:
source_columns["__extra"] = "VARCHAR"
cols_sql = ", ".join(
[f"'{sql_str(name)}': '{sql_str(dtype)}'" for name, dtype in source_columns.items()]
)
Expand Down