dataciviclab · Gabrymi93 · Mar 3, 2026 · Mar 3, 2026
diff --git a/tests/test_clean_duckdb_read.py b/tests/test_clean_duckdb_read.py
@@ -89,6 +89,31 @@ def test_read_raw_to_relation_passes_parallel_flag(tmp_path: Path):
     con.close()
 
 
+def test_read_raw_to_relation_keeps_explicit_columns_unchanged(tmp_path: Path):
+    input_file = tmp_path / "ok.csv"
+    input_file.write_text("a;b\n1;2\n", encoding="utf-8")
+
+    con = duckdb.connect(":memory:")
+    logger = logging.getLogger("tests.clean.duckdb_read.columns")
+
+    info = duckdb_read.read_raw_to_relation(
+        con,
+        [input_file],
+        {
+            "delim": ";",
+            "encoding": "utf-8",
+            "header": True,
+            "columns": {"a": "VARCHAR", "b": "VARCHAR"},
+        },
+        "strict",
+        logger,
+    )
+
+    assert info.source == "strict"
+    assert info.params_used["columns"] == {"a": "VARCHAR", "b": "VARCHAR"}
+    con.close()
+
+
 def test_read_raw_to_relation_strict_error_message_uses_current_config_keys(tmp_path: Path):
     input_file = tmp_path / "bad.csv"
     input_file.write_text("a;b\n1;2;3\n", encoding="utf-8")
@@ -117,6 +142,39 @@ def _fail_execute_csv_read(_con, _input_files, _read_cfg):
     con.close()
 
 
+def test_read_raw_to_relation_handles_no_header_fixed_schema_without_extra_column(tmp_path: Path):
+    input_file = tmp_path / "fixed.csv"
+    input_file.write_text("A,2024,1,123,45.6\nB,2024,2,456,78.9\n", encoding="utf-8")
+
+    con = duckdb.connect(":memory:")
+    logger = logging.getLogger("tests.clean.duckdb_read.fixed_schema")
+
+    info = duckdb_read.read_raw_to_relation(
+        con,
+        [input_file],
+        {
+            "delim": ",",
+            "encoding": "utf-8",
+            "header": False,
+            "auto_detect": False,
+            "columns": {
+                "col0": "VARCHAR",
+                "col1": "VARCHAR",
+                "col2": "VARCHAR",
+                "col3": "VARCHAR",
+                "col4": "VARCHAR",
+            },
+        },
+        "fallback",
+        logger,
+    )
+
+    rows = con.execute("SELECT col0, col1, col2, col3, col4 FROM raw_input ORDER BY col0").fetchall()
+    assert info.source == "strict"
+    assert rows == [("A", "2024", "1", "123", "45.6"), ("B", "2024", "2", "456", "78.9")]
+    con.close()
+
+
 def test_resolve_clean_read_cfg_uses_suggested_hints_in_auto_mode(tmp_path: Path):
     raw_dir = tmp_path / "raw" / "demo" / "2024"
     profile_dir = raw_dir / "_profile"

diff --git a/toolkit/clean/duckdb_read.py b/toolkit/clean/duckdb_read.py
@@ -200,8 +200,6 @@ def _csv_read_options(read_cfg: dict[str, Any]) -> tuple[list[str], dict[str, An
     source_columns = None
     if columns:
         source_columns = dict(columns)
-        if "__extra" not in source_columns:
-            source_columns["__extra"] = "VARCHAR"
         cols_sql = ", ".join(
             [f"'{sql_str(name)}': '{sql_str(dtype)}'" for name, dtype in source_columns.items()]
         )