From 49374a9e8fdc42f54cdbba40b5e24608424b2559 Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Thu, 26 Mar 2026 16:46:07 +0000 Subject: [PATCH 01/12] feat: add CSV validation checks for allowed values and organisation ranges --- .gitignore | 2 +- digital_land/expectations/checkpoints/csv.py | 4 + digital_land/expectations/operations/csv.py | 129 ++++++++++++++++++ .../expectations/operations/test_csv.py | 100 ++++++++++++++ 4 files changed, 234 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9cb740a8..6dda51b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ __pycache__/ *.py[cod] - +.history/ .cache .coverage build diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py index 08e47d2d..15ebebd0 100644 --- a/digital_land/expectations/checkpoints/csv.py +++ b/digital_land/expectations/checkpoints/csv.py @@ -9,6 +9,8 @@ check_unique, check_no_shared_values, check_no_overlapping_ranges, + check_lookup_entities_are_within_organisation_ranges, + check_allowed_values ) @@ -24,6 +26,8 @@ def operation_factory(self, operation_string: str): "check_unique": check_unique, "check_no_shared_values": check_no_shared_values, "check_no_overlapping_ranges": check_no_overlapping_ranges, + "check_lookup_entities_are_within_organisation_ranges":check_lookup_entities_are_within_organisation_ranges, + "check_allowed_values":check_allowed_values } if operation_string not in operation_map: raise ValueError( diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 5e6267b9..ddae091e 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -157,3 +157,132 @@ def check_no_overlapping_ranges(conn, file_path: Path, min_field: str, max_field } return passed, message, details + + +def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list): + """ + Checks that a field contains only values from an allowed set. + + Args: + conn: duckdb connection + file_path: path to the CSV file + field: the column name to validate + allowed_values: allowed values for the field + """ + cleaned_allowed_values = [ + str(value).strip().replace("'", "''") + for value in (allowed_values or []) + if str(value).strip() != "" + ] + + if not cleaned_allowed_values: + raise ValueError("allowed_values must contain at least one non-empty value") + + allowed_values_sql = ",".join("'" + value + "'" for value in cleaned_allowed_values) + + result = conn.execute( + f""" + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + TRIM(COALESCE("{field}", '')) AS value + FROM {_read_csv(file_path)} + WHERE TRIM(COALESCE("{field}", '')) NOT IN ({allowed_values_sql}) + """ + ).fetchall() + + invalid_rows = [{"line_number": row[0], "value": row[1]} for row in result] + invalid_values = sorted({row["value"] for row in invalid_rows}) + + if len(invalid_rows) == 0: + passed = True + message = f"all values in '{field}' are allowed" + else: + passed = False + message = ( + f"there were {len(invalid_rows)} invalid values in '{field}'" + ) + + details = { + "field": field, + "allowed_values": sorted({value for value in cleaned_allowed_values}), + "invalid_values": invalid_values, + "invalid_rows": invalid_rows, + } + + return passed, message, details + + +def check_lookup_entities_are_within_organisation_ranges( + conn, file_path: Path, organisation_file: Path, ignored_organisations: list = None +): + """ + Checks that lookup entities are within any valid range from an organisation file. + + Args: + conn: duckdb connection + file_path: path to the lookup CSV file + organisation_file: path to the entity-organisation CSV file + ignored_organisations: list of organisations to ignore (i.e. not check that their entities are within a valid range) + """ + ignored_values = [ + org.replace("'", "''") + for org in (ignored_organisations or []) + if isinstance(org, str) and org.strip() + ] + ignored_clause = "" + if ignored_values: + ignored_values_sql = ",".join("'" + org + "'" for org in ignored_values) + ignored_clause = ( + " AND TRIM(COALESCE(\"organisation\", '')) NOT IN " + + f"({ignored_values_sql})" + ) + + result = conn.execute( + f""" + WITH ranges AS ( + SELECT + TRY_CAST("entity-minimum" AS BIGINT) AS min_entity, + TRY_CAST("entity-maximum" AS BIGINT) AS max_entity + FROM {_read_csv(organisation_file)} + WHERE TRY_CAST("entity-minimum" AS BIGINT) IS NOT NULL + AND TRY_CAST("entity-maximum" AS BIGINT) IS NOT NULL + ), + lookup_rows AS ( + SELECT + TRY_CAST("entity" AS BIGINT) AS entity, + TRIM(COALESCE("organisation", '')) AS organisation, + COALESCE("reference", '') AS reference + FROM {_read_csv(file_path)} + WHERE TRIM(COALESCE("organisation", '')) != '' + {ignored_clause} + ) + SELECT entity, organisation, reference + FROM lookup_rows l + WHERE organisation != '' + AND entity IS NOT NULL + AND NOT EXISTS ( + SELECT 1 + FROM ranges r + WHERE l.entity BETWEEN r.min_entity AND r.max_entity + ) + """ + ).fetchall() + + out_of_range_rows = [ + {"entity": row[0], "organisation": row[1], "reference": row[2]} + for row in result + ] + + if len(out_of_range_rows) == 0: + passed = True + message = "all lookup entities are within allowed ranges" + else: + passed = False + message = f"there were {len(out_of_range_rows)} out-of-range rows found" + + details = { + "invalid_rows": out_of_range_rows, + } + + return passed, message, details + diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 77a9677c..7cd57437 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -7,6 +7,8 @@ check_unique, check_no_shared_values, check_no_overlapping_ranges, + check_allowed_values, + check_lookup_entities_are_within_organisation_ranges, ) @@ -201,3 +203,101 @@ def test_check_no_overlapping_ranges_adjacent_fails(tmp_path): ) assert passed is False assert len(details["overlaps"]) == 1 + + +def test_check_lookup_entities_are_within_organisation_ranges_fails(tmp_path): + lookup_file = tmp_path / "lookup.csv" + with open(lookup_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "organisation", "reference"]) + writer.writerow(["150", "org-1", "ok-ref"]) + writer.writerow(["999", "org-2", "bad-ref"]) + + organisation_file = tmp_path / "entity-organisation.csv" + with open(organisation_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum", "organisation"]) + writer.writerow(["100", "200", "org-1"]) + writer.writerow(["300", "400", "org-2"]) + + conn = duckdb.connect() + passed, message, details = check_lookup_entities_are_within_organisation_ranges( + conn, file_path=lookup_file, organisation_file=organisation_file + ) + + assert passed is False + assert "out-of-range" in message + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["entity"] == 999 + assert details["invalid_rows"][0]["organisation"] == "org-2" + + +def test_check_lookup_entities_are_within_organisation_ranges_ignores_org(tmp_path): + lookup_file = tmp_path / "lookup.csv" + with open(lookup_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "organisation", "reference"]) + writer.writerow(["150", "org-1", "ok-ref"]) + writer.writerow(["999", "org-2", "ignored-ref"]) + + organisation_file = tmp_path / "entity-organisation.csv" + with open(organisation_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum", "organisation"]) + writer.writerow(["100", "200", "org-1"]) + writer.writerow(["300", "400", "org-2"]) + + conn = duckdb.connect() + passed, message, details = check_lookup_entities_are_within_organisation_ranges( + conn, + file_path=lookup_file, + organisation_file=organisation_file, + ignored_organisations=["org-2"], + ) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_allowed_values_fails_for_old_entity_status(tmp_path): + file_path = tmp_path / "old-entity.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["old-entity", "status", "entity"]) + writer.writerow(["1001", "301", "2001"]) + writer.writerow(["1002", "410", "2002"]) + writer.writerow(["1003", "302", "2003"]) + + conn = duckdb.connect() + passed, message, details = check_allowed_values( + conn, + file_path=file_path, + field="status", + allowed_values=["301", "410"], + ) + + assert passed is False + assert "invalid values" in message + assert details["invalid_values"] == ["302"] + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["value"] == "302" + + +def test_check_allowed_values_passes_for_old_entity_status(tmp_path): + file_path = tmp_path / "old-entity.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["old-entity", "status", "entity"]) + writer.writerow(["1001", "301", "2001"]) + writer.writerow(["1002", "410", "2002"]) + + conn = duckdb.connect() + passed, message, details = check_allowed_values( + conn, + file_path=file_path, + field="status", + allowed_values=["301", "410"], + ) + + assert passed is True + assert details["invalid_rows"] == [] From 38bcf9e43de754e10cd0b053d0f2378435a84702 Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Fri, 27 Mar 2026 11:41:55 +0000 Subject: [PATCH 02/12] refactor: replace check_lookup_entities_are_within_organisation_ranges with check_field_is_within_range in CSV operations and tests --- digital_land/expectations/checkpoints/csv.py | 4 +- digital_land/expectations/operations/csv.py | 224 +++++++++++++----- .../expectations/operations/test_csv.py | 188 +++++++++++++-- 3 files changed, 332 insertions(+), 84 deletions(-) diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py index 15ebebd0..50394fc8 100644 --- a/digital_land/expectations/checkpoints/csv.py +++ b/digital_land/expectations/checkpoints/csv.py @@ -9,7 +9,7 @@ check_unique, check_no_shared_values, check_no_overlapping_ranges, - check_lookup_entities_are_within_organisation_ranges, + check_field_is_within_range, check_allowed_values ) @@ -26,7 +26,7 @@ def operation_factory(self, operation_string: str): "check_unique": check_unique, "check_no_shared_values": check_no_shared_values, "check_no_overlapping_ranges": check_no_overlapping_ranges, - "check_lookup_entities_are_within_organisation_ranges":check_lookup_entities_are_within_organisation_ranges, + "check_field_is_within_range":check_field_is_within_range, "check_allowed_values":check_allowed_values } if operation_string not in operation_map: diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index ddae091e..d3d2a693 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -5,6 +5,41 @@ def _read_csv(file_path: Path) -> str: return f"read_csv_auto('{str(file_path)}',all_varchar=true,delim=',',quote='\"',escape='\"')" +def _get_csv_columns(conn, file_path: Path) -> list: + """Get column names from CSV file.""" + return [col[0] for col in conn.execute( + f"SELECT * FROM {_read_csv(file_path)} LIMIT 0" + ).description] + + +def _build_exclude_clause(exclude: list) -> str: + """Build SQL NOT clause from exclude conditions. Each dict is AND group; list is OR between groups.""" + if not exclude: + return "" + exclude_conditions = [] + for exclude_dict in exclude: + and_parts = [] + for k, v in exclude_dict.items(): + cleaned = str(v).strip().replace("'", "''") + and_parts.append(f'"{k}" = \'{cleaned}\'') + if and_parts: + exclude_conditions.append(f"({' AND '.join(and_parts)})") + return f" AND NOT ({' OR '.join(exclude_conditions)})" if exclude_conditions else "" + + +def _build_key_sql(cols: list, prefix: str) -> tuple: + """Build SQL key SELECT and WHERE fragments. Returns (select, where_not_empty).""" + select = ",\n ".join( + f'TRIM(COALESCE("{col}", \'\')) AS {prefix}_key_{i}' + for i, col in enumerate(cols) + ) + where = "\n AND ".join( + f'TRIM(COALESCE("{col}", \'\')) != \'\'' + for col in cols + ) + return select, where + + def count_rows( conn, file_path: Path, expected: int, comparison_rule: str = "greater_than" ): @@ -212,77 +247,150 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list return passed, message, details -def check_lookup_entities_are_within_organisation_ranges( - conn, file_path: Path, organisation_file: Path, ignored_organisations: list = None +def check_field_is_within_range( + conn, + file_path: Path, + field: str, + external_file: Path, + min_field: str, + max_field: str, + join_on: dict = None, + exclude: list = None, ): """ - Checks that lookup entities are within any valid range from an organisation file. + Checks that a field's values are within any valid range from an external file. Args: conn: duckdb connection - file_path: path to the lookup CSV file - organisation_file: path to the entity-organisation CSV file - ignored_organisations: list of organisations to ignore (i.e. not check that their entities are within a valid range) + file_path: path to the CSV file containing the field to validate + external_file: path to the CSV file containing the ranges + min_field: the column name for the range minimum + max_field: the column name for the range maximum + field: the column name to validate + join_on: optional dict with keys {"file": [...], "external": [...]} specifying columns to match for range validation + exclude: optional list of dicts specifying row conditions to exclude from validation. Each dict is an AND group; the list is OR between groups. + Example: [{"prefix": "conservationarea", "organisation": "orgA"}, {"prefix": "conservationarea", "organisation": "orgB"}] """ - ignored_values = [ - org.replace("'", "''") - for org in (ignored_organisations or []) - if isinstance(org, str) and org.strip() - ] - ignored_clause = "" - if ignored_values: - ignored_values_sql = ",".join("'" + org + "'" for org in ignored_values) - ignored_clause = ( - " AND TRIM(COALESCE(\"organisation\", '')) NOT IN " - + f"({ignored_values_sql})" - ) - - result = conn.execute( - f""" - WITH ranges AS ( - SELECT - TRY_CAST("entity-minimum" AS BIGINT) AS min_entity, - TRY_CAST("entity-maximum" AS BIGINT) AS max_entity - FROM {_read_csv(organisation_file)} - WHERE TRY_CAST("entity-minimum" AS BIGINT) IS NOT NULL - AND TRY_CAST("entity-maximum" AS BIGINT) IS NOT NULL - ), - lookup_rows AS ( - SELECT - TRY_CAST("entity" AS BIGINT) AS entity, - TRIM(COALESCE("organisation", '')) AS organisation, - COALESCE("reference", '') AS reference - FROM {_read_csv(file_path)} - WHERE TRIM(COALESCE("organisation", '')) != '' - {ignored_clause} + file_cols_list = _get_csv_columns(conn, file_path) + external_cols_list = _get_csv_columns(conn, external_file) + exclude_clause = _build_exclude_clause(exclude) + + # Validate and extract join_on + file_cols = external_cols = None + if join_on is not None: + if not isinstance(join_on, dict): + raise ValueError("join_on must be a dictionary") + file_cols = join_on.get("file") + external_cols = join_on.get("external") + if file_cols is None or external_cols is None: + raise ValueError( + 'join_on must have keys "file" and "external" with column lists' + ) + if not file_cols or not external_cols: + raise ValueError( + 'join_on "file" and "external" lists must be non-empty' + ) + if len(file_cols) != len(external_cols): + raise ValueError( + 'join_on "file" and "external" lists must have the same length' + ) + for col in file_cols: + if col not in file_cols_list: + raise ValueError( + f"Column '{col}' not found in file. Available columns: {file_cols_list}" + ) + for col in external_cols: + if col not in external_cols_list: + raise ValueError( + f"Column '{col}' not found in external file. Available columns: {external_cols_list}" + ) + + # Simple range check without key matching + if join_on is None: + result = conn.execute( + f""" + WITH ranges AS ( + SELECT + TRY_CAST("{min_field}" AS BIGINT) AS min_value, + TRY_CAST("{max_field}" AS BIGINT) AS max_value + FROM {_read_csv(external_file)} + WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL + AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL + ), + lookup_rows AS ( + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + TRY_CAST("{field}" AS BIGINT) AS value + FROM {_read_csv(file_path)} + WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause} + ) + SELECT line_number, value + FROM lookup_rows l + WHERE value IS NOT NULL + AND NOT EXISTS ( + SELECT 1 + FROM ranges r + WHERE l.value BETWEEN r.min_value AND r.max_value + ) + """ + ).fetchall() + out_of_range_rows = [{"line_number": row[0], "value": row[1]} for row in result] + else: + # Key-matched range check + range_keys, range_empty = _build_key_sql(external_cols, "range") + lookup_keys, lookup_empty = _build_key_sql(file_cols, "lookup") + key_join = "\n AND ".join( + f"l.lookup_key_{i} = r.range_key_{i}" + for i in range(len(file_cols)) ) - SELECT entity, organisation, reference - FROM lookup_rows l - WHERE organisation != '' - AND entity IS NOT NULL - AND NOT EXISTS ( - SELECT 1 - FROM ranges r - WHERE l.entity BETWEEN r.min_entity AND r.max_entity - ) - """ - ).fetchall() - - out_of_range_rows = [ - {"entity": row[0], "organisation": row[1], "reference": row[2]} - for row in result - ] + key_proj = ", ".join(f"lookup_key_{i}" for i in range(len(file_cols))) + + result = conn.execute( + f""" + WITH ranges AS ( + SELECT + {range_keys}, + TRY_CAST("{min_field}" AS BIGINT) AS min_value, + TRY_CAST("{max_field}" AS BIGINT) AS max_value + FROM {_read_csv(external_file)} + WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL + AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL + AND {range_empty} + ), + lookup_rows AS ( + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + TRY_CAST("{field}" AS BIGINT) AS value, + {lookup_keys} + FROM {_read_csv(file_path)} + WHERE {lookup_empty} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause} + ) + SELECT line_number, value, {key_proj} + FROM lookup_rows l + WHERE value IS NOT NULL + AND NOT EXISTS ( + SELECT 1 + FROM ranges r + WHERE {key_join} + AND l.value BETWEEN r.min_value AND r.max_value + ) + """ + ).fetchall() + + out_of_range_rows = [] + for row in result: + invalid_row = {"line_number": row[0], field: row[1]} + for i, col_name in enumerate(file_cols): + invalid_row[col_name] = row[i + 2] + out_of_range_rows.append(invalid_row) if len(out_of_range_rows) == 0: passed = True - message = "all lookup entities are within allowed ranges" + message = f"all values in '{field}' are within allowed ranges" else: passed = False message = f"there were {len(out_of_range_rows)} out-of-range rows found" - details = { - "invalid_rows": out_of_range_rows, - } - + details = {"invalid_rows": out_of_range_rows} return passed, message, details diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 7cd57437..348b823c 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -8,7 +8,7 @@ check_no_shared_values, check_no_overlapping_ranges, check_allowed_values, - check_lookup_entities_are_within_organisation_ranges, + check_field_is_within_range, ) @@ -205,58 +205,68 @@ def test_check_no_overlapping_ranges_adjacent_fails(tmp_path): assert len(details["overlaps"]) == 1 -def test_check_lookup_entities_are_within_organisation_ranges_fails(tmp_path): +def test_check_field_is_within_ranges_fails(tmp_path): lookup_file = tmp_path / "lookup.csv" with open(lookup_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity", "organisation", "reference"]) - writer.writerow(["150", "org-1", "ok-ref"]) - writer.writerow(["999", "org-2", "bad-ref"]) + writer.writerow(["entity"]) + writer.writerow(["150"]) + writer.writerow(["999"]) organisation_file = tmp_path / "entity-organisation.csv" with open(organisation_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity-minimum", "entity-maximum", "organisation"]) - writer.writerow(["100", "200", "org-1"]) - writer.writerow(["300", "400", "org-2"]) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + writer.writerow(["300", "400"]) conn = duckdb.connect() - passed, message, details = check_lookup_entities_are_within_organisation_ranges( - conn, file_path=lookup_file, organisation_file=organisation_file + passed, message, details = check_field_is_within_range( + conn, + file_path=lookup_file, + external_file=organisation_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", ) assert passed is False assert "out-of-range" in message assert len(details["invalid_rows"]) == 1 - assert details["invalid_rows"][0]["entity"] == 999 - assert details["invalid_rows"][0]["organisation"] == "org-2" + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["value"] == 999 -def test_check_lookup_entities_are_within_organisation_ranges_ignores_org(tmp_path): +def test_check_field_is_within_ranges_ignores_org(tmp_path): lookup_file = tmp_path / "lookup.csv" with open(lookup_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity", "organisation", "reference"]) - writer.writerow(["150", "org-1", "ok-ref"]) - writer.writerow(["999", "org-2", "ignored-ref"]) + writer.writerow(["entity"]) + writer.writerow(["150"]) + writer.writerow(["250"]) organisation_file = tmp_path / "entity-organisation.csv" with open(organisation_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity-minimum", "entity-maximum", "organisation"]) - writer.writerow(["100", "200", "org-1"]) - writer.writerow(["300", "400", "org-2"]) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + writer.writerow(["300", "400"]) conn = duckdb.connect() - passed, message, details = check_lookup_entities_are_within_organisation_ranges( + # Test without match_fields - simple range check + passed, message, details = check_field_is_within_range( conn, file_path=lookup_file, - organisation_file=organisation_file, - ignored_organisations=["org-2"], + external_file=organisation_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", ) - assert passed is True - assert details["invalid_rows"] == [] + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["value"] == 250 def test_check_allowed_values_fails_for_old_entity_status(tmp_path): @@ -301,3 +311,133 @@ def test_check_allowed_values_passes_for_old_entity_status(tmp_path): assert passed is True assert details["invalid_rows"] == [] + + +def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "prefix", "organisation", "reference"]) + writer.writerow(["150", "dataset-a", "org-1", "ok-ref"]) + writer.writerow(["250", "dataset-a", "org-1", "bad-ref"]) + writer.writerow(["999", "dataset-a", "org-2", "other-org-ref"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["dataset", "organisation", "entity-minimum", "entity-maximum"]) + writer.writerow(["dataset-a", "org-1", "100", "200"]) + writer.writerow(["dataset-a", "org-2", "900", "1000"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]}, + ) + + assert passed is False + assert "out-of-range" in message + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["entity"] == 250 + assert details["invalid_rows"][0]["prefix"] == "dataset-a" + assert details["invalid_rows"][0]["organisation"] == "org-1" + + +def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "prefix", "organisation", "reference"]) + writer.writerow(["150", "dataset-a", "org-1", "ok-ref"]) + writer.writerow(["950", "dataset-a", "org-2", "ok-ref-2"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["dataset", "organisation", "entity-minimum", "entity-maximum"]) + writer.writerow(["dataset-a", "org-1", "100", "200"]) + writer.writerow(["dataset-a", "org-2", "900", "1000"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]}, + ) + + +def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path): + file_path = tmp_path / "lookup_custom.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity_value", "dataset_key", "org_key", "ref_code"]) + writer.writerow(["55", "dataset-x", "org-a", "ok-ref"]) + writer.writerow(["250", "dataset-x", "org-a", "bad-ref"]) + + external_file = tmp_path / "ranges_custom.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["dataset_name", "org_name", "entity-minimum", "entity-maximum"]) + writer.writerow(["dataset-x", "org-a", "50", "100"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity_value", + join_on={"file": ["dataset_key", "org_key"], "external": ["dataset_name", "org_name"]}, + ) + + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["entity_value"] == 250 + assert details["invalid_rows"][0]["dataset_key"] == "dataset-x" + assert details["invalid_rows"][0]["org_key"] == "org-a" + + +def test_check_field_is_within_ranges_excludes_rows(tmp_path): + """Test that exclude skips rows matching specified field conditions during validation.""" + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "status"]) + writer.writerow(["150", "active"]) + writer.writerow(["250", "active"]) # out of range but not excluded + writer.writerow(["350", "inactive"]) # out of range but excluded + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + exclude=[{"status": "inactive"}], + ) + + # Should fail due to entity 250 which is out of range and not excluded + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["value"] == 250 + assert details["invalid_rows"][0]["line_number"] == 3 From 3089bc8723075bc8db93b2a6d9fbc25aaf29dbca Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Fri, 27 Mar 2026 13:53:10 +0000 Subject: [PATCH 03/12] check_field_is_within_range to support structured rules for filtering and matching --- digital_land/expectations/operations/csv.py | 183 +++++++++++++----- .../expectations/operations/test_csv.py | 139 ++++++++++++- 2 files changed, 261 insertions(+), 61 deletions(-) diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index d3d2a693..13e88a67 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -12,32 +12,91 @@ def _get_csv_columns(conn, file_path: Path) -> list: ).description] -def _build_exclude_clause(exclude: list) -> str: - """Build SQL NOT clause from exclude conditions. Each dict is AND group; list is OR between groups.""" - if not exclude: +def _sql_string(value) -> str: + cleaned = str(value).strip().replace("'", "''") + return f"'{cleaned}'" + + +def _normalize_condition_groups(conditions, name: str) -> list: + if conditions is None: + return [] + if isinstance(conditions, dict): + return [conditions] + if isinstance(conditions, list): + return conditions + raise ValueError(f"{name} must be a dict, list of dicts, or None") + + +def _build_field_condition(field_name: str, spec) -> str: + if isinstance(spec, dict): + op = str(spec.get("op", spec.get("operation", ""))).strip().lower() + value = spec.get("value") + if not op: + raise ValueError( + f"Condition for '{field_name}' must include 'op' when using dict format" + ) + else: + op = "=" + value = spec + + if op in ("=", "=="): + return f'"{field_name}" = {_sql_string(value)}' + if op in ("!=", "<>"): + return f'"{field_name}" != {_sql_string(value)}' + if op in ("in", "not in"): + if not isinstance(value, (list, tuple, set)) or len(value) == 0: + raise ValueError( + f"Condition for '{field_name}' with op '{op}' must use a non-empty list" + ) + values_sql = ", ".join(_sql_string(item) for item in value) + return f'"{field_name}" {op.upper()} ({values_sql})' + + raise ValueError( + f"Unsupported operator '{op}' for field '{field_name}'. Supported: =, !=, in, not in" + ) + + +def _build_condition_group(group: dict, file_columns: list) -> str: + if not isinstance(group, dict) or not group: + raise ValueError("Each condition group must be a non-empty dict") + + parts = [] + for field_name, spec in group.items(): + if field_name not in file_columns: + raise ValueError( + f"Column '{field_name}' not found in file. Available columns: {file_columns}" + ) + parts.append(_build_field_condition(field_name, spec)) + + return f"({' AND '.join(parts)})" + + +def _build_filter_clause(filter_spec, file_columns: list, name: str) -> str: + """Build SQL clause that keeps rows matching structured conditions.""" + groups = _normalize_condition_groups(filter_spec, name) + if not groups: return "" - exclude_conditions = [] - for exclude_dict in exclude: - and_parts = [] - for k, v in exclude_dict.items(): - cleaned = str(v).strip().replace("'", "''") - and_parts.append(f'"{k}" = \'{cleaned}\'') - if and_parts: - exclude_conditions.append(f"({' AND '.join(and_parts)})") - return f" AND NOT ({' OR '.join(exclude_conditions)})" if exclude_conditions else "" - - -def _build_key_sql(cols: list, prefix: str) -> tuple: - """Build SQL key SELECT and WHERE fragments. Returns (select, where_not_empty).""" - select = ",\n ".join( - f'TRIM(COALESCE("{col}", \'\')) AS {prefix}_key_{i}' - for i, col in enumerate(cols) + clauses = [_build_condition_group(group, file_columns) for group in groups] + return f" AND ({' OR '.join(clauses)})" + + +def _build_match_column_sql_parts(columns: list, alias_prefix: str) -> tuple: + """Build SQL fragments for match-key columns. + + Returns: + tuple[str, str]: + - SELECT projection fragment with normalized key aliases. + - WHERE fragment ensuring key columns are non-empty. + """ + select_fragment = ",\n ".join( + f'TRIM(COALESCE("{column}", \'\')) AS {alias_prefix}_key_{i}' + for i, column in enumerate(columns) ) - where = "\n AND ".join( - f'TRIM(COALESCE("{col}", \'\')) != \'\'' - for col in cols + non_empty_filter_fragment = "\n AND ".join( + f'TRIM(COALESCE("{column}", \'\')) != \'\'' + for column in columns ) - return select, where + return select_fragment, non_empty_filter_fragment def count_rows( @@ -254,8 +313,7 @@ def check_field_is_within_range( external_file: Path, min_field: str, max_field: str, - join_on: dict = None, - exclude: list = None, + rules: dict = None, ): """ Checks that a field's values are within any valid range from an external file. @@ -267,32 +325,49 @@ def check_field_is_within_range( min_field: the column name for the range minimum max_field: the column name for the range maximum field: the column name to validate - join_on: optional dict with keys {"file": [...], "external": [...]} specifying columns to match for range validation - exclude: optional list of dicts specifying row conditions to exclude from validation. Each dict is an AND group; the list is OR between groups. - Example: [{"prefix": "conservationarea", "organisation": "orgA"}, {"prefix": "conservationarea", "organisation": "orgB"}] + rules: optional dict that controls subset selection and key matching. + Supported keys: + - lookup_rules: dict or list[dict] of structured conditions for file_path rows. + - match_columns: dict with keys {"lookup": [...], "range": [...]} specifying columns to match. + lookup columns come from file_path (the rows being validated). + range columns come from external_file (the rows providing valid ranges). + Examples: + {"lookup_rules": {"prefix": "conservationarea"}} + {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}} + {"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}} + Use operators like != and not in when you want to exclude rows. """ file_cols_list = _get_csv_columns(conn, file_path) external_cols_list = _get_csv_columns(conn, external_file) - exclude_clause = _build_exclude_clause(exclude) + rules = rules or {} + if not isinstance(rules, dict): + raise ValueError("rules must be a dictionary or None") + + lookup_clause = _build_filter_clause( + rules.get("lookup_rules"), + file_cols_list, + "rules.lookup_rules", + ) - # Validate and extract join_on + # Validate and extract match_columns file_cols = external_cols = None - if join_on is not None: - if not isinstance(join_on, dict): - raise ValueError("join_on must be a dictionary") - file_cols = join_on.get("file") - external_cols = join_on.get("external") + match_columns = rules.get("match_columns") + if match_columns is not None: + if not isinstance(match_columns, dict): + raise ValueError("rules.match_columns must be a dictionary") + file_cols = match_columns.get("lookup") + external_cols = match_columns.get("range") if file_cols is None or external_cols is None: raise ValueError( - 'join_on must have keys "file" and "external" with column lists' + 'rules.match_columns must have keys "lookup" and "range" with column lists' ) if not file_cols or not external_cols: raise ValueError( - 'join_on "file" and "external" lists must be non-empty' + 'rules.match_columns "lookup" and "range" lists must be non-empty' ) if len(file_cols) != len(external_cols): raise ValueError( - 'join_on "file" and "external" lists must have the same length' + 'rules.match_columns "lookup" and "range" lists must have the same length' ) for col in file_cols: if col not in file_cols_list: @@ -306,7 +381,7 @@ def check_field_is_within_range( ) # Simple range check without key matching - if join_on is None: + if match_columns is None: result = conn.execute( f""" WITH ranges AS ( @@ -322,7 +397,7 @@ def check_field_is_within_range( ROW_NUMBER() OVER () + 1 AS line_number, TRY_CAST("{field}" AS BIGINT) AS value FROM {_read_csv(file_path)} - WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause} + WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause} ) SELECT line_number, value FROM lookup_rows l @@ -337,41 +412,47 @@ def check_field_is_within_range( out_of_range_rows = [{"line_number": row[0], "value": row[1]} for row in result] else: # Key-matched range check - range_keys, range_empty = _build_key_sql(external_cols, "range") - lookup_keys, lookup_empty = _build_key_sql(file_cols, "lookup") - key_join = "\n AND ".join( + range_key_select_sql, range_keys_non_empty_sql = _build_match_column_sql_parts( + external_cols, "range" + ) + lookup_key_select_sql, lookup_keys_non_empty_sql = _build_match_column_sql_parts( + file_cols, "lookup" + ) + key_match_condition_sql = "\n AND ".join( f"l.lookup_key_{i} = r.range_key_{i}" for i in range(len(file_cols)) ) - key_proj = ", ".join(f"lookup_key_{i}" for i in range(len(file_cols))) + key_projection_sql = ", ".join( + f"lookup_key_{i}" for i in range(len(file_cols)) + ) result = conn.execute( f""" WITH ranges AS ( SELECT - {range_keys}, + {range_key_select_sql}, TRY_CAST("{min_field}" AS BIGINT) AS min_value, TRY_CAST("{max_field}" AS BIGINT) AS max_value FROM {_read_csv(external_file)} WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL - AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL - AND {range_empty} + AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL + AND {range_keys_non_empty_sql} ), lookup_rows AS ( SELECT ROW_NUMBER() OVER () + 1 AS line_number, TRY_CAST("{field}" AS BIGINT) AS value, - {lookup_keys} + {lookup_key_select_sql} FROM {_read_csv(file_path)} - WHERE {lookup_empty} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause} + WHERE {lookup_keys_non_empty_sql} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause} ) - SELECT line_number, value, {key_proj} + SELECT line_number, value, {key_projection_sql} FROM lookup_rows l WHERE value IS NOT NULL AND NOT EXISTS ( SELECT 1 FROM ranges r - WHERE {key_join} + WHERE {key_match_condition_sql} AND l.value BETWEEN r.min_value AND r.max_value ) """ diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 348b823c..424fb096 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -337,7 +337,7 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_ min_field="entity-minimum", max_field="entity-maximum", field="entity", - join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]}, + rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}}, ) assert passed is False @@ -372,7 +372,7 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp min_field="entity-minimum", max_field="entity-maximum", field="entity", - join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]}, + rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}}, ) @@ -398,7 +398,7 @@ def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path): min_field="entity-minimum", max_field="entity-maximum", field="entity_value", - join_on={"file": ["dataset_key", "org_key"], "external": ["dataset_name", "org_name"]}, + rules={"match_columns": {"lookup": ["dataset_key", "org_key"], "range": ["dataset_name", "org_name"]}}, ) assert passed is False @@ -409,15 +409,15 @@ def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path): assert details["invalid_rows"][0]["org_key"] == "org-a" -def test_check_field_is_within_ranges_excludes_rows(tmp_path): - """Test that exclude skips rows matching specified field conditions during validation.""" +def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path): + """Test filtering rows with lookup_rules during validation.""" file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["entity", "status"]) writer.writerow(["150", "active"]) - writer.writerow(["250", "active"]) # out of range but not excluded - writer.writerow(["350", "inactive"]) # out of range but excluded + writer.writerow(["250", "active"]) + writer.writerow(["350", "inactive"]) external_file = tmp_path / "ranges.csv" with open(external_file, "w", newline="") as f: @@ -433,11 +433,130 @@ def test_check_field_is_within_ranges_excludes_rows(tmp_path): min_field="entity-minimum", max_field="entity-maximum", field="entity", - exclude=[{"status": "inactive"}], + rules={"lookup_rules": {"status": "active"}}, ) - - # Should fail due to entity 250 which is out of range and not excluded assert passed is False assert len(details["invalid_rows"]) == 1 assert details["invalid_rows"][0]["value"] == 250 assert details["invalid_rows"][0]["line_number"] == 3 + + +def test_check_field_is_within_ranges_lookup_rules_operator_eq_shape(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "prefix"]) + writer.writerow(["150", "conservationarea"]) + writer.writerow(["350", "other"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + rules={"lookup_rules": {"prefix": {"op": "=", "value": "conservationarea"}}}, + ) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_field_is_within_ranges_lookup_rules_exact_match(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "prefix"]) + writer.writerow(["150", "conservationarea"]) + writer.writerow(["350", "other"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + rules={"lookup_rules": {"prefix": "conservationarea"}}, + ) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "organisation"]) + writer.writerow(["150", "org-a"]) + writer.writerow(["350", "org-b"]) + writer.writerow(["350", "org-c"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + rules={"lookup_rules": {"organisation": {"op": "in", "value": ["org-a", "org-b"]}}}, + ) + + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["value"] == 350 + +def test_check_field_is_within_ranges_for_only_staus_301(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "status"]) + writer.writerow(["150", "301"]) + writer.writerow(["250", "301"]) + writer.writerow(["350", "410"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + + conn = duckdb.connect() + passed, message, details = check_field_is_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity", + rules={"lookup_rules": {"status": {"op": "=", "value": "301"}}}, + ) + + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["value"] == 250 From 3be0e3230d81aef7918834e2ba7e9b3f0d4e874d Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Fri, 27 Mar 2026 15:56:03 +0000 Subject: [PATCH 04/12] refactor: update CSV validation functions to improve range checks and add dataset organization matching --- digital_land/expectations/checkpoints/csv.py | 6 +- digital_land/expectations/operations/csv.py | 400 +++++++++++------- .../expectations/operations/test_csv.py | 90 ++-- 3 files changed, 324 insertions(+), 172 deletions(-) diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py index 50394fc8..b2258cb9 100644 --- a/digital_land/expectations/checkpoints/csv.py +++ b/digital_land/expectations/checkpoints/csv.py @@ -9,7 +9,8 @@ check_unique, check_no_shared_values, check_no_overlapping_ranges, - check_field_is_within_range, + check_fields_are_within_range, + check_field_is_within_range_by_dataset_org, check_allowed_values ) @@ -26,7 +27,8 @@ def operation_factory(self, operation_string: str): "check_unique": check_unique, "check_no_shared_values": check_no_shared_values, "check_no_overlapping_ranges": check_no_overlapping_ranges, - "check_field_is_within_range":check_field_is_within_range, + "check_fields_are_within_range": check_fields_are_within_range, + "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org, "check_allowed_values":check_allowed_values } if operation_string not in operation_map: diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 13e88a67..dfb9d949 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -17,6 +17,10 @@ def _sql_string(value) -> str: return f"'{cleaned}'" +def _sql_identifier(name: str) -> str: + return '"' + str(name).replace('"', '""') + '"' + + def _normalize_condition_groups(conditions, name: str) -> list: if conditions is None: return [] @@ -80,23 +84,61 @@ def _build_filter_clause(filter_spec, file_columns: list, name: str) -> str: return f" AND ({' OR '.join(clauses)})" -def _build_match_column_sql_parts(columns: list, alias_prefix: str) -> tuple: - """Build SQL fragments for match-key columns. +def _normalize_fields_for_validation(field_spec, file_columns: list) -> list: + """Normalize a field spec into a list of column names to validate.""" + if isinstance(field_spec, str): + fields = [item.strip() for item in field_spec.split(",") if item.strip()] + elif isinstance(field_spec, (list, tuple, set)): + fields = [str(item).strip() for item in field_spec if str(item).strip()] + else: + raise ValueError("field must be a string, comma-separated string, or list of strings") - Returns: - tuple[str, str]: - - SELECT projection fragment with normalized key aliases. - - WHERE fragment ensuring key columns are non-empty. - """ - select_fragment = ",\n ".join( - f'TRIM(COALESCE("{column}", \'\')) AS {alias_prefix}_key_{i}' - for i, column in enumerate(columns) - ) - non_empty_filter_fragment = "\n AND ".join( - f'TRIM(COALESCE("{column}", \'\')) != \'\'' - for column in columns - ) - return select_fragment, non_empty_filter_fragment + if not fields: + raise ValueError("field must include at least one column name") + + seen = set() + normalized_fields = [] + for field_name in fields: + if field_name not in seen: + seen.add(field_name) + normalized_fields.append(field_name) + + missing_fields = [field_name for field_name in normalized_fields if field_name not in file_columns] + if missing_fields: + raise ValueError( + f"Column(s) {missing_fields} not found in file. Available columns: {file_columns}" + ) + + return normalized_fields + + +def _build_range_invalid_rows( + result: list, + validating_multiple_fields: bool, + has_match_columns: bool, + lookup_match_columns: list = None, +) -> list: + """Format query rows into expectation invalid_rows shape.""" + out_of_range_rows = [] + + for row in result: + field_name = row[1] + + if has_match_columns: + if validating_multiple_fields: + invalid_row = {"line_number": row[0], "field": field_name, "value": row[2]} + else: + invalid_row = {"line_number": row[0], field_name: row[2]} + for i, col_name in enumerate(lookup_match_columns): + invalid_row[col_name] = row[i + 3] + else: + invalid_row = {"line_number": row[0], "value": row[2]} + if validating_multiple_fields: + invalid_row["field"] = field_name + + out_of_range_rows.append(invalid_row) + + return out_of_range_rows def count_rows( @@ -306,7 +348,7 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list return passed, message, details -def check_field_is_within_range( +def check_fields_are_within_range( conn, file_path: Path, field: str, @@ -316,154 +358,222 @@ def check_field_is_within_range( rules: dict = None, ): """ - Checks that a field's values are within any valid range from an external file. + Check that one or more lookup fields are within ranges from an external file. Args: conn: duckdb connection - file_path: path to the CSV file containing the field to validate - external_file: path to the CSV file containing the ranges + file_path: path to the CSV file containing fields to validate + field: column name(s) to validate. + You can pass a single name ("entity") or a comma-separated list + ("entity, end-entity"). All specified fields must be within range. + external_file: path to the CSV file containing valid ranges min_field: the column name for the range minimum max_field: the column name for the range maximum - field: the column name to validate - rules: optional dict that controls subset selection and key matching. - Supported keys: - - lookup_rules: dict or list[dict] of structured conditions for file_path rows. - - match_columns: dict with keys {"lookup": [...], "range": [...]} specifying columns to match. - lookup columns come from file_path (the rows being validated). - range columns come from external_file (the rows providing valid ranges). - Examples: - {"lookup_rules": {"prefix": "conservationarea"}} - {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}} - {"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}} - Use operators like != and not in when you want to exclude rows. + rules: optional dict controlling subset selection on lookup rows. + Supported keys: + - lookup_rules: dict or list[dict] of structured conditions. + Fields in one dict are AND'ed; multiple dicts are OR'ed. + Examples: + {"lookup_rules": {"prefix": "conservationarea"}} + {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}} + Use operators like != and not in when you want to exclude rows. """ - file_cols_list = _get_csv_columns(conn, file_path) - external_cols_list = _get_csv_columns(conn, external_file) + file_columns = _get_csv_columns(conn, file_path) rules = rules or {} if not isinstance(rules, dict): raise ValueError("rules must be a dictionary or None") lookup_clause = _build_filter_clause( rules.get("lookup_rules"), - file_cols_list, + file_columns, "rules.lookup_rules", ) - # Validate and extract match_columns - file_cols = external_cols = None - match_columns = rules.get("match_columns") - if match_columns is not None: - if not isinstance(match_columns, dict): - raise ValueError("rules.match_columns must be a dictionary") - file_cols = match_columns.get("lookup") - external_cols = match_columns.get("range") - if file_cols is None or external_cols is None: - raise ValueError( - 'rules.match_columns must have keys "lookup" and "range" with column lists' - ) - if not file_cols or not external_cols: - raise ValueError( - 'rules.match_columns "lookup" and "range" lists must be non-empty' - ) - if len(file_cols) != len(external_cols): - raise ValueError( - 'rules.match_columns "lookup" and "range" lists must have the same length' - ) - for col in file_cols: - if col not in file_cols_list: - raise ValueError( - f"Column '{col}' not found in file. Available columns: {file_cols_list}" - ) - for col in external_cols: - if col not in external_cols_list: - raise ValueError( - f"Column '{col}' not found in external file. Available columns: {external_cols_list}" - ) - - # Simple range check without key matching - if match_columns is None: - result = conn.execute( - f""" - WITH ranges AS ( - SELECT - TRY_CAST("{min_field}" AS BIGINT) AS min_value, - TRY_CAST("{max_field}" AS BIGINT) AS max_value - FROM {_read_csv(external_file)} - WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL - AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL - ), - lookup_rows AS ( - SELECT - ROW_NUMBER() OVER () + 1 AS line_number, - TRY_CAST("{field}" AS BIGINT) AS value - FROM {_read_csv(file_path)} - WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause} - ) - SELECT line_number, value - FROM lookup_rows l - WHERE value IS NOT NULL - AND NOT EXISTS ( - SELECT 1 - FROM ranges r - WHERE l.value BETWEEN r.min_value AND r.max_value - ) - """ - ).fetchall() - out_of_range_rows = [{"line_number": row[0], "value": row[1]} for row in result] - else: - # Key-matched range check - range_key_select_sql, range_keys_non_empty_sql = _build_match_column_sql_parts( - external_cols, "range" + fields_to_validate = _normalize_fields_for_validation(field, file_columns) + validating_multiple_fields = len(fields_to_validate) > 1 + lookup_values_sql = ",\n ".join( + f"({i}, {_sql_string(field_name)}, TRY_CAST(src.{_sql_identifier(field_name)} AS BIGINT))" + for i, field_name in enumerate(fields_to_validate) + ) + + result = conn.execute( + f""" + WITH ranges AS ( + SELECT + TRY_CAST("{min_field}" AS BIGINT) AS min_value, + TRY_CAST("{max_field}" AS BIGINT) AS max_value + FROM {_read_csv(external_file)} + WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL + AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL + ), + source_rows AS ( + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + * + FROM {_read_csv(file_path)} + ), + lookup_rows AS ( + SELECT + src.line_number, + fields.field_order, + fields.field_name, + fields.value + FROM source_rows src + CROSS JOIN LATERAL ( + VALUES + {lookup_values_sql} + ) AS fields(field_order, field_name, value) + WHERE fields.value IS NOT NULL{lookup_clause} ) - lookup_key_select_sql, lookup_keys_non_empty_sql = _build_match_column_sql_parts( - file_cols, "lookup" + SELECT + line_number, + field_name, + value + FROM lookup_rows l + WHERE NOT EXISTS ( + SELECT 1 + FROM ranges r + WHERE l.value BETWEEN r.min_value AND r.max_value ) - key_match_condition_sql = "\n AND ".join( - f"l.lookup_key_{i} = r.range_key_{i}" - for i in range(len(file_cols)) + ORDER BY field_order, line_number + """ + ).fetchall() + + out_of_range_rows = _build_range_invalid_rows( + result=result, + validating_multiple_fields=validating_multiple_fields, + has_match_columns=False, + ) + + if len(out_of_range_rows) == 0: + passed = True + message = f"all values in '{field}' are within allowed ranges" + else: + passed = False + message = f"there were {len(out_of_range_rows)} out-of-range rows found" + + details = {"invalid_rows": out_of_range_rows} + return passed, message, details + + +def check_field_is_within_range_by_dataset_org( + conn, + file_path: Path, + field: str, + external_file: Path, + min_field: str, + max_field: str, + lookup_dataset_field: str, + range_dataset_field: str, + rules: dict = None, +): + """ + Check field values are within ranges matched by dataset field and organisation. + + Matching is fixed to two keys: + 1. lookup_dataset_field -> range_dataset_field + 2. organisation -> organisation + + Args: + conn: duckdb connection + file_path: path to the CSV file containing fields to validate + field: single column name to validate (for example: "entity"). + external_file: path to the CSV file containing valid ranges + min_field: the column name for the range minimum + max_field: the column name for the range maximum + lookup_dataset_field: dataset column name in file_path + range_dataset_field: dataset column name in external_file + rules: optional dict controlling subset selection on lookup rows. + Supported keys: + - lookup_rules: dict or list[dict] of structured conditions. + Fields in one dict are AND'ed; multiple dicts are OR'ed. + Examples: + {"lookup_rules": {"prefix": "conservationarea"}} + {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}} + Use operators like != and not in when you want to exclude rows. + """ + file_columns = _get_csv_columns(conn, file_path) + rules = rules or {} + if not isinstance(rules, dict): + raise ValueError("rules must be a dictionary or None") + + lookup_clause = _build_filter_clause( + rules.get("lookup_rules"), + file_columns, + "rules.lookup_rules", + ) + + fields_to_validate = _normalize_fields_for_validation(field, file_columns) + if len(fields_to_validate) != 1: + raise ValueError("field must be a single column name") + field_name = fields_to_validate[0] + + lookup_dataset_name = str(lookup_dataset_field).strip() + range_dataset_name = str(range_dataset_field).strip() + lookup_match_columns = [lookup_dataset_name, "organisation"] + + lookup_dataset_col = _sql_identifier(lookup_dataset_name) + lookup_org_col = _sql_identifier("organisation") + range_dataset_col = _sql_identifier(range_dataset_name) + range_org_col = _sql_identifier("organisation") + min_col = _sql_identifier(min_field) + max_col = _sql_identifier(max_field) + value_col = _sql_identifier(field_name) + + result = conn.execute( + f""" + WITH ranges AS ( + SELECT + TRY_CAST({min_col} AS BIGINT) AS min_value, + TRY_CAST({max_col} AS BIGINT) AS max_value, + TRIM(COALESCE({range_dataset_col}, '')) AS range_key_0, + TRIM(COALESCE({range_org_col}, '')) AS range_key_1 + FROM {_read_csv(external_file)} + WHERE TRY_CAST({min_col} AS BIGINT) IS NOT NULL + AND TRY_CAST({max_col} AS BIGINT) IS NOT NULL + AND TRIM(COALESCE({range_dataset_col}, '')) != '' + AND TRIM(COALESCE({range_org_col}, '')) != '' + ), + source_rows AS ( + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + * + FROM {_read_csv(file_path)} + ), + lookup_rows AS ( + SELECT + src.line_number, + TRY_CAST(src.{value_col} AS BIGINT) AS value, + TRIM(COALESCE(src.{lookup_dataset_col}, '')) AS lookup_key_0, + TRIM(COALESCE(src.{lookup_org_col}, '')) AS lookup_key_1 + FROM source_rows src + WHERE TRY_CAST(src.{value_col} AS BIGINT) IS NOT NULL + AND TRIM(COALESCE(src.{lookup_dataset_col}, '')) != '' + AND TRIM(COALESCE(src.{lookup_org_col}, '')) != ''{lookup_clause} ) - key_projection_sql = ", ".join( - f"lookup_key_{i}" for i in range(len(file_cols)) + SELECT + line_number, + value, + lookup_key_0, + lookup_key_1 + FROM lookup_rows l + WHERE NOT EXISTS ( + SELECT 1 + FROM ranges r + WHERE l.value BETWEEN r.min_value AND r.max_value + AND l.lookup_key_0 = r.range_key_0 + AND l.lookup_key_1 = r.range_key_1 ) + ORDER BY line_number + """ + ).fetchall() - result = conn.execute( - f""" - WITH ranges AS ( - SELECT - {range_key_select_sql}, - TRY_CAST("{min_field}" AS BIGINT) AS min_value, - TRY_CAST("{max_field}" AS BIGINT) AS max_value - FROM {_read_csv(external_file)} - WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL - AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL - AND {range_keys_non_empty_sql} - ), - lookup_rows AS ( - SELECT - ROW_NUMBER() OVER () + 1 AS line_number, - TRY_CAST("{field}" AS BIGINT) AS value, - {lookup_key_select_sql} - FROM {_read_csv(file_path)} - WHERE {lookup_keys_non_empty_sql} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause} - ) - SELECT line_number, value, {key_projection_sql} - FROM lookup_rows l - WHERE value IS NOT NULL - AND NOT EXISTS ( - SELECT 1 - FROM ranges r - WHERE {key_match_condition_sql} - AND l.value BETWEEN r.min_value AND r.max_value - ) - """ - ).fetchall() - - out_of_range_rows = [] - for row in result: - invalid_row = {"line_number": row[0], field: row[1]} - for i, col_name in enumerate(file_cols): - invalid_row[col_name] = row[i + 2] - out_of_range_rows.append(invalid_row) + out_of_range_rows = [] + for row in result: + invalid_row = {"line_number": row[0], field_name: row[1]} + for i, col_name in enumerate(lookup_match_columns): + invalid_row[col_name] = row[i + 2] + out_of_range_rows.append(invalid_row) if len(out_of_range_rows) == 0: passed = True diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 424fb096..176f02bc 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -8,7 +8,8 @@ check_no_shared_values, check_no_overlapping_ranges, check_allowed_values, - check_field_is_within_range, + check_fields_are_within_range, + check_field_is_within_range_by_dataset_org, ) @@ -221,7 +222,7 @@ def test_check_field_is_within_ranges_fails(tmp_path): writer.writerow(["300", "400"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=lookup_file, external_file=organisation_file, @@ -254,7 +255,7 @@ def test_check_field_is_within_ranges_ignores_org(tmp_path): conn = duckdb.connect() # Test without match_fields - simple range check - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=lookup_file, external_file=organisation_file, @@ -313,7 +314,7 @@ def test_check_allowed_values_passes_for_old_entity_status(tmp_path): assert details["invalid_rows"] == [] -def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_path): +def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(tmp_path): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) @@ -330,14 +331,15 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_ writer.writerow(["dataset-a", "org-2", "900", "1000"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_field_is_within_range_by_dataset_org( conn, file_path=file_path, external_file=external_file, min_field="entity-minimum", max_field="entity-maximum", field="entity", - rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}}, + lookup_dataset_field="prefix", + range_dataset_field="dataset", ) assert passed is False @@ -349,7 +351,7 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_ assert details["invalid_rows"][0]["organisation"] == "org-1" -def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp_path): +def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_passes(tmp_path): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) @@ -365,40 +367,42 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp writer.writerow(["dataset-a", "org-2", "900", "1000"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_field_is_within_range_by_dataset_org( conn, file_path=file_path, external_file=external_file, min_field="entity-minimum", max_field="entity-maximum", field="entity", - rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}}, + lookup_dataset_field="prefix", + range_dataset_field="dataset", ) -def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path): +def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_names(tmp_path): file_path = tmp_path / "lookup_custom.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity_value", "dataset_key", "org_key", "ref_code"]) + writer.writerow(["entity_value", "dataset_key", "organisation", "ref_code"]) writer.writerow(["55", "dataset-x", "org-a", "ok-ref"]) writer.writerow(["250", "dataset-x", "org-a", "bad-ref"]) external_file = tmp_path / "ranges_custom.csv" with open(external_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["dataset_name", "org_name", "entity-minimum", "entity-maximum"]) + writer.writerow(["dataset_name", "organisation", "entity-minimum", "entity-maximum"]) writer.writerow(["dataset-x", "org-a", "50", "100"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_field_is_within_range_by_dataset_org( conn, file_path=file_path, external_file=external_file, min_field="entity-minimum", max_field="entity-maximum", field="entity_value", - rules={"match_columns": {"lookup": ["dataset_key", "org_key"], "range": ["dataset_name", "org_name"]}}, + lookup_dataset_field="dataset_key", + range_dataset_field="dataset_name", ) assert passed is False @@ -406,7 +410,7 @@ def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path): assert details["invalid_rows"][0]["line_number"] == 3 assert details["invalid_rows"][0]["entity_value"] == 250 assert details["invalid_rows"][0]["dataset_key"] == "dataset-x" - assert details["invalid_rows"][0]["org_key"] == "org-a" + assert details["invalid_rows"][0]["organisation"] == "org-a" def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path): @@ -426,7 +430,7 @@ def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path): writer.writerow(["100", "200"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=file_path, external_file=external_file, @@ -456,7 +460,7 @@ def test_check_field_is_within_ranges_lookup_rules_operator_eq_shape(tmp_path): writer.writerow(["100", "200"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=file_path, external_file=external_file, @@ -485,7 +489,7 @@ def test_check_field_is_within_ranges_lookup_rules_exact_match(tmp_path): writer.writerow(["100", "200"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=file_path, external_file=external_file, @@ -515,7 +519,7 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path): writer.writerow(["100", "200"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=file_path, external_file=external_file, @@ -530,14 +534,49 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path): assert details["invalid_rows"][0]["line_number"] == 3 assert details["invalid_rows"][0]["value"] == 350 + +def test_check_field_is_within_ranges_comma_separated_fields(tmp_path): + file_path = tmp_path / "lookup.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "end-entity"]) + writer.writerow(["150", "160"]) + writer.writerow(["150", "350"]) + writer.writerow(["350", "150"]) + + external_file = tmp_path / "ranges.csv" + with open(external_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity-minimum", "entity-maximum"]) + writer.writerow(["100", "200"]) + + conn = duckdb.connect() + passed, message, details = check_fields_are_within_range( + conn, + file_path=file_path, + external_file=external_file, + min_field="entity-minimum", + max_field="entity-maximum", + field="entity, end-entity", + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + assert details["invalid_rows"][0]["line_number"] == 4 + assert details["invalid_rows"][0]["field"] == "entity" + assert details["invalid_rows"][0]["value"] == 350 + assert details["invalid_rows"][1]["line_number"] == 3 + assert details["invalid_rows"][1]["field"] == "end-entity" + assert details["invalid_rows"][1]["value"] == 350 + def test_check_field_is_within_ranges_for_only_staus_301(tmp_path): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity", "status"]) - writer.writerow(["150", "301"]) - writer.writerow(["250", "301"]) - writer.writerow(["350", "410"]) + writer.writerow(["entity", "status","old-entity"]) + writer.writerow(["150", "301", "140"]) + writer.writerow(["250", "301", "150"]) + writer.writerow(["350", "410", "340"]) external_file = tmp_path / "ranges.csv" with open(external_file, "w", newline="") as f: @@ -546,17 +585,18 @@ def test_check_field_is_within_ranges_for_only_staus_301(tmp_path): writer.writerow(["100", "200"]) conn = duckdb.connect() - passed, message, details = check_field_is_within_range( + passed, message, details = check_fields_are_within_range( conn, file_path=file_path, external_file=external_file, min_field="entity-minimum", max_field="entity-maximum", - field="entity", + field="entity,old-entity", rules={"lookup_rules": {"status": {"op": "=", "value": "301"}}}, ) assert passed is False assert len(details["invalid_rows"]) == 1 assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["field"] == "entity" assert details["invalid_rows"][0]["value"] == 250 From 70524874dd1e3ad4617f61032b00ec944836c6b7 Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Fri, 27 Mar 2026 17:05:15 +0000 Subject: [PATCH 05/12] add check_no_blank_rows function and corresponding tests --- digital_land/expectations/checkpoints/csv.py | 6 ++- digital_land/expectations/operations/csv.py | 50 +++++++++++++++++++ .../expectations/operations/test_csv.py | 36 +++++++++++++ 3 files changed, 90 insertions(+), 2 deletions(-) diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py index b2258cb9..a948f65e 100644 --- a/digital_land/expectations/checkpoints/csv.py +++ b/digital_land/expectations/checkpoints/csv.py @@ -11,7 +11,8 @@ check_no_overlapping_ranges, check_fields_are_within_range, check_field_is_within_range_by_dataset_org, - check_allowed_values + check_allowed_values, + check_no_blank_rows, ) @@ -29,7 +30,8 @@ def operation_factory(self, operation_string: str): "check_no_overlapping_ranges": check_no_overlapping_ranges, "check_fields_are_within_range": check_fields_are_within_range, "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org, - "check_allowed_values":check_allowed_values + "check_allowed_values": check_allowed_values, + "check_no_blank_rows": check_no_blank_rows, } if operation_string not in operation_map: raise ValueError( diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index dfb9d949..8840c414 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -348,6 +348,56 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list return passed, message, details +def check_no_blank_rows(conn, file_path: Path): + """ + Checks that the CSV does not contain fully blank rows. + + A row is considered blank when every column is empty after trimming whitespace. + + Args: + conn: duckdb connection + file_path: path to the CSV file + """ + file_columns = _get_csv_columns(conn, file_path) + if not file_columns: + return True, "no blank rows found", {"invalid_rows": []} + + blank_conditions = " AND ".join( + f"TRIM(COALESCE({_sql_identifier(column_name)}, '')) = ''" + for column_name in file_columns + ) + + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + * + FROM {_read_csv(file_path)} + ) + SELECT + line_number + FROM source_rows + WHERE {blank_conditions} + ORDER BY line_number + """ + ).fetchall() + + invalid_rows = [{"line_number": row[0]} for row in result] + + if len(invalid_rows) == 0: + passed = True + message = "no blank rows found" + else: + passed = False + message = f"there were {len(invalid_rows)} blank rows found" + + details = { + "invalid_rows": invalid_rows, + } + return passed, message, details + + def check_fields_are_within_range( conn, file_path: Path, diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 176f02bc..3d89837d 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -8,6 +8,7 @@ check_no_shared_values, check_no_overlapping_ranges, check_allowed_values, + check_no_blank_rows, check_fields_are_within_range, check_field_is_within_range_by_dataset_org, ) @@ -314,6 +315,41 @@ def test_check_allowed_values_passes_for_old_entity_status(tmp_path): assert details["invalid_rows"] == [] +def test_check_no_blank_rows_passes(tmp_path): + file_path = tmp_path / "no-blank-rows.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "name", "reference"]) + writer.writerow(["1", "foo", "ref1"]) + writer.writerow(["2", "bar", "ref2"]) + + conn = duckdb.connect() + passed, message, details = check_no_blank_rows(conn, file_path=file_path) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_no_blank_rows_fails(tmp_path): + file_path = tmp_path / "has-blank-rows.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "name", "reference"]) + writer.writerow(["1", "foo", "ref1"]) + writer.writerow(["", "", ""]) + writer.writerow([" ", "", " "]) + writer.writerow(["2", "bar", "ref2"]) + + conn = duckdb.connect() + passed, message, details = check_no_blank_rows(conn, file_path=file_path) + + assert passed is False + assert "blank rows" in message + assert len(details["invalid_rows"]) == 2 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][1]["line_number"] == 4 + + def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(tmp_path): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: From 2df8646592aef78334415d3e9b724d59a868fc09 Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Mon, 30 Mar 2026 09:29:57 +0100 Subject: [PATCH 06/12] add datatype validation for CSV values and corresponding tests --- digital_land/expectations/checkpoints/csv.py | 2 + digital_land/expectations/operations/csv.py | 78 +++++ .../operations/datatype_validators.py | 282 ++++++++++++++++++ .../expectations/checkpoints/test_csv.py | 34 +++ .../expectations/operations/test_csv.py | 129 ++++++++ 5 files changed, 525 insertions(+) create mode 100644 digital_land/expectations/operations/datatype_validators.py diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py index a948f65e..5b33a771 100644 --- a/digital_land/expectations/checkpoints/csv.py +++ b/digital_land/expectations/checkpoints/csv.py @@ -13,6 +13,7 @@ check_field_is_within_range_by_dataset_org, check_allowed_values, check_no_blank_rows, + check_values_have_the_correct_datatype, ) @@ -32,6 +33,7 @@ def operation_factory(self, operation_string: str): "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org, "check_allowed_values": check_allowed_values, "check_no_blank_rows": check_no_blank_rows, + "check_values_have_the_correct_datatype": check_values_have_the_correct_datatype, } if operation_string not in operation_map: raise ValueError( diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 8840c414..12622ac1 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -1,4 +1,7 @@ from pathlib import Path +import pandas as pd + +from digital_land.expectations.operations.datatype_validators import _is_valid_address_value, _is_valid_curie_list_value, _is_valid_curie_value, _is_valid_datetime_value, _is_valid_decimal_value, _is_valid_flag_value, _is_valid_hash_value, _is_valid_integer_value, _is_valid_json_value, _is_valid_latitude_value, _is_valid_longitude_value, _is_valid_multipolygon_value, _is_valid_pattern_value, _is_valid_point_value, _is_valid_reference_value, _is_valid_url_value def _read_csv(file_path: Path) -> str: @@ -635,3 +638,78 @@ def check_field_is_within_range_by_dataset_org( details = {"invalid_rows": out_of_range_rows} return passed, message, details + +def check_values_have_the_correct_datatype(conn,file_path, field_datatype): + """ + Validates that CSV column values have correct datatypes. + + This function uses pandas to read and validate the CSV using datatype validators. + The conn parameter is accepted for consistency with other operations but not used. + + Args: + file_path: path to the CSV file to validate + field_datatype: dict mapping column name to datatype string + """ + validators = { + "address": _is_valid_address_value, + "curie-list": _is_valid_curie_list_value, + "curie": _is_valid_curie_value, + "date": _is_valid_datetime_value, + "datetime": _is_valid_datetime_value, + "decimal": _is_valid_decimal_value, + "flag": _is_valid_flag_value, + "hash": _is_valid_hash_value, + "integer": _is_valid_integer_value, + "json": _is_valid_json_value, + "latitude": _is_valid_latitude_value, + "longitude": _is_valid_longitude_value, + "multipolygon": _is_valid_multipolygon_value, + "pattern": _is_valid_pattern_value, + "point": _is_valid_point_value, + "reference": _is_valid_reference_value, + "url": _is_valid_url_value, + } + + # Read CSV with pandas (keep_default_na=False preserves empty strings) + df = pd.read_csv(file_path, dtype=str, keep_default_na=False) + + if df.empty or len(df.columns) == 0: + return True, "no invalid values found", {"invalid_rows": []} + + # Identify applicable fields for validation + applicable_fields = [ + (field, field_datatype.get(field), validators[field_datatype.get(field)]) + for field in df.columns + if field in field_datatype and field_datatype.get(field) in validators + ] + + if not applicable_fields: + return True, "no invalid values found", {"invalid_rows": []} + + # Validate values + invalid_values = [] + for line_number, (idx, row) in enumerate(df.iterrows(), start=2): + for field, datatype, validator in applicable_fields: + value = str(row.get(field, "")).strip() + if not value: + continue + + if not validator(value): + invalid_values.append({ + "line_number": line_number, + "field": field, + "datatype": datatype, + "value": value, + }) + + if len(invalid_values) == 0: + passed = True + message = "all values have valid datatypes" + details = {"invalid_rows": []} + else: + passed = False + message = f"there were {len(invalid_values)} invalid datatype value(s) found" + details = {"invalid_rows": invalid_values} + + return passed, message, details + diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py new file mode 100644 index 00000000..7ef6bd80 --- /dev/null +++ b/digital_land/expectations/operations/datatype_validators.py @@ -0,0 +1,282 @@ +import json +import re +import urllib.parse +from datetime import datetime +from decimal import Decimal, InvalidOperation + +import shapely.errors +import shapely.wkt +from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon, shape + + +def _is_valid_datetime_value(value): + value = value.strip().strip('",').lower() + + # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/date.py#L22 + patterns = [ + # Date/date-like formats + "%Y-%m-%d", + "%Y%m%d", + "%Y/%m/%d", + "%Y %m %d", + "%Y.%m.%d", + "%Y-%d-%m", # risky! + "%Y-%m", + "%Y.%m", + "%Y/%m", + "%Y %m", + "%Y", + "%Y.0", + "%d/%m/%Y", + "%d/%m/%y", + "%d-%m-%Y", + "%d-%m-%y", + "%d.%m.%Y", + "%d.%m.%y", + "%d-%b-%Y", + "%d-%b-%y", + "%d %B %Y", + "%b %d, %Y", + "%b %d, %y", + "%b-%y", + "%B %Y", + "%m/%d/%Y", # risky! + # Datetime formats + "%Y-%m-%dT%H:%M:%S.000Z", + "%Y-%m-%dT%H:%M:%S.000", + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%S.%f%z", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S", + "%Y/%m/%d %H:%M:%S%z", + "%Y/%m/%d %H:%M:%S+00", + "%Y/%m/%d %H:%M:%S", + "%Y/%m/%d %H:%M", + "%Y/%m/%dT%H:%M:%S", + "%Y/%m/%dT%H:%M:%S.000Z", + "%Y/%m/%dT%H:%M:%S.000", + "%Y/%m/%dT%H:%M:%S.%fZ", + "%Y/%m/%dT%H:%M:%S.%f%z", + "%Y/%m/%dT%H:%M:%S.%f", + "%Y/%m/%dT%H:%M:%SZ", + "%Y-%m-%d %H:%M:%S", + "%d/%m/%Y %H:%M:%S", + "%d/%m/%Y %H:%M", + ] + + # Handle fractional seconds with extra precision. + if "." in value and "Z" in value: + parts = value.replace("Z", "").split(".") + if len(parts) == 2 and len(parts[1]) > 6: + value = parts[0] + "." + parts[1][:6] + "Z" + elif "." in value and "+" in value: + parts = value.split("+") + base_part = parts[0] + tz_part = "+" + parts[1] + if "." in base_part: + date_time, frac = base_part.rsplit(".", 1) + if len(frac) > 6: + frac = frac[:6] + value = date_time + "." + frac + tz_part + + for pattern in patterns: + try: + datetime.strptime(value, pattern) + return True + except ValueError: + continue + + # Try unix timestamp + try: + float_val = float(value) + return -62135596800 < float_val < 253402300800 # Year 1 to 9999 + except ValueError: + pass + + return False + + +def _is_valid_integer_value(value): + try: + num = float(value) + return num == int(num) + except (ValueError, OverflowError): + return False + + +def _is_valid_decimal_value(value): + try: + Decimal(value) + return True + except (InvalidOperation, ValueError): + return False + + +def _is_valid_flag_value(value): + value = value.strip().lower() + + lookup = { + "y": "yes", + "n": "no", + "true": "yes", + "false": "no", + } + + normalized = lookup.get(value, value) + return normalized in {"", "yes", "no"} + + +def _is_valid_json_value(value): + try: + json.loads(value) + return True + except json.JSONDecodeError: + return False + + +def _is_valid_reference_value(value): + return bool(value.strip()) and not any(ch.isspace() for ch in value) + + +def _is_valid_curie_value(value): + return bool(re.fullmatch(r"[a-z0-9-]+:[^\s:][^\s]*", value)) + + +def _is_valid_curie_list_value(value): + text = (value or "").strip() + if not text: + return False + + parts = [part.strip() for part in text.split(";")] + if any(not part for part in parts): + return False + + curie_re = re.compile(r"[a-z0-9-]+:[^\s:][^\s]*") + return all(bool(curie_re.fullmatch(part)) for part in parts) + + +def _is_valid_address_value(value): + if not value or not value.strip(): + return False + + value = value.strip() + + # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/address.py#L10 + value = ", ".join(value.split("\n")) + value = value.replace(";", ",") + + comma_re = re.compile(r",\s*,+") + value = comma_re.sub(", ", value) + value = value.strip(", ") + + hyphen_re = re.compile(r"\s*-\s*") + value = hyphen_re.sub("-", value) + + value = " ".join(value.split()).replace('"', "") + + return bool(value.strip()) + + +def _is_valid_url_value(value): + candidate = (value or "").strip().strip("'") + parsed = urllib.parse.urlparse(candidate) + return bool(parsed.scheme and parsed.netloc) + + +def _is_valid_hash_value(value): + if ":" in value: + _, digest = value.split(":", 1) + else: + digest = value + return bool(re.fullmatch(r"[0-9a-fA-F]+", digest)) + + +def _is_valid_pattern_value(value): + try: + re.compile(value) + return True + except re.error: + return False + + +def _is_valid_latitude_value(value): + try: + numeric = float(value) + except ValueError: + return False + return -90 <= numeric <= 90 + + +def _is_valid_longitude_value(value): + try: + numeric = float(value) + except ValueError: + return False + return -180 <= numeric <= 180 + + +def _is_valid_multipolygon_value(value): + candidate = (value or "").strip() + if not candidate: + return False + + try: + geometry = shapely.wkt.loads(candidate) + except shapely.errors.WKTReadingError: + try: + geojson = json.loads(candidate) + geometry = shape(geojson) + except Exception: + return False + + if not isinstance(geometry, (Polygon, MultiPolygon, GeometryCollection)): + return False + + # Shapely normal validity check where available. + is_valid = getattr(geometry, "is_valid", True) + return bool(is_valid) + + +def _is_valid_point_value(value): + candidate = value + if candidate is None: + return False + + # Try WKT first. + try: + point = shapely.wkt.loads(candidate if isinstance(candidate, str) else str(candidate)) + if not isinstance(point, Point): + return False + return bool(getattr(point, "is_valid", True)) + except shapely.errors.WKTReadingError: + pass + except Exception: + return False + + # Fallback to coordinate pair. + try: + if isinstance(candidate, (list, tuple)) and len(candidate) == 2: + x_raw, y_raw = candidate[0], candidate[1] + elif isinstance(candidate, str): + text = candidate.strip() + if not text: + return False + + if text.startswith("["): + coords = json.loads(text) + if not isinstance(coords, list) or len(coords) != 2: + return False + x_raw, y_raw = coords[0], coords[1] + else: + parts = [p.strip() for p in text.split(",")] + if len(parts) != 2: + return False + x_raw, y_raw = parts[0], parts[1] + else: + return False + + point = Point(float(x_raw), float(y_raw)) + return bool(getattr(point, "is_valid", True)) + except Exception: + return False diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py index 3458fe84..44f4f1ba 100644 --- a/tests/integration/expectations/checkpoints/test_csv.py +++ b/tests/integration/expectations/checkpoints/test_csv.py @@ -1,4 +1,5 @@ import csv +import json import pytest from digital_land.expectations.checkpoints.csv import CsvCheckpoint @@ -73,3 +74,36 @@ def test_invalid_operation(self, csv_file): checkpoint.load( [{"operation": "nonexistent", "name": "test", "parameters": "{}"}] ) + + def test_check_values_have_the_correct_datatype_rule(self, tmp_path): + file_path = tmp_path / "test_datatypes.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "count"]) + writer.writerow(["entity-1", "100"]) + writer.writerow(["entity-2", "invalid_int"]) + + checkpoint = CsvCheckpoint("test-dataset", file_path) + rules = [ + { + "operation": "check_values_have_the_correct_datatype", + "name": "Datatype validation", + "parameters": { + "field_datatype": { + "entity": "reference", + "count": "integer", + } + }, + } + ] + + checkpoint.load(rules) + checkpoint.run() + + assert len(checkpoint.log.entries) == 1 + assert checkpoint.log.entries[0]["operation"] == "check_values_have_the_correct_datatype" + assert checkpoint.log.entries[0]["passed"] is False + details = json.loads(checkpoint.log.entries[0]["details"]) + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["field"] == "count" diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 3d89837d..a61b2180 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -11,6 +11,7 @@ check_no_blank_rows, check_fields_are_within_range, check_field_is_within_range_by_dataset_org, + check_values_have_the_correct_datatype, ) @@ -636,3 +637,131 @@ def test_check_field_is_within_ranges_for_only_staus_301(tmp_path): assert details["invalid_rows"][0]["line_number"] == 3 assert details["invalid_rows"][0]["field"] == "entity" assert details["invalid_rows"][0]["value"] == 250 + + +def test_check_values_have_the_correct_datatype_passes(tmp_path): + """Test datatype validation with all valid values.""" + file_path = tmp_path / "valid_datatypes.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "count", "enabled"]) + writer.writerow(["entity-1", "100", "true"]) + writer.writerow(["entity-2", "200", "false"]) + + field_datatype = { + "entity": "reference", + "count": "integer", + "enabled": "flag", + } + + passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_values_have_the_correct_datatype_fails(tmp_path): + """Test datatype validation with invalid values.""" + file_path = tmp_path / "invalid_datatypes.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "count", "enabled"]) + writer.writerow(["entity-1", "100", "true"]) + writer.writerow(["entity-2", "not_a_number", "false"]) + writer.writerow(["entity-3", "300", "maybe"]) + + field_datatype = { + "entity": "reference", + "count": "integer", + "enabled": "flag", + } + + passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + assert details["invalid_rows"][0]["line_number"] == 3 + assert details["invalid_rows"][0]["field"] == "count" + assert details["invalid_rows"][0]["value"] == "not_a_number" + assert details["invalid_rows"][0]["datatype"] == "integer" + assert details["invalid_rows"][1]["line_number"] == 4 + assert details["invalid_rows"][1]["field"] == "enabled" + assert details["invalid_rows"][1]["value"] == "maybe" + assert "invalid datatype value(s)" in message + + +def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path): + """Test that empty values are skipped during validation.""" + file_path = tmp_path / "with_empty_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "count"]) + writer.writerow(["entity-1", "100"]) + writer.writerow(["entity-2", ""]) + writer.writerow(["entity-3", "300"]) + + field_datatype = { + "entity": "reference", + "count": "integer", + } + + passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path): + """Test that fields not in field_datatype map are not validated.""" + file_path = tmp_path / "unmapped_fields.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "count", "description"]) + writer.writerow(["entity-1", "100", "invalid_but_ignored"]) + + field_datatype = { + "entity": "reference", + "count": "integer", + } + + passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_values_have_the_correct_datatype_empty_file(tmp_path): + """Test behavior with empty CSV file.""" + file_path = tmp_path / "empty.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["entity", "count"]) + + field_datatype = { + "entity": "reference", + "count": "integer", + } + + passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + + assert passed is True + assert details["invalid_rows"] == [] + + +def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path): + """Test when no fields have datatype validators.""" + file_path = tmp_path / "no_applicable.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["name", "description"]) + writer.writerow(["field1", "some value"]) + + field_datatype = { + "name": "string", + "description": "string", + } + + passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + + assert passed is True + assert details["invalid_rows"] == [] From 7be8ced2ab60bc6da1eb0bdd675c4ef19ff1701f Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Mon, 30 Mar 2026 12:27:33 +0100 Subject: [PATCH 07/12] remove unused test for datatype validation in CsvCheckpoint --- digital_land/expectations/operations/csv.py | 49 ++++++------------- .../expectations/checkpoints/test_csv.py | 35 +------------ 2 files changed, 17 insertions(+), 67 deletions(-) diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 12622ac1..c0514821 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -20,10 +20,6 @@ def _sql_string(value) -> str: return f"'{cleaned}'" -def _sql_identifier(name: str) -> str: - return '"' + str(name).replace('"', '""') + '"' - - def _normalize_condition_groups(conditions, name: str) -> list: if conditions is None: return [] @@ -118,8 +114,6 @@ def _normalize_fields_for_validation(field_spec, file_columns: list) -> list: def _build_range_invalid_rows( result: list, validating_multiple_fields: bool, - has_match_columns: bool, - lookup_match_columns: list = None, ) -> list: """Format query rows into expectation invalid_rows shape.""" out_of_range_rows = [] @@ -127,17 +121,9 @@ def _build_range_invalid_rows( for row in result: field_name = row[1] - if has_match_columns: - if validating_multiple_fields: - invalid_row = {"line_number": row[0], "field": field_name, "value": row[2]} - else: - invalid_row = {"line_number": row[0], field_name: row[2]} - for i, col_name in enumerate(lookup_match_columns): - invalid_row[col_name] = row[i + 3] - else: - invalid_row = {"line_number": row[0], "value": row[2]} - if validating_multiple_fields: - invalid_row["field"] = field_name + invalid_row = {"line_number": row[0], "value": row[2]} + if validating_multiple_fields: + invalid_row["field"] = field_name out_of_range_rows.append(invalid_row) @@ -366,7 +352,7 @@ def check_no_blank_rows(conn, file_path: Path): return True, "no blank rows found", {"invalid_rows": []} blank_conditions = " AND ".join( - f"TRIM(COALESCE({_sql_identifier(column_name)}, '')) = ''" + f'TRIM(COALESCE("{column_name}", \'\')) = \'\'' for column_name in file_columns ) @@ -445,7 +431,7 @@ def check_fields_are_within_range( fields_to_validate = _normalize_fields_for_validation(field, file_columns) validating_multiple_fields = len(fields_to_validate) > 1 lookup_values_sql = ",\n ".join( - f"({i}, {_sql_string(field_name)}, TRY_CAST(src.{_sql_identifier(field_name)} AS BIGINT))" + f'({i}, {_sql_string(field_name)}, TRY_CAST(src."{field_name}" AS BIGINT))' for i, field_name in enumerate(fields_to_validate) ) @@ -495,7 +481,6 @@ def check_fields_are_within_range( out_of_range_rows = _build_range_invalid_rows( result=result, validating_multiple_fields=validating_multiple_fields, - has_match_columns=False, ) if len(out_of_range_rows) == 0: @@ -565,13 +550,11 @@ def check_field_is_within_range_by_dataset_org( range_dataset_name = str(range_dataset_field).strip() lookup_match_columns = [lookup_dataset_name, "organisation"] - lookup_dataset_col = _sql_identifier(lookup_dataset_name) - lookup_org_col = _sql_identifier("organisation") - range_dataset_col = _sql_identifier(range_dataset_name) - range_org_col = _sql_identifier("organisation") - min_col = _sql_identifier(min_field) - max_col = _sql_identifier(max_field) - value_col = _sql_identifier(field_name) + lookup_dataset_col = f'"{lookup_dataset_name}"' + range_dataset_col = f'"{range_dataset_name}"' + min_col = f'"{min_field}"' + max_col = f'"{max_field}"' + value_col = f'"{field_name}"' result = conn.execute( f""" @@ -580,12 +563,12 @@ def check_field_is_within_range_by_dataset_org( TRY_CAST({min_col} AS BIGINT) AS min_value, TRY_CAST({max_col} AS BIGINT) AS max_value, TRIM(COALESCE({range_dataset_col}, '')) AS range_key_0, - TRIM(COALESCE({range_org_col}, '')) AS range_key_1 + TRIM(COALESCE("organisation", '')) AS range_key_1 FROM {_read_csv(external_file)} WHERE TRY_CAST({min_col} AS BIGINT) IS NOT NULL AND TRY_CAST({max_col} AS BIGINT) IS NOT NULL AND TRIM(COALESCE({range_dataset_col}, '')) != '' - AND TRIM(COALESCE({range_org_col}, '')) != '' + AND TRIM(COALESCE("organisation", '')) != '' ), source_rows AS ( SELECT @@ -598,11 +581,11 @@ def check_field_is_within_range_by_dataset_org( src.line_number, TRY_CAST(src.{value_col} AS BIGINT) AS value, TRIM(COALESCE(src.{lookup_dataset_col}, '')) AS lookup_key_0, - TRIM(COALESCE(src.{lookup_org_col}, '')) AS lookup_key_1 + TRIM(COALESCE(src."organisation", '')) AS lookup_key_1 FROM source_rows src WHERE TRY_CAST(src.{value_col} AS BIGINT) IS NOT NULL AND TRIM(COALESCE(src.{lookup_dataset_col}, '')) != '' - AND TRIM(COALESCE(src.{lookup_org_col}, '')) != ''{lookup_clause} + AND TRIM(COALESCE(src."organisation", '')) != ''{lookup_clause} ) SELECT line_number, @@ -639,7 +622,7 @@ def check_field_is_within_range_by_dataset_org( return passed, message, details -def check_values_have_the_correct_datatype(conn,file_path, field_datatype): +def check_values_have_the_correct_datatype(file_path, field_datatype): """ Validates that CSV column values have correct datatypes. @@ -688,7 +671,7 @@ def check_values_have_the_correct_datatype(conn,file_path, field_datatype): # Validate values invalid_values = [] - for line_number, (idx, row) in enumerate(df.iterrows(), start=2): + for line_number, (_, row) in enumerate(df.iterrows(), start=2): for field, datatype, validator in applicable_fields: value = str(row.get(field, "")).strip() if not value: diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py index 44f4f1ba..3bc89b00 100644 --- a/tests/integration/expectations/checkpoints/test_csv.py +++ b/tests/integration/expectations/checkpoints/test_csv.py @@ -73,37 +73,4 @@ def test_invalid_operation(self, csv_file): with pytest.raises(ValueError): checkpoint.load( [{"operation": "nonexistent", "name": "test", "parameters": "{}"}] - ) - - def test_check_values_have_the_correct_datatype_rule(self, tmp_path): - file_path = tmp_path / "test_datatypes.csv" - with open(file_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["entity", "count"]) - writer.writerow(["entity-1", "100"]) - writer.writerow(["entity-2", "invalid_int"]) - - checkpoint = CsvCheckpoint("test-dataset", file_path) - rules = [ - { - "operation": "check_values_have_the_correct_datatype", - "name": "Datatype validation", - "parameters": { - "field_datatype": { - "entity": "reference", - "count": "integer", - } - }, - } - ] - - checkpoint.load(rules) - checkpoint.run() - - assert len(checkpoint.log.entries) == 1 - assert checkpoint.log.entries[0]["operation"] == "check_values_have_the_correct_datatype" - assert checkpoint.log.entries[0]["passed"] is False - details = json.loads(checkpoint.log.entries[0]["details"]) - assert len(details["invalid_rows"]) == 1 - assert details["invalid_rows"][0]["line_number"] == 3 - assert details["invalid_rows"][0]["field"] == "count" + ) \ No newline at end of file From 1e1d979ca94bbd44f271117fd59de587c834eeeb Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Mon, 30 Mar 2026 12:29:39 +0100 Subject: [PATCH 08/12] added conn to test --- digital_land/expectations/operations/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index c0514821..ebdc31d5 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -622,7 +622,7 @@ def check_field_is_within_range_by_dataset_org( return passed, message, details -def check_values_have_the_correct_datatype(file_path, field_datatype): +def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None): """ Validates that CSV column values have correct datatypes. From acb1449e008196b9d978029a0d8f8c5e88079530 Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Mon, 30 Mar 2026 13:19:50 +0100 Subject: [PATCH 09/12] improve code formatting and readability in CSV operations and datatype validators --- digital_land/expectations/operations/csv.py | 60 ++++++++++----- .../operations/datatype_validators.py | 4 +- .../expectations/checkpoints/test_csv.py | 2 +- .../expectations/operations/test_csv.py | 73 ++++++++++++------- 4 files changed, 93 insertions(+), 46 deletions(-) diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index ebdc31d5..01d69d40 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -1,7 +1,24 @@ from pathlib import Path import pandas as pd -from digital_land.expectations.operations.datatype_validators import _is_valid_address_value, _is_valid_curie_list_value, _is_valid_curie_value, _is_valid_datetime_value, _is_valid_decimal_value, _is_valid_flag_value, _is_valid_hash_value, _is_valid_integer_value, _is_valid_json_value, _is_valid_latitude_value, _is_valid_longitude_value, _is_valid_multipolygon_value, _is_valid_pattern_value, _is_valid_point_value, _is_valid_reference_value, _is_valid_url_value +from digital_land.expectations.operations.datatype_validators import ( + _is_valid_address_value, + _is_valid_curie_list_value, + _is_valid_curie_value, + _is_valid_datetime_value, + _is_valid_decimal_value, + _is_valid_flag_value, + _is_valid_hash_value, + _is_valid_integer_value, + _is_valid_json_value, + _is_valid_latitude_value, + _is_valid_longitude_value, + _is_valid_multipolygon_value, + _is_valid_pattern_value, + _is_valid_point_value, + _is_valid_reference_value, + _is_valid_url_value, +) def _read_csv(file_path: Path) -> str: @@ -10,9 +27,12 @@ def _read_csv(file_path: Path) -> str: def _get_csv_columns(conn, file_path: Path) -> list: """Get column names from CSV file.""" - return [col[0] for col in conn.execute( - f"SELECT * FROM {_read_csv(file_path)} LIMIT 0" - ).description] + return [ + col[0] + for col in conn.execute( + f"SELECT * FROM {_read_csv(file_path)} LIMIT 0" + ).description + ] def _sql_string(value) -> str: @@ -90,7 +110,9 @@ def _normalize_fields_for_validation(field_spec, file_columns: list) -> list: elif isinstance(field_spec, (list, tuple, set)): fields = [str(item).strip() for item in field_spec if str(item).strip()] else: - raise ValueError("field must be a string, comma-separated string, or list of strings") + raise ValueError( + "field must be a string, comma-separated string, or list of strings" + ) if not fields: raise ValueError("field must include at least one column name") @@ -102,7 +124,9 @@ def _normalize_fields_for_validation(field_spec, file_columns: list) -> list: seen.add(field_name) normalized_fields.append(field_name) - missing_fields = [field_name for field_name in normalized_fields if field_name not in file_columns] + missing_fields = [ + field_name for field_name in normalized_fields if field_name not in file_columns + ] if missing_fields: raise ValueError( f"Column(s) {missing_fields} not found in file. Available columns: {file_columns}" @@ -323,9 +347,7 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list message = f"all values in '{field}' are allowed" else: passed = False - message = ( - f"there were {len(invalid_rows)} invalid values in '{field}'" - ) + message = f"there were {len(invalid_rows)} invalid values in '{field}'" details = { "field": field, @@ -352,8 +374,7 @@ def check_no_blank_rows(conn, file_path: Path): return True, "no blank rows found", {"invalid_rows": []} blank_conditions = " AND ".join( - f'TRIM(COALESCE("{column_name}", \'\')) = \'\'' - for column_name in file_columns + f"TRIM(COALESCE(\"{column_name}\", '')) = ''" for column_name in file_columns ) result = conn.execute( @@ -622,7 +643,7 @@ def check_field_is_within_range_by_dataset_org( return passed, message, details -def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None): +def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None): """ Validates that CSV column values have correct datatypes. @@ -678,12 +699,14 @@ def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None): continue if not validator(value): - invalid_values.append({ - "line_number": line_number, - "field": field, - "datatype": datatype, - "value": value, - }) + invalid_values.append( + { + "line_number": line_number, + "field": field, + "datatype": datatype, + "value": value, + } + ) if len(invalid_values) == 0: passed = True @@ -695,4 +718,3 @@ def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None): details = {"invalid_rows": invalid_values} return passed, message, details - diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py index 7ef6bd80..1911c3c6 100644 --- a/digital_land/expectations/operations/datatype_validators.py +++ b/digital_land/expectations/operations/datatype_validators.py @@ -245,7 +245,9 @@ def _is_valid_point_value(value): # Try WKT first. try: - point = shapely.wkt.loads(candidate if isinstance(candidate, str) else str(candidate)) + point = shapely.wkt.loads( + candidate if isinstance(candidate, str) else str(candidate) + ) if not isinstance(point, Point): return False return bool(getattr(point, "is_valid", True)) diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py index 3bc89b00..d3c0ee8d 100644 --- a/tests/integration/expectations/checkpoints/test_csv.py +++ b/tests/integration/expectations/checkpoints/test_csv.py @@ -73,4 +73,4 @@ def test_invalid_operation(self, csv_file): with pytest.raises(ValueError): checkpoint.load( [{"operation": "nonexistent", "name": "test", "parameters": "{}"}] - ) \ No newline at end of file + ) diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index a61b2180..2a9592f9 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -351,7 +351,9 @@ def test_check_no_blank_rows_fails(tmp_path): assert details["invalid_rows"][1]["line_number"] == 4 -def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(tmp_path): +def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails( + tmp_path, +): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) @@ -388,7 +390,9 @@ def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisa assert details["invalid_rows"][0]["organisation"] == "org-1" -def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_passes(tmp_path): +def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_passes( + tmp_path, +): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) @@ -416,7 +420,9 @@ def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisa ) -def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_names(tmp_path): +def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_names( + tmp_path, +): file_path = tmp_path / "lookup_custom.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) @@ -427,7 +433,9 @@ def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_name external_file = tmp_path / "ranges_custom.csv" with open(external_file, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["dataset_name", "organisation", "entity-minimum", "entity-maximum"]) + writer.writerow( + ["dataset_name", "organisation", "entity-minimum", "entity-maximum"] + ) writer.writerow(["dataset-x", "org-a", "50", "100"]) conn = duckdb.connect() @@ -457,7 +465,7 @@ def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path): writer = csv.writer(f) writer.writerow(["entity", "status"]) writer.writerow(["150", "active"]) - writer.writerow(["250", "active"]) + writer.writerow(["250", "active"]) writer.writerow(["350", "inactive"]) external_file = tmp_path / "ranges.csv" @@ -487,8 +495,8 @@ def test_check_field_is_within_ranges_lookup_rules_operator_eq_shape(tmp_path): with open(file_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["entity", "prefix"]) - writer.writerow(["150", "conservationarea"]) - writer.writerow(["350", "other"]) + writer.writerow(["150", "conservationarea"]) + writer.writerow(["350", "other"]) external_file = tmp_path / "ranges.csv" with open(external_file, "w", newline="") as f: @@ -516,8 +524,8 @@ def test_check_field_is_within_ranges_lookup_rules_exact_match(tmp_path): with open(file_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["entity", "prefix"]) - writer.writerow(["150", "conservationarea"]) - writer.writerow(["350", "other"]) + writer.writerow(["150", "conservationarea"]) + writer.writerow(["350", "other"]) external_file = tmp_path / "ranges.csv" with open(external_file, "w", newline="") as f: @@ -545,9 +553,9 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path): with open(file_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["entity", "organisation"]) - writer.writerow(["150", "org-a"]) - writer.writerow(["350", "org-b"]) - writer.writerow(["350", "org-c"]) + writer.writerow(["150", "org-a"]) + writer.writerow(["350", "org-b"]) + writer.writerow(["350", "org-c"]) external_file = tmp_path / "ranges.csv" with open(external_file, "w", newline="") as f: @@ -563,7 +571,9 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path): min_field="entity-minimum", max_field="entity-maximum", field="entity", - rules={"lookup_rules": {"organisation": {"op": "in", "value": ["org-a", "org-b"]}}}, + rules={ + "lookup_rules": {"organisation": {"op": "in", "value": ["org-a", "org-b"]}} + }, ) assert passed is False @@ -577,9 +587,9 @@ def test_check_field_is_within_ranges_comma_separated_fields(tmp_path): with open(file_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["entity", "end-entity"]) - writer.writerow(["150", "160"]) - writer.writerow(["150", "350"]) - writer.writerow(["350", "150"]) + writer.writerow(["150", "160"]) + writer.writerow(["150", "350"]) + writer.writerow(["350", "150"]) external_file = tmp_path / "ranges.csv" with open(external_file, "w", newline="") as f: @@ -606,13 +616,14 @@ def test_check_field_is_within_ranges_comma_separated_fields(tmp_path): assert details["invalid_rows"][1]["field"] == "end-entity" assert details["invalid_rows"][1]["value"] == 350 + def test_check_field_is_within_ranges_for_only_staus_301(tmp_path): file_path = tmp_path / "lookup.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity", "status","old-entity"]) - writer.writerow(["150", "301", "140"]) - writer.writerow(["250", "301", "150"]) + writer.writerow(["entity", "status", "old-entity"]) + writer.writerow(["150", "301", "140"]) + writer.writerow(["250", "301", "150"]) writer.writerow(["350", "410", "340"]) external_file = tmp_path / "ranges.csv" @@ -654,7 +665,9 @@ def test_check_values_have_the_correct_datatype_passes(tmp_path): "enabled": "flag", } - passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + passed, message, details = check_values_have_the_correct_datatype( + file_path, field_datatype + ) assert passed is True assert details["invalid_rows"] == [] @@ -676,7 +689,9 @@ def test_check_values_have_the_correct_datatype_fails(tmp_path): "enabled": "flag", } - passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + passed, message, details = check_values_have_the_correct_datatype( + file_path, field_datatype + ) assert passed is False assert len(details["invalid_rows"]) == 2 @@ -705,7 +720,9 @@ def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path): "count": "integer", } - passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + passed, message, details = check_values_have_the_correct_datatype( + file_path, field_datatype + ) assert passed is True assert details["invalid_rows"] == [] @@ -724,7 +741,9 @@ def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path): "count": "integer", } - passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + passed, message, details = check_values_have_the_correct_datatype( + file_path, field_datatype + ) assert passed is True assert details["invalid_rows"] == [] @@ -742,7 +761,9 @@ def test_check_values_have_the_correct_datatype_empty_file(tmp_path): "count": "integer", } - passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + passed, message, details = check_values_have_the_correct_datatype( + file_path, field_datatype + ) assert passed is True assert details["invalid_rows"] == [] @@ -761,7 +782,9 @@ def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path): "description": "string", } - passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype) + passed, message, details = check_values_have_the_correct_datatype( + file_path, field_datatype + ) assert passed is True assert details["invalid_rows"] == [] From 6818887a35a261127270fb9a294416bb624f177f Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Mon, 30 Mar 2026 13:31:19 +0100 Subject: [PATCH 10/12] fixed pipeline failure cleanup --- digital_land/expectations/operations/csv.py | 59 ++++++++----------- .../expectations/checkpoints/test_csv.py | 1 - 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 01d69d40..9bfa42aa 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -40,16 +40,6 @@ def _sql_string(value) -> str: return f"'{cleaned}'" -def _normalize_condition_groups(conditions, name: str) -> list: - if conditions is None: - return [] - if isinstance(conditions, dict): - return [conditions] - if isinstance(conditions, list): - return conditions - raise ValueError(f"{name} must be a dict, list of dicts, or None") - - def _build_field_condition(field_name: str, spec) -> str: if isinstance(spec, dict): op = str(spec.get("op", spec.get("operation", ""))).strip().lower() @@ -79,27 +69,35 @@ def _build_field_condition(field_name: str, spec) -> str: ) -def _build_condition_group(group: dict, file_columns: list) -> str: - if not isinstance(group, dict) or not group: - raise ValueError("Each condition group must be a non-empty dict") - - parts = [] - for field_name, spec in group.items(): - if field_name not in file_columns: - raise ValueError( - f"Column '{field_name}' not found in file. Available columns: {file_columns}" - ) - parts.append(_build_field_condition(field_name, spec)) - - return f"({' AND '.join(parts)})" - - def _build_filter_clause(filter_spec, file_columns: list, name: str) -> str: """Build SQL clause that keeps rows matching structured conditions.""" - groups = _normalize_condition_groups(filter_spec, name) + if filter_spec is None: + groups = [] + elif isinstance(filter_spec, dict): + groups = [filter_spec] + elif isinstance(filter_spec, list): + groups = filter_spec + else: + raise ValueError(f"{name} must be a dict, list of dicts, or None") + if not groups: return "" - clauses = [_build_condition_group(group, file_columns) for group in groups] + + clauses = [] + for group in groups: + if not isinstance(group, dict) or not group: + raise ValueError("Each condition group must be a non-empty dict") + + parts = [] + for field_name, spec in group.items(): + if field_name not in file_columns: + raise ValueError( + f"Column '{field_name}' not found in file. Available columns: {file_columns}" + ) + parts.append(_build_field_condition(field_name, spec)) + + clauses.append(f"({' AND '.join(parts)})") + return f" AND ({' OR '.join(clauses)})" @@ -647,12 +645,10 @@ def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None) """ Validates that CSV column values have correct datatypes. - This function uses pandas to read and validate the CSV using datatype validators. - The conn parameter is accepted for consistency with other operations but not used. - Args: file_path: path to the CSV file to validate field_datatype: dict mapping column name to datatype string + conn: duckdb connection not used but required by caller """ validators = { "address": _is_valid_address_value, @@ -674,13 +670,11 @@ def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None) "url": _is_valid_url_value, } - # Read CSV with pandas (keep_default_na=False preserves empty strings) df = pd.read_csv(file_path, dtype=str, keep_default_na=False) if df.empty or len(df.columns) == 0: return True, "no invalid values found", {"invalid_rows": []} - # Identify applicable fields for validation applicable_fields = [ (field, field_datatype.get(field), validators[field_datatype.get(field)]) for field in df.columns @@ -690,7 +684,6 @@ def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None) if not applicable_fields: return True, "no invalid values found", {"invalid_rows": []} - # Validate values invalid_values = [] for line_number, (_, row) in enumerate(df.iterrows(), start=2): for field, datatype, validator in applicable_fields: diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py index d3c0ee8d..3458fe84 100644 --- a/tests/integration/expectations/checkpoints/test_csv.py +++ b/tests/integration/expectations/checkpoints/test_csv.py @@ -1,5 +1,4 @@ import csv -import json import pytest from digital_land.expectations.checkpoints.csv import CsvCheckpoint From 4ed3035ad14db0fd4c79a7cef22590c7fb1bce2d Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Tue, 31 Mar 2026 14:12:24 +0100 Subject: [PATCH 11/12] used duckdb for datatype validation --- digital_land/expectations/operations/csv.py | 161 +++++--- .../operations/datatype_validators.py | 202 ---------- .../expectations/operations/test_csv.py | 353 +++++++++++++++++- 3 files changed, 444 insertions(+), 272 deletions(-) diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 9bfa42aa..30ea7a01 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -2,22 +2,9 @@ import pandas as pd from digital_land.expectations.operations.datatype_validators import ( - _is_valid_address_value, - _is_valid_curie_list_value, - _is_valid_curie_value, - _is_valid_datetime_value, - _is_valid_decimal_value, - _is_valid_flag_value, - _is_valid_hash_value, - _is_valid_integer_value, - _is_valid_json_value, - _is_valid_latitude_value, - _is_valid_longitude_value, _is_valid_multipolygon_value, _is_valid_pattern_value, _is_valid_point_value, - _is_valid_reference_value, - _is_valid_url_value, ) @@ -316,16 +303,8 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list field: the column name to validate allowed_values: allowed values for the field """ - cleaned_allowed_values = [ - str(value).strip().replace("'", "''") - for value in (allowed_values or []) - if str(value).strip() != "" - ] - - if not cleaned_allowed_values: - raise ValueError("allowed_values must contain at least one non-empty value") - allowed_values_sql = ",".join("'" + value + "'" for value in cleaned_allowed_values) + allowed_values_sql = ",".join("'" + value + "'" for value in allowed_values) result = conn.execute( f""" @@ -349,7 +328,7 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list details = { "field": field, - "allowed_values": sorted({value for value in cleaned_allowed_values}), + "allowed_values": sorted({value for value in allowed_values}), "invalid_values": invalid_values, "invalid_rows": invalid_rows, } @@ -641,66 +620,128 @@ def check_field_is_within_range_by_dataset_org( return passed, message, details -def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None): +def check_values_have_the_correct_datatype(conn, file_path, field_datatype): """ Validates that CSV column values have correct datatypes. + Uses DuckDB queries for datatypes: integer, decimal, flag, latitude, longitude, hash, curie, curie-list, json, url, date, datetime. + + Uses Python validators for complex datatypes: pattern, multipolygon, point. + Args: file_path: path to the CSV file to validate field_datatype: dict mapping column name to datatype string - conn: duckdb connection not used but required by caller """ - validators = { - "address": _is_valid_address_value, - "curie-list": _is_valid_curie_list_value, - "curie": _is_valid_curie_value, - "date": _is_valid_datetime_value, - "datetime": _is_valid_datetime_value, - "decimal": _is_valid_decimal_value, - "flag": _is_valid_flag_value, - "hash": _is_valid_hash_value, - "integer": _is_valid_integer_value, - "json": _is_valid_json_value, - "latitude": _is_valid_latitude_value, - "longitude": _is_valid_longitude_value, - "multipolygon": _is_valid_multipolygon_value, + + def _get_sql_validation_condition(datatype: str, field_name: str) -> str: + field_ref = f"TRIM(COALESCE(\"{field_name}\", ''))" + + conditions = { + "integer": f"{field_ref} != '' AND NOT (TRY_CAST({field_ref} AS DOUBLE) IS NOT NULL AND TRY_CAST({field_ref} AS DOUBLE) = TRY_CAST({field_ref} AS BIGINT))", + "decimal": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DECIMAL) IS NULL", + "flag": f"{field_ref} != '' AND LOWER({field_ref}) NOT IN ('yes', 'no', 'true', 'false')", + "latitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -90 OR TRY_CAST({field_ref} AS DOUBLE) > 90)", + "longitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -180 OR TRY_CAST({field_ref} AS DOUBLE) > 180)", + "hash": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z]+:)?[0-9a-fA-F]+$'))", + "curie": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-z0-9-]+:[^\\s:][^\\s]*$'))", + "curie-list": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z0-9-]+:[^\\s:][^\\s]*(;[a-z0-9-]+:[^\\s:][^\\s]*)*)?$'))", + "json": f"{field_ref} != '' AND TRY(json_extract({field_ref}, '$')) IS NULL", + "url": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-zA-Z][a-zA-Z0-9+.-]*://[^\\s/:?#]+(?::[0-9]+)?(?:[/?#][^\\s]*)?$'))", + "date": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DATE) IS NULL", + "datetime": f"{field_ref} != '' AND TRY_CAST({field_ref} AS TIMESTAMP) IS NULL", + } + + return conditions.get(datatype, "FALSE") + + # Python validators for complex datatypes that can't be easily expressed in SQL + python_validators = { "pattern": _is_valid_pattern_value, + "multipolygon": _is_valid_multipolygon_value, "point": _is_valid_point_value, - "reference": _is_valid_reference_value, - "url": _is_valid_url_value, } - df = pd.read_csv(file_path, dtype=str, keep_default_na=False) - - if df.empty or len(df.columns) == 0: - return True, "no invalid values found", {"invalid_rows": []} + sql_validators = { + "integer", + "decimal", + "flag", + "latitude", + "longitude", + "hash", + "curie", + "curie-list", + "json", + "url", + "date", + "datetime", + } - applicable_fields = [ - (field, field_datatype.get(field), validators[field_datatype.get(field)]) - for field in df.columns - if field in field_datatype and field_datatype.get(field) in validators - ] + fields_for_sql = [] + fields_for_python = [] - if not applicable_fields: - return True, "no invalid values found", {"invalid_rows": []} + for field in field_datatype: + datatype = field_datatype.get(field) + if datatype in sql_validators: + fields_for_sql.append((field, datatype)) + elif datatype in python_validators: + fields_for_python.append((field, datatype, python_validators[datatype])) invalid_values = [] - for line_number, (_, row) in enumerate(df.iterrows(), start=2): - for field, datatype, validator in applicable_fields: - value = str(row.get(field, "")).strip() - if not value: - continue - if not validator(value): + # SQL validation: query invalid rows for each field + if fields_for_sql: + for field, datatype in fields_for_sql: + condition = _get_sql_validation_condition(datatype, field) + + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT + ROW_NUMBER() OVER () + 1 AS line_number, + * + FROM {_read_csv(file_path)} + ) + SELECT + line_number, + TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE {condition} + """ + ).fetchall() + + for row in result: invalid_values.append( { - "line_number": line_number, + "line_number": row[0], "field": field, "datatype": datatype, - "value": value, + "value": row[1], } ) + if fields_for_python: + df = pd.read_csv(file_path, dtype=str, keep_default_na=False) + + if df.empty or len(df.columns) == 0: + pass + else: + for line_number, (_, row) in enumerate(df.iterrows(), start=2): + for field, datatype, validator in fields_for_python: + if field not in df.columns: + continue + value = str(row.get(field, "")).strip() + if not value: + continue + + if not validator(value): + invalid_values.append( + { + "line_number": line_number, + "field": field, + "datatype": datatype, + "value": value, + } + ) + if len(invalid_values) == 0: passed = True message = "all values have valid datatypes" diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py index 1911c3c6..3c04dc6e 100644 --- a/digital_land/expectations/operations/datatype_validators.py +++ b/digital_land/expectations/operations/datatype_validators.py @@ -1,197 +1,11 @@ import json import re -import urllib.parse -from datetime import datetime -from decimal import Decimal, InvalidOperation import shapely.errors import shapely.wkt from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon, shape -def _is_valid_datetime_value(value): - value = value.strip().strip('",').lower() - - # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/date.py#L22 - patterns = [ - # Date/date-like formats - "%Y-%m-%d", - "%Y%m%d", - "%Y/%m/%d", - "%Y %m %d", - "%Y.%m.%d", - "%Y-%d-%m", # risky! - "%Y-%m", - "%Y.%m", - "%Y/%m", - "%Y %m", - "%Y", - "%Y.0", - "%d/%m/%Y", - "%d/%m/%y", - "%d-%m-%Y", - "%d-%m-%y", - "%d.%m.%Y", - "%d.%m.%y", - "%d-%b-%Y", - "%d-%b-%y", - "%d %B %Y", - "%b %d, %Y", - "%b %d, %y", - "%b-%y", - "%B %Y", - "%m/%d/%Y", # risky! - # Datetime formats - "%Y-%m-%dT%H:%M:%S.000Z", - "%Y-%m-%dT%H:%M:%S.000", - "%Y-%m-%dT%H:%M:%S.%fZ", - "%Y-%m-%dT%H:%M:%S.%f%z", - "%Y-%m-%dT%H:%M:%S.%f", - "%Y-%m-%dT%H:%M:%SZ", - "%Y-%m-%dT%H:%M:%S", - "%Y/%m/%d %H:%M:%S%z", - "%Y/%m/%d %H:%M:%S+00", - "%Y/%m/%d %H:%M:%S", - "%Y/%m/%d %H:%M", - "%Y/%m/%dT%H:%M:%S", - "%Y/%m/%dT%H:%M:%S.000Z", - "%Y/%m/%dT%H:%M:%S.000", - "%Y/%m/%dT%H:%M:%S.%fZ", - "%Y/%m/%dT%H:%M:%S.%f%z", - "%Y/%m/%dT%H:%M:%S.%f", - "%Y/%m/%dT%H:%M:%SZ", - "%Y-%m-%d %H:%M:%S", - "%d/%m/%Y %H:%M:%S", - "%d/%m/%Y %H:%M", - ] - - # Handle fractional seconds with extra precision. - if "." in value and "Z" in value: - parts = value.replace("Z", "").split(".") - if len(parts) == 2 and len(parts[1]) > 6: - value = parts[0] + "." + parts[1][:6] + "Z" - elif "." in value and "+" in value: - parts = value.split("+") - base_part = parts[0] - tz_part = "+" + parts[1] - if "." in base_part: - date_time, frac = base_part.rsplit(".", 1) - if len(frac) > 6: - frac = frac[:6] - value = date_time + "." + frac + tz_part - - for pattern in patterns: - try: - datetime.strptime(value, pattern) - return True - except ValueError: - continue - - # Try unix timestamp - try: - float_val = float(value) - return -62135596800 < float_val < 253402300800 # Year 1 to 9999 - except ValueError: - pass - - return False - - -def _is_valid_integer_value(value): - try: - num = float(value) - return num == int(num) - except (ValueError, OverflowError): - return False - - -def _is_valid_decimal_value(value): - try: - Decimal(value) - return True - except (InvalidOperation, ValueError): - return False - - -def _is_valid_flag_value(value): - value = value.strip().lower() - - lookup = { - "y": "yes", - "n": "no", - "true": "yes", - "false": "no", - } - - normalized = lookup.get(value, value) - return normalized in {"", "yes", "no"} - - -def _is_valid_json_value(value): - try: - json.loads(value) - return True - except json.JSONDecodeError: - return False - - -def _is_valid_reference_value(value): - return bool(value.strip()) and not any(ch.isspace() for ch in value) - - -def _is_valid_curie_value(value): - return bool(re.fullmatch(r"[a-z0-9-]+:[^\s:][^\s]*", value)) - - -def _is_valid_curie_list_value(value): - text = (value or "").strip() - if not text: - return False - - parts = [part.strip() for part in text.split(";")] - if any(not part for part in parts): - return False - - curie_re = re.compile(r"[a-z0-9-]+:[^\s:][^\s]*") - return all(bool(curie_re.fullmatch(part)) for part in parts) - - -def _is_valid_address_value(value): - if not value or not value.strip(): - return False - - value = value.strip() - - # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/address.py#L10 - value = ", ".join(value.split("\n")) - value = value.replace(";", ",") - - comma_re = re.compile(r",\s*,+") - value = comma_re.sub(", ", value) - value = value.strip(", ") - - hyphen_re = re.compile(r"\s*-\s*") - value = hyphen_re.sub("-", value) - - value = " ".join(value.split()).replace('"', "") - - return bool(value.strip()) - - -def _is_valid_url_value(value): - candidate = (value or "").strip().strip("'") - parsed = urllib.parse.urlparse(candidate) - return bool(parsed.scheme and parsed.netloc) - - -def _is_valid_hash_value(value): - if ":" in value: - _, digest = value.split(":", 1) - else: - digest = value - return bool(re.fullmatch(r"[0-9a-fA-F]+", digest)) - - def _is_valid_pattern_value(value): try: re.compile(value) @@ -200,22 +14,6 @@ def _is_valid_pattern_value(value): return False -def _is_valid_latitude_value(value): - try: - numeric = float(value) - except ValueError: - return False - return -90 <= numeric <= 90 - - -def _is_valid_longitude_value(value): - try: - numeric = float(value) - except ValueError: - return False - return -180 <= numeric <= 180 - - def _is_valid_multipolygon_value(value): candidate = (value or "").strip() if not candidate: diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index 2a9592f9..ad064300 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -665,8 +665,9 @@ def test_check_values_have_the_correct_datatype_passes(tmp_path): "enabled": "flag", } + conn = duckdb.connect() passed, message, details = check_values_have_the_correct_datatype( - file_path, field_datatype + conn, file_path, field_datatype ) assert passed is True @@ -689,8 +690,10 @@ def test_check_values_have_the_correct_datatype_fails(tmp_path): "enabled": "flag", } + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( - file_path, field_datatype + conn, file_path, field_datatype ) assert passed is False @@ -719,9 +722,9 @@ def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path): "entity": "reference", "count": "integer", } - + conn = duckdb.connect() passed, message, details = check_values_have_the_correct_datatype( - file_path, field_datatype + conn, file_path, field_datatype ) assert passed is True @@ -740,9 +743,9 @@ def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path): "entity": "reference", "count": "integer", } - + conn = duckdb.connect() passed, message, details = check_values_have_the_correct_datatype( - file_path, field_datatype + conn, file_path, field_datatype ) assert passed is True @@ -760,9 +763,9 @@ def test_check_values_have_the_correct_datatype_empty_file(tmp_path): "entity": "reference", "count": "integer", } - + conn = duckdb.connect() passed, message, details = check_values_have_the_correct_datatype( - file_path, field_datatype + conn, file_path, field_datatype ) assert passed is True @@ -781,10 +784,340 @@ def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path): "name": "string", "description": "string", } - + conn = duckdb.connect() passed, message, details = check_values_have_the_correct_datatype( - file_path, field_datatype + conn, file_path, field_datatype ) assert passed is True assert details["invalid_rows"] == [] + + +def test_check_values_have_the_correct_datatype_decimal(tmp_path): + """Test decimal datatype validation with both valid and invalid values.""" + file_path = tmp_path / "decimal_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["price"]) + writer.writerow(["100.50"]) + writer.writerow(["0.99"]) + writer.writerow(["999.999"]) + writer.writerow(["not-a-decimal"]) + writer.writerow(["12abc"]) + + field_datatype = {"price": "decimal"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + assert any(r["value"] == "not-a-decimal" for r in details["invalid_rows"]) + assert any(r["value"] == "12abc" for r in details["invalid_rows"]) + + +def test_check_values_have_the_correct_datatype_latitude_longitude(tmp_path): + """Test latitude and longitude datatype validation with valid and invalid values.""" + file_path = tmp_path / "coordinates.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["latitude", "longitude"]) + writer.writerow(["0", "0"]) + writer.writerow(["51.5074", "-0.1278"]) + writer.writerow(["-33.8688", "151.2093"]) + writer.writerow(["90", "180"]) + writer.writerow(["91", "0"]) + writer.writerow(["0", "181"]) + + field_datatype = { + "latitude": "latitude", + "longitude": "longitude", + } + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_flag(tmp_path): + """Test flag datatype validation with valid and invalid values.""" + file_path = tmp_path / "flag_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["active"]) + writer.writerow(["true"]) + writer.writerow(["false"]) + writer.writerow(["y"]) + writer.writerow(["n"]) + writer.writerow(["yes"]) + writer.writerow(["no"]) + writer.writerow(["maybe"]) + writer.writerow(["1"]) + + field_datatype = {"active": "flag"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 4 + + +def test_check_values_have_the_correct_datatype_hash(tmp_path): + """Test hash datatype validation with valid and invalid values.""" + file_path = tmp_path / "hash_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["content_hash"]) + writer.writerow(["abcdef123456"]) + writer.writerow(["abc:1234567890abcdef"]) + writer.writerow(["sha:5d41402abc4b2a76b9719d911017c592"]) + writer.writerow(["not-a-hash"]) + writer.writerow(["xyz:notahex"]) + + field_datatype = {"content_hash": "hash"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_curie(tmp_path): + """Test curie datatype validation with valid and invalid values.""" + file_path = tmp_path / "curie_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["identifier"]) + writer.writerow(["prefix:value"]) + writer.writerow(["org:entity123"]) + writer.writerow(["schema:name"]) + writer.writerow(["prefix:"]) + writer.writerow(["no_colon"]) + writer.writerow(["prefix: space"]) + + field_datatype = {"identifier": "curie"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 3 + + +def test_check_values_have_the_correct_datatype_curie_list(tmp_path): + """Test curie-list datatype validation with valid and invalid values.""" + file_path = tmp_path / "curie_list_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["identifiers"]) + writer.writerow(["prefix:value1;org:value2"]) + writer.writerow(["schema:name"]) + writer.writerow([""]) + writer.writerow(["not-valid"]) + writer.writerow(["prefix: value"]) + + field_datatype = {"identifiers": "curie-list"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_json(tmp_path): + """Test json datatype validation with valid and invalid JSON.""" + file_path = tmp_path / "json_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["data"]) + writer.writerow(['{"key":"value"}']) + writer.writerow(['{"nested":{"field":"value"}}']) + writer.writerow(["not json"]) # Invalid + writer.writerow(['{"incomplete":']) # Invalid (malformed) + + field_datatype = {"data": "json"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_url(tmp_path): + """Test url datatype validation with valid and invalid URLs.""" + file_path = tmp_path / "url_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["website"]) + writer.writerow(["https://example.com"]) + writer.writerow(["http://test.org"]) + writer.writerow(["ftp://files.example.com"]) + writer.writerow(["not a url"]) # Invalid (no scheme) + writer.writerow(["example.com"]) # Invalid (no scheme) + + field_datatype = {"website": "url"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_date(tmp_path): + """Test date datatype validation with valid and invalid dates.""" + file_path = tmp_path / "date_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["start_date"]) + writer.writerow(["2024-01-15"]) + writer.writerow(["2023-12-31"]) + writer.writerow(["2022-06-30"]) + writer.writerow(["not-a-date"]) + writer.writerow(["2024-13-01"]) + + field_datatype = {"start_date": "date"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_datetime(tmp_path): + """Test datetime datatype validation with valid and invalid datetimes.""" + file_path = tmp_path / "datetime_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["timestamp"]) + writer.writerow(["2024-01-15T10:30:45"]) + writer.writerow(["2023-12-31T23:59:59Z"]) + writer.writerow(["2022-06-30T12:00:00+00:00"]) + writer.writerow(["not-a-datetime"]) + writer.writerow(["2024-13-01T10:00:00"]) + + field_datatype = {"timestamp": "datetime"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_pattern(tmp_path): + """Test pattern datatype validation with valid and invalid regex patterns.""" + file_path = tmp_path / "pattern_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["regex"]) + writer.writerow(["^[A-Z]+$"]) + writer.writerow(["\\d{3}-\\d{4}"]) + writer.writerow(["(foo|bar)"]) + writer.writerow(["["]) + writer.writerow(["(unclosed"]) + + field_datatype = {"regex": "pattern"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_point(tmp_path): + """Test point datatype validation (WKT format) with valid and invalid values.""" + file_path = tmp_path / "point_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["geometry"]) + writer.writerow(["POINT(0 0)"]) + writer.writerow(["POINT(51.5074 -0.1278)"]) + writer.writerow(["POINT(-33.8688 151.2093)"]) + writer.writerow(["not wkt"]) + writer.writerow(["POINT(0)"]) + + field_datatype = {"geometry": "point"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_multipolygon(tmp_path): + """Test multipolygon datatype validation (WKT format) with valid and invalid values.""" + file_path = tmp_path / "multipolygon_values.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["boundary"]) + writer.writerow(["POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))"]) + writer.writerow( + [ + "MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)), ((20 20, 30 20, 30 30, 20 30, 20 20)))" + ] + ) + writer.writerow(["not wkt"]) # Invalid + writer.writerow(["POINT(0 0)"]) # Invalid (not a polygon/multipolygon) + + field_datatype = {"boundary": "multipolygon"} + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 2 + + +def test_check_values_have_the_correct_datatype_mixed_types(tmp_path): + """Test validation with multiple different datatypes in one file.""" + file_path = tmp_path / "mixed_datatypes.csv" + with open(file_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["id", "price", "active", "latitude", "url", "date"]) + writer.writerow( + ["org-001", "99.99", "true", "51.5074", "https://example.com", "2024-01-15"] + ) + writer.writerow( + ["org-002", "150.50", "false", "-33.8688", "https://test.org", "2023-12-31"] + ) + writer.writerow( + ["org 003", "invalid", "maybe", "91", "not-a-url", "not-a-date"] + ) + + field_datatype = { + "price": "decimal", + "active": "flag", + "latitude": "latitude", + "url": "url", + "date": "date", + } + conn = duckdb.connect() + passed, message, details = check_values_have_the_correct_datatype( + conn, file_path, field_datatype + ) + + assert passed is False + assert len(details["invalid_rows"]) == 5 From f156b8152d93dd9227e2c97d902d1b8321fb9a39 Mon Sep 17 00:00:00 2001 From: Gibah Joseph Date: Wed, 1 Apr 2026 14:05:23 +0100 Subject: [PATCH 12/12] Refactor CSV datatype validation functions to individual expectations - Replaced the previous `check_values_have_the_correct_datatype` function with specific functions for each datatype (e.g., `expect_column_to_be_integer`, `expect_column_to_be_decimal`, etc.). - Each new function performs validation for a specific datatype and returns detailed results including invalid rows. - Updated integration tests to reflect the new validation functions and ensure they cover various scenarios for each datatype. - Removed unused pattern validation function from `datatype_validators.py`. --- digital_land/expectations/checkpoints/csv.py | 32 +- digital_land/expectations/operations/csv.py | 517 ++++++++++++++---- .../operations/datatype_validators.py | 9 - .../expectations/operations/test_csv.py | 476 +++++----------- 4 files changed, 580 insertions(+), 454 deletions(-) diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py index 5b33a771..b69ead33 100644 --- a/digital_land/expectations/checkpoints/csv.py +++ b/digital_land/expectations/checkpoints/csv.py @@ -13,7 +13,21 @@ check_field_is_within_range_by_dataset_org, check_allowed_values, check_no_blank_rows, - check_values_have_the_correct_datatype, + expect_column_to_be_integer, + expect_column_to_be_decimal, + expect_column_to_be_flag, + expect_column_to_be_latitude, + expect_column_to_be_longitude, + expect_column_to_be_hash, + expect_column_to_be_curie, + expect_column_to_be_curie_list, + expect_column_to_be_json, + expect_column_to_be_url, + expect_column_to_be_date, + expect_column_to_be_datetime, + expect_column_to_be_pattern, + expect_column_to_be_multipolygon, + expect_column_to_be_point, ) @@ -33,7 +47,21 @@ def operation_factory(self, operation_string: str): "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org, "check_allowed_values": check_allowed_values, "check_no_blank_rows": check_no_blank_rows, - "check_values_have_the_correct_datatype": check_values_have_the_correct_datatype, + "expect_column_to_be_integer": expect_column_to_be_integer, + "expect_column_to_be_decimal": expect_column_to_be_decimal, + "expect_column_to_be_flag": expect_column_to_be_flag, + "expect_column_to_be_latitude": expect_column_to_be_latitude, + "expect_column_to_be_longitude": expect_column_to_be_longitude, + "expect_column_to_be_hash": expect_column_to_be_hash, + "expect_column_to_be_curie": expect_column_to_be_curie, + "expect_column_to_be_curie_list": expect_column_to_be_curie_list, + "expect_column_to_be_json": expect_column_to_be_json, + "expect_column_to_be_url": expect_column_to_be_url, + "expect_column_to_be_date": expect_column_to_be_date, + "expect_column_to_be_datetime": expect_column_to_be_datetime, + "expect_column_to_be_pattern": expect_column_to_be_pattern, + "expect_column_to_be_multipolygon": expect_column_to_be_multipolygon, + "expect_column_to_be_point": expect_column_to_be_point, } if operation_string not in operation_map: raise ValueError( diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py index 30ea7a01..72f1d2e4 100644 --- a/digital_land/expectations/operations/csv.py +++ b/digital_land/expectations/operations/csv.py @@ -1,9 +1,9 @@ from pathlib import Path +import re import pandas as pd from digital_land.expectations.operations.datatype_validators import ( _is_valid_multipolygon_value, - _is_valid_pattern_value, _is_valid_point_value, ) @@ -620,135 +620,414 @@ def check_field_is_within_range_by_dataset_org( return passed, message, details -def check_values_have_the_correct_datatype(conn, file_path, field_datatype): - """ - Validates that CSV column values have correct datatypes. +def expect_column_to_be_integer(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND NOT ( + TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) IS NOT NULL + AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) = TRY_CAST(TRIM(COALESCE("{field}", '')) AS BIGINT) + ) + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "integer", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'integer'" + if passed + else f"there were {len(invalid_rows)} invalid 'integer' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} - Uses DuckDB queries for datatypes: integer, decimal, flag, latitude, longitude, hash, curie, curie-list, json, url, date, datetime. - Uses Python validators for complex datatypes: pattern, multipolygon, point. +def expect_column_to_be_decimal(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS DECIMAL) IS NULL + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "decimal", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'decimal'" + if passed + else f"there were {len(invalid_rows)} invalid 'decimal' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_flag(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND LOWER(TRIM(COALESCE("{field}", ''))) NOT IN ('yes', 'no', 'true', 'false') + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "flag", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'flag'" + if passed + else f"there were {len(invalid_rows)} invalid 'flag' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_latitude(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND ( + TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) IS NULL + OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) < -90 + OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) > 90 + ) + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "latitude", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'latitude'" + if passed + else f"there were {len(invalid_rows)} invalid 'latitude' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} - Args: - file_path: path to the CSV file to validate - field_datatype: dict mapping column name to datatype string - """ - def _get_sql_validation_condition(datatype: str, field_name: str) -> str: - field_ref = f"TRIM(COALESCE(\"{field_name}\", ''))" - - conditions = { - "integer": f"{field_ref} != '' AND NOT (TRY_CAST({field_ref} AS DOUBLE) IS NOT NULL AND TRY_CAST({field_ref} AS DOUBLE) = TRY_CAST({field_ref} AS BIGINT))", - "decimal": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DECIMAL) IS NULL", - "flag": f"{field_ref} != '' AND LOWER({field_ref}) NOT IN ('yes', 'no', 'true', 'false')", - "latitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -90 OR TRY_CAST({field_ref} AS DOUBLE) > 90)", - "longitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -180 OR TRY_CAST({field_ref} AS DOUBLE) > 180)", - "hash": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z]+:)?[0-9a-fA-F]+$'))", - "curie": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-z0-9-]+:[^\\s:][^\\s]*$'))", - "curie-list": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z0-9-]+:[^\\s:][^\\s]*(;[a-z0-9-]+:[^\\s:][^\\s]*)*)?$'))", - "json": f"{field_ref} != '' AND TRY(json_extract({field_ref}, '$')) IS NULL", - "url": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-zA-Z][a-zA-Z0-9+.-]*://[^\\s/:?#]+(?::[0-9]+)?(?:[/?#][^\\s]*)?$'))", - "date": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DATE) IS NULL", - "datetime": f"{field_ref} != '' AND TRY_CAST({field_ref} AS TIMESTAMP) IS NULL", +def expect_column_to_be_longitude(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND ( + TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) IS NULL + OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) < -180 + OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) > 180 + ) + """ + ).fetchall() + invalid_rows = [ + { + "line_number": row[0], + "field": field, + "datatype": "longitude", + "value": row[1], } + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'longitude'" + if passed + else f"there were {len(invalid_rows)} invalid 'longitude' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} - return conditions.get(datatype, "FALSE") - # Python validators for complex datatypes that can't be easily expressed in SQL - python_validators = { - "pattern": _is_valid_pattern_value, - "multipolygon": _is_valid_multipolygon_value, - "point": _is_valid_point_value, - } +def expect_column_to_be_hash(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^([a-z]+:)?[0-9a-fA-F]+$')) + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "hash", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'hash'" + if passed + else f"there were {len(invalid_rows)} invalid 'hash' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} - sql_validators = { - "integer", - "decimal", - "flag", - "latitude", - "longitude", - "hash", - "curie", - "curie-list", - "json", - "url", - "date", - "datetime", - } - fields_for_sql = [] - fields_for_python = [] - - for field in field_datatype: - datatype = field_datatype.get(field) - if datatype in sql_validators: - fields_for_sql.append((field, datatype)) - elif datatype in python_validators: - fields_for_python.append((field, datatype, python_validators[datatype])) - - invalid_values = [] - - # SQL validation: query invalid rows for each field - if fields_for_sql: - for field, datatype in fields_for_sql: - condition = _get_sql_validation_condition(datatype, field) - - result = conn.execute( - f""" - WITH source_rows AS ( - SELECT - ROW_NUMBER() OVER () + 1 AS line_number, - * - FROM {_read_csv(file_path)} +def expect_column_to_be_curie(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^[a-z0-9-]+:[^\\s:][^\\s]*$')) + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "curie", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'curie'" + if passed + else f"there were {len(invalid_rows)} invalid 'curie' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_curie_list(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^([a-z0-9-]+:[^\\s:][^\\s]*(;[a-z0-9-]+:[^\\s:][^\\s]*)*)?$')) + """ + ).fetchall() + invalid_rows = [ + { + "line_number": row[0], + "field": field, + "datatype": "curie-list", + "value": row[1], + } + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'curie-list'" + if passed + else f"there were {len(invalid_rows)} invalid 'curie-list' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_json(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND TRY(json_extract(TRIM(COALESCE("{field}", '')), '$')) IS NULL + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "json", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'json'" + if passed + else f"there were {len(invalid_rows)} invalid 'json' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_url(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^[a-zA-Z][a-zA-Z0-9+.-]*://[^\\s/:?#]+(?::[0-9]+)?(?:[/?#][^\\s]*)?$')) + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "url", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'url'" + if passed + else f"there were {len(invalid_rows)} invalid 'url' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_date(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS DATE) IS NULL + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "date", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'date'" + if passed + else f"there were {len(invalid_rows)} invalid 'date' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_datetime(conn, file_path: Path, field: str): + result = conn.execute( + f""" + WITH source_rows AS ( + SELECT ROW_NUMBER() OVER () + 1 AS line_number, * + FROM {_read_csv(file_path)} + ) + SELECT line_number, TRIM(COALESCE("{field}", '')) AS value + FROM source_rows + WHERE TRIM(COALESCE("{field}", '')) != '' + AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS TIMESTAMP) IS NULL + """ + ).fetchall() + invalid_rows = [ + {"line_number": row[0], "field": field, "datatype": "datetime", "value": row[1]} + for row in result + ] + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'datetime'" + if passed + else f"there were {len(invalid_rows)} invalid 'datetime' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_pattern(conn, file_path: Path, field: str): + invalid_rows = [] + df = pd.read_csv(file_path, dtype=str, keep_default_na=False) + if not df.empty and len(df.columns) > 0 and field in df.columns: + for line_number, (_, row) in enumerate(df.iterrows(), start=2): + value = str(row.get(field, "")).strip() + if not value: + continue + try: + re.compile(value) + except re.error: + invalid_rows.append( + { + "line_number": line_number, + "field": field, + "datatype": "pattern", + "value": value, + } ) - SELECT - line_number, - TRIM(COALESCE("{field}", '')) AS value - FROM source_rows - WHERE {condition} - """ - ).fetchall() - - for row in result: - invalid_values.append( + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'pattern'" + if passed + else f"there were {len(invalid_rows)} invalid 'pattern' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_multipolygon(conn, file_path: Path, field: str): + invalid_rows = [] + df = pd.read_csv(file_path, dtype=str, keep_default_na=False) + if not df.empty and len(df.columns) > 0 and field in df.columns: + for line_number, (_, row) in enumerate(df.iterrows(), start=2): + value = str(row.get(field, "")).strip() + if not value: + continue + if not _is_valid_multipolygon_value(value): + invalid_rows.append( { - "line_number": row[0], + "line_number": line_number, "field": field, - "datatype": datatype, - "value": row[1], + "datatype": "multipolygon", + "value": value, } ) - - if fields_for_python: - df = pd.read_csv(file_path, dtype=str, keep_default_na=False) - - if df.empty or len(df.columns) == 0: - pass - else: - for line_number, (_, row) in enumerate(df.iterrows(), start=2): - for field, datatype, validator in fields_for_python: - if field not in df.columns: - continue - value = str(row.get(field, "")).strip() - if not value: - continue - - if not validator(value): - invalid_values.append( - { - "line_number": line_number, - "field": field, - "datatype": datatype, - "value": value, - } - ) - - if len(invalid_values) == 0: - passed = True - message = "all values have valid datatypes" - details = {"invalid_rows": []} - else: - passed = False - message = f"there were {len(invalid_values)} invalid datatype value(s) found" - details = {"invalid_rows": invalid_values} - - return passed, message, details + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'multipolygon'" + if passed + else f"there were {len(invalid_rows)} invalid 'multipolygon' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} + + +def expect_column_to_be_point(conn, file_path: Path, field: str): + invalid_rows = [] + df = pd.read_csv(file_path, dtype=str, keep_default_na=False) + if not df.empty and len(df.columns) > 0 and field in df.columns: + for line_number, (_, row) in enumerate(df.iterrows(), start=2): + value = str(row.get(field, "")).strip() + if not value: + continue + if not _is_valid_point_value(value): + invalid_rows.append( + { + "line_number": line_number, + "field": field, + "datatype": "point", + "value": value, + } + ) + passed = len(invalid_rows) == 0 + message = ( + f"all values in '{field}' have datatype 'point'" + if passed + else f"there were {len(invalid_rows)} invalid 'point' value(s) in '{field}'" + ) + return passed, message, {"invalid_rows": invalid_rows} diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py index 3c04dc6e..ce9eedd2 100644 --- a/digital_land/expectations/operations/datatype_validators.py +++ b/digital_land/expectations/operations/datatype_validators.py @@ -1,19 +1,10 @@ import json -import re import shapely.errors import shapely.wkt from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon, shape -def _is_valid_pattern_value(value): - try: - re.compile(value) - return True - except re.error: - return False - - def _is_valid_multipolygon_value(value): candidate = (value or "").strip() if not candidate: diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py index ad064300..166b5140 100644 --- a/tests/integration/expectations/operations/test_csv.py +++ b/tests/integration/expectations/operations/test_csv.py @@ -11,7 +11,21 @@ check_no_blank_rows, check_fields_are_within_range, check_field_is_within_range_by_dataset_org, - check_values_have_the_correct_datatype, + expect_column_to_be_decimal, + expect_column_to_be_flag, + expect_column_to_be_latitude, + expect_column_to_be_longitude, + expect_column_to_be_hash, + expect_column_to_be_curie, + expect_column_to_be_curie_list, + expect_column_to_be_json, + expect_column_to_be_url, + expect_column_to_be_date, + expect_column_to_be_datetime, + expect_column_to_be_pattern, + expect_column_to_be_point, + expect_column_to_be_integer, + expect_column_to_be_multipolygon, ) @@ -650,474 +664,288 @@ def test_check_field_is_within_ranges_for_only_staus_301(tmp_path): assert details["invalid_rows"][0]["value"] == 250 -def test_check_values_have_the_correct_datatype_passes(tmp_path): - """Test datatype validation with all valid values.""" - file_path = tmp_path / "valid_datatypes.csv" +def test_expect_column_to_be_integer(tmp_path): + file_path = tmp_path / "expect_integer.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity", "count", "enabled"]) - writer.writerow(["entity-1", "100", "true"]) - writer.writerow(["entity-2", "200", "false"]) - - field_datatype = { - "entity": "reference", - "count": "integer", - "enabled": "flag", - } + writer.writerow(["count"]) + writer.writerow(["10"]) + writer.writerow(["abc"]) conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) - - assert passed is True - assert details["invalid_rows"] == [] - - -def test_check_values_have_the_correct_datatype_fails(tmp_path): - """Test datatype validation with invalid values.""" - file_path = tmp_path / "invalid_datatypes.csv" - with open(file_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["entity", "count", "enabled"]) - writer.writerow(["entity-1", "100", "true"]) - writer.writerow(["entity-2", "not_a_number", "false"]) - writer.writerow(["entity-3", "300", "maybe"]) - - field_datatype = { - "entity": "reference", - "count": "integer", - "enabled": "flag", - } - - conn = duckdb.connect() - - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, message, details = expect_column_to_be_integer( + conn, file_path=file_path, field="count" ) assert passed is False - assert len(details["invalid_rows"]) == 2 - assert details["invalid_rows"][0]["line_number"] == 3 - assert details["invalid_rows"][0]["field"] == "count" - assert details["invalid_rows"][0]["value"] == "not_a_number" + assert len(details["invalid_rows"]) == 1 assert details["invalid_rows"][0]["datatype"] == "integer" - assert details["invalid_rows"][1]["line_number"] == 4 - assert details["invalid_rows"][1]["field"] == "enabled" - assert details["invalid_rows"][1]["value"] == "maybe" - assert "invalid datatype value(s)" in message - - -def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path): - """Test that empty values are skipped during validation.""" - file_path = tmp_path / "with_empty_values.csv" - with open(file_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["entity", "count"]) - writer.writerow(["entity-1", "100"]) - writer.writerow(["entity-2", ""]) - writer.writerow(["entity-3", "300"]) - - field_datatype = { - "entity": "reference", - "count": "integer", - } - conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) - - assert passed is True - assert details["invalid_rows"] == [] - - -def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path): - """Test that fields not in field_datatype map are not validated.""" - file_path = tmp_path / "unmapped_fields.csv" - with open(file_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["entity", "count", "description"]) - writer.writerow(["entity-1", "100", "invalid_but_ignored"]) - - field_datatype = { - "entity": "reference", - "count": "integer", - } - conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) + assert "invalid 'integer'" in message - assert passed is True - assert details["invalid_rows"] == [] - -def test_check_values_have_the_correct_datatype_empty_file(tmp_path): - """Test behavior with empty CSV file.""" - file_path = tmp_path / "empty.csv" +def test_expect_column_to_be_multipolygon(tmp_path): + file_path = tmp_path / "expect_multipolygon.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["entity", "count"]) + writer.writerow(["boundary"]) + writer.writerow(["POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))"]) + writer.writerow(["POINT(0 0)"]) - field_datatype = { - "entity": "reference", - "count": "integer", - } conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_multipolygon( + conn, file_path=file_path, field="boundary" ) - assert passed is True - assert details["invalid_rows"] == [] + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "multipolygon" -def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path): - """Test when no fields have datatype validators.""" - file_path = tmp_path / "no_applicable.csv" +def test_expect_column_to_be_decimal(tmp_path): + file_path = tmp_path / "expect_decimal.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["name", "description"]) - writer.writerow(["field1", "some value"]) + writer.writerow(["price"]) + writer.writerow(["1.2"]) + writer.writerow(["10"]) + writer.writerow(["0.01"]) + writer.writerow(["bad"]) - field_datatype = { - "name": "string", - "description": "string", - } conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_decimal( + conn, file_path=file_path, field="price" ) - assert passed is True - assert details["invalid_rows"] == [] + assert passed is False + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "decimal" -def test_check_values_have_the_correct_datatype_decimal(tmp_path): - """Test decimal datatype validation with both valid and invalid values.""" - file_path = tmp_path / "decimal_values.csv" +def test_expect_column_to_be_flag(tmp_path): + file_path = tmp_path / "expect_flag.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["price"]) - writer.writerow(["100.50"]) - writer.writerow(["0.99"]) - writer.writerow(["999.999"]) - writer.writerow(["not-a-decimal"]) - writer.writerow(["12abc"]) + writer.writerow(["active"]) + writer.writerow(["true"]) + writer.writerow(["false"]) + writer.writerow(["yes"]) + writer.writerow(["no"]) + writer.writerow(["maybe"]) - field_datatype = {"price": "decimal"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_flag( + conn, file_path=file_path, field="active" ) assert passed is False - assert len(details["invalid_rows"]) == 2 - assert any(r["value"] == "not-a-decimal" for r in details["invalid_rows"]) - assert any(r["value"] == "12abc" for r in details["invalid_rows"]) + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "flag" -def test_check_values_have_the_correct_datatype_latitude_longitude(tmp_path): - """Test latitude and longitude datatype validation with valid and invalid values.""" - file_path = tmp_path / "coordinates.csv" +def test_expect_column_to_be_latitude(tmp_path): + file_path = tmp_path / "expect_latitude.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["latitude", "longitude"]) - writer.writerow(["0", "0"]) - writer.writerow(["51.5074", "-0.1278"]) - writer.writerow(["-33.8688", "151.2093"]) - writer.writerow(["90", "180"]) - writer.writerow(["91", "0"]) - writer.writerow(["0", "181"]) - - field_datatype = { - "latitude": "latitude", - "longitude": "longitude", - } + writer.writerow(["lat"]) + writer.writerow(["0"]) + writer.writerow(["51.5"]) + writer.writerow(["-90"]) + writer.writerow(["90"]) + writer.writerow(["91"]) + conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_latitude( + conn, file_path=file_path, field="lat" ) assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "latitude" -def test_check_values_have_the_correct_datatype_flag(tmp_path): - """Test flag datatype validation with valid and invalid values.""" - file_path = tmp_path / "flag_values.csv" +def test_expect_column_to_be_longitude(tmp_path): + file_path = tmp_path / "expect_longitude.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["active"]) - writer.writerow(["true"]) - writer.writerow(["false"]) - writer.writerow(["y"]) - writer.writerow(["n"]) - writer.writerow(["yes"]) - writer.writerow(["no"]) - writer.writerow(["maybe"]) - writer.writerow(["1"]) + writer.writerow(["lon"]) + writer.writerow(["0"]) + writer.writerow(["-0.1"]) + writer.writerow(["-180"]) + writer.writerow(["180"]) + writer.writerow(["181"]) - field_datatype = {"active": "flag"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_longitude( + conn, file_path=file_path, field="lon" ) assert passed is False - assert len(details["invalid_rows"]) == 4 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "longitude" -def test_check_values_have_the_correct_datatype_hash(tmp_path): - """Test hash datatype validation with valid and invalid values.""" - file_path = tmp_path / "hash_values.csv" +def test_expect_column_to_be_hash(tmp_path): + file_path = tmp_path / "expect_hash.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["content_hash"]) - writer.writerow(["abcdef123456"]) - writer.writerow(["abc:1234567890abcdef"]) + writer.writerow(["hash"]) + writer.writerow(["abcdef"]) writer.writerow(["sha:5d41402abc4b2a76b9719d911017c592"]) - writer.writerow(["not-a-hash"]) writer.writerow(["xyz:notahex"]) - field_datatype = {"content_hash": "hash"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_hash( + conn, file_path=file_path, field="hash" ) assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "hash" -def test_check_values_have_the_correct_datatype_curie(tmp_path): - """Test curie datatype validation with valid and invalid values.""" - file_path = tmp_path / "curie_values.csv" +def test_expect_column_to_be_curie(tmp_path): + file_path = tmp_path / "expect_curie.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["identifier"]) + writer.writerow(["id"]) writer.writerow(["prefix:value"]) writer.writerow(["org:entity123"]) - writer.writerow(["schema:name"]) - writer.writerow(["prefix:"]) writer.writerow(["no_colon"]) - writer.writerow(["prefix: space"]) - field_datatype = {"identifier": "curie"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_curie( + conn, file_path=file_path, field="id" ) assert passed is False - assert len(details["invalid_rows"]) == 3 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "curie" -def test_check_values_have_the_correct_datatype_curie_list(tmp_path): - """Test curie-list datatype validation with valid and invalid values.""" - file_path = tmp_path / "curie_list_values.csv" +def test_expect_column_to_be_curie_list(tmp_path): + file_path = tmp_path / "expect_curie_list.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["identifiers"]) - writer.writerow(["prefix:value1;org:value2"]) + writer.writerow(["ids"]) + writer.writerow(["prefix:a;org:b"]) writer.writerow(["schema:name"]) - writer.writerow([""]) writer.writerow(["not-valid"]) - writer.writerow(["prefix: value"]) - field_datatype = {"identifiers": "curie-list"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_curie_list( + conn, file_path=file_path, field="ids" ) assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "curie-list" -def test_check_values_have_the_correct_datatype_json(tmp_path): - """Test json datatype validation with valid and invalid JSON.""" - file_path = tmp_path / "json_values.csv" +def test_expect_column_to_be_json(tmp_path): + file_path = tmp_path / "expect_json.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["data"]) - writer.writerow(['{"key":"value"}']) - writer.writerow(['{"nested":{"field":"value"}}']) - writer.writerow(["not json"]) # Invalid - writer.writerow(['{"incomplete":']) # Invalid (malformed) + writer.writerow(["payload"]) + writer.writerow(['{"a":1}']) + writer.writerow(["[1,2,3]"]) + writer.writerow(["not json"]) - field_datatype = {"data": "json"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_json( + conn, file_path=file_path, field="payload" ) assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "json" -def test_check_values_have_the_correct_datatype_url(tmp_path): - """Test url datatype validation with valid and invalid URLs.""" - file_path = tmp_path / "url_values.csv" +def test_expect_column_to_be_url(tmp_path): + file_path = tmp_path / "expect_url.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["website"]) + writer.writerow(["url"]) writer.writerow(["https://example.com"]) writer.writerow(["http://test.org"]) - writer.writerow(["ftp://files.example.com"]) - writer.writerow(["not a url"]) # Invalid (no scheme) - writer.writerow(["example.com"]) # Invalid (no scheme) + writer.writerow(["example.com"]) - field_datatype = {"website": "url"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) + passed, _, details = expect_column_to_be_url(conn, file_path=file_path, field="url") assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "url" -def test_check_values_have_the_correct_datatype_date(tmp_path): - """Test date datatype validation with valid and invalid dates.""" - file_path = tmp_path / "date_values.csv" +def test_expect_column_to_be_date(tmp_path): + file_path = tmp_path / "expect_date.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["start_date"]) - writer.writerow(["2024-01-15"]) + writer.writerow(["d"]) + writer.writerow(["2024-01-01"]) writer.writerow(["2023-12-31"]) - writer.writerow(["2022-06-30"]) writer.writerow(["not-a-date"]) - writer.writerow(["2024-13-01"]) - field_datatype = {"start_date": "date"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) + passed, _, details = expect_column_to_be_date(conn, file_path=file_path, field="d") assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "date" -def test_check_values_have_the_correct_datatype_datetime(tmp_path): - """Test datetime datatype validation with valid and invalid datetimes.""" - file_path = tmp_path / "datetime_values.csv" +def test_expect_column_to_be_datetime(tmp_path): + file_path = tmp_path / "expect_datetime.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["timestamp"]) - writer.writerow(["2024-01-15T10:30:45"]) + writer.writerow(["dt"]) + writer.writerow(["2024-01-01T10:00:00"]) writer.writerow(["2023-12-31T23:59:59Z"]) - writer.writerow(["2022-06-30T12:00:00+00:00"]) writer.writerow(["not-a-datetime"]) - writer.writerow(["2024-13-01T10:00:00"]) - field_datatype = {"timestamp": "datetime"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_datetime( + conn, file_path=file_path, field="dt" ) assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "datetime" -def test_check_values_have_the_correct_datatype_pattern(tmp_path): - """Test pattern datatype validation with valid and invalid regex patterns.""" - file_path = tmp_path / "pattern_values.csv" +def test_expect_column_to_be_pattern(tmp_path): + file_path = tmp_path / "expect_pattern.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) writer.writerow(["regex"]) writer.writerow(["^[A-Z]+$"]) - writer.writerow(["\\d{3}-\\d{4}"]) writer.writerow(["(foo|bar)"]) writer.writerow(["["]) - writer.writerow(["(unclosed"]) - field_datatype = {"regex": "pattern"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_pattern( + conn, file_path=file_path, field="regex" ) assert passed is False - assert len(details["invalid_rows"]) == 2 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "pattern" -def test_check_values_have_the_correct_datatype_point(tmp_path): - """Test point datatype validation (WKT format) with valid and invalid values.""" - file_path = tmp_path / "point_values.csv" +def test_expect_column_to_be_point(tmp_path): + file_path = tmp_path / "expect_point.csv" with open(file_path, "w", newline="") as f: writer = csv.writer(f) - writer.writerow(["geometry"]) + writer.writerow(["geom"]) writer.writerow(["POINT(0 0)"]) - writer.writerow(["POINT(51.5074 -0.1278)"]) - writer.writerow(["POINT(-33.8688 151.2093)"]) - writer.writerow(["not wkt"]) + writer.writerow(["POINT(1 2)"]) writer.writerow(["POINT(0)"]) - field_datatype = {"geometry": "point"} - conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) - - assert passed is False - assert len(details["invalid_rows"]) == 2 - - -def test_check_values_have_the_correct_datatype_multipolygon(tmp_path): - """Test multipolygon datatype validation (WKT format) with valid and invalid values.""" - file_path = tmp_path / "multipolygon_values.csv" - with open(file_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["boundary"]) - writer.writerow(["POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))"]) - writer.writerow( - [ - "MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)), ((20 20, 30 20, 30 30, 20 30, 20 20)))" - ] - ) - writer.writerow(["not wkt"]) # Invalid - writer.writerow(["POINT(0 0)"]) # Invalid (not a polygon/multipolygon) - - field_datatype = {"boundary": "multipolygon"} conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype + passed, _, details = expect_column_to_be_point( + conn, file_path=file_path, field="geom" ) assert passed is False - assert len(details["invalid_rows"]) == 2 - - -def test_check_values_have_the_correct_datatype_mixed_types(tmp_path): - """Test validation with multiple different datatypes in one file.""" - file_path = tmp_path / "mixed_datatypes.csv" - with open(file_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["id", "price", "active", "latitude", "url", "date"]) - writer.writerow( - ["org-001", "99.99", "true", "51.5074", "https://example.com", "2024-01-15"] - ) - writer.writerow( - ["org-002", "150.50", "false", "-33.8688", "https://test.org", "2023-12-31"] - ) - writer.writerow( - ["org 003", "invalid", "maybe", "91", "not-a-url", "not-a-date"] - ) - - field_datatype = { - "price": "decimal", - "active": "flag", - "latitude": "latitude", - "url": "url", - "date": "date", - } - conn = duckdb.connect() - passed, message, details = check_values_have_the_correct_datatype( - conn, file_path, field_datatype - ) - - assert passed is False - assert len(details["invalid_rows"]) == 5 + assert len(details["invalid_rows"]) == 1 + assert details["invalid_rows"][0]["datatype"] == "point"