From 49374a9e8fdc42f54cdbba40b5e24608424b2559 Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Thu, 26 Mar 2026 16:46:07 +0000
Subject: [PATCH 01/12] feat: add CSV validation checks for allowed values and
 organisation ranges

---
 .gitignore                                    |   2 +-
 digital_land/expectations/checkpoints/csv.py  |   4 +
 digital_land/expectations/operations/csv.py   | 129 ++++++++++++++++++
 .../expectations/operations/test_csv.py       | 100 ++++++++++++++
 4 files changed, 234 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 9cb740a8..6dda51b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__/
 *.py[cod]
-
+.history/
 .cache
 .coverage
 build
diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py
index 08e47d2d..15ebebd0 100644
--- a/digital_land/expectations/checkpoints/csv.py
+++ b/digital_land/expectations/checkpoints/csv.py
@@ -9,6 +9,8 @@
     check_unique,
     check_no_shared_values,
     check_no_overlapping_ranges,
+    check_lookup_entities_are_within_organisation_ranges,
+    check_allowed_values
 )
 
 
@@ -24,6 +26,8 @@ def operation_factory(self, operation_string: str):
             "check_unique": check_unique,
             "check_no_shared_values": check_no_shared_values,
             "check_no_overlapping_ranges": check_no_overlapping_ranges,
+            "check_lookup_entities_are_within_organisation_ranges":check_lookup_entities_are_within_organisation_ranges,
+            "check_allowed_values":check_allowed_values
         }
         if operation_string not in operation_map:
             raise ValueError(
diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 5e6267b9..ddae091e 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -157,3 +157,132 @@ def check_no_overlapping_ranges(conn, file_path: Path, min_field: str, max_field
     }
 
     return passed, message, details
+
+
+def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list):
+    """
+    Checks that a field contains only values from an allowed set.
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file
+        field: the column name to validate
+        allowed_values: allowed values for the field
+    """
+    cleaned_allowed_values = [
+        str(value).strip().replace("'", "''")
+        for value in (allowed_values or [])
+        if str(value).strip() != ""
+    ]
+
+    if not cleaned_allowed_values:
+        raise ValueError("allowed_values must contain at least one non-empty value")
+
+    allowed_values_sql = ",".join("'" + value + "'" for value in cleaned_allowed_values)
+
+    result = conn.execute(
+        f"""
+        SELECT
+            ROW_NUMBER() OVER () + 1 AS line_number,
+            TRIM(COALESCE("{field}", '')) AS value
+        FROM {_read_csv(file_path)}
+        WHERE TRIM(COALESCE("{field}", '')) NOT IN ({allowed_values_sql})
+        """
+    ).fetchall()
+
+    invalid_rows = [{"line_number": row[0], "value": row[1]} for row in result]
+    invalid_values = sorted({row["value"] for row in invalid_rows})
+
+    if len(invalid_rows) == 0:
+        passed = True
+        message = f"all values in '{field}' are allowed"
+    else:
+        passed = False
+        message = (
+            f"there were {len(invalid_rows)} invalid values in '{field}'"
+        )
+
+    details = {
+        "field": field,
+        "allowed_values": sorted({value for value in cleaned_allowed_values}),
+        "invalid_values": invalid_values,
+        "invalid_rows": invalid_rows,
+    }
+
+    return passed, message, details
+
+
+def check_lookup_entities_are_within_organisation_ranges(
+    conn, file_path: Path, organisation_file: Path, ignored_organisations: list = None
+):
+    """
+    Checks that lookup entities are within any valid range from an organisation file.
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the lookup CSV file
+        organisation_file: path to the entity-organisation CSV file
+        ignored_organisations: list of organisations to ignore (i.e. not check that their entities are within a valid range)
+    """
+    ignored_values = [
+        org.replace("'", "''")
+        for org in (ignored_organisations or [])
+        if isinstance(org, str) and org.strip()
+    ]
+    ignored_clause = ""
+    if ignored_values:
+        ignored_values_sql = ",".join("'" + org + "'" for org in ignored_values)
+        ignored_clause = (
+            " AND TRIM(COALESCE(\"organisation\", '')) NOT IN "
+            + f"({ignored_values_sql})"
+        )
+
+    result = conn.execute(
+        f"""
+        WITH ranges AS (
+            SELECT
+                TRY_CAST("entity-minimum" AS BIGINT) AS min_entity,
+                TRY_CAST("entity-maximum" AS BIGINT) AS max_entity
+            FROM {_read_csv(organisation_file)}
+            WHERE TRY_CAST("entity-minimum" AS BIGINT) IS NOT NULL
+              AND TRY_CAST("entity-maximum" AS BIGINT) IS NOT NULL
+        ),
+        lookup_rows AS (
+            SELECT
+                TRY_CAST("entity" AS BIGINT) AS entity,
+                TRIM(COALESCE("organisation", '')) AS organisation,
+                COALESCE("reference", '') AS reference
+            FROM {_read_csv(file_path)}
+            WHERE TRIM(COALESCE("organisation", '')) != ''
+            {ignored_clause}
+        )
+        SELECT entity, organisation, reference
+        FROM lookup_rows l
+        WHERE organisation != ''
+          AND entity IS NOT NULL
+          AND NOT EXISTS (
+              SELECT 1
+              FROM ranges r
+              WHERE l.entity BETWEEN r.min_entity AND r.max_entity
+          )
+        """
+    ).fetchall()
+
+    out_of_range_rows = [
+        {"entity": row[0], "organisation": row[1], "reference": row[2]}
+        for row in result
+    ]
+
+    if len(out_of_range_rows) == 0:
+        passed = True
+        message = "all lookup entities are within allowed ranges"
+    else:
+        passed = False
+        message = f"there were {len(out_of_range_rows)} out-of-range rows found"
+
+    details = {
+        "invalid_rows": out_of_range_rows,
+    }
+
+    return passed, message, details
+
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 77a9677c..7cd57437 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -7,6 +7,8 @@
     check_unique,
     check_no_shared_values,
     check_no_overlapping_ranges,
+    check_allowed_values,
+    check_lookup_entities_are_within_organisation_ranges,
 )
 
 
@@ -201,3 +203,101 @@ def test_check_no_overlapping_ranges_adjacent_fails(tmp_path):
     )
     assert passed is False
     assert len(details["overlaps"]) == 1
+
+
+def test_check_lookup_entities_are_within_organisation_ranges_fails(tmp_path):
+    lookup_file = tmp_path / "lookup.csv"
+    with open(lookup_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "organisation", "reference"])
+        writer.writerow(["150", "org-1", "ok-ref"])
+        writer.writerow(["999", "org-2", "bad-ref"])
+
+    organisation_file = tmp_path / "entity-organisation.csv"
+    with open(organisation_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum", "organisation"])
+        writer.writerow(["100", "200", "org-1"])
+        writer.writerow(["300", "400", "org-2"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_lookup_entities_are_within_organisation_ranges(
+        conn, file_path=lookup_file, organisation_file=organisation_file
+    )
+
+    assert passed is False
+    assert "out-of-range" in message
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["entity"] == 999
+    assert details["invalid_rows"][0]["organisation"] == "org-2"
+
+
+def test_check_lookup_entities_are_within_organisation_ranges_ignores_org(tmp_path):
+    lookup_file = tmp_path / "lookup.csv"
+    with open(lookup_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "organisation", "reference"])
+        writer.writerow(["150", "org-1", "ok-ref"])
+        writer.writerow(["999", "org-2", "ignored-ref"])
+
+    organisation_file = tmp_path / "entity-organisation.csv"
+    with open(organisation_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum", "organisation"])
+        writer.writerow(["100", "200", "org-1"])
+        writer.writerow(["300", "400", "org-2"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_lookup_entities_are_within_organisation_ranges(
+        conn,
+        file_path=lookup_file,
+        organisation_file=organisation_file,
+        ignored_organisations=["org-2"],
+    )
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_allowed_values_fails_for_old_entity_status(tmp_path):
+    file_path = tmp_path / "old-entity.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["old-entity", "status", "entity"])
+        writer.writerow(["1001", "301", "2001"])
+        writer.writerow(["1002", "410", "2002"])
+        writer.writerow(["1003", "302", "2003"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_allowed_values(
+        conn,
+        file_path=file_path,
+        field="status",
+        allowed_values=["301", "410"],
+    )
+
+    assert passed is False
+    assert "invalid values" in message
+    assert details["invalid_values"] == ["302"]
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["value"] == "302"
+
+
+def test_check_allowed_values_passes_for_old_entity_status(tmp_path):
+    file_path = tmp_path / "old-entity.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["old-entity", "status", "entity"])
+        writer.writerow(["1001", "301", "2001"])
+        writer.writerow(["1002", "410", "2002"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_allowed_values(
+        conn,
+        file_path=file_path,
+        field="status",
+        allowed_values=["301", "410"],
+    )
+
+    assert passed is True
+    assert details["invalid_rows"] == []

From 38bcf9e43de754e10cd0b053d0f2378435a84702 Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Fri, 27 Mar 2026 11:41:55 +0000
Subject: [PATCH 02/12] refactor: replace
 check_lookup_entities_are_within_organisation_ranges with
 check_field_is_within_range in CSV operations and tests

---
 digital_land/expectations/checkpoints/csv.py  |   4 +-
 digital_land/expectations/operations/csv.py   | 224 +++++++++++++-----
 .../expectations/operations/test_csv.py       | 188 +++++++++++++--
 3 files changed, 332 insertions(+), 84 deletions(-)

diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py
index 15ebebd0..50394fc8 100644
--- a/digital_land/expectations/checkpoints/csv.py
+++ b/digital_land/expectations/checkpoints/csv.py
@@ -9,7 +9,7 @@
     check_unique,
     check_no_shared_values,
     check_no_overlapping_ranges,
-    check_lookup_entities_are_within_organisation_ranges,
+    check_field_is_within_range,
     check_allowed_values
 )
 
@@ -26,7 +26,7 @@ def operation_factory(self, operation_string: str):
             "check_unique": check_unique,
             "check_no_shared_values": check_no_shared_values,
             "check_no_overlapping_ranges": check_no_overlapping_ranges,
-            "check_lookup_entities_are_within_organisation_ranges":check_lookup_entities_are_within_organisation_ranges,
+            "check_field_is_within_range":check_field_is_within_range,
             "check_allowed_values":check_allowed_values
         }
         if operation_string not in operation_map:
diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index ddae091e..d3d2a693 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -5,6 +5,41 @@ def _read_csv(file_path: Path) -> str:
     return f"read_csv_auto('{str(file_path)}',all_varchar=true,delim=',',quote='\"',escape='\"')"
 
 
+def _get_csv_columns(conn, file_path: Path) -> list:
+    """Get column names from CSV file."""
+    return [col[0] for col in conn.execute(
+        f"SELECT * FROM {_read_csv(file_path)} LIMIT 0"
+    ).description]
+
+
+def _build_exclude_clause(exclude: list) -> str:
+    """Build SQL NOT clause from exclude conditions. Each dict is AND group; list is OR between groups."""
+    if not exclude:
+        return ""
+    exclude_conditions = []
+    for exclude_dict in exclude:
+        and_parts = []
+        for k, v in exclude_dict.items():
+            cleaned = str(v).strip().replace("'", "''")
+            and_parts.append(f'"{k}" = \'{cleaned}\'')
+        if and_parts:
+            exclude_conditions.append(f"({' AND '.join(and_parts)})")
+    return f" AND NOT ({' OR '.join(exclude_conditions)})" if exclude_conditions else ""
+
+
+def _build_key_sql(cols: list, prefix: str) -> tuple:
+    """Build SQL key SELECT and WHERE fragments. Returns (select, where_not_empty)."""
+    select = ",\n                ".join(
+        f'TRIM(COALESCE("{col}", \'\')) AS {prefix}_key_{i}'
+        for i, col in enumerate(cols)
+    )
+    where = "\n              AND ".join(
+        f'TRIM(COALESCE("{col}", \'\')) != \'\''
+        for col in cols
+    )
+    return select, where
+
+
 def count_rows(
     conn, file_path: Path, expected: int, comparison_rule: str = "greater_than"
 ):
@@ -212,77 +247,150 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list
     return passed, message, details
 
 
-def check_lookup_entities_are_within_organisation_ranges(
-    conn, file_path: Path, organisation_file: Path, ignored_organisations: list = None
+def check_field_is_within_range(
+    conn,
+    file_path: Path,
+    field: str,
+    external_file: Path,
+    min_field: str,
+    max_field: str,
+    join_on: dict = None,
+    exclude: list = None,
 ):
     """
-    Checks that lookup entities are within any valid range from an organisation file.
+    Checks that a field's values are within any valid range from an external file.
 
     Args:
         conn: duckdb connection
-        file_path: path to the lookup CSV file
-        organisation_file: path to the entity-organisation CSV file
-        ignored_organisations: list of organisations to ignore (i.e. not check that their entities are within a valid range)
+        file_path: path to the CSV file containing the field to validate
+        external_file: path to the CSV file containing the ranges
+        min_field: the column name for the range minimum
+        max_field: the column name for the range maximum
+        field: the column name to validate
+        join_on: optional dict with keys {"file": [...], "external": [...]} specifying columns to match for range validation
+        exclude: optional list of dicts specifying row conditions to exclude from validation. Each dict is an AND group; the list is OR between groups.
+                 Example: [{"prefix": "conservationarea", "organisation": "orgA"}, {"prefix": "conservationarea", "organisation": "orgB"}]
     """
-    ignored_values = [
-        org.replace("'", "''")
-        for org in (ignored_organisations or [])
-        if isinstance(org, str) and org.strip()
-    ]
-    ignored_clause = ""
-    if ignored_values:
-        ignored_values_sql = ",".join("'" + org + "'" for org in ignored_values)
-        ignored_clause = (
-            " AND TRIM(COALESCE(\"organisation\", '')) NOT IN "
-            + f"({ignored_values_sql})"
-        )
-
-    result = conn.execute(
-        f"""
-        WITH ranges AS (
-            SELECT
-                TRY_CAST("entity-minimum" AS BIGINT) AS min_entity,
-                TRY_CAST("entity-maximum" AS BIGINT) AS max_entity
-            FROM {_read_csv(organisation_file)}
-            WHERE TRY_CAST("entity-minimum" AS BIGINT) IS NOT NULL
-              AND TRY_CAST("entity-maximum" AS BIGINT) IS NOT NULL
-        ),
-        lookup_rows AS (
-            SELECT
-                TRY_CAST("entity" AS BIGINT) AS entity,
-                TRIM(COALESCE("organisation", '')) AS organisation,
-                COALESCE("reference", '') AS reference
-            FROM {_read_csv(file_path)}
-            WHERE TRIM(COALESCE("organisation", '')) != ''
-            {ignored_clause}
+    file_cols_list = _get_csv_columns(conn, file_path)
+    external_cols_list = _get_csv_columns(conn, external_file)
+    exclude_clause = _build_exclude_clause(exclude)
+
+    # Validate and extract join_on
+    file_cols = external_cols = None
+    if join_on is not None:
+        if not isinstance(join_on, dict):
+            raise ValueError("join_on must be a dictionary")
+        file_cols = join_on.get("file")
+        external_cols = join_on.get("external")
+        if file_cols is None or external_cols is None:
+            raise ValueError(
+                'join_on must have keys "file" and "external" with column lists'
+            )
+        if not file_cols or not external_cols:
+            raise ValueError(
+                'join_on "file" and "external" lists must be non-empty'
+            )
+        if len(file_cols) != len(external_cols):
+            raise ValueError(
+                'join_on "file" and "external" lists must have the same length'
+            )
+        for col in file_cols:
+            if col not in file_cols_list:
+                raise ValueError(
+                    f"Column '{col}' not found in file. Available columns: {file_cols_list}"
+                )
+        for col in external_cols:
+            if col not in external_cols_list:
+                raise ValueError(
+                    f"Column '{col}' not found in external file. Available columns: {external_cols_list}"
+                )
+
+    # Simple range check without key matching
+    if join_on is None:
+        result = conn.execute(
+            f"""
+            WITH ranges AS (
+                SELECT
+                    TRY_CAST("{min_field}" AS BIGINT) AS min_value,
+                    TRY_CAST("{max_field}" AS BIGINT) AS max_value
+                FROM {_read_csv(external_file)}
+                WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL
+                  AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
+            ),
+            lookup_rows AS (
+                SELECT
+                    ROW_NUMBER() OVER () + 1 AS line_number,
+                    TRY_CAST("{field}" AS BIGINT) AS value
+                FROM {_read_csv(file_path)}
+                WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause}
+            )
+            SELECT line_number, value
+            FROM lookup_rows l
+            WHERE value IS NOT NULL
+              AND NOT EXISTS (
+                  SELECT 1
+                  FROM ranges r
+                  WHERE l.value BETWEEN r.min_value AND r.max_value
+              )
+            """
+        ).fetchall()
+        out_of_range_rows = [{"line_number": row[0], "value": row[1]} for row in result]
+    else:
+        # Key-matched range check
+        range_keys, range_empty = _build_key_sql(external_cols, "range")
+        lookup_keys, lookup_empty = _build_key_sql(file_cols, "lookup")
+        key_join = "\n                AND ".join(
+            f"l.lookup_key_{i} = r.range_key_{i}"
+            for i in range(len(file_cols))
         )
-        SELECT entity, organisation, reference
-        FROM lookup_rows l
-        WHERE organisation != ''
-          AND entity IS NOT NULL
-          AND NOT EXISTS (
-              SELECT 1
-              FROM ranges r
-              WHERE l.entity BETWEEN r.min_entity AND r.max_entity
-          )
-        """
-    ).fetchall()
-
-    out_of_range_rows = [
-        {"entity": row[0], "organisation": row[1], "reference": row[2]}
-        for row in result
-    ]
+        key_proj = ", ".join(f"lookup_key_{i}" for i in range(len(file_cols)))
+
+        result = conn.execute(
+            f"""
+            WITH ranges AS (
+                SELECT
+                    {range_keys},
+                    TRY_CAST("{min_field}" AS BIGINT) AS min_value,
+                    TRY_CAST("{max_field}" AS BIGINT) AS max_value
+                FROM {_read_csv(external_file)}
+                WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL
+                  AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
+                  AND {range_empty}
+            ),
+            lookup_rows AS (
+                SELECT
+                    ROW_NUMBER() OVER () + 1 AS line_number,
+                    TRY_CAST("{field}" AS BIGINT) AS value,
+                    {lookup_keys}
+                FROM {_read_csv(file_path)}
+                WHERE {lookup_empty} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause}
+            )
+            SELECT line_number, value, {key_proj}
+            FROM lookup_rows l
+            WHERE value IS NOT NULL
+              AND NOT EXISTS (
+                  SELECT 1
+                  FROM ranges r
+                  WHERE {key_join}
+                    AND l.value BETWEEN r.min_value AND r.max_value
+              )
+            """
+        ).fetchall()
+
+        out_of_range_rows = []
+        for row in result:
+            invalid_row = {"line_number": row[0], field: row[1]}
+            for i, col_name in enumerate(file_cols):
+                invalid_row[col_name] = row[i + 2]
+            out_of_range_rows.append(invalid_row)
 
     if len(out_of_range_rows) == 0:
         passed = True
-        message = "all lookup entities are within allowed ranges"
+        message = f"all values in '{field}' are within allowed ranges"
     else:
         passed = False
         message = f"there were {len(out_of_range_rows)} out-of-range rows found"
 
-    details = {
-        "invalid_rows": out_of_range_rows,
-    }
-
+    details = {"invalid_rows": out_of_range_rows}
     return passed, message, details
 
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 7cd57437..348b823c 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -8,7 +8,7 @@
     check_no_shared_values,
     check_no_overlapping_ranges,
     check_allowed_values,
-    check_lookup_entities_are_within_organisation_ranges,
+    check_field_is_within_range,
 )
 
 
@@ -205,58 +205,68 @@ def test_check_no_overlapping_ranges_adjacent_fails(tmp_path):
     assert len(details["overlaps"]) == 1
 
 
-def test_check_lookup_entities_are_within_organisation_ranges_fails(tmp_path):
+def test_check_field_is_within_ranges_fails(tmp_path):
     lookup_file = tmp_path / "lookup.csv"
     with open(lookup_file, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity", "organisation", "reference"])
-        writer.writerow(["150", "org-1", "ok-ref"])
-        writer.writerow(["999", "org-2", "bad-ref"])
+        writer.writerow(["entity"])
+        writer.writerow(["150"])
+        writer.writerow(["999"])
 
     organisation_file = tmp_path / "entity-organisation.csv"
     with open(organisation_file, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity-minimum", "entity-maximum", "organisation"])
-        writer.writerow(["100", "200", "org-1"])
-        writer.writerow(["300", "400", "org-2"])
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+        writer.writerow(["300", "400"])
 
     conn = duckdb.connect()
-    passed, message, details = check_lookup_entities_are_within_organisation_ranges(
-        conn, file_path=lookup_file, organisation_file=organisation_file
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=lookup_file,
+        external_file=organisation_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
     )
 
     assert passed is False
     assert "out-of-range" in message
     assert len(details["invalid_rows"]) == 1
-    assert details["invalid_rows"][0]["entity"] == 999
-    assert details["invalid_rows"][0]["organisation"] == "org-2"
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["value"] == 999
 
 
-def test_check_lookup_entities_are_within_organisation_ranges_ignores_org(tmp_path):
+def test_check_field_is_within_ranges_ignores_org(tmp_path):
     lookup_file = tmp_path / "lookup.csv"
     with open(lookup_file, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity", "organisation", "reference"])
-        writer.writerow(["150", "org-1", "ok-ref"])
-        writer.writerow(["999", "org-2", "ignored-ref"])
+        writer.writerow(["entity"])
+        writer.writerow(["150"])
+        writer.writerow(["250"])
 
     organisation_file = tmp_path / "entity-organisation.csv"
     with open(organisation_file, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity-minimum", "entity-maximum", "organisation"])
-        writer.writerow(["100", "200", "org-1"])
-        writer.writerow(["300", "400", "org-2"])
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+        writer.writerow(["300", "400"])
 
     conn = duckdb.connect()
-    passed, message, details = check_lookup_entities_are_within_organisation_ranges(
+    # Test without match_fields - simple range check
+    passed, message, details = check_field_is_within_range(
         conn,
         file_path=lookup_file,
-        organisation_file=organisation_file,
-        ignored_organisations=["org-2"],
+        external_file=organisation_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
     )
 
-    assert passed is True
-    assert details["invalid_rows"] == []
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["value"] == 250
 
 
 def test_check_allowed_values_fails_for_old_entity_status(tmp_path):
@@ -301,3 +311,133 @@ def test_check_allowed_values_passes_for_old_entity_status(tmp_path):
 
     assert passed is True
     assert details["invalid_rows"] == []
+
+
+def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "prefix", "organisation", "reference"])
+        writer.writerow(["150", "dataset-a", "org-1", "ok-ref"])
+        writer.writerow(["250", "dataset-a", "org-1", "bad-ref"])
+        writer.writerow(["999", "dataset-a", "org-2", "other-org-ref"])
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["dataset", "organisation", "entity-minimum", "entity-maximum"])
+        writer.writerow(["dataset-a", "org-1", "100", "200"])
+        writer.writerow(["dataset-a", "org-2", "900", "1000"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]},
+    )
+
+    assert passed is False
+    assert "out-of-range" in message
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["entity"] == 250
+    assert details["invalid_rows"][0]["prefix"] == "dataset-a"
+    assert details["invalid_rows"][0]["organisation"] == "org-1"
+
+
+def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "prefix", "organisation", "reference"])
+        writer.writerow(["150", "dataset-a", "org-1", "ok-ref"])
+        writer.writerow(["950", "dataset-a", "org-2", "ok-ref-2"])
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["dataset", "organisation", "entity-minimum", "entity-maximum"])
+        writer.writerow(["dataset-a", "org-1", "100", "200"])
+        writer.writerow(["dataset-a", "org-2", "900", "1000"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]},
+    )
+
+
+def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path):
+    file_path = tmp_path / "lookup_custom.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity_value", "dataset_key", "org_key", "ref_code"])
+        writer.writerow(["55", "dataset-x", "org-a", "ok-ref"])
+        writer.writerow(["250", "dataset-x", "org-a", "bad-ref"])
+
+    external_file = tmp_path / "ranges_custom.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["dataset_name", "org_name", "entity-minimum", "entity-maximum"])
+        writer.writerow(["dataset-x", "org-a", "50", "100"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity_value",
+        join_on={"file": ["dataset_key", "org_key"], "external": ["dataset_name", "org_name"]},
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["entity_value"] == 250
+    assert details["invalid_rows"][0]["dataset_key"] == "dataset-x"
+    assert details["invalid_rows"][0]["org_key"] == "org-a"
+
+
+def test_check_field_is_within_ranges_excludes_rows(tmp_path):
+    """Test that exclude skips rows matching specified field conditions during validation."""
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "status"])
+        writer.writerow(["150", "active"])
+        writer.writerow(["250", "active"])  # out of range but not excluded
+        writer.writerow(["350", "inactive"])  # out of range but excluded
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        exclude=[{"status": "inactive"}],
+    )
+
+    # Should fail due to entity 250 which is out of range and not excluded
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["value"] == 250
+    assert details["invalid_rows"][0]["line_number"] == 3

From 3089bc8723075bc8db93b2a6d9fbc25aaf29dbca Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Fri, 27 Mar 2026 13:53:10 +0000
Subject: [PATCH 03/12] check_field_is_within_range to support structured rules
 for filtering and matching

---
 digital_land/expectations/operations/csv.py   | 183 +++++++++++++-----
 .../expectations/operations/test_csv.py       | 139 ++++++++++++-
 2 files changed, 261 insertions(+), 61 deletions(-)

diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index d3d2a693..13e88a67 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -12,32 +12,91 @@ def _get_csv_columns(conn, file_path: Path) -> list:
     ).description]
 
 
-def _build_exclude_clause(exclude: list) -> str:
-    """Build SQL NOT clause from exclude conditions. Each dict is AND group; list is OR between groups."""
-    if not exclude:
+def _sql_string(value) -> str:
+    cleaned = str(value).strip().replace("'", "''")
+    return f"'{cleaned}'"
+
+
+def _normalize_condition_groups(conditions, name: str) -> list:
+    if conditions is None:
+        return []
+    if isinstance(conditions, dict):
+        return [conditions]
+    if isinstance(conditions, list):
+        return conditions
+    raise ValueError(f"{name} must be a dict, list of dicts, or None")
+
+
+def _build_field_condition(field_name: str, spec) -> str:
+    if isinstance(spec, dict):
+        op = str(spec.get("op", spec.get("operation", ""))).strip().lower()
+        value = spec.get("value")
+        if not op:
+            raise ValueError(
+                f"Condition for '{field_name}' must include 'op' when using dict format"
+            )
+    else:
+        op = "="
+        value = spec
+
+    if op in ("=", "=="):
+        return f'"{field_name}" = {_sql_string(value)}'
+    if op in ("!=", "<>"):
+        return f'"{field_name}" != {_sql_string(value)}'
+    if op in ("in", "not in"):
+        if not isinstance(value, (list, tuple, set)) or len(value) == 0:
+            raise ValueError(
+                f"Condition for '{field_name}' with op '{op}' must use a non-empty list"
+            )
+        values_sql = ", ".join(_sql_string(item) for item in value)
+        return f'"{field_name}" {op.upper()} ({values_sql})'
+
+    raise ValueError(
+        f"Unsupported operator '{op}' for field '{field_name}'. Supported: =, !=, in, not in"
+    )
+
+
+def _build_condition_group(group: dict, file_columns: list) -> str:
+    if not isinstance(group, dict) or not group:
+        raise ValueError("Each condition group must be a non-empty dict")
+
+    parts = []
+    for field_name, spec in group.items():
+        if field_name not in file_columns:
+            raise ValueError(
+                f"Column '{field_name}' not found in file. Available columns: {file_columns}"
+            )
+        parts.append(_build_field_condition(field_name, spec))
+
+    return f"({' AND '.join(parts)})"
+
+
+def _build_filter_clause(filter_spec, file_columns: list, name: str) -> str:
+    """Build SQL clause that keeps rows matching structured conditions."""
+    groups = _normalize_condition_groups(filter_spec, name)
+    if not groups:
         return ""
-    exclude_conditions = []
-    for exclude_dict in exclude:
-        and_parts = []
-        for k, v in exclude_dict.items():
-            cleaned = str(v).strip().replace("'", "''")
-            and_parts.append(f'"{k}" = \'{cleaned}\'')
-        if and_parts:
-            exclude_conditions.append(f"({' AND '.join(and_parts)})")
-    return f" AND NOT ({' OR '.join(exclude_conditions)})" if exclude_conditions else ""
-
-
-def _build_key_sql(cols: list, prefix: str) -> tuple:
-    """Build SQL key SELECT and WHERE fragments. Returns (select, where_not_empty)."""
-    select = ",\n                ".join(
-        f'TRIM(COALESCE("{col}", \'\')) AS {prefix}_key_{i}'
-        for i, col in enumerate(cols)
+    clauses = [_build_condition_group(group, file_columns) for group in groups]
+    return f" AND ({' OR '.join(clauses)})"
+
+
+def _build_match_column_sql_parts(columns: list, alias_prefix: str) -> tuple:
+    """Build SQL fragments for match-key columns.
+
+    Returns:
+        tuple[str, str]:
+            - SELECT projection fragment with normalized key aliases.
+            - WHERE fragment ensuring key columns are non-empty.
+    """
+    select_fragment = ",\n                ".join(
+        f'TRIM(COALESCE("{column}", \'\')) AS {alias_prefix}_key_{i}'
+        for i, column in enumerate(columns)
     )
-    where = "\n              AND ".join(
-        f'TRIM(COALESCE("{col}", \'\')) != \'\''
-        for col in cols
+    non_empty_filter_fragment = "\n              AND ".join(
+        f'TRIM(COALESCE("{column}", \'\')) != \'\''
+        for column in columns
     )
-    return select, where
+    return select_fragment, non_empty_filter_fragment
 
 
 def count_rows(
@@ -254,8 +313,7 @@ def check_field_is_within_range(
     external_file: Path,
     min_field: str,
     max_field: str,
-    join_on: dict = None,
-    exclude: list = None,
+    rules: dict = None,
 ):
     """
     Checks that a field's values are within any valid range from an external file.
@@ -267,32 +325,49 @@ def check_field_is_within_range(
         min_field: the column name for the range minimum
         max_field: the column name for the range maximum
         field: the column name to validate
-        join_on: optional dict with keys {"file": [...], "external": [...]} specifying columns to match for range validation
-        exclude: optional list of dicts specifying row conditions to exclude from validation. Each dict is an AND group; the list is OR between groups.
-                 Example: [{"prefix": "conservationarea", "organisation": "orgA"}, {"prefix": "conservationarea", "organisation": "orgB"}]
+        rules: optional dict that controls subset selection and key matching.
+                 Supported keys:
+                 - lookup_rules: dict or list[dict] of structured conditions for file_path rows.
+                 - match_columns: dict with keys {"lookup": [...], "range": [...]} specifying columns to match.
+                   lookup columns come from file_path (the rows being validated).
+                   range columns come from external_file (the rows providing valid ranges).
+                 Examples:
+                 {"lookup_rules": {"prefix": "conservationarea"}}
+                 {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}}
+                 {"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}}
+                 Use operators like != and not in when you want to exclude rows.
     """
     file_cols_list = _get_csv_columns(conn, file_path)
     external_cols_list = _get_csv_columns(conn, external_file)
-    exclude_clause = _build_exclude_clause(exclude)
+    rules = rules or {}
+    if not isinstance(rules, dict):
+        raise ValueError("rules must be a dictionary or None")
+
+    lookup_clause = _build_filter_clause(
+        rules.get("lookup_rules"),
+        file_cols_list,
+        "rules.lookup_rules",
+    )
 
-    # Validate and extract join_on
+    # Validate and extract match_columns
     file_cols = external_cols = None
-    if join_on is not None:
-        if not isinstance(join_on, dict):
-            raise ValueError("join_on must be a dictionary")
-        file_cols = join_on.get("file")
-        external_cols = join_on.get("external")
+    match_columns = rules.get("match_columns")
+    if match_columns is not None:
+        if not isinstance(match_columns, dict):
+            raise ValueError("rules.match_columns must be a dictionary")
+        file_cols = match_columns.get("lookup")
+        external_cols = match_columns.get("range")
         if file_cols is None or external_cols is None:
             raise ValueError(
-                'join_on must have keys "file" and "external" with column lists'
+                'rules.match_columns must have keys "lookup" and "range" with column lists'
             )
         if not file_cols or not external_cols:
             raise ValueError(
-                'join_on "file" and "external" lists must be non-empty'
+                'rules.match_columns "lookup" and "range" lists must be non-empty'
             )
         if len(file_cols) != len(external_cols):
             raise ValueError(
-                'join_on "file" and "external" lists must have the same length'
+                'rules.match_columns "lookup" and "range" lists must have the same length'
             )
         for col in file_cols:
             if col not in file_cols_list:
@@ -306,7 +381,7 @@ def check_field_is_within_range(
                 )
 
     # Simple range check without key matching
-    if join_on is None:
+    if match_columns is None:
         result = conn.execute(
             f"""
             WITH ranges AS (
@@ -322,7 +397,7 @@ def check_field_is_within_range(
                     ROW_NUMBER() OVER () + 1 AS line_number,
                     TRY_CAST("{field}" AS BIGINT) AS value
                 FROM {_read_csv(file_path)}
-                WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause}
+                WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause}
             )
             SELECT line_number, value
             FROM lookup_rows l
@@ -337,41 +412,47 @@ def check_field_is_within_range(
         out_of_range_rows = [{"line_number": row[0], "value": row[1]} for row in result]
     else:
         # Key-matched range check
-        range_keys, range_empty = _build_key_sql(external_cols, "range")
-        lookup_keys, lookup_empty = _build_key_sql(file_cols, "lookup")
-        key_join = "\n                AND ".join(
+        range_key_select_sql, range_keys_non_empty_sql = _build_match_column_sql_parts(
+            external_cols, "range"
+        )
+        lookup_key_select_sql, lookup_keys_non_empty_sql = _build_match_column_sql_parts(
+            file_cols, "lookup"
+        )
+        key_match_condition_sql = "\n                AND ".join(
             f"l.lookup_key_{i} = r.range_key_{i}"
             for i in range(len(file_cols))
         )
-        key_proj = ", ".join(f"lookup_key_{i}" for i in range(len(file_cols)))
+        key_projection_sql = ", ".join(
+            f"lookup_key_{i}" for i in range(len(file_cols))
+        )
 
         result = conn.execute(
             f"""
             WITH ranges AS (
                 SELECT
-                    {range_keys},
+                                        {range_key_select_sql},
                     TRY_CAST("{min_field}" AS BIGINT) AS min_value,
                     TRY_CAST("{max_field}" AS BIGINT) AS max_value
                 FROM {_read_csv(external_file)}
                 WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL
-                  AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
-                  AND {range_empty}
+                                    AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
+                                    AND {range_keys_non_empty_sql}
             ),
             lookup_rows AS (
                 SELECT
                     ROW_NUMBER() OVER () + 1 AS line_number,
                     TRY_CAST("{field}" AS BIGINT) AS value,
-                    {lookup_keys}
+                                        {lookup_key_select_sql}
                 FROM {_read_csv(file_path)}
-                WHERE {lookup_empty} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{exclude_clause}
+                                WHERE {lookup_keys_non_empty_sql} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause}
             )
-            SELECT line_number, value, {key_proj}
+                        SELECT line_number, value, {key_projection_sql}
             FROM lookup_rows l
             WHERE value IS NOT NULL
               AND NOT EXISTS (
                   SELECT 1
                   FROM ranges r
-                  WHERE {key_join}
+                                    WHERE {key_match_condition_sql}
                     AND l.value BETWEEN r.min_value AND r.max_value
               )
             """
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 348b823c..424fb096 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -337,7 +337,7 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity",
-        join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]},
+        rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}},
     )
 
     assert passed is False
@@ -372,7 +372,7 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity",
-        join_on={"file": ["prefix", "organisation"], "external": ["dataset", "organisation"]},
+        rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}},
     )
 
 
@@ -398,7 +398,7 @@ def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path):
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity_value",
-        join_on={"file": ["dataset_key", "org_key"], "external": ["dataset_name", "org_name"]},
+        rules={"match_columns": {"lookup": ["dataset_key", "org_key"], "range": ["dataset_name", "org_name"]}},
     )
 
     assert passed is False
@@ -409,15 +409,15 @@ def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path):
     assert details["invalid_rows"][0]["org_key"] == "org-a"
 
 
-def test_check_field_is_within_ranges_excludes_rows(tmp_path):
-    """Test that exclude skips rows matching specified field conditions during validation."""
+def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path):
+    """Test filtering rows with lookup_rules during validation."""
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["entity", "status"])
         writer.writerow(["150", "active"])
-        writer.writerow(["250", "active"])  # out of range but not excluded
-        writer.writerow(["350", "inactive"])  # out of range but excluded
+        writer.writerow(["250", "active"])  
+        writer.writerow(["350", "inactive"])
 
     external_file = tmp_path / "ranges.csv"
     with open(external_file, "w", newline="") as f:
@@ -433,11 +433,130 @@ def test_check_field_is_within_ranges_excludes_rows(tmp_path):
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity",
-        exclude=[{"status": "inactive"}],
+        rules={"lookup_rules": {"status": "active"}},
     )
-
-    # Should fail due to entity 250 which is out of range and not excluded
     assert passed is False
     assert len(details["invalid_rows"]) == 1
     assert details["invalid_rows"][0]["value"] == 250
     assert details["invalid_rows"][0]["line_number"] == 3
+
+
+def test_check_field_is_within_ranges_lookup_rules_operator_eq_shape(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "prefix"])
+        writer.writerow(["150", "conservationarea"]) 
+        writer.writerow(["350", "other"]) 
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        rules={"lookup_rules": {"prefix": {"op": "=", "value": "conservationarea"}}},
+    )
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_field_is_within_ranges_lookup_rules_exact_match(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "prefix"])
+        writer.writerow(["150", "conservationarea"])  
+        writer.writerow(["350", "other"]) 
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        rules={"lookup_rules": {"prefix": "conservationarea"}},
+    )
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "organisation"])
+        writer.writerow(["150", "org-a"])  
+        writer.writerow(["350", "org-b"])  
+        writer.writerow(["350", "org-c"])  
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        rules={"lookup_rules": {"organisation": {"op": "in", "value": ["org-a", "org-b"]}}},
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["value"] == 350
+
+def test_check_field_is_within_ranges_for_only_staus_301(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "status"])
+        writer.writerow(["150", "301"]) 
+        writer.writerow(["250", "301"]) 
+        writer.writerow(["350", "410"])
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_field_is_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity",
+        rules={"lookup_rules": {"status": {"op": "=", "value": "301"}}},
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["value"] == 250

From 3be0e3230d81aef7918834e2ba7e9b3f0d4e874d Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Fri, 27 Mar 2026 15:56:03 +0000
Subject: [PATCH 04/12] refactor: update CSV validation functions to improve
 range checks and add dataset organization matching

---
 digital_land/expectations/checkpoints/csv.py  |   6 +-
 digital_land/expectations/operations/csv.py   | 400 +++++++++++-------
 .../expectations/operations/test_csv.py       |  90 ++--
 3 files changed, 324 insertions(+), 172 deletions(-)

diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py
index 50394fc8..b2258cb9 100644
--- a/digital_land/expectations/checkpoints/csv.py
+++ b/digital_land/expectations/checkpoints/csv.py
@@ -9,7 +9,8 @@
     check_unique,
     check_no_shared_values,
     check_no_overlapping_ranges,
-    check_field_is_within_range,
+    check_fields_are_within_range,
+    check_field_is_within_range_by_dataset_org,
     check_allowed_values
 )
 
@@ -26,7 +27,8 @@ def operation_factory(self, operation_string: str):
             "check_unique": check_unique,
             "check_no_shared_values": check_no_shared_values,
             "check_no_overlapping_ranges": check_no_overlapping_ranges,
-            "check_field_is_within_range":check_field_is_within_range,
+            "check_fields_are_within_range": check_fields_are_within_range,
+            "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org,
             "check_allowed_values":check_allowed_values
         }
         if operation_string not in operation_map:
diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 13e88a67..dfb9d949 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -17,6 +17,10 @@ def _sql_string(value) -> str:
     return f"'{cleaned}'"
 
 
+def _sql_identifier(name: str) -> str:
+    return '"' + str(name).replace('"', '""') + '"'
+
+
 def _normalize_condition_groups(conditions, name: str) -> list:
     if conditions is None:
         return []
@@ -80,23 +84,61 @@ def _build_filter_clause(filter_spec, file_columns: list, name: str) -> str:
     return f" AND ({' OR '.join(clauses)})"
 
 
-def _build_match_column_sql_parts(columns: list, alias_prefix: str) -> tuple:
-    """Build SQL fragments for match-key columns.
+def _normalize_fields_for_validation(field_spec, file_columns: list) -> list:
+    """Normalize a field spec into a list of column names to validate."""
+    if isinstance(field_spec, str):
+        fields = [item.strip() for item in field_spec.split(",") if item.strip()]
+    elif isinstance(field_spec, (list, tuple, set)):
+        fields = [str(item).strip() for item in field_spec if str(item).strip()]
+    else:
+        raise ValueError("field must be a string, comma-separated string, or list of strings")
 
-    Returns:
-        tuple[str, str]:
-            - SELECT projection fragment with normalized key aliases.
-            - WHERE fragment ensuring key columns are non-empty.
-    """
-    select_fragment = ",\n                ".join(
-        f'TRIM(COALESCE("{column}", \'\')) AS {alias_prefix}_key_{i}'
-        for i, column in enumerate(columns)
-    )
-    non_empty_filter_fragment = "\n              AND ".join(
-        f'TRIM(COALESCE("{column}", \'\')) != \'\''
-        for column in columns
-    )
-    return select_fragment, non_empty_filter_fragment
+    if not fields:
+        raise ValueError("field must include at least one column name")
+
+    seen = set()
+    normalized_fields = []
+    for field_name in fields:
+        if field_name not in seen:
+            seen.add(field_name)
+            normalized_fields.append(field_name)
+
+    missing_fields = [field_name for field_name in normalized_fields if field_name not in file_columns]
+    if missing_fields:
+        raise ValueError(
+            f"Column(s) {missing_fields} not found in file. Available columns: {file_columns}"
+        )
+
+    return normalized_fields
+
+
+def _build_range_invalid_rows(
+    result: list,
+    validating_multiple_fields: bool,
+    has_match_columns: bool,
+    lookup_match_columns: list = None,
+) -> list:
+    """Format query rows into expectation invalid_rows shape."""
+    out_of_range_rows = []
+
+    for row in result:
+        field_name = row[1]
+
+        if has_match_columns:
+            if validating_multiple_fields:
+                invalid_row = {"line_number": row[0], "field": field_name, "value": row[2]}
+            else:
+                invalid_row = {"line_number": row[0], field_name: row[2]}
+            for i, col_name in enumerate(lookup_match_columns):
+                invalid_row[col_name] = row[i + 3]
+        else:
+            invalid_row = {"line_number": row[0], "value": row[2]}
+            if validating_multiple_fields:
+                invalid_row["field"] = field_name
+
+        out_of_range_rows.append(invalid_row)
+
+    return out_of_range_rows
 
 
 def count_rows(
@@ -306,7 +348,7 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list
     return passed, message, details
 
 
-def check_field_is_within_range(
+def check_fields_are_within_range(
     conn,
     file_path: Path,
     field: str,
@@ -316,154 +358,222 @@ def check_field_is_within_range(
     rules: dict = None,
 ):
     """
-    Checks that a field's values are within any valid range from an external file.
+    Check that one or more lookup fields are within ranges from an external file.
 
     Args:
         conn: duckdb connection
-        file_path: path to the CSV file containing the field to validate
-        external_file: path to the CSV file containing the ranges
+        file_path: path to the CSV file containing fields to validate
+        field: column name(s) to validate.
+               You can pass a single name ("entity") or a comma-separated list
+               ("entity, end-entity"). All specified fields must be within range.
+        external_file: path to the CSV file containing valid ranges
         min_field: the column name for the range minimum
         max_field: the column name for the range maximum
-        field: the column name to validate
-        rules: optional dict that controls subset selection and key matching.
-                 Supported keys:
-                 - lookup_rules: dict or list[dict] of structured conditions for file_path rows.
-                 - match_columns: dict with keys {"lookup": [...], "range": [...]} specifying columns to match.
-                   lookup columns come from file_path (the rows being validated).
-                   range columns come from external_file (the rows providing valid ranges).
-                 Examples:
-                 {"lookup_rules": {"prefix": "conservationarea"}}
-                 {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}}
-                 {"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}}
-                 Use operators like != and not in when you want to exclude rows.
+        rules: optional dict controlling subset selection on lookup rows.
+               Supported keys:
+               - lookup_rules: dict or list[dict] of structured conditions.
+                 Fields in one dict are AND'ed; multiple dicts are OR'ed.
+               Examples:
+               {"lookup_rules": {"prefix": "conservationarea"}}
+               {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}}
+               Use operators like != and not in when you want to exclude rows.
     """
-    file_cols_list = _get_csv_columns(conn, file_path)
-    external_cols_list = _get_csv_columns(conn, external_file)
+    file_columns = _get_csv_columns(conn, file_path)
     rules = rules or {}
     if not isinstance(rules, dict):
         raise ValueError("rules must be a dictionary or None")
 
     lookup_clause = _build_filter_clause(
         rules.get("lookup_rules"),
-        file_cols_list,
+        file_columns,
         "rules.lookup_rules",
     )
 
-    # Validate and extract match_columns
-    file_cols = external_cols = None
-    match_columns = rules.get("match_columns")
-    if match_columns is not None:
-        if not isinstance(match_columns, dict):
-            raise ValueError("rules.match_columns must be a dictionary")
-        file_cols = match_columns.get("lookup")
-        external_cols = match_columns.get("range")
-        if file_cols is None or external_cols is None:
-            raise ValueError(
-                'rules.match_columns must have keys "lookup" and "range" with column lists'
-            )
-        if not file_cols or not external_cols:
-            raise ValueError(
-                'rules.match_columns "lookup" and "range" lists must be non-empty'
-            )
-        if len(file_cols) != len(external_cols):
-            raise ValueError(
-                'rules.match_columns "lookup" and "range" lists must have the same length'
-            )
-        for col in file_cols:
-            if col not in file_cols_list:
-                raise ValueError(
-                    f"Column '{col}' not found in file. Available columns: {file_cols_list}"
-                )
-        for col in external_cols:
-            if col not in external_cols_list:
-                raise ValueError(
-                    f"Column '{col}' not found in external file. Available columns: {external_cols_list}"
-                )
-
-    # Simple range check without key matching
-    if match_columns is None:
-        result = conn.execute(
-            f"""
-            WITH ranges AS (
-                SELECT
-                    TRY_CAST("{min_field}" AS BIGINT) AS min_value,
-                    TRY_CAST("{max_field}" AS BIGINT) AS max_value
-                FROM {_read_csv(external_file)}
-                WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL
-                  AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
-            ),
-            lookup_rows AS (
-                SELECT
-                    ROW_NUMBER() OVER () + 1 AS line_number,
-                    TRY_CAST("{field}" AS BIGINT) AS value
-                FROM {_read_csv(file_path)}
-                WHERE TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause}
-            )
-            SELECT line_number, value
-            FROM lookup_rows l
-            WHERE value IS NOT NULL
-              AND NOT EXISTS (
-                  SELECT 1
-                  FROM ranges r
-                  WHERE l.value BETWEEN r.min_value AND r.max_value
-              )
-            """
-        ).fetchall()
-        out_of_range_rows = [{"line_number": row[0], "value": row[1]} for row in result]
-    else:
-        # Key-matched range check
-        range_key_select_sql, range_keys_non_empty_sql = _build_match_column_sql_parts(
-            external_cols, "range"
+    fields_to_validate = _normalize_fields_for_validation(field, file_columns)
+    validating_multiple_fields = len(fields_to_validate) > 1
+    lookup_values_sql = ",\n                    ".join(
+        f"({i}, {_sql_string(field_name)}, TRY_CAST(src.{_sql_identifier(field_name)} AS BIGINT))"
+        for i, field_name in enumerate(fields_to_validate)
+    )
+
+    result = conn.execute(
+        f"""
+        WITH ranges AS (
+            SELECT
+                TRY_CAST("{min_field}" AS BIGINT) AS min_value,
+                TRY_CAST("{max_field}" AS BIGINT) AS max_value
+            FROM {_read_csv(external_file)}
+            WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL
+              AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
+        ),
+        source_rows AS (
+            SELECT
+                ROW_NUMBER() OVER () + 1 AS line_number,
+                *
+            FROM {_read_csv(file_path)}
+        ),
+        lookup_rows AS (
+            SELECT
+                src.line_number,
+                fields.field_order,
+                fields.field_name,
+                fields.value
+            FROM source_rows src
+            CROSS JOIN LATERAL (
+                VALUES
+                    {lookup_values_sql}
+            ) AS fields(field_order, field_name, value)
+            WHERE fields.value IS NOT NULL{lookup_clause}
         )
-        lookup_key_select_sql, lookup_keys_non_empty_sql = _build_match_column_sql_parts(
-            file_cols, "lookup"
+        SELECT
+            line_number,
+            field_name,
+            value
+        FROM lookup_rows l
+        WHERE NOT EXISTS (
+            SELECT 1
+            FROM ranges r
+            WHERE l.value BETWEEN r.min_value AND r.max_value
         )
-        key_match_condition_sql = "\n                AND ".join(
-            f"l.lookup_key_{i} = r.range_key_{i}"
-            for i in range(len(file_cols))
+        ORDER BY field_order, line_number
+        """
+    ).fetchall()
+
+    out_of_range_rows = _build_range_invalid_rows(
+        result=result,
+        validating_multiple_fields=validating_multiple_fields,
+        has_match_columns=False,
+    )
+
+    if len(out_of_range_rows) == 0:
+        passed = True
+        message = f"all values in '{field}' are within allowed ranges"
+    else:
+        passed = False
+        message = f"there were {len(out_of_range_rows)} out-of-range rows found"
+
+    details = {"invalid_rows": out_of_range_rows}
+    return passed, message, details
+
+
+def check_field_is_within_range_by_dataset_org(
+    conn,
+    file_path: Path,
+    field: str,
+    external_file: Path,
+    min_field: str,
+    max_field: str,
+    lookup_dataset_field: str,
+    range_dataset_field: str,
+    rules: dict = None,
+):
+    """
+    Check field values are within ranges matched by dataset field and organisation.
+
+    Matching is fixed to two keys:
+    1. lookup_dataset_field -> range_dataset_field
+    2. organisation -> organisation
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file containing fields to validate
+        field: single column name to validate (for example: "entity").
+        external_file: path to the CSV file containing valid ranges
+        min_field: the column name for the range minimum
+        max_field: the column name for the range maximum
+        lookup_dataset_field: dataset column name in file_path
+        range_dataset_field: dataset column name in external_file
+        rules: optional dict controlling subset selection on lookup rows.
+               Supported keys:
+               - lookup_rules: dict or list[dict] of structured conditions.
+                 Fields in one dict are AND'ed; multiple dicts are OR'ed.
+               Examples:
+               {"lookup_rules": {"prefix": "conservationarea"}}
+               {"lookup_rules": {"organisation": {"op": "in", "value": ["orgA", "orgB"]}}}
+               Use operators like != and not in when you want to exclude rows.
+    """
+    file_columns = _get_csv_columns(conn, file_path)
+    rules = rules or {}
+    if not isinstance(rules, dict):
+        raise ValueError("rules must be a dictionary or None")
+
+    lookup_clause = _build_filter_clause(
+        rules.get("lookup_rules"),
+        file_columns,
+        "rules.lookup_rules",
+    )
+
+    fields_to_validate = _normalize_fields_for_validation(field, file_columns)
+    if len(fields_to_validate) != 1:
+        raise ValueError("field must be a single column name")
+    field_name = fields_to_validate[0]
+
+    lookup_dataset_name = str(lookup_dataset_field).strip()
+    range_dataset_name = str(range_dataset_field).strip()
+    lookup_match_columns = [lookup_dataset_name, "organisation"]
+
+    lookup_dataset_col = _sql_identifier(lookup_dataset_name)
+    lookup_org_col = _sql_identifier("organisation")
+    range_dataset_col = _sql_identifier(range_dataset_name)
+    range_org_col = _sql_identifier("organisation")
+    min_col = _sql_identifier(min_field)
+    max_col = _sql_identifier(max_field)
+    value_col = _sql_identifier(field_name)
+
+    result = conn.execute(
+        f"""
+        WITH ranges AS (
+            SELECT
+                TRY_CAST({min_col} AS BIGINT) AS min_value,
+                TRY_CAST({max_col} AS BIGINT) AS max_value,
+                TRIM(COALESCE({range_dataset_col}, '')) AS range_key_0,
+                                TRIM(COALESCE({range_org_col}, '')) AS range_key_1
+            FROM {_read_csv(external_file)}
+            WHERE TRY_CAST({min_col} AS BIGINT) IS NOT NULL
+              AND TRY_CAST({max_col} AS BIGINT) IS NOT NULL
+              AND TRIM(COALESCE({range_dataset_col}, '')) != ''
+                            AND TRIM(COALESCE({range_org_col}, '')) != ''
+        ),
+        source_rows AS (
+            SELECT
+                ROW_NUMBER() OVER () + 1 AS line_number,
+                *
+            FROM {_read_csv(file_path)}
+        ),
+        lookup_rows AS (
+            SELECT
+                src.line_number,
+                TRY_CAST(src.{value_col} AS BIGINT) AS value,
+                TRIM(COALESCE(src.{lookup_dataset_col}, '')) AS lookup_key_0,
+                TRIM(COALESCE(src.{lookup_org_col}, '')) AS lookup_key_1
+            FROM source_rows src
+            WHERE TRY_CAST(src.{value_col} AS BIGINT) IS NOT NULL
+              AND TRIM(COALESCE(src.{lookup_dataset_col}, '')) != ''
+              AND TRIM(COALESCE(src.{lookup_org_col}, '')) != ''{lookup_clause}
         )
-        key_projection_sql = ", ".join(
-            f"lookup_key_{i}" for i in range(len(file_cols))
+        SELECT
+            line_number,
+            value,
+            lookup_key_0,
+            lookup_key_1
+        FROM lookup_rows l
+        WHERE NOT EXISTS (
+            SELECT 1
+            FROM ranges r
+            WHERE l.value BETWEEN r.min_value AND r.max_value
+                  AND l.lookup_key_0 = r.range_key_0
+                  AND l.lookup_key_1 = r.range_key_1
         )
+        ORDER BY line_number
+        """
+    ).fetchall()
 
-        result = conn.execute(
-            f"""
-            WITH ranges AS (
-                SELECT
-                                        {range_key_select_sql},
-                    TRY_CAST("{min_field}" AS BIGINT) AS min_value,
-                    TRY_CAST("{max_field}" AS BIGINT) AS max_value
-                FROM {_read_csv(external_file)}
-                WHERE TRY_CAST("{min_field}" AS BIGINT) IS NOT NULL
-                                    AND TRY_CAST("{max_field}" AS BIGINT) IS NOT NULL
-                                    AND {range_keys_non_empty_sql}
-            ),
-            lookup_rows AS (
-                SELECT
-                    ROW_NUMBER() OVER () + 1 AS line_number,
-                    TRY_CAST("{field}" AS BIGINT) AS value,
-                                        {lookup_key_select_sql}
-                FROM {_read_csv(file_path)}
-                                WHERE {lookup_keys_non_empty_sql} AND TRY_CAST("{field}" AS BIGINT) IS NOT NULL{lookup_clause}
-            )
-                        SELECT line_number, value, {key_projection_sql}
-            FROM lookup_rows l
-            WHERE value IS NOT NULL
-              AND NOT EXISTS (
-                  SELECT 1
-                  FROM ranges r
-                                    WHERE {key_match_condition_sql}
-                    AND l.value BETWEEN r.min_value AND r.max_value
-              )
-            """
-        ).fetchall()
-
-        out_of_range_rows = []
-        for row in result:
-            invalid_row = {"line_number": row[0], field: row[1]}
-            for i, col_name in enumerate(file_cols):
-                invalid_row[col_name] = row[i + 2]
-            out_of_range_rows.append(invalid_row)
+    out_of_range_rows = []
+    for row in result:
+        invalid_row = {"line_number": row[0], field_name: row[1]}
+        for i, col_name in enumerate(lookup_match_columns):
+            invalid_row[col_name] = row[i + 2]
+        out_of_range_rows.append(invalid_row)
 
     if len(out_of_range_rows) == 0:
         passed = True
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 424fb096..176f02bc 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -8,7 +8,8 @@
     check_no_shared_values,
     check_no_overlapping_ranges,
     check_allowed_values,
-    check_field_is_within_range,
+    check_fields_are_within_range,
+    check_field_is_within_range_by_dataset_org,
 )
 
 
@@ -221,7 +222,7 @@ def test_check_field_is_within_ranges_fails(tmp_path):
         writer.writerow(["300", "400"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=lookup_file,
         external_file=organisation_file,
@@ -254,7 +255,7 @@ def test_check_field_is_within_ranges_ignores_org(tmp_path):
 
     conn = duckdb.connect()
     # Test without match_fields - simple range check
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=lookup_file,
         external_file=organisation_file,
@@ -313,7 +314,7 @@ def test_check_allowed_values_passes_for_old_entity_status(tmp_path):
     assert details["invalid_rows"] == []
 
 
-def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_path):
+def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(tmp_path):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
@@ -330,14 +331,15 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_
         writer.writerow(["dataset-a", "org-2", "900", "1000"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_field_is_within_range_by_dataset_org(
         conn,
         file_path=file_path,
         external_file=external_file,
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity",
-        rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}},
+        lookup_dataset_field="prefix",
+        range_dataset_field="dataset",
     )
 
     assert passed is False
@@ -349,7 +351,7 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_fails(tmp_
     assert details["invalid_rows"][0]["organisation"] == "org-1"
 
 
-def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp_path):
+def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_passes(tmp_path):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
@@ -365,40 +367,42 @@ def test_check_field_is_within_ranges_matches_prefix_and_organisation_passes(tmp
         writer.writerow(["dataset-a", "org-2", "900", "1000"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_field_is_within_range_by_dataset_org(
         conn,
         file_path=file_path,
         external_file=external_file,
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity",
-        rules={"match_columns": {"lookup": ["prefix", "organisation"], "range": ["dataset", "organisation"]}},
+        lookup_dataset_field="prefix",
+        range_dataset_field="dataset",
     )
 
 
-def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path):
+def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_names(tmp_path):
     file_path = tmp_path / "lookup_custom.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity_value", "dataset_key", "org_key", "ref_code"])
+        writer.writerow(["entity_value", "dataset_key", "organisation", "ref_code"])
         writer.writerow(["55", "dataset-x", "org-a", "ok-ref"])
         writer.writerow(["250", "dataset-x", "org-a", "bad-ref"])
 
     external_file = tmp_path / "ranges_custom.csv"
     with open(external_file, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["dataset_name", "org_name", "entity-minimum", "entity-maximum"])
+        writer.writerow(["dataset_name", "organisation", "entity-minimum", "entity-maximum"])
         writer.writerow(["dataset-x", "org-a", "50", "100"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_field_is_within_range_by_dataset_org(
         conn,
         file_path=file_path,
         external_file=external_file,
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity_value",
-        rules={"match_columns": {"lookup": ["dataset_key", "org_key"], "range": ["dataset_name", "org_name"]}},
+        lookup_dataset_field="dataset_key",
+        range_dataset_field="dataset_name",
     )
 
     assert passed is False
@@ -406,7 +410,7 @@ def test_check_field_is_within_ranges_supports_custom_column_names(tmp_path):
     assert details["invalid_rows"][0]["line_number"] == 3
     assert details["invalid_rows"][0]["entity_value"] == 250
     assert details["invalid_rows"][0]["dataset_key"] == "dataset-x"
-    assert details["invalid_rows"][0]["org_key"] == "org-a"
+    assert details["invalid_rows"][0]["organisation"] == "org-a"
 
 
 def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path):
@@ -426,7 +430,7 @@ def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path):
         writer.writerow(["100", "200"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=file_path,
         external_file=external_file,
@@ -456,7 +460,7 @@ def test_check_field_is_within_ranges_lookup_rules_operator_eq_shape(tmp_path):
         writer.writerow(["100", "200"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=file_path,
         external_file=external_file,
@@ -485,7 +489,7 @@ def test_check_field_is_within_ranges_lookup_rules_exact_match(tmp_path):
         writer.writerow(["100", "200"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=file_path,
         external_file=external_file,
@@ -515,7 +519,7 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path):
         writer.writerow(["100", "200"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=file_path,
         external_file=external_file,
@@ -530,14 +534,49 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path):
     assert details["invalid_rows"][0]["line_number"] == 3
     assert details["invalid_rows"][0]["value"] == 350
 
+
+def test_check_field_is_within_ranges_comma_separated_fields(tmp_path):
+    file_path = tmp_path / "lookup.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "end-entity"])
+        writer.writerow(["150", "160"])  
+        writer.writerow(["150", "350"])  
+        writer.writerow(["350", "150"])  
+
+    external_file = tmp_path / "ranges.csv"
+    with open(external_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity-minimum", "entity-maximum"])
+        writer.writerow(["100", "200"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_fields_are_within_range(
+        conn,
+        file_path=file_path,
+        external_file=external_file,
+        min_field="entity-minimum",
+        max_field="entity-maximum",
+        field="entity, end-entity",
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+    assert details["invalid_rows"][0]["line_number"] == 4
+    assert details["invalid_rows"][0]["field"] == "entity"
+    assert details["invalid_rows"][0]["value"] == 350
+    assert details["invalid_rows"][1]["line_number"] == 3
+    assert details["invalid_rows"][1]["field"] == "end-entity"
+    assert details["invalid_rows"][1]["value"] == 350
+
 def test_check_field_is_within_ranges_for_only_staus_301(tmp_path):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity", "status"])
-        writer.writerow(["150", "301"]) 
-        writer.writerow(["250", "301"]) 
-        writer.writerow(["350", "410"])
+        writer.writerow(["entity", "status","old-entity"])
+        writer.writerow(["150", "301", "140"]) 
+        writer.writerow(["250", "301", "150"]) 
+        writer.writerow(["350", "410", "340"])
 
     external_file = tmp_path / "ranges.csv"
     with open(external_file, "w", newline="") as f:
@@ -546,17 +585,18 @@ def test_check_field_is_within_ranges_for_only_staus_301(tmp_path):
         writer.writerow(["100", "200"])
 
     conn = duckdb.connect()
-    passed, message, details = check_field_is_within_range(
+    passed, message, details = check_fields_are_within_range(
         conn,
         file_path=file_path,
         external_file=external_file,
         min_field="entity-minimum",
         max_field="entity-maximum",
-        field="entity",
+        field="entity,old-entity",
         rules={"lookup_rules": {"status": {"op": "=", "value": "301"}}},
     )
 
     assert passed is False
     assert len(details["invalid_rows"]) == 1
     assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["field"] == "entity"
     assert details["invalid_rows"][0]["value"] == 250

From 70524874dd1e3ad4617f61032b00ec944836c6b7 Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Fri, 27 Mar 2026 17:05:15 +0000
Subject: [PATCH 05/12]  add check_no_blank_rows function and corresponding
 tests

---
 digital_land/expectations/checkpoints/csv.py  |  6 ++-
 digital_land/expectations/operations/csv.py   | 50 +++++++++++++++++++
 .../expectations/operations/test_csv.py       | 36 +++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py
index b2258cb9..a948f65e 100644
--- a/digital_land/expectations/checkpoints/csv.py
+++ b/digital_land/expectations/checkpoints/csv.py
@@ -11,7 +11,8 @@
     check_no_overlapping_ranges,
     check_fields_are_within_range,
     check_field_is_within_range_by_dataset_org,
-    check_allowed_values
+    check_allowed_values,
+    check_no_blank_rows,
 )
 
 
@@ -29,7 +30,8 @@ def operation_factory(self, operation_string: str):
             "check_no_overlapping_ranges": check_no_overlapping_ranges,
             "check_fields_are_within_range": check_fields_are_within_range,
             "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org,
-            "check_allowed_values":check_allowed_values
+            "check_allowed_values": check_allowed_values,
+            "check_no_blank_rows": check_no_blank_rows,
         }
         if operation_string not in operation_map:
             raise ValueError(
diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index dfb9d949..8840c414 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -348,6 +348,56 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list
     return passed, message, details
 
 
+def check_no_blank_rows(conn, file_path: Path):
+    """
+    Checks that the CSV does not contain fully blank rows.
+
+    A row is considered blank when every column is empty after trimming whitespace.
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file
+    """
+    file_columns = _get_csv_columns(conn, file_path)
+    if not file_columns:
+        return True, "no blank rows found", {"invalid_rows": []}
+
+    blank_conditions = " AND ".join(
+        f"TRIM(COALESCE({_sql_identifier(column_name)}, '')) = ''"
+        for column_name in file_columns
+    )
+
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT
+                ROW_NUMBER() OVER () + 1 AS line_number,
+                *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT
+            line_number
+        FROM source_rows
+        WHERE {blank_conditions}
+        ORDER BY line_number
+        """
+    ).fetchall()
+
+    invalid_rows = [{"line_number": row[0]} for row in result]
+
+    if len(invalid_rows) == 0:
+        passed = True
+        message = "no blank rows found"
+    else:
+        passed = False
+        message = f"there were {len(invalid_rows)} blank rows found"
+
+    details = {
+        "invalid_rows": invalid_rows,
+    }
+    return passed, message, details
+
+
 def check_fields_are_within_range(
     conn,
     file_path: Path,
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 176f02bc..3d89837d 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -8,6 +8,7 @@
     check_no_shared_values,
     check_no_overlapping_ranges,
     check_allowed_values,
+    check_no_blank_rows,
     check_fields_are_within_range,
     check_field_is_within_range_by_dataset_org,
 )
@@ -314,6 +315,41 @@ def test_check_allowed_values_passes_for_old_entity_status(tmp_path):
     assert details["invalid_rows"] == []
 
 
+def test_check_no_blank_rows_passes(tmp_path):
+    file_path = tmp_path / "no-blank-rows.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "name", "reference"])
+        writer.writerow(["1", "foo", "ref1"])
+        writer.writerow(["2", "bar", "ref2"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_no_blank_rows(conn, file_path=file_path)
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_no_blank_rows_fails(tmp_path):
+    file_path = tmp_path / "has-blank-rows.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "name", "reference"])
+        writer.writerow(["1", "foo", "ref1"])
+        writer.writerow(["", "", ""])
+        writer.writerow([" ", "", "   "])
+        writer.writerow(["2", "bar", "ref2"])
+
+    conn = duckdb.connect()
+    passed, message, details = check_no_blank_rows(conn, file_path=file_path)
+
+    assert passed is False
+    assert "blank rows" in message
+    assert len(details["invalid_rows"]) == 2
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][1]["line_number"] == 4
+
+
 def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(tmp_path):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:

From 2df8646592aef78334415d3e9b724d59a868fc09 Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Mon, 30 Mar 2026 09:29:57 +0100
Subject: [PATCH 06/12] add datatype validation for CSV values and
 corresponding tests

---
 digital_land/expectations/checkpoints/csv.py  |   2 +
 digital_land/expectations/operations/csv.py   |  78 +++++
 .../operations/datatype_validators.py         | 282 ++++++++++++++++++
 .../expectations/checkpoints/test_csv.py      |  34 +++
 .../expectations/operations/test_csv.py       | 129 ++++++++
 5 files changed, 525 insertions(+)
 create mode 100644 digital_land/expectations/operations/datatype_validators.py

diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py
index a948f65e..5b33a771 100644
--- a/digital_land/expectations/checkpoints/csv.py
+++ b/digital_land/expectations/checkpoints/csv.py
@@ -13,6 +13,7 @@
     check_field_is_within_range_by_dataset_org,
     check_allowed_values,
     check_no_blank_rows,
+    check_values_have_the_correct_datatype,
 )
 
 
@@ -32,6 +33,7 @@ def operation_factory(self, operation_string: str):
             "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org,
             "check_allowed_values": check_allowed_values,
             "check_no_blank_rows": check_no_blank_rows,
+            "check_values_have_the_correct_datatype": check_values_have_the_correct_datatype,
         }
         if operation_string not in operation_map:
             raise ValueError(
diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 8840c414..12622ac1 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -1,4 +1,7 @@
 from pathlib import Path
+import pandas as pd
+
+from digital_land.expectations.operations.datatype_validators import _is_valid_address_value, _is_valid_curie_list_value, _is_valid_curie_value, _is_valid_datetime_value, _is_valid_decimal_value, _is_valid_flag_value, _is_valid_hash_value, _is_valid_integer_value, _is_valid_json_value, _is_valid_latitude_value, _is_valid_longitude_value, _is_valid_multipolygon_value, _is_valid_pattern_value, _is_valid_point_value, _is_valid_reference_value, _is_valid_url_value
 
 
 def _read_csv(file_path: Path) -> str:
@@ -635,3 +638,78 @@ def check_field_is_within_range_by_dataset_org(
     details = {"invalid_rows": out_of_range_rows}
     return passed, message, details
 
+
+def check_values_have_the_correct_datatype(conn,file_path, field_datatype):
+    """
+    Validates that CSV column values have correct datatypes.
+
+    This function uses pandas to read and validate the CSV using datatype validators.
+    The conn parameter is accepted for consistency with other operations but not used.
+
+    Args:
+        file_path: path to the CSV file to validate
+        field_datatype: dict mapping column name to datatype string
+    """
+    validators = {
+        "address": _is_valid_address_value,
+        "curie-list": _is_valid_curie_list_value,
+        "curie": _is_valid_curie_value,
+        "date": _is_valid_datetime_value,
+        "datetime": _is_valid_datetime_value,
+        "decimal": _is_valid_decimal_value,
+        "flag": _is_valid_flag_value,
+        "hash": _is_valid_hash_value,
+        "integer": _is_valid_integer_value,
+        "json": _is_valid_json_value,
+        "latitude": _is_valid_latitude_value,
+        "longitude": _is_valid_longitude_value,
+        "multipolygon": _is_valid_multipolygon_value,
+        "pattern": _is_valid_pattern_value,
+        "point": _is_valid_point_value,
+        "reference": _is_valid_reference_value,
+        "url": _is_valid_url_value,
+    }
+
+    # Read CSV with pandas (keep_default_na=False preserves empty strings)
+    df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
+
+    if df.empty or len(df.columns) == 0:
+        return True, "no invalid values found", {"invalid_rows": []}
+
+    # Identify applicable fields for validation
+    applicable_fields = [
+        (field, field_datatype.get(field), validators[field_datatype.get(field)])
+        for field in df.columns
+        if field in field_datatype and field_datatype.get(field) in validators
+    ]
+
+    if not applicable_fields:
+        return True, "no invalid values found", {"invalid_rows": []}
+
+    # Validate values
+    invalid_values = []
+    for line_number, (idx, row) in enumerate(df.iterrows(), start=2):
+        for field, datatype, validator in applicable_fields:
+            value = str(row.get(field, "")).strip()
+            if not value:
+                continue
+
+            if not validator(value):
+                invalid_values.append({
+                    "line_number": line_number,
+                    "field": field,
+                    "datatype": datatype,
+                    "value": value,
+                })
+
+    if len(invalid_values) == 0:
+        passed = True
+        message = "all values have valid datatypes"
+        details = {"invalid_rows": []}
+    else:
+        passed = False
+        message = f"there were {len(invalid_values)} invalid datatype value(s) found"
+        details = {"invalid_rows": invalid_values}
+
+    return passed, message, details
+
diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py
new file mode 100644
index 00000000..7ef6bd80
--- /dev/null
+++ b/digital_land/expectations/operations/datatype_validators.py
@@ -0,0 +1,282 @@
+import json
+import re
+import urllib.parse
+from datetime import datetime
+from decimal import Decimal, InvalidOperation
+
+import shapely.errors
+import shapely.wkt
+from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon, shape
+
+
+def _is_valid_datetime_value(value):
+    value = value.strip().strip('",').lower()
+
+    # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/date.py#L22
+    patterns = [
+        # Date/date-like formats
+        "%Y-%m-%d",
+        "%Y%m%d",
+        "%Y/%m/%d",
+        "%Y %m %d",
+        "%Y.%m.%d",
+        "%Y-%d-%m",  # risky!
+        "%Y-%m",
+        "%Y.%m",
+        "%Y/%m",
+        "%Y %m",
+        "%Y",
+        "%Y.0",
+        "%d/%m/%Y",
+        "%d/%m/%y",
+        "%d-%m-%Y",
+        "%d-%m-%y",
+        "%d.%m.%Y",
+        "%d.%m.%y",
+        "%d-%b-%Y",
+        "%d-%b-%y",
+        "%d %B %Y",
+        "%b %d, %Y",
+        "%b %d, %y",
+        "%b-%y",
+        "%B %Y",
+        "%m/%d/%Y",  # risky!
+        # Datetime formats
+        "%Y-%m-%dT%H:%M:%S.000Z",
+        "%Y-%m-%dT%H:%M:%S.000",
+        "%Y-%m-%dT%H:%M:%S.%fZ",
+        "%Y-%m-%dT%H:%M:%S.%f%z",
+        "%Y-%m-%dT%H:%M:%S.%f",
+        "%Y-%m-%dT%H:%M:%SZ",
+        "%Y-%m-%dT%H:%M:%S",
+        "%Y/%m/%d %H:%M:%S%z",
+        "%Y/%m/%d %H:%M:%S+00",
+        "%Y/%m/%d %H:%M:%S",
+        "%Y/%m/%d %H:%M",
+        "%Y/%m/%dT%H:%M:%S",
+        "%Y/%m/%dT%H:%M:%S.000Z",
+        "%Y/%m/%dT%H:%M:%S.000",
+        "%Y/%m/%dT%H:%M:%S.%fZ",
+        "%Y/%m/%dT%H:%M:%S.%f%z",
+        "%Y/%m/%dT%H:%M:%S.%f",
+        "%Y/%m/%dT%H:%M:%SZ",
+        "%Y-%m-%d %H:%M:%S",
+        "%d/%m/%Y %H:%M:%S",
+        "%d/%m/%Y %H:%M",
+    ]
+
+    # Handle fractional seconds with extra precision.
+    if "." in value and "Z" in value:
+        parts = value.replace("Z", "").split(".")
+        if len(parts) == 2 and len(parts[1]) > 6:
+            value = parts[0] + "." + parts[1][:6] + "Z"
+    elif "." in value and "+" in value:
+        parts = value.split("+")
+        base_part = parts[0]
+        tz_part = "+" + parts[1]
+        if "." in base_part:
+            date_time, frac = base_part.rsplit(".", 1)
+            if len(frac) > 6:
+                frac = frac[:6]
+            value = date_time + "." + frac + tz_part
+
+    for pattern in patterns:
+        try:
+            datetime.strptime(value, pattern)
+            return True
+        except ValueError:
+            continue
+
+    # Try unix timestamp
+    try:
+        float_val = float(value)
+        return -62135596800 < float_val < 253402300800  # Year 1 to 9999
+    except ValueError:
+        pass
+
+    return False
+
+
+def _is_valid_integer_value(value):
+    try:
+        num = float(value)
+        return num == int(num)
+    except (ValueError, OverflowError):
+        return False
+
+
+def _is_valid_decimal_value(value):
+    try:
+        Decimal(value)
+        return True
+    except (InvalidOperation, ValueError):
+        return False
+
+
+def _is_valid_flag_value(value):
+    value = value.strip().lower()
+
+    lookup = {
+        "y": "yes",
+        "n": "no",
+        "true": "yes",
+        "false": "no",
+    }
+
+    normalized = lookup.get(value, value)
+    return normalized in {"", "yes", "no"}
+
+
+def _is_valid_json_value(value):
+    try:
+        json.loads(value)
+        return True
+    except json.JSONDecodeError:
+        return False
+
+
+def _is_valid_reference_value(value):
+    return bool(value.strip()) and not any(ch.isspace() for ch in value)
+
+
+def _is_valid_curie_value(value):
+    return bool(re.fullmatch(r"[a-z0-9-]+:[^\s:][^\s]*", value))
+
+
+def _is_valid_curie_list_value(value):
+    text = (value or "").strip()
+    if not text:
+        return False
+
+    parts = [part.strip() for part in text.split(";")]
+    if any(not part for part in parts):
+        return False
+
+    curie_re = re.compile(r"[a-z0-9-]+:[^\s:][^\s]*")
+    return all(bool(curie_re.fullmatch(part)) for part in parts)
+
+
+def _is_valid_address_value(value):
+    if not value or not value.strip():
+        return False
+
+    value = value.strip()
+
+    # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/address.py#L10
+    value = ", ".join(value.split("\n"))
+    value = value.replace(";", ",")
+
+    comma_re = re.compile(r",\s*,+")
+    value = comma_re.sub(", ", value)
+    value = value.strip(", ")
+
+    hyphen_re = re.compile(r"\s*-\s*")
+    value = hyphen_re.sub("-", value)
+
+    value = " ".join(value.split()).replace('"', "")
+
+    return bool(value.strip())
+
+
+def _is_valid_url_value(value):
+    candidate = (value or "").strip().strip("'")
+    parsed = urllib.parse.urlparse(candidate)
+    return bool(parsed.scheme and parsed.netloc)
+
+
+def _is_valid_hash_value(value):
+    if ":" in value:
+        _, digest = value.split(":", 1)
+    else:
+        digest = value
+    return bool(re.fullmatch(r"[0-9a-fA-F]+", digest))
+
+
+def _is_valid_pattern_value(value):
+    try:
+        re.compile(value)
+        return True
+    except re.error:
+        return False
+
+
+def _is_valid_latitude_value(value):
+    try:
+        numeric = float(value)
+    except ValueError:
+        return False
+    return -90 <= numeric <= 90
+
+
+def _is_valid_longitude_value(value):
+    try:
+        numeric = float(value)
+    except ValueError:
+        return False
+    return -180 <= numeric <= 180
+
+
+def _is_valid_multipolygon_value(value):
+    candidate = (value or "").strip()
+    if not candidate:
+        return False
+
+    try:
+        geometry = shapely.wkt.loads(candidate)
+    except shapely.errors.WKTReadingError:
+        try:
+            geojson = json.loads(candidate)
+            geometry = shape(geojson)
+        except Exception:
+            return False
+
+    if not isinstance(geometry, (Polygon, MultiPolygon, GeometryCollection)):
+        return False
+
+    # Shapely normal validity check where available.
+    is_valid = getattr(geometry, "is_valid", True)
+    return bool(is_valid)
+
+
+def _is_valid_point_value(value):
+    candidate = value
+    if candidate is None:
+        return False
+
+    # Try WKT first.
+    try:
+        point = shapely.wkt.loads(candidate if isinstance(candidate, str) else str(candidate))
+        if not isinstance(point, Point):
+            return False
+        return bool(getattr(point, "is_valid", True))
+    except shapely.errors.WKTReadingError:
+        pass
+    except Exception:
+        return False
+
+    # Fallback to coordinate pair.
+    try:
+        if isinstance(candidate, (list, tuple)) and len(candidate) == 2:
+            x_raw, y_raw = candidate[0], candidate[1]
+        elif isinstance(candidate, str):
+            text = candidate.strip()
+            if not text:
+                return False
+
+            if text.startswith("["):
+                coords = json.loads(text)
+                if not isinstance(coords, list) or len(coords) != 2:
+                    return False
+                x_raw, y_raw = coords[0], coords[1]
+            else:
+                parts = [p.strip() for p in text.split(",")]
+                if len(parts) != 2:
+                    return False
+                x_raw, y_raw = parts[0], parts[1]
+        else:
+            return False
+
+        point = Point(float(x_raw), float(y_raw))
+        return bool(getattr(point, "is_valid", True))
+    except Exception:
+        return False
diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py
index 3458fe84..44f4f1ba 100644
--- a/tests/integration/expectations/checkpoints/test_csv.py
+++ b/tests/integration/expectations/checkpoints/test_csv.py
@@ -1,4 +1,5 @@
 import csv
+import json
 import pytest
 
 from digital_land.expectations.checkpoints.csv import CsvCheckpoint
@@ -73,3 +74,36 @@ def test_invalid_operation(self, csv_file):
             checkpoint.load(
                 [{"operation": "nonexistent", "name": "test", "parameters": "{}"}]
             )
+
+    def test_check_values_have_the_correct_datatype_rule(self, tmp_path):
+        file_path = tmp_path / "test_datatypes.csv"
+        with open(file_path, "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow(["entity", "count"])
+            writer.writerow(["entity-1", "100"])
+            writer.writerow(["entity-2", "invalid_int"])
+
+        checkpoint = CsvCheckpoint("test-dataset", file_path)
+        rules = [
+            {
+                "operation": "check_values_have_the_correct_datatype",
+                "name": "Datatype validation",
+                "parameters": {
+                    "field_datatype": {
+                        "entity": "reference",
+                        "count": "integer",
+                    }
+                },
+            }
+        ]
+
+        checkpoint.load(rules)
+        checkpoint.run()
+
+        assert len(checkpoint.log.entries) == 1
+        assert checkpoint.log.entries[0]["operation"] == "check_values_have_the_correct_datatype"
+        assert checkpoint.log.entries[0]["passed"] is False
+        details = json.loads(checkpoint.log.entries[0]["details"])
+        assert len(details["invalid_rows"]) == 1
+        assert details["invalid_rows"][0]["line_number"] == 3
+        assert details["invalid_rows"][0]["field"] == "count"
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 3d89837d..a61b2180 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -11,6 +11,7 @@
     check_no_blank_rows,
     check_fields_are_within_range,
     check_field_is_within_range_by_dataset_org,
+    check_values_have_the_correct_datatype,
 )
 
 
@@ -636,3 +637,131 @@ def test_check_field_is_within_ranges_for_only_staus_301(tmp_path):
     assert details["invalid_rows"][0]["line_number"] == 3
     assert details["invalid_rows"][0]["field"] == "entity"
     assert details["invalid_rows"][0]["value"] == 250
+
+
+def test_check_values_have_the_correct_datatype_passes(tmp_path):
+    """Test datatype validation with all valid values."""
+    file_path = tmp_path / "valid_datatypes.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "count", "enabled"])
+        writer.writerow(["entity-1", "100", "true"])
+        writer.writerow(["entity-2", "200", "false"])
+
+    field_datatype = {
+        "entity": "reference",
+        "count": "integer",
+        "enabled": "flag",
+    }
+
+    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_values_have_the_correct_datatype_fails(tmp_path):
+    """Test datatype validation with invalid values."""
+    file_path = tmp_path / "invalid_datatypes.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "count", "enabled"])
+        writer.writerow(["entity-1", "100", "true"])
+        writer.writerow(["entity-2", "not_a_number", "false"])
+        writer.writerow(["entity-3", "300", "maybe"])
+
+    field_datatype = {
+        "entity": "reference",
+        "count": "integer",
+        "enabled": "flag",
+    }
+
+    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+    assert details["invalid_rows"][0]["line_number"] == 3
+    assert details["invalid_rows"][0]["field"] == "count"
+    assert details["invalid_rows"][0]["value"] == "not_a_number"
+    assert details["invalid_rows"][0]["datatype"] == "integer"
+    assert details["invalid_rows"][1]["line_number"] == 4
+    assert details["invalid_rows"][1]["field"] == "enabled"
+    assert details["invalid_rows"][1]["value"] == "maybe"
+    assert "invalid datatype value(s)" in message
+
+
+def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path):
+    """Test that empty values are skipped during validation."""
+    file_path = tmp_path / "with_empty_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "count"])
+        writer.writerow(["entity-1", "100"])
+        writer.writerow(["entity-2", ""])
+        writer.writerow(["entity-3", "300"])
+
+    field_datatype = {
+        "entity": "reference",
+        "count": "integer",
+    }
+
+    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path):
+    """Test that fields not in field_datatype map are not validated."""
+    file_path = tmp_path / "unmapped_fields.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "count", "description"])
+        writer.writerow(["entity-1", "100", "invalid_but_ignored"])
+
+    field_datatype = {
+        "entity": "reference",
+        "count": "integer",
+    }
+
+    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_values_have_the_correct_datatype_empty_file(tmp_path):
+    """Test behavior with empty CSV file."""
+    file_path = tmp_path / "empty.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["entity", "count"])
+
+    field_datatype = {
+        "entity": "reference",
+        "count": "integer",
+    }
+
+    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+
+    assert passed is True
+    assert details["invalid_rows"] == []
+
+
+def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path):
+    """Test when no fields have datatype validators."""
+    file_path = tmp_path / "no_applicable.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["name", "description"])
+        writer.writerow(["field1", "some value"])
+
+    field_datatype = {
+        "name": "string",
+        "description": "string",
+    }
+
+    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+
+    assert passed is True
+    assert details["invalid_rows"] == []

From 7be8ced2ab60bc6da1eb0bdd675c4ef19ff1701f Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Mon, 30 Mar 2026 12:27:33 +0100
Subject: [PATCH 07/12] remove unused test for datatype validation in
 CsvCheckpoint

---
 digital_land/expectations/operations/csv.py   | 49 ++++++-------------
 .../expectations/checkpoints/test_csv.py      | 35 +------------
 2 files changed, 17 insertions(+), 67 deletions(-)

diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 12622ac1..c0514821 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -20,10 +20,6 @@ def _sql_string(value) -> str:
     return f"'{cleaned}'"
 
 
-def _sql_identifier(name: str) -> str:
-    return '"' + str(name).replace('"', '""') + '"'
-
-
 def _normalize_condition_groups(conditions, name: str) -> list:
     if conditions is None:
         return []
@@ -118,8 +114,6 @@ def _normalize_fields_for_validation(field_spec, file_columns: list) -> list:
 def _build_range_invalid_rows(
     result: list,
     validating_multiple_fields: bool,
-    has_match_columns: bool,
-    lookup_match_columns: list = None,
 ) -> list:
     """Format query rows into expectation invalid_rows shape."""
     out_of_range_rows = []
@@ -127,17 +121,9 @@ def _build_range_invalid_rows(
     for row in result:
         field_name = row[1]
 
-        if has_match_columns:
-            if validating_multiple_fields:
-                invalid_row = {"line_number": row[0], "field": field_name, "value": row[2]}
-            else:
-                invalid_row = {"line_number": row[0], field_name: row[2]}
-            for i, col_name in enumerate(lookup_match_columns):
-                invalid_row[col_name] = row[i + 3]
-        else:
-            invalid_row = {"line_number": row[0], "value": row[2]}
-            if validating_multiple_fields:
-                invalid_row["field"] = field_name
+        invalid_row = {"line_number": row[0], "value": row[2]}
+        if validating_multiple_fields:
+            invalid_row["field"] = field_name
 
         out_of_range_rows.append(invalid_row)
 
@@ -366,7 +352,7 @@ def check_no_blank_rows(conn, file_path: Path):
         return True, "no blank rows found", {"invalid_rows": []}
 
     blank_conditions = " AND ".join(
-        f"TRIM(COALESCE({_sql_identifier(column_name)}, '')) = ''"
+        f'TRIM(COALESCE("{column_name}", \'\')) = \'\''
         for column_name in file_columns
     )
 
@@ -445,7 +431,7 @@ def check_fields_are_within_range(
     fields_to_validate = _normalize_fields_for_validation(field, file_columns)
     validating_multiple_fields = len(fields_to_validate) > 1
     lookup_values_sql = ",\n                    ".join(
-        f"({i}, {_sql_string(field_name)}, TRY_CAST(src.{_sql_identifier(field_name)} AS BIGINT))"
+        f'({i}, {_sql_string(field_name)}, TRY_CAST(src."{field_name}" AS BIGINT))'
         for i, field_name in enumerate(fields_to_validate)
     )
 
@@ -495,7 +481,6 @@ def check_fields_are_within_range(
     out_of_range_rows = _build_range_invalid_rows(
         result=result,
         validating_multiple_fields=validating_multiple_fields,
-        has_match_columns=False,
     )
 
     if len(out_of_range_rows) == 0:
@@ -565,13 +550,11 @@ def check_field_is_within_range_by_dataset_org(
     range_dataset_name = str(range_dataset_field).strip()
     lookup_match_columns = [lookup_dataset_name, "organisation"]
 
-    lookup_dataset_col = _sql_identifier(lookup_dataset_name)
-    lookup_org_col = _sql_identifier("organisation")
-    range_dataset_col = _sql_identifier(range_dataset_name)
-    range_org_col = _sql_identifier("organisation")
-    min_col = _sql_identifier(min_field)
-    max_col = _sql_identifier(max_field)
-    value_col = _sql_identifier(field_name)
+    lookup_dataset_col = f'"{lookup_dataset_name}"'
+    range_dataset_col = f'"{range_dataset_name}"'
+    min_col = f'"{min_field}"'
+    max_col = f'"{max_field}"'
+    value_col = f'"{field_name}"'
 
     result = conn.execute(
         f"""
@@ -580,12 +563,12 @@ def check_field_is_within_range_by_dataset_org(
                 TRY_CAST({min_col} AS BIGINT) AS min_value,
                 TRY_CAST({max_col} AS BIGINT) AS max_value,
                 TRIM(COALESCE({range_dataset_col}, '')) AS range_key_0,
-                                TRIM(COALESCE({range_org_col}, '')) AS range_key_1
+                                TRIM(COALESCE("organisation", '')) AS range_key_1
             FROM {_read_csv(external_file)}
             WHERE TRY_CAST({min_col} AS BIGINT) IS NOT NULL
               AND TRY_CAST({max_col} AS BIGINT) IS NOT NULL
               AND TRIM(COALESCE({range_dataset_col}, '')) != ''
-                            AND TRIM(COALESCE({range_org_col}, '')) != ''
+                            AND TRIM(COALESCE("organisation", '')) != ''
         ),
         source_rows AS (
             SELECT
@@ -598,11 +581,11 @@ def check_field_is_within_range_by_dataset_org(
                 src.line_number,
                 TRY_CAST(src.{value_col} AS BIGINT) AS value,
                 TRIM(COALESCE(src.{lookup_dataset_col}, '')) AS lookup_key_0,
-                TRIM(COALESCE(src.{lookup_org_col}, '')) AS lookup_key_1
+                TRIM(COALESCE(src."organisation", '')) AS lookup_key_1
             FROM source_rows src
             WHERE TRY_CAST(src.{value_col} AS BIGINT) IS NOT NULL
               AND TRIM(COALESCE(src.{lookup_dataset_col}, '')) != ''
-              AND TRIM(COALESCE(src.{lookup_org_col}, '')) != ''{lookup_clause}
+              AND TRIM(COALESCE(src."organisation", '')) != ''{lookup_clause}
         )
         SELECT
             line_number,
@@ -639,7 +622,7 @@ def check_field_is_within_range_by_dataset_org(
     return passed, message, details
 
 
-def check_values_have_the_correct_datatype(conn,file_path, field_datatype):
+def check_values_have_the_correct_datatype(file_path, field_datatype):
     """
     Validates that CSV column values have correct datatypes.
 
@@ -688,7 +671,7 @@ def check_values_have_the_correct_datatype(conn,file_path, field_datatype):
 
     # Validate values
     invalid_values = []
-    for line_number, (idx, row) in enumerate(df.iterrows(), start=2):
+    for line_number, (_, row) in enumerate(df.iterrows(), start=2):
         for field, datatype, validator in applicable_fields:
             value = str(row.get(field, "")).strip()
             if not value:
diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py
index 44f4f1ba..3bc89b00 100644
--- a/tests/integration/expectations/checkpoints/test_csv.py
+++ b/tests/integration/expectations/checkpoints/test_csv.py
@@ -73,37 +73,4 @@ def test_invalid_operation(self, csv_file):
         with pytest.raises(ValueError):
             checkpoint.load(
                 [{"operation": "nonexistent", "name": "test", "parameters": "{}"}]
-            )
-
-    def test_check_values_have_the_correct_datatype_rule(self, tmp_path):
-        file_path = tmp_path / "test_datatypes.csv"
-        with open(file_path, "w", newline="") as f:
-            writer = csv.writer(f)
-            writer.writerow(["entity", "count"])
-            writer.writerow(["entity-1", "100"])
-            writer.writerow(["entity-2", "invalid_int"])
-
-        checkpoint = CsvCheckpoint("test-dataset", file_path)
-        rules = [
-            {
-                "operation": "check_values_have_the_correct_datatype",
-                "name": "Datatype validation",
-                "parameters": {
-                    "field_datatype": {
-                        "entity": "reference",
-                        "count": "integer",
-                    }
-                },
-            }
-        ]
-
-        checkpoint.load(rules)
-        checkpoint.run()
-
-        assert len(checkpoint.log.entries) == 1
-        assert checkpoint.log.entries[0]["operation"] == "check_values_have_the_correct_datatype"
-        assert checkpoint.log.entries[0]["passed"] is False
-        details = json.loads(checkpoint.log.entries[0]["details"])
-        assert len(details["invalid_rows"]) == 1
-        assert details["invalid_rows"][0]["line_number"] == 3
-        assert details["invalid_rows"][0]["field"] == "count"
+            )
\ No newline at end of file

From 1e1d979ca94bbd44f271117fd59de587c834eeeb Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Mon, 30 Mar 2026 12:29:39 +0100
Subject: [PATCH 08/12] added conn to test

---
 digital_land/expectations/operations/csv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index c0514821..ebdc31d5 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -622,7 +622,7 @@ def check_field_is_within_range_by_dataset_org(
     return passed, message, details
 
 
-def check_values_have_the_correct_datatype(file_path, field_datatype):
+def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None):
     """
     Validates that CSV column values have correct datatypes.
 

From acb1449e008196b9d978029a0d8f8c5e88079530 Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Mon, 30 Mar 2026 13:19:50 +0100
Subject: [PATCH 09/12] improve code formatting and readability in CSV
 operations and datatype validators

---
 digital_land/expectations/operations/csv.py   | 60 ++++++++++-----
 .../operations/datatype_validators.py         |  4 +-
 .../expectations/checkpoints/test_csv.py      |  2 +-
 .../expectations/operations/test_csv.py       | 73 ++++++++++++-------
 4 files changed, 93 insertions(+), 46 deletions(-)

diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index ebdc31d5..01d69d40 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -1,7 +1,24 @@
 from pathlib import Path
 import pandas as pd
 
-from digital_land.expectations.operations.datatype_validators import _is_valid_address_value, _is_valid_curie_list_value, _is_valid_curie_value, _is_valid_datetime_value, _is_valid_decimal_value, _is_valid_flag_value, _is_valid_hash_value, _is_valid_integer_value, _is_valid_json_value, _is_valid_latitude_value, _is_valid_longitude_value, _is_valid_multipolygon_value, _is_valid_pattern_value, _is_valid_point_value, _is_valid_reference_value, _is_valid_url_value
+from digital_land.expectations.operations.datatype_validators import (
+    _is_valid_address_value,
+    _is_valid_curie_list_value,
+    _is_valid_curie_value,
+    _is_valid_datetime_value,
+    _is_valid_decimal_value,
+    _is_valid_flag_value,
+    _is_valid_hash_value,
+    _is_valid_integer_value,
+    _is_valid_json_value,
+    _is_valid_latitude_value,
+    _is_valid_longitude_value,
+    _is_valid_multipolygon_value,
+    _is_valid_pattern_value,
+    _is_valid_point_value,
+    _is_valid_reference_value,
+    _is_valid_url_value,
+)
 
 
 def _read_csv(file_path: Path) -> str:
@@ -10,9 +27,12 @@ def _read_csv(file_path: Path) -> str:
 
 def _get_csv_columns(conn, file_path: Path) -> list:
     """Get column names from CSV file."""
-    return [col[0] for col in conn.execute(
-        f"SELECT * FROM {_read_csv(file_path)} LIMIT 0"
-    ).description]
+    return [
+        col[0]
+        for col in conn.execute(
+            f"SELECT * FROM {_read_csv(file_path)} LIMIT 0"
+        ).description
+    ]
 
 
 def _sql_string(value) -> str:
@@ -90,7 +110,9 @@ def _normalize_fields_for_validation(field_spec, file_columns: list) -> list:
     elif isinstance(field_spec, (list, tuple, set)):
         fields = [str(item).strip() for item in field_spec if str(item).strip()]
     else:
-        raise ValueError("field must be a string, comma-separated string, or list of strings")
+        raise ValueError(
+            "field must be a string, comma-separated string, or list of strings"
+        )
 
     if not fields:
         raise ValueError("field must include at least one column name")
@@ -102,7 +124,9 @@ def _normalize_fields_for_validation(field_spec, file_columns: list) -> list:
             seen.add(field_name)
             normalized_fields.append(field_name)
 
-    missing_fields = [field_name for field_name in normalized_fields if field_name not in file_columns]
+    missing_fields = [
+        field_name for field_name in normalized_fields if field_name not in file_columns
+    ]
     if missing_fields:
         raise ValueError(
             f"Column(s) {missing_fields} not found in file. Available columns: {file_columns}"
@@ -323,9 +347,7 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list
         message = f"all values in '{field}' are allowed"
     else:
         passed = False
-        message = (
-            f"there were {len(invalid_rows)} invalid values in '{field}'"
-        )
+        message = f"there were {len(invalid_rows)} invalid values in '{field}'"
 
     details = {
         "field": field,
@@ -352,8 +374,7 @@ def check_no_blank_rows(conn, file_path: Path):
         return True, "no blank rows found", {"invalid_rows": []}
 
     blank_conditions = " AND ".join(
-        f'TRIM(COALESCE("{column_name}", \'\')) = \'\''
-        for column_name in file_columns
+        f"TRIM(COALESCE(\"{column_name}\", '')) = ''" for column_name in file_columns
     )
 
     result = conn.execute(
@@ -622,7 +643,7 @@ def check_field_is_within_range_by_dataset_org(
     return passed, message, details
 
 
-def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None):
+def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None):
     """
     Validates that CSV column values have correct datatypes.
 
@@ -678,12 +699,14 @@ def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None):
                 continue
 
             if not validator(value):
-                invalid_values.append({
-                    "line_number": line_number,
-                    "field": field,
-                    "datatype": datatype,
-                    "value": value,
-                })
+                invalid_values.append(
+                    {
+                        "line_number": line_number,
+                        "field": field,
+                        "datatype": datatype,
+                        "value": value,
+                    }
+                )
 
     if len(invalid_values) == 0:
         passed = True
@@ -695,4 +718,3 @@ def check_values_have_the_correct_datatype(file_path, field_datatype,conn=None):
         details = {"invalid_rows": invalid_values}
 
     return passed, message, details
-
diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py
index 7ef6bd80..1911c3c6 100644
--- a/digital_land/expectations/operations/datatype_validators.py
+++ b/digital_land/expectations/operations/datatype_validators.py
@@ -245,7 +245,9 @@ def _is_valid_point_value(value):
 
     # Try WKT first.
     try:
-        point = shapely.wkt.loads(candidate if isinstance(candidate, str) else str(candidate))
+        point = shapely.wkt.loads(
+            candidate if isinstance(candidate, str) else str(candidate)
+        )
         if not isinstance(point, Point):
             return False
         return bool(getattr(point, "is_valid", True))
diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py
index 3bc89b00..d3c0ee8d 100644
--- a/tests/integration/expectations/checkpoints/test_csv.py
+++ b/tests/integration/expectations/checkpoints/test_csv.py
@@ -73,4 +73,4 @@ def test_invalid_operation(self, csv_file):
         with pytest.raises(ValueError):
             checkpoint.load(
                 [{"operation": "nonexistent", "name": "test", "parameters": "{}"}]
-            )
\ No newline at end of file
+            )
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index a61b2180..2a9592f9 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -351,7 +351,9 @@ def test_check_no_blank_rows_fails(tmp_path):
     assert details["invalid_rows"][1]["line_number"] == 4
 
 
-def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(tmp_path):
+def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_fails(
+    tmp_path,
+):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
@@ -388,7 +390,9 @@ def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisa
     assert details["invalid_rows"][0]["organisation"] == "org-1"
 
 
-def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_passes(tmp_path):
+def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisation_passes(
+    tmp_path,
+):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
@@ -416,7 +420,9 @@ def test_check_field_is_within_ranges_by_dataset_org_matches_prefix_and_organisa
     )
 
 
-def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_names(tmp_path):
+def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_names(
+    tmp_path,
+):
     file_path = tmp_path / "lookup_custom.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
@@ -427,7 +433,9 @@ def test_check_field_is_within_ranges_by_dataset_org_supports_custom_column_name
     external_file = tmp_path / "ranges_custom.csv"
     with open(external_file, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["dataset_name", "organisation", "entity-minimum", "entity-maximum"])
+        writer.writerow(
+            ["dataset_name", "organisation", "entity-minimum", "entity-maximum"]
+        )
         writer.writerow(["dataset-x", "org-a", "50", "100"])
 
     conn = duckdb.connect()
@@ -457,7 +465,7 @@ def test_check_field_is_within_ranges_filters_rows_with_lookup_rules(tmp_path):
         writer = csv.writer(f)
         writer.writerow(["entity", "status"])
         writer.writerow(["150", "active"])
-        writer.writerow(["250", "active"])  
+        writer.writerow(["250", "active"])
         writer.writerow(["350", "inactive"])
 
     external_file = tmp_path / "ranges.csv"
@@ -487,8 +495,8 @@ def test_check_field_is_within_ranges_lookup_rules_operator_eq_shape(tmp_path):
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["entity", "prefix"])
-        writer.writerow(["150", "conservationarea"]) 
-        writer.writerow(["350", "other"]) 
+        writer.writerow(["150", "conservationarea"])
+        writer.writerow(["350", "other"])
 
     external_file = tmp_path / "ranges.csv"
     with open(external_file, "w", newline="") as f:
@@ -516,8 +524,8 @@ def test_check_field_is_within_ranges_lookup_rules_exact_match(tmp_path):
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["entity", "prefix"])
-        writer.writerow(["150", "conservationarea"])  
-        writer.writerow(["350", "other"]) 
+        writer.writerow(["150", "conservationarea"])
+        writer.writerow(["350", "other"])
 
     external_file = tmp_path / "ranges.csv"
     with open(external_file, "w", newline="") as f:
@@ -545,9 +553,9 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path):
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["entity", "organisation"])
-        writer.writerow(["150", "org-a"])  
-        writer.writerow(["350", "org-b"])  
-        writer.writerow(["350", "org-c"])  
+        writer.writerow(["150", "org-a"])
+        writer.writerow(["350", "org-b"])
+        writer.writerow(["350", "org-c"])
 
     external_file = tmp_path / "ranges.csv"
     with open(external_file, "w", newline="") as f:
@@ -563,7 +571,9 @@ def test_check_field_is_within_ranges_lookup_rules_operator_in(tmp_path):
         min_field="entity-minimum",
         max_field="entity-maximum",
         field="entity",
-        rules={"lookup_rules": {"organisation": {"op": "in", "value": ["org-a", "org-b"]}}},
+        rules={
+            "lookup_rules": {"organisation": {"op": "in", "value": ["org-a", "org-b"]}}
+        },
     )
 
     assert passed is False
@@ -577,9 +587,9 @@ def test_check_field_is_within_ranges_comma_separated_fields(tmp_path):
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["entity", "end-entity"])
-        writer.writerow(["150", "160"])  
-        writer.writerow(["150", "350"])  
-        writer.writerow(["350", "150"])  
+        writer.writerow(["150", "160"])
+        writer.writerow(["150", "350"])
+        writer.writerow(["350", "150"])
 
     external_file = tmp_path / "ranges.csv"
     with open(external_file, "w", newline="") as f:
@@ -606,13 +616,14 @@ def test_check_field_is_within_ranges_comma_separated_fields(tmp_path):
     assert details["invalid_rows"][1]["field"] == "end-entity"
     assert details["invalid_rows"][1]["value"] == 350
 
+
 def test_check_field_is_within_ranges_for_only_staus_301(tmp_path):
     file_path = tmp_path / "lookup.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity", "status","old-entity"])
-        writer.writerow(["150", "301", "140"]) 
-        writer.writerow(["250", "301", "150"]) 
+        writer.writerow(["entity", "status", "old-entity"])
+        writer.writerow(["150", "301", "140"])
+        writer.writerow(["250", "301", "150"])
         writer.writerow(["350", "410", "340"])
 
     external_file = tmp_path / "ranges.csv"
@@ -654,7 +665,9 @@ def test_check_values_have_the_correct_datatype_passes(tmp_path):
         "enabled": "flag",
     }
 
-    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+    passed, message, details = check_values_have_the_correct_datatype(
+        file_path, field_datatype
+    )
 
     assert passed is True
     assert details["invalid_rows"] == []
@@ -676,7 +689,9 @@ def test_check_values_have_the_correct_datatype_fails(tmp_path):
         "enabled": "flag",
     }
 
-    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+    passed, message, details = check_values_have_the_correct_datatype(
+        file_path, field_datatype
+    )
 
     assert passed is False
     assert len(details["invalid_rows"]) == 2
@@ -705,7 +720,9 @@ def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path):
         "count": "integer",
     }
 
-    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+    passed, message, details = check_values_have_the_correct_datatype(
+        file_path, field_datatype
+    )
 
     assert passed is True
     assert details["invalid_rows"] == []
@@ -724,7 +741,9 @@ def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path):
         "count": "integer",
     }
 
-    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+    passed, message, details = check_values_have_the_correct_datatype(
+        file_path, field_datatype
+    )
 
     assert passed is True
     assert details["invalid_rows"] == []
@@ -742,7 +761,9 @@ def test_check_values_have_the_correct_datatype_empty_file(tmp_path):
         "count": "integer",
     }
 
-    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+    passed, message, details = check_values_have_the_correct_datatype(
+        file_path, field_datatype
+    )
 
     assert passed is True
     assert details["invalid_rows"] == []
@@ -761,7 +782,9 @@ def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path):
         "description": "string",
     }
 
-    passed, message, details = check_values_have_the_correct_datatype(file_path, field_datatype)
+    passed, message, details = check_values_have_the_correct_datatype(
+        file_path, field_datatype
+    )
 
     assert passed is True
     assert details["invalid_rows"] == []

From 6818887a35a261127270fb9a294416bb624f177f Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Mon, 30 Mar 2026 13:31:19 +0100
Subject: [PATCH 10/12] fixed pipeline failure

cleanup
---
 digital_land/expectations/operations/csv.py   | 59 ++++++++-----------
 .../expectations/checkpoints/test_csv.py      |  1 -
 2 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 01d69d40..9bfa42aa 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -40,16 +40,6 @@ def _sql_string(value) -> str:
     return f"'{cleaned}'"
 
 
-def _normalize_condition_groups(conditions, name: str) -> list:
-    if conditions is None:
-        return []
-    if isinstance(conditions, dict):
-        return [conditions]
-    if isinstance(conditions, list):
-        return conditions
-    raise ValueError(f"{name} must be a dict, list of dicts, or None")
-
-
 def _build_field_condition(field_name: str, spec) -> str:
     if isinstance(spec, dict):
         op = str(spec.get("op", spec.get("operation", ""))).strip().lower()
@@ -79,27 +69,35 @@ def _build_field_condition(field_name: str, spec) -> str:
     )
 
 
-def _build_condition_group(group: dict, file_columns: list) -> str:
-    if not isinstance(group, dict) or not group:
-        raise ValueError("Each condition group must be a non-empty dict")
-
-    parts = []
-    for field_name, spec in group.items():
-        if field_name not in file_columns:
-            raise ValueError(
-                f"Column '{field_name}' not found in file. Available columns: {file_columns}"
-            )
-        parts.append(_build_field_condition(field_name, spec))
-
-    return f"({' AND '.join(parts)})"
-
-
 def _build_filter_clause(filter_spec, file_columns: list, name: str) -> str:
     """Build SQL clause that keeps rows matching structured conditions."""
-    groups = _normalize_condition_groups(filter_spec, name)
+    if filter_spec is None:
+        groups = []
+    elif isinstance(filter_spec, dict):
+        groups = [filter_spec]
+    elif isinstance(filter_spec, list):
+        groups = filter_spec
+    else:
+        raise ValueError(f"{name} must be a dict, list of dicts, or None")
+
     if not groups:
         return ""
-    clauses = [_build_condition_group(group, file_columns) for group in groups]
+
+    clauses = []
+    for group in groups:
+        if not isinstance(group, dict) or not group:
+            raise ValueError("Each condition group must be a non-empty dict")
+
+        parts = []
+        for field_name, spec in group.items():
+            if field_name not in file_columns:
+                raise ValueError(
+                    f"Column '{field_name}' not found in file. Available columns: {file_columns}"
+                )
+            parts.append(_build_field_condition(field_name, spec))
+
+        clauses.append(f"({' AND '.join(parts)})")
+
     return f" AND ({' OR '.join(clauses)})"
 
 
@@ -647,12 +645,10 @@ def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None)
     """
     Validates that CSV column values have correct datatypes.
 
-    This function uses pandas to read and validate the CSV using datatype validators.
-    The conn parameter is accepted for consistency with other operations but not used.
-
     Args:
         file_path: path to the CSV file to validate
         field_datatype: dict mapping column name to datatype string
+        conn: duckdb connection not used but required by caller
     """
     validators = {
         "address": _is_valid_address_value,
@@ -674,13 +670,11 @@ def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None)
         "url": _is_valid_url_value,
     }
 
-    # Read CSV with pandas (keep_default_na=False preserves empty strings)
     df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
 
     if df.empty or len(df.columns) == 0:
         return True, "no invalid values found", {"invalid_rows": []}
 
-    # Identify applicable fields for validation
     applicable_fields = [
         (field, field_datatype.get(field), validators[field_datatype.get(field)])
         for field in df.columns
@@ -690,7 +684,6 @@ def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None)
     if not applicable_fields:
         return True, "no invalid values found", {"invalid_rows": []}
 
-    # Validate values
     invalid_values = []
     for line_number, (_, row) in enumerate(df.iterrows(), start=2):
         for field, datatype, validator in applicable_fields:
diff --git a/tests/integration/expectations/checkpoints/test_csv.py b/tests/integration/expectations/checkpoints/test_csv.py
index d3c0ee8d..3458fe84 100644
--- a/tests/integration/expectations/checkpoints/test_csv.py
+++ b/tests/integration/expectations/checkpoints/test_csv.py
@@ -1,5 +1,4 @@
 import csv
-import json
 import pytest
 
 from digital_land.expectations.checkpoints.csv import CsvCheckpoint

From 4ed3035ad14db0fd4c79a7cef22590c7fb1bce2d Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Tue, 31 Mar 2026 14:12:24 +0100
Subject: [PATCH 11/12] used duckdb for datatype validation

---
 digital_land/expectations/operations/csv.py   | 161 +++++---
 .../operations/datatype_validators.py         | 202 ----------
 .../expectations/operations/test_csv.py       | 353 +++++++++++++++++-
 3 files changed, 444 insertions(+), 272 deletions(-)

diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 9bfa42aa..30ea7a01 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -2,22 +2,9 @@
 import pandas as pd
 
 from digital_land.expectations.operations.datatype_validators import (
-    _is_valid_address_value,
-    _is_valid_curie_list_value,
-    _is_valid_curie_value,
-    _is_valid_datetime_value,
-    _is_valid_decimal_value,
-    _is_valid_flag_value,
-    _is_valid_hash_value,
-    _is_valid_integer_value,
-    _is_valid_json_value,
-    _is_valid_latitude_value,
-    _is_valid_longitude_value,
     _is_valid_multipolygon_value,
     _is_valid_pattern_value,
     _is_valid_point_value,
-    _is_valid_reference_value,
-    _is_valid_url_value,
 )
 
 
@@ -316,16 +303,8 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list
         field: the column name to validate
         allowed_values: allowed values for the field
     """
-    cleaned_allowed_values = [
-        str(value).strip().replace("'", "''")
-        for value in (allowed_values or [])
-        if str(value).strip() != ""
-    ]
-
-    if not cleaned_allowed_values:
-        raise ValueError("allowed_values must contain at least one non-empty value")
 
-    allowed_values_sql = ",".join("'" + value + "'" for value in cleaned_allowed_values)
+    allowed_values_sql = ",".join("'" + value + "'" for value in allowed_values)
 
     result = conn.execute(
         f"""
@@ -349,7 +328,7 @@ def check_allowed_values(conn, file_path: Path, field: str, allowed_values: list
 
     details = {
         "field": field,
-        "allowed_values": sorted({value for value in cleaned_allowed_values}),
+        "allowed_values": sorted({value for value in allowed_values}),
         "invalid_values": invalid_values,
         "invalid_rows": invalid_rows,
     }
@@ -641,66 +620,128 @@ def check_field_is_within_range_by_dataset_org(
     return passed, message, details
 
 
-def check_values_have_the_correct_datatype(file_path, field_datatype, conn=None):
+def check_values_have_the_correct_datatype(conn, file_path, field_datatype):
     """
     Validates that CSV column values have correct datatypes.
 
+    Uses DuckDB queries for datatypes: integer, decimal, flag, latitude, longitude, hash, curie, curie-list, json, url, date, datetime.
+
+    Uses Python validators for complex datatypes: pattern, multipolygon, point.
+
     Args:
         file_path: path to the CSV file to validate
         field_datatype: dict mapping column name to datatype string
-        conn: duckdb connection not used but required by caller
     """
-    validators = {
-        "address": _is_valid_address_value,
-        "curie-list": _is_valid_curie_list_value,
-        "curie": _is_valid_curie_value,
-        "date": _is_valid_datetime_value,
-        "datetime": _is_valid_datetime_value,
-        "decimal": _is_valid_decimal_value,
-        "flag": _is_valid_flag_value,
-        "hash": _is_valid_hash_value,
-        "integer": _is_valid_integer_value,
-        "json": _is_valid_json_value,
-        "latitude": _is_valid_latitude_value,
-        "longitude": _is_valid_longitude_value,
-        "multipolygon": _is_valid_multipolygon_value,
+
+    def _get_sql_validation_condition(datatype: str, field_name: str) -> str:
+        field_ref = f"TRIM(COALESCE(\"{field_name}\", ''))"
+
+        conditions = {
+            "integer": f"{field_ref} != '' AND NOT (TRY_CAST({field_ref} AS DOUBLE) IS NOT NULL AND TRY_CAST({field_ref} AS DOUBLE) = TRY_CAST({field_ref} AS BIGINT))",
+            "decimal": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DECIMAL) IS NULL",
+            "flag": f"{field_ref} != '' AND LOWER({field_ref}) NOT IN ('yes', 'no', 'true', 'false')",
+            "latitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -90 OR TRY_CAST({field_ref} AS DOUBLE) > 90)",
+            "longitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -180 OR TRY_CAST({field_ref} AS DOUBLE) > 180)",
+            "hash": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z]+:)?[0-9a-fA-F]+$'))",
+            "curie": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-z0-9-]+:[^\\s:][^\\s]*$'))",
+            "curie-list": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z0-9-]+:[^\\s:][^\\s]*(;[a-z0-9-]+:[^\\s:][^\\s]*)*)?$'))",
+            "json": f"{field_ref} != '' AND TRY(json_extract({field_ref}, '$')) IS NULL",
+            "url": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-zA-Z][a-zA-Z0-9+.-]*://[^\\s/:?#]+(?::[0-9]+)?(?:[/?#][^\\s]*)?$'))",
+            "date": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DATE) IS NULL",
+            "datetime": f"{field_ref} != '' AND TRY_CAST({field_ref} AS TIMESTAMP) IS NULL",
+        }
+
+        return conditions.get(datatype, "FALSE")
+
+    # Python validators for complex datatypes that can't be easily expressed in SQL
+    python_validators = {
         "pattern": _is_valid_pattern_value,
+        "multipolygon": _is_valid_multipolygon_value,
         "point": _is_valid_point_value,
-        "reference": _is_valid_reference_value,
-        "url": _is_valid_url_value,
     }
 
-    df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
-
-    if df.empty or len(df.columns) == 0:
-        return True, "no invalid values found", {"invalid_rows": []}
+    sql_validators = {
+        "integer",
+        "decimal",
+        "flag",
+        "latitude",
+        "longitude",
+        "hash",
+        "curie",
+        "curie-list",
+        "json",
+        "url",
+        "date",
+        "datetime",
+    }
 
-    applicable_fields = [
-        (field, field_datatype.get(field), validators[field_datatype.get(field)])
-        for field in df.columns
-        if field in field_datatype and field_datatype.get(field) in validators
-    ]
+    fields_for_sql = []
+    fields_for_python = []
 
-    if not applicable_fields:
-        return True, "no invalid values found", {"invalid_rows": []}
+    for field in field_datatype:
+        datatype = field_datatype.get(field)
+        if datatype in sql_validators:
+            fields_for_sql.append((field, datatype))
+        elif datatype in python_validators:
+            fields_for_python.append((field, datatype, python_validators[datatype]))
 
     invalid_values = []
-    for line_number, (_, row) in enumerate(df.iterrows(), start=2):
-        for field, datatype, validator in applicable_fields:
-            value = str(row.get(field, "")).strip()
-            if not value:
-                continue
 
-            if not validator(value):
+    # SQL validation: query invalid rows for each field
+    if fields_for_sql:
+        for field, datatype in fields_for_sql:
+            condition = _get_sql_validation_condition(datatype, field)
+
+            result = conn.execute(
+                f"""
+                WITH source_rows AS (
+                    SELECT
+                        ROW_NUMBER() OVER () + 1 AS line_number,
+                        *
+                    FROM {_read_csv(file_path)}
+                )
+                SELECT
+                    line_number,
+                    TRIM(COALESCE("{field}", '')) AS value
+                FROM source_rows
+                WHERE {condition}
+                """
+            ).fetchall()
+
+            for row in result:
                 invalid_values.append(
                     {
-                        "line_number": line_number,
+                        "line_number": row[0],
                         "field": field,
                         "datatype": datatype,
-                        "value": value,
+                        "value": row[1],
                     }
                 )
 
+    if fields_for_python:
+        df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
+
+        if df.empty or len(df.columns) == 0:
+            pass
+        else:
+            for line_number, (_, row) in enumerate(df.iterrows(), start=2):
+                for field, datatype, validator in fields_for_python:
+                    if field not in df.columns:
+                        continue
+                    value = str(row.get(field, "")).strip()
+                    if not value:
+                        continue
+
+                    if not validator(value):
+                        invalid_values.append(
+                            {
+                                "line_number": line_number,
+                                "field": field,
+                                "datatype": datatype,
+                                "value": value,
+                            }
+                        )
+
     if len(invalid_values) == 0:
         passed = True
         message = "all values have valid datatypes"
diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py
index 1911c3c6..3c04dc6e 100644
--- a/digital_land/expectations/operations/datatype_validators.py
+++ b/digital_land/expectations/operations/datatype_validators.py
@@ -1,197 +1,11 @@
 import json
 import re
-import urllib.parse
-from datetime import datetime
-from decimal import Decimal, InvalidOperation
 
 import shapely.errors
 import shapely.wkt
 from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon, shape
 
 
-def _is_valid_datetime_value(value):
-    value = value.strip().strip('",').lower()
-
-    # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/date.py#L22
-    patterns = [
-        # Date/date-like formats
-        "%Y-%m-%d",
-        "%Y%m%d",
-        "%Y/%m/%d",
-        "%Y %m %d",
-        "%Y.%m.%d",
-        "%Y-%d-%m",  # risky!
-        "%Y-%m",
-        "%Y.%m",
-        "%Y/%m",
-        "%Y %m",
-        "%Y",
-        "%Y.0",
-        "%d/%m/%Y",
-        "%d/%m/%y",
-        "%d-%m-%Y",
-        "%d-%m-%y",
-        "%d.%m.%Y",
-        "%d.%m.%y",
-        "%d-%b-%Y",
-        "%d-%b-%y",
-        "%d %B %Y",
-        "%b %d, %Y",
-        "%b %d, %y",
-        "%b-%y",
-        "%B %Y",
-        "%m/%d/%Y",  # risky!
-        # Datetime formats
-        "%Y-%m-%dT%H:%M:%S.000Z",
-        "%Y-%m-%dT%H:%M:%S.000",
-        "%Y-%m-%dT%H:%M:%S.%fZ",
-        "%Y-%m-%dT%H:%M:%S.%f%z",
-        "%Y-%m-%dT%H:%M:%S.%f",
-        "%Y-%m-%dT%H:%M:%SZ",
-        "%Y-%m-%dT%H:%M:%S",
-        "%Y/%m/%d %H:%M:%S%z",
-        "%Y/%m/%d %H:%M:%S+00",
-        "%Y/%m/%d %H:%M:%S",
-        "%Y/%m/%d %H:%M",
-        "%Y/%m/%dT%H:%M:%S",
-        "%Y/%m/%dT%H:%M:%S.000Z",
-        "%Y/%m/%dT%H:%M:%S.000",
-        "%Y/%m/%dT%H:%M:%S.%fZ",
-        "%Y/%m/%dT%H:%M:%S.%f%z",
-        "%Y/%m/%dT%H:%M:%S.%f",
-        "%Y/%m/%dT%H:%M:%SZ",
-        "%Y-%m-%d %H:%M:%S",
-        "%d/%m/%Y %H:%M:%S",
-        "%d/%m/%Y %H:%M",
-    ]
-
-    # Handle fractional seconds with extra precision.
-    if "." in value and "Z" in value:
-        parts = value.replace("Z", "").split(".")
-        if len(parts) == 2 and len(parts[1]) > 6:
-            value = parts[0] + "." + parts[1][:6] + "Z"
-    elif "." in value and "+" in value:
-        parts = value.split("+")
-        base_part = parts[0]
-        tz_part = "+" + parts[1]
-        if "." in base_part:
-            date_time, frac = base_part.rsplit(".", 1)
-            if len(frac) > 6:
-                frac = frac[:6]
-            value = date_time + "." + frac + tz_part
-
-    for pattern in patterns:
-        try:
-            datetime.strptime(value, pattern)
-            return True
-        except ValueError:
-            continue
-
-    # Try unix timestamp
-    try:
-        float_val = float(value)
-        return -62135596800 < float_val < 253402300800  # Year 1 to 9999
-    except ValueError:
-        pass
-
-    return False
-
-
-def _is_valid_integer_value(value):
-    try:
-        num = float(value)
-        return num == int(num)
-    except (ValueError, OverflowError):
-        return False
-
-
-def _is_valid_decimal_value(value):
-    try:
-        Decimal(value)
-        return True
-    except (InvalidOperation, ValueError):
-        return False
-
-
-def _is_valid_flag_value(value):
-    value = value.strip().lower()
-
-    lookup = {
-        "y": "yes",
-        "n": "no",
-        "true": "yes",
-        "false": "no",
-    }
-
-    normalized = lookup.get(value, value)
-    return normalized in {"", "yes", "no"}
-
-
-def _is_valid_json_value(value):
-    try:
-        json.loads(value)
-        return True
-    except json.JSONDecodeError:
-        return False
-
-
-def _is_valid_reference_value(value):
-    return bool(value.strip()) and not any(ch.isspace() for ch in value)
-
-
-def _is_valid_curie_value(value):
-    return bool(re.fullmatch(r"[a-z0-9-]+:[^\s:][^\s]*", value))
-
-
-def _is_valid_curie_list_value(value):
-    text = (value or "").strip()
-    if not text:
-        return False
-
-    parts = [part.strip() for part in text.split(";")]
-    if any(not part for part in parts):
-        return False
-
-    curie_re = re.compile(r"[a-z0-9-]+:[^\s:][^\s]*")
-    return all(bool(curie_re.fullmatch(part)) for part in parts)
-
-
-def _is_valid_address_value(value):
-    if not value or not value.strip():
-        return False
-
-    value = value.strip()
-
-    # https://github.com/digital-land/digital-land-python/blob/1dbbad99e0c5939d87d5a8a8ece372e4c43eba77/digital_land/datatype/address.py#L10
-    value = ", ".join(value.split("\n"))
-    value = value.replace(";", ",")
-
-    comma_re = re.compile(r",\s*,+")
-    value = comma_re.sub(", ", value)
-    value = value.strip(", ")
-
-    hyphen_re = re.compile(r"\s*-\s*")
-    value = hyphen_re.sub("-", value)
-
-    value = " ".join(value.split()).replace('"', "")
-
-    return bool(value.strip())
-
-
-def _is_valid_url_value(value):
-    candidate = (value or "").strip().strip("'")
-    parsed = urllib.parse.urlparse(candidate)
-    return bool(parsed.scheme and parsed.netloc)
-
-
-def _is_valid_hash_value(value):
-    if ":" in value:
-        _, digest = value.split(":", 1)
-    else:
-        digest = value
-    return bool(re.fullmatch(r"[0-9a-fA-F]+", digest))
-
-
 def _is_valid_pattern_value(value):
     try:
         re.compile(value)
@@ -200,22 +14,6 @@ def _is_valid_pattern_value(value):
         return False
 
 
-def _is_valid_latitude_value(value):
-    try:
-        numeric = float(value)
-    except ValueError:
-        return False
-    return -90 <= numeric <= 90
-
-
-def _is_valid_longitude_value(value):
-    try:
-        numeric = float(value)
-    except ValueError:
-        return False
-    return -180 <= numeric <= 180
-
-
 def _is_valid_multipolygon_value(value):
     candidate = (value or "").strip()
     if not candidate:
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index 2a9592f9..ad064300 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -665,8 +665,9 @@ def test_check_values_have_the_correct_datatype_passes(tmp_path):
         "enabled": "flag",
     }
 
+    conn = duckdb.connect()
     passed, message, details = check_values_have_the_correct_datatype(
-        file_path, field_datatype
+        conn, file_path, field_datatype
     )
 
     assert passed is True
@@ -689,8 +690,10 @@ def test_check_values_have_the_correct_datatype_fails(tmp_path):
         "enabled": "flag",
     }
 
+    conn = duckdb.connect()
+
     passed, message, details = check_values_have_the_correct_datatype(
-        file_path, field_datatype
+        conn, file_path, field_datatype
     )
 
     assert passed is False
@@ -719,9 +722,9 @@ def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path):
         "entity": "reference",
         "count": "integer",
     }
-
+    conn = duckdb.connect()
     passed, message, details = check_values_have_the_correct_datatype(
-        file_path, field_datatype
+        conn, file_path, field_datatype
     )
 
     assert passed is True
@@ -740,9 +743,9 @@ def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path):
         "entity": "reference",
         "count": "integer",
     }
-
+    conn = duckdb.connect()
     passed, message, details = check_values_have_the_correct_datatype(
-        file_path, field_datatype
+        conn, file_path, field_datatype
     )
 
     assert passed is True
@@ -760,9 +763,9 @@ def test_check_values_have_the_correct_datatype_empty_file(tmp_path):
         "entity": "reference",
         "count": "integer",
     }
-
+    conn = duckdb.connect()
     passed, message, details = check_values_have_the_correct_datatype(
-        file_path, field_datatype
+        conn, file_path, field_datatype
     )
 
     assert passed is True
@@ -781,10 +784,340 @@ def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path):
         "name": "string",
         "description": "string",
     }
-
+    conn = duckdb.connect()
     passed, message, details = check_values_have_the_correct_datatype(
-        file_path, field_datatype
+        conn, file_path, field_datatype
     )
 
     assert passed is True
     assert details["invalid_rows"] == []
+
+
+def test_check_values_have_the_correct_datatype_decimal(tmp_path):
+    """Test decimal datatype validation with both valid and invalid values."""
+    file_path = tmp_path / "decimal_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["price"])
+        writer.writerow(["100.50"])
+        writer.writerow(["0.99"])
+        writer.writerow(["999.999"])
+        writer.writerow(["not-a-decimal"])
+        writer.writerow(["12abc"])
+
+    field_datatype = {"price": "decimal"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+    assert any(r["value"] == "not-a-decimal" for r in details["invalid_rows"])
+    assert any(r["value"] == "12abc" for r in details["invalid_rows"])
+
+
+def test_check_values_have_the_correct_datatype_latitude_longitude(tmp_path):
+    """Test latitude and longitude datatype validation with valid and invalid values."""
+    file_path = tmp_path / "coordinates.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["latitude", "longitude"])
+        writer.writerow(["0", "0"])
+        writer.writerow(["51.5074", "-0.1278"])
+        writer.writerow(["-33.8688", "151.2093"])
+        writer.writerow(["90", "180"])
+        writer.writerow(["91", "0"])
+        writer.writerow(["0", "181"])
+
+    field_datatype = {
+        "latitude": "latitude",
+        "longitude": "longitude",
+    }
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_flag(tmp_path):
+    """Test flag datatype validation with valid and invalid values."""
+    file_path = tmp_path / "flag_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["active"])
+        writer.writerow(["true"])
+        writer.writerow(["false"])
+        writer.writerow(["y"])
+        writer.writerow(["n"])
+        writer.writerow(["yes"])
+        writer.writerow(["no"])
+        writer.writerow(["maybe"])
+        writer.writerow(["1"])
+
+    field_datatype = {"active": "flag"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 4
+
+
+def test_check_values_have_the_correct_datatype_hash(tmp_path):
+    """Test hash datatype validation with valid and invalid values."""
+    file_path = tmp_path / "hash_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["content_hash"])
+        writer.writerow(["abcdef123456"])
+        writer.writerow(["abc:1234567890abcdef"])
+        writer.writerow(["sha:5d41402abc4b2a76b9719d911017c592"])
+        writer.writerow(["not-a-hash"])
+        writer.writerow(["xyz:notahex"])
+
+    field_datatype = {"content_hash": "hash"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_curie(tmp_path):
+    """Test curie datatype validation with valid and invalid values."""
+    file_path = tmp_path / "curie_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["identifier"])
+        writer.writerow(["prefix:value"])
+        writer.writerow(["org:entity123"])
+        writer.writerow(["schema:name"])
+        writer.writerow(["prefix:"])
+        writer.writerow(["no_colon"])
+        writer.writerow(["prefix: space"])
+
+    field_datatype = {"identifier": "curie"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 3
+
+
+def test_check_values_have_the_correct_datatype_curie_list(tmp_path):
+    """Test curie-list datatype validation with valid and invalid values."""
+    file_path = tmp_path / "curie_list_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["identifiers"])
+        writer.writerow(["prefix:value1;org:value2"])
+        writer.writerow(["schema:name"])
+        writer.writerow([""])
+        writer.writerow(["not-valid"])
+        writer.writerow(["prefix: value"])
+
+    field_datatype = {"identifiers": "curie-list"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_json(tmp_path):
+    """Test json datatype validation with valid and invalid JSON."""
+    file_path = tmp_path / "json_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["data"])
+        writer.writerow(['{"key":"value"}'])
+        writer.writerow(['{"nested":{"field":"value"}}'])
+        writer.writerow(["not json"])  # Invalid
+        writer.writerow(['{"incomplete":'])  # Invalid (malformed)
+
+    field_datatype = {"data": "json"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_url(tmp_path):
+    """Test url datatype validation with valid and invalid URLs."""
+    file_path = tmp_path / "url_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["website"])
+        writer.writerow(["https://example.com"])
+        writer.writerow(["http://test.org"])
+        writer.writerow(["ftp://files.example.com"])
+        writer.writerow(["not a url"])  # Invalid (no scheme)
+        writer.writerow(["example.com"])  # Invalid (no scheme)
+
+    field_datatype = {"website": "url"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_date(tmp_path):
+    """Test date datatype validation with valid and invalid dates."""
+    file_path = tmp_path / "date_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["start_date"])
+        writer.writerow(["2024-01-15"])
+        writer.writerow(["2023-12-31"])
+        writer.writerow(["2022-06-30"])
+        writer.writerow(["not-a-date"])
+        writer.writerow(["2024-13-01"])
+
+    field_datatype = {"start_date": "date"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_datetime(tmp_path):
+    """Test datetime datatype validation with valid and invalid datetimes."""
+    file_path = tmp_path / "datetime_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["timestamp"])
+        writer.writerow(["2024-01-15T10:30:45"])
+        writer.writerow(["2023-12-31T23:59:59Z"])
+        writer.writerow(["2022-06-30T12:00:00+00:00"])
+        writer.writerow(["not-a-datetime"])
+        writer.writerow(["2024-13-01T10:00:00"])
+
+    field_datatype = {"timestamp": "datetime"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_pattern(tmp_path):
+    """Test pattern datatype validation with valid and invalid regex patterns."""
+    file_path = tmp_path / "pattern_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["regex"])
+        writer.writerow(["^[A-Z]+$"])
+        writer.writerow(["\\d{3}-\\d{4}"])
+        writer.writerow(["(foo|bar)"])
+        writer.writerow(["["])
+        writer.writerow(["(unclosed"])
+
+    field_datatype = {"regex": "pattern"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_point(tmp_path):
+    """Test point datatype validation (WKT format) with valid and invalid values."""
+    file_path = tmp_path / "point_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["geometry"])
+        writer.writerow(["POINT(0 0)"])
+        writer.writerow(["POINT(51.5074 -0.1278)"])
+        writer.writerow(["POINT(-33.8688 151.2093)"])
+        writer.writerow(["not wkt"])
+        writer.writerow(["POINT(0)"])
+
+    field_datatype = {"geometry": "point"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_multipolygon(tmp_path):
+    """Test multipolygon datatype validation (WKT format) with valid and invalid values."""
+    file_path = tmp_path / "multipolygon_values.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["boundary"])
+        writer.writerow(["POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))"])
+        writer.writerow(
+            [
+                "MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)), ((20 20, 30 20, 30 30, 20 30, 20 20)))"
+            ]
+        )
+        writer.writerow(["not wkt"])  # Invalid
+        writer.writerow(["POINT(0 0)"])  # Invalid (not a polygon/multipolygon)
+
+    field_datatype = {"boundary": "multipolygon"}
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 2
+
+
+def test_check_values_have_the_correct_datatype_mixed_types(tmp_path):
+    """Test validation with multiple different datatypes in one file."""
+    file_path = tmp_path / "mixed_datatypes.csv"
+    with open(file_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["id", "price", "active", "latitude", "url", "date"])
+        writer.writerow(
+            ["org-001", "99.99", "true", "51.5074", "https://example.com", "2024-01-15"]
+        )
+        writer.writerow(
+            ["org-002", "150.50", "false", "-33.8688", "https://test.org", "2023-12-31"]
+        )
+        writer.writerow(
+            ["org 003", "invalid", "maybe", "91", "not-a-url", "not-a-date"]
+        )
+
+    field_datatype = {
+        "price": "decimal",
+        "active": "flag",
+        "latitude": "latitude",
+        "url": "url",
+        "date": "date",
+    }
+    conn = duckdb.connect()
+    passed, message, details = check_values_have_the_correct_datatype(
+        conn, file_path, field_datatype
+    )
+
+    assert passed is False
+    assert len(details["invalid_rows"]) == 5

From f156b8152d93dd9227e2c97d902d1b8321fb9a39 Mon Sep 17 00:00:00 2001
From: Gibah Joseph <gibahjoe@gmail.com>
Date: Wed, 1 Apr 2026 14:05:23 +0100
Subject: [PATCH 12/12] Refactor CSV datatype validation functions to
 individual expectations

- Replaced the previous `check_values_have_the_correct_datatype` function with specific functions for each datatype (e.g., `expect_column_to_be_integer`, `expect_column_to_be_decimal`, etc.).
- Each new function performs validation for a specific datatype and returns detailed results including invalid rows.
- Updated integration tests to reflect the new validation functions and ensure they cover various scenarios for each datatype.
- Removed unused pattern validation function from `datatype_validators.py`.
---
 digital_land/expectations/checkpoints/csv.py  |  32 +-
 digital_land/expectations/operations/csv.py   | 517 ++++++++++++++----
 .../operations/datatype_validators.py         |   9 -
 .../expectations/operations/test_csv.py       | 476 +++++-----------
 4 files changed, 580 insertions(+), 454 deletions(-)

diff --git a/digital_land/expectations/checkpoints/csv.py b/digital_land/expectations/checkpoints/csv.py
index 5b33a771..b69ead33 100644
--- a/digital_land/expectations/checkpoints/csv.py
+++ b/digital_land/expectations/checkpoints/csv.py
@@ -13,7 +13,21 @@
     check_field_is_within_range_by_dataset_org,
     check_allowed_values,
     check_no_blank_rows,
-    check_values_have_the_correct_datatype,
+    expect_column_to_be_integer,
+    expect_column_to_be_decimal,
+    expect_column_to_be_flag,
+    expect_column_to_be_latitude,
+    expect_column_to_be_longitude,
+    expect_column_to_be_hash,
+    expect_column_to_be_curie,
+    expect_column_to_be_curie_list,
+    expect_column_to_be_json,
+    expect_column_to_be_url,
+    expect_column_to_be_date,
+    expect_column_to_be_datetime,
+    expect_column_to_be_pattern,
+    expect_column_to_be_multipolygon,
+    expect_column_to_be_point,
 )
 
 
@@ -33,7 +47,21 @@ def operation_factory(self, operation_string: str):
             "check_field_is_within_range_by_dataset_org": check_field_is_within_range_by_dataset_org,
             "check_allowed_values": check_allowed_values,
             "check_no_blank_rows": check_no_blank_rows,
-            "check_values_have_the_correct_datatype": check_values_have_the_correct_datatype,
+            "expect_column_to_be_integer": expect_column_to_be_integer,
+            "expect_column_to_be_decimal": expect_column_to_be_decimal,
+            "expect_column_to_be_flag": expect_column_to_be_flag,
+            "expect_column_to_be_latitude": expect_column_to_be_latitude,
+            "expect_column_to_be_longitude": expect_column_to_be_longitude,
+            "expect_column_to_be_hash": expect_column_to_be_hash,
+            "expect_column_to_be_curie": expect_column_to_be_curie,
+            "expect_column_to_be_curie_list": expect_column_to_be_curie_list,
+            "expect_column_to_be_json": expect_column_to_be_json,
+            "expect_column_to_be_url": expect_column_to_be_url,
+            "expect_column_to_be_date": expect_column_to_be_date,
+            "expect_column_to_be_datetime": expect_column_to_be_datetime,
+            "expect_column_to_be_pattern": expect_column_to_be_pattern,
+            "expect_column_to_be_multipolygon": expect_column_to_be_multipolygon,
+            "expect_column_to_be_point": expect_column_to_be_point,
         }
         if operation_string not in operation_map:
             raise ValueError(
diff --git a/digital_land/expectations/operations/csv.py b/digital_land/expectations/operations/csv.py
index 30ea7a01..72f1d2e4 100644
--- a/digital_land/expectations/operations/csv.py
+++ b/digital_land/expectations/operations/csv.py
@@ -1,9 +1,9 @@
 from pathlib import Path
+import re
 import pandas as pd
 
 from digital_land.expectations.operations.datatype_validators import (
     _is_valid_multipolygon_value,
-    _is_valid_pattern_value,
     _is_valid_point_value,
 )
 
@@ -620,135 +620,414 @@ def check_field_is_within_range_by_dataset_org(
     return passed, message, details
 
 
-def check_values_have_the_correct_datatype(conn, file_path, field_datatype):
-    """
-    Validates that CSV column values have correct datatypes.
+def expect_column_to_be_integer(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND NOT (
+              TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) IS NOT NULL
+              AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) = TRY_CAST(TRIM(COALESCE("{field}", '')) AS BIGINT)
+          )
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "integer", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'integer'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'integer' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
 
-    Uses DuckDB queries for datatypes: integer, decimal, flag, latitude, longitude, hash, curie, curie-list, json, url, date, datetime.
 
-    Uses Python validators for complex datatypes: pattern, multipolygon, point.
+def expect_column_to_be_decimal(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS DECIMAL) IS NULL
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "decimal", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'decimal'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'decimal' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_flag(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND LOWER(TRIM(COALESCE("{field}", ''))) NOT IN ('yes', 'no', 'true', 'false')
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "flag", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'flag'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'flag' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_latitude(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND (
+              TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) IS NULL
+              OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) < -90
+              OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) > 90
+          )
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "latitude", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'latitude'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'latitude' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
 
-    Args:
-        file_path: path to the CSV file to validate
-        field_datatype: dict mapping column name to datatype string
-    """
 
-    def _get_sql_validation_condition(datatype: str, field_name: str) -> str:
-        field_ref = f"TRIM(COALESCE(\"{field_name}\", ''))"
-
-        conditions = {
-            "integer": f"{field_ref} != '' AND NOT (TRY_CAST({field_ref} AS DOUBLE) IS NOT NULL AND TRY_CAST({field_ref} AS DOUBLE) = TRY_CAST({field_ref} AS BIGINT))",
-            "decimal": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DECIMAL) IS NULL",
-            "flag": f"{field_ref} != '' AND LOWER({field_ref}) NOT IN ('yes', 'no', 'true', 'false')",
-            "latitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -90 OR TRY_CAST({field_ref} AS DOUBLE) > 90)",
-            "longitude": f"{field_ref} != '' AND (TRY_CAST({field_ref} AS DOUBLE) IS NULL OR TRY_CAST({field_ref} AS DOUBLE) < -180 OR TRY_CAST({field_ref} AS DOUBLE) > 180)",
-            "hash": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z]+:)?[0-9a-fA-F]+$'))",
-            "curie": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-z0-9-]+:[^\\s:][^\\s]*$'))",
-            "curie-list": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^([a-z0-9-]+:[^\\s:][^\\s]*(;[a-z0-9-]+:[^\\s:][^\\s]*)*)?$'))",
-            "json": f"{field_ref} != '' AND TRY(json_extract({field_ref}, '$')) IS NULL",
-            "url": f"{field_ref} != '' AND NOT (REGEXP_MATCHES({field_ref}, '^[a-zA-Z][a-zA-Z0-9+.-]*://[^\\s/:?#]+(?::[0-9]+)?(?:[/?#][^\\s]*)?$'))",
-            "date": f"{field_ref} != '' AND TRY_CAST({field_ref} AS DATE) IS NULL",
-            "datetime": f"{field_ref} != '' AND TRY_CAST({field_ref} AS TIMESTAMP) IS NULL",
+def expect_column_to_be_longitude(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND (
+              TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) IS NULL
+              OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) < -180
+              OR TRY_CAST(TRIM(COALESCE("{field}", '')) AS DOUBLE) > 180
+          )
+        """
+    ).fetchall()
+    invalid_rows = [
+        {
+            "line_number": row[0],
+            "field": field,
+            "datatype": "longitude",
+            "value": row[1],
         }
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'longitude'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'longitude' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
 
-        return conditions.get(datatype, "FALSE")
 
-    # Python validators for complex datatypes that can't be easily expressed in SQL
-    python_validators = {
-        "pattern": _is_valid_pattern_value,
-        "multipolygon": _is_valid_multipolygon_value,
-        "point": _is_valid_point_value,
-    }
+def expect_column_to_be_hash(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^([a-z]+:)?[0-9a-fA-F]+$'))
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "hash", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'hash'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'hash' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
 
-    sql_validators = {
-        "integer",
-        "decimal",
-        "flag",
-        "latitude",
-        "longitude",
-        "hash",
-        "curie",
-        "curie-list",
-        "json",
-        "url",
-        "date",
-        "datetime",
-    }
 
-    fields_for_sql = []
-    fields_for_python = []
-
-    for field in field_datatype:
-        datatype = field_datatype.get(field)
-        if datatype in sql_validators:
-            fields_for_sql.append((field, datatype))
-        elif datatype in python_validators:
-            fields_for_python.append((field, datatype, python_validators[datatype]))
-
-    invalid_values = []
-
-    # SQL validation: query invalid rows for each field
-    if fields_for_sql:
-        for field, datatype in fields_for_sql:
-            condition = _get_sql_validation_condition(datatype, field)
-
-            result = conn.execute(
-                f"""
-                WITH source_rows AS (
-                    SELECT
-                        ROW_NUMBER() OVER () + 1 AS line_number,
-                        *
-                    FROM {_read_csv(file_path)}
+def expect_column_to_be_curie(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^[a-z0-9-]+:[^\\s:][^\\s]*$'))
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "curie", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'curie'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'curie' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_curie_list(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^([a-z0-9-]+:[^\\s:][^\\s]*(;[a-z0-9-]+:[^\\s:][^\\s]*)*)?$'))
+        """
+    ).fetchall()
+    invalid_rows = [
+        {
+            "line_number": row[0],
+            "field": field,
+            "datatype": "curie-list",
+            "value": row[1],
+        }
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'curie-list'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'curie-list' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_json(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND TRY(json_extract(TRIM(COALESCE("{field}", '')), '$')) IS NULL
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "json", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'json'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'json' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_url(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND NOT (REGEXP_MATCHES(TRIM(COALESCE("{field}", '')), '^[a-zA-Z][a-zA-Z0-9+.-]*://[^\\s/:?#]+(?::[0-9]+)?(?:[/?#][^\\s]*)?$'))
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "url", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'url'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'url' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_date(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS DATE) IS NULL
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "date", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'date'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'date' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_datetime(conn, file_path: Path, field: str):
+    result = conn.execute(
+        f"""
+        WITH source_rows AS (
+            SELECT ROW_NUMBER() OVER () + 1 AS line_number, *
+            FROM {_read_csv(file_path)}
+        )
+        SELECT line_number, TRIM(COALESCE("{field}", '')) AS value
+        FROM source_rows
+        WHERE TRIM(COALESCE("{field}", '')) != ''
+          AND TRY_CAST(TRIM(COALESCE("{field}", '')) AS TIMESTAMP) IS NULL
+        """
+    ).fetchall()
+    invalid_rows = [
+        {"line_number": row[0], "field": field, "datatype": "datetime", "value": row[1]}
+        for row in result
+    ]
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'datetime'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'datetime' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_pattern(conn, file_path: Path, field: str):
+    invalid_rows = []
+    df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
+    if not df.empty and len(df.columns) > 0 and field in df.columns:
+        for line_number, (_, row) in enumerate(df.iterrows(), start=2):
+            value = str(row.get(field, "")).strip()
+            if not value:
+                continue
+            try:
+                re.compile(value)
+            except re.error:
+                invalid_rows.append(
+                    {
+                        "line_number": line_number,
+                        "field": field,
+                        "datatype": "pattern",
+                        "value": value,
+                    }
                 )
-                SELECT
-                    line_number,
-                    TRIM(COALESCE("{field}", '')) AS value
-                FROM source_rows
-                WHERE {condition}
-                """
-            ).fetchall()
-
-            for row in result:
-                invalid_values.append(
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'pattern'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'pattern' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_multipolygon(conn, file_path: Path, field: str):
+    invalid_rows = []
+    df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
+    if not df.empty and len(df.columns) > 0 and field in df.columns:
+        for line_number, (_, row) in enumerate(df.iterrows(), start=2):
+            value = str(row.get(field, "")).strip()
+            if not value:
+                continue
+            if not _is_valid_multipolygon_value(value):
+                invalid_rows.append(
                     {
-                        "line_number": row[0],
+                        "line_number": line_number,
                         "field": field,
-                        "datatype": datatype,
-                        "value": row[1],
+                        "datatype": "multipolygon",
+                        "value": value,
                     }
                 )
-
-    if fields_for_python:
-        df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
-
-        if df.empty or len(df.columns) == 0:
-            pass
-        else:
-            for line_number, (_, row) in enumerate(df.iterrows(), start=2):
-                for field, datatype, validator in fields_for_python:
-                    if field not in df.columns:
-                        continue
-                    value = str(row.get(field, "")).strip()
-                    if not value:
-                        continue
-
-                    if not validator(value):
-                        invalid_values.append(
-                            {
-                                "line_number": line_number,
-                                "field": field,
-                                "datatype": datatype,
-                                "value": value,
-                            }
-                        )
-
-    if len(invalid_values) == 0:
-        passed = True
-        message = "all values have valid datatypes"
-        details = {"invalid_rows": []}
-    else:
-        passed = False
-        message = f"there were {len(invalid_values)} invalid datatype value(s) found"
-        details = {"invalid_rows": invalid_values}
-
-    return passed, message, details
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'multipolygon'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'multipolygon' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
+
+
+def expect_column_to_be_point(conn, file_path: Path, field: str):
+    invalid_rows = []
+    df = pd.read_csv(file_path, dtype=str, keep_default_na=False)
+    if not df.empty and len(df.columns) > 0 and field in df.columns:
+        for line_number, (_, row) in enumerate(df.iterrows(), start=2):
+            value = str(row.get(field, "")).strip()
+            if not value:
+                continue
+            if not _is_valid_point_value(value):
+                invalid_rows.append(
+                    {
+                        "line_number": line_number,
+                        "field": field,
+                        "datatype": "point",
+                        "value": value,
+                    }
+                )
+    passed = len(invalid_rows) == 0
+    message = (
+        f"all values in '{field}' have datatype 'point'"
+        if passed
+        else f"there were {len(invalid_rows)} invalid 'point' value(s) in '{field}'"
+    )
+    return passed, message, {"invalid_rows": invalid_rows}
diff --git a/digital_land/expectations/operations/datatype_validators.py b/digital_land/expectations/operations/datatype_validators.py
index 3c04dc6e..ce9eedd2 100644
--- a/digital_land/expectations/operations/datatype_validators.py
+++ b/digital_land/expectations/operations/datatype_validators.py
@@ -1,19 +1,10 @@
 import json
-import re
 
 import shapely.errors
 import shapely.wkt
 from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon, shape
 
 
-def _is_valid_pattern_value(value):
-    try:
-        re.compile(value)
-        return True
-    except re.error:
-        return False
-
-
 def _is_valid_multipolygon_value(value):
     candidate = (value or "").strip()
     if not candidate:
diff --git a/tests/integration/expectations/operations/test_csv.py b/tests/integration/expectations/operations/test_csv.py
index ad064300..166b5140 100644
--- a/tests/integration/expectations/operations/test_csv.py
+++ b/tests/integration/expectations/operations/test_csv.py
@@ -11,7 +11,21 @@
     check_no_blank_rows,
     check_fields_are_within_range,
     check_field_is_within_range_by_dataset_org,
-    check_values_have_the_correct_datatype,
+    expect_column_to_be_decimal,
+    expect_column_to_be_flag,
+    expect_column_to_be_latitude,
+    expect_column_to_be_longitude,
+    expect_column_to_be_hash,
+    expect_column_to_be_curie,
+    expect_column_to_be_curie_list,
+    expect_column_to_be_json,
+    expect_column_to_be_url,
+    expect_column_to_be_date,
+    expect_column_to_be_datetime,
+    expect_column_to_be_pattern,
+    expect_column_to_be_point,
+    expect_column_to_be_integer,
+    expect_column_to_be_multipolygon,
 )
 
 
@@ -650,474 +664,288 @@ def test_check_field_is_within_ranges_for_only_staus_301(tmp_path):
     assert details["invalid_rows"][0]["value"] == 250
 
 
-def test_check_values_have_the_correct_datatype_passes(tmp_path):
-    """Test datatype validation with all valid values."""
-    file_path = tmp_path / "valid_datatypes.csv"
+def test_expect_column_to_be_integer(tmp_path):
+    file_path = tmp_path / "expect_integer.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity", "count", "enabled"])
-        writer.writerow(["entity-1", "100", "true"])
-        writer.writerow(["entity-2", "200", "false"])
-
-    field_datatype = {
-        "entity": "reference",
-        "count": "integer",
-        "enabled": "flag",
-    }
+        writer.writerow(["count"])
+        writer.writerow(["10"])
+        writer.writerow(["abc"])
 
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
-
-    assert passed is True
-    assert details["invalid_rows"] == []
-
-
-def test_check_values_have_the_correct_datatype_fails(tmp_path):
-    """Test datatype validation with invalid values."""
-    file_path = tmp_path / "invalid_datatypes.csv"
-    with open(file_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["entity", "count", "enabled"])
-        writer.writerow(["entity-1", "100", "true"])
-        writer.writerow(["entity-2", "not_a_number", "false"])
-        writer.writerow(["entity-3", "300", "maybe"])
-
-    field_datatype = {
-        "entity": "reference",
-        "count": "integer",
-        "enabled": "flag",
-    }
-
-    conn = duckdb.connect()
-
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, message, details = expect_column_to_be_integer(
+        conn, file_path=file_path, field="count"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
-    assert details["invalid_rows"][0]["line_number"] == 3
-    assert details["invalid_rows"][0]["field"] == "count"
-    assert details["invalid_rows"][0]["value"] == "not_a_number"
+    assert len(details["invalid_rows"]) == 1
     assert details["invalid_rows"][0]["datatype"] == "integer"
-    assert details["invalid_rows"][1]["line_number"] == 4
-    assert details["invalid_rows"][1]["field"] == "enabled"
-    assert details["invalid_rows"][1]["value"] == "maybe"
-    assert "invalid datatype value(s)" in message
-
-
-def test_check_values_have_the_correct_datatype_ignores_empty_values(tmp_path):
-    """Test that empty values are skipped during validation."""
-    file_path = tmp_path / "with_empty_values.csv"
-    with open(file_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["entity", "count"])
-        writer.writerow(["entity-1", "100"])
-        writer.writerow(["entity-2", ""])
-        writer.writerow(["entity-3", "300"])
-
-    field_datatype = {
-        "entity": "reference",
-        "count": "integer",
-    }
-    conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
-
-    assert passed is True
-    assert details["invalid_rows"] == []
-
-
-def test_check_values_have_the_correct_datatype_skips_unmapped_fields(tmp_path):
-    """Test that fields not in field_datatype map are not validated."""
-    file_path = tmp_path / "unmapped_fields.csv"
-    with open(file_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["entity", "count", "description"])
-        writer.writerow(["entity-1", "100", "invalid_but_ignored"])
-
-    field_datatype = {
-        "entity": "reference",
-        "count": "integer",
-    }
-    conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
+    assert "invalid 'integer'" in message
 
-    assert passed is True
-    assert details["invalid_rows"] == []
 
-
-def test_check_values_have_the_correct_datatype_empty_file(tmp_path):
-    """Test behavior with empty CSV file."""
-    file_path = tmp_path / "empty.csv"
+def test_expect_column_to_be_multipolygon(tmp_path):
+    file_path = tmp_path / "expect_multipolygon.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["entity", "count"])
+        writer.writerow(["boundary"])
+        writer.writerow(["POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))"])
+        writer.writerow(["POINT(0 0)"])
 
-    field_datatype = {
-        "entity": "reference",
-        "count": "integer",
-    }
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_multipolygon(
+        conn, file_path=file_path, field="boundary"
     )
 
-    assert passed is True
-    assert details["invalid_rows"] == []
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "multipolygon"
 
 
-def test_check_values_have_the_correct_datatype_no_applicable_fields(tmp_path):
-    """Test when no fields have datatype validators."""
-    file_path = tmp_path / "no_applicable.csv"
+def test_expect_column_to_be_decimal(tmp_path):
+    file_path = tmp_path / "expect_decimal.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["name", "description"])
-        writer.writerow(["field1", "some value"])
+        writer.writerow(["price"])
+        writer.writerow(["1.2"])
+        writer.writerow(["10"])
+        writer.writerow(["0.01"])
+        writer.writerow(["bad"])
 
-    field_datatype = {
-        "name": "string",
-        "description": "string",
-    }
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_decimal(
+        conn, file_path=file_path, field="price"
     )
 
-    assert passed is True
-    assert details["invalid_rows"] == []
+    assert passed is False
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "decimal"
 
 
-def test_check_values_have_the_correct_datatype_decimal(tmp_path):
-    """Test decimal datatype validation with both valid and invalid values."""
-    file_path = tmp_path / "decimal_values.csv"
+def test_expect_column_to_be_flag(tmp_path):
+    file_path = tmp_path / "expect_flag.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["price"])
-        writer.writerow(["100.50"])
-        writer.writerow(["0.99"])
-        writer.writerow(["999.999"])
-        writer.writerow(["not-a-decimal"])
-        writer.writerow(["12abc"])
+        writer.writerow(["active"])
+        writer.writerow(["true"])
+        writer.writerow(["false"])
+        writer.writerow(["yes"])
+        writer.writerow(["no"])
+        writer.writerow(["maybe"])
 
-    field_datatype = {"price": "decimal"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_flag(
+        conn, file_path=file_path, field="active"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
-    assert any(r["value"] == "not-a-decimal" for r in details["invalid_rows"])
-    assert any(r["value"] == "12abc" for r in details["invalid_rows"])
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "flag"
 
 
-def test_check_values_have_the_correct_datatype_latitude_longitude(tmp_path):
-    """Test latitude and longitude datatype validation with valid and invalid values."""
-    file_path = tmp_path / "coordinates.csv"
+def test_expect_column_to_be_latitude(tmp_path):
+    file_path = tmp_path / "expect_latitude.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["latitude", "longitude"])
-        writer.writerow(["0", "0"])
-        writer.writerow(["51.5074", "-0.1278"])
-        writer.writerow(["-33.8688", "151.2093"])
-        writer.writerow(["90", "180"])
-        writer.writerow(["91", "0"])
-        writer.writerow(["0", "181"])
-
-    field_datatype = {
-        "latitude": "latitude",
-        "longitude": "longitude",
-    }
+        writer.writerow(["lat"])
+        writer.writerow(["0"])
+        writer.writerow(["51.5"])
+        writer.writerow(["-90"])
+        writer.writerow(["90"])
+        writer.writerow(["91"])
+
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_latitude(
+        conn, file_path=file_path, field="lat"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "latitude"
 
 
-def test_check_values_have_the_correct_datatype_flag(tmp_path):
-    """Test flag datatype validation with valid and invalid values."""
-    file_path = tmp_path / "flag_values.csv"
+def test_expect_column_to_be_longitude(tmp_path):
+    file_path = tmp_path / "expect_longitude.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["active"])
-        writer.writerow(["true"])
-        writer.writerow(["false"])
-        writer.writerow(["y"])
-        writer.writerow(["n"])
-        writer.writerow(["yes"])
-        writer.writerow(["no"])
-        writer.writerow(["maybe"])
-        writer.writerow(["1"])
+        writer.writerow(["lon"])
+        writer.writerow(["0"])
+        writer.writerow(["-0.1"])
+        writer.writerow(["-180"])
+        writer.writerow(["180"])
+        writer.writerow(["181"])
 
-    field_datatype = {"active": "flag"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_longitude(
+        conn, file_path=file_path, field="lon"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 4
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "longitude"
 
 
-def test_check_values_have_the_correct_datatype_hash(tmp_path):
-    """Test hash datatype validation with valid and invalid values."""
-    file_path = tmp_path / "hash_values.csv"
+def test_expect_column_to_be_hash(tmp_path):
+    file_path = tmp_path / "expect_hash.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["content_hash"])
-        writer.writerow(["abcdef123456"])
-        writer.writerow(["abc:1234567890abcdef"])
+        writer.writerow(["hash"])
+        writer.writerow(["abcdef"])
         writer.writerow(["sha:5d41402abc4b2a76b9719d911017c592"])
-        writer.writerow(["not-a-hash"])
         writer.writerow(["xyz:notahex"])
 
-    field_datatype = {"content_hash": "hash"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_hash(
+        conn, file_path=file_path, field="hash"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "hash"
 
 
-def test_check_values_have_the_correct_datatype_curie(tmp_path):
-    """Test curie datatype validation with valid and invalid values."""
-    file_path = tmp_path / "curie_values.csv"
+def test_expect_column_to_be_curie(tmp_path):
+    file_path = tmp_path / "expect_curie.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["identifier"])
+        writer.writerow(["id"])
         writer.writerow(["prefix:value"])
         writer.writerow(["org:entity123"])
-        writer.writerow(["schema:name"])
-        writer.writerow(["prefix:"])
         writer.writerow(["no_colon"])
-        writer.writerow(["prefix: space"])
 
-    field_datatype = {"identifier": "curie"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_curie(
+        conn, file_path=file_path, field="id"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 3
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "curie"
 
 
-def test_check_values_have_the_correct_datatype_curie_list(tmp_path):
-    """Test curie-list datatype validation with valid and invalid values."""
-    file_path = tmp_path / "curie_list_values.csv"
+def test_expect_column_to_be_curie_list(tmp_path):
+    file_path = tmp_path / "expect_curie_list.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["identifiers"])
-        writer.writerow(["prefix:value1;org:value2"])
+        writer.writerow(["ids"])
+        writer.writerow(["prefix:a;org:b"])
         writer.writerow(["schema:name"])
-        writer.writerow([""])
         writer.writerow(["not-valid"])
-        writer.writerow(["prefix: value"])
 
-    field_datatype = {"identifiers": "curie-list"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_curie_list(
+        conn, file_path=file_path, field="ids"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "curie-list"
 
 
-def test_check_values_have_the_correct_datatype_json(tmp_path):
-    """Test json datatype validation with valid and invalid JSON."""
-    file_path = tmp_path / "json_values.csv"
+def test_expect_column_to_be_json(tmp_path):
+    file_path = tmp_path / "expect_json.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["data"])
-        writer.writerow(['{"key":"value"}'])
-        writer.writerow(['{"nested":{"field":"value"}}'])
-        writer.writerow(["not json"])  # Invalid
-        writer.writerow(['{"incomplete":'])  # Invalid (malformed)
+        writer.writerow(["payload"])
+        writer.writerow(['{"a":1}'])
+        writer.writerow(["[1,2,3]"])
+        writer.writerow(["not json"])
 
-    field_datatype = {"data": "json"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_json(
+        conn, file_path=file_path, field="payload"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "json"
 
 
-def test_check_values_have_the_correct_datatype_url(tmp_path):
-    """Test url datatype validation with valid and invalid URLs."""
-    file_path = tmp_path / "url_values.csv"
+def test_expect_column_to_be_url(tmp_path):
+    file_path = tmp_path / "expect_url.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["website"])
+        writer.writerow(["url"])
         writer.writerow(["https://example.com"])
         writer.writerow(["http://test.org"])
-        writer.writerow(["ftp://files.example.com"])
-        writer.writerow(["not a url"])  # Invalid (no scheme)
-        writer.writerow(["example.com"])  # Invalid (no scheme)
+        writer.writerow(["example.com"])
 
-    field_datatype = {"website": "url"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
+    passed, _, details = expect_column_to_be_url(conn, file_path=file_path, field="url")
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "url"
 
 
-def test_check_values_have_the_correct_datatype_date(tmp_path):
-    """Test date datatype validation with valid and invalid dates."""
-    file_path = tmp_path / "date_values.csv"
+def test_expect_column_to_be_date(tmp_path):
+    file_path = tmp_path / "expect_date.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["start_date"])
-        writer.writerow(["2024-01-15"])
+        writer.writerow(["d"])
+        writer.writerow(["2024-01-01"])
         writer.writerow(["2023-12-31"])
-        writer.writerow(["2022-06-30"])
         writer.writerow(["not-a-date"])
-        writer.writerow(["2024-13-01"])
 
-    field_datatype = {"start_date": "date"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
+    passed, _, details = expect_column_to_be_date(conn, file_path=file_path, field="d")
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "date"
 
 
-def test_check_values_have_the_correct_datatype_datetime(tmp_path):
-    """Test datetime datatype validation with valid and invalid datetimes."""
-    file_path = tmp_path / "datetime_values.csv"
+def test_expect_column_to_be_datetime(tmp_path):
+    file_path = tmp_path / "expect_datetime.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["timestamp"])
-        writer.writerow(["2024-01-15T10:30:45"])
+        writer.writerow(["dt"])
+        writer.writerow(["2024-01-01T10:00:00"])
         writer.writerow(["2023-12-31T23:59:59Z"])
-        writer.writerow(["2022-06-30T12:00:00+00:00"])
         writer.writerow(["not-a-datetime"])
-        writer.writerow(["2024-13-01T10:00:00"])
 
-    field_datatype = {"timestamp": "datetime"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_datetime(
+        conn, file_path=file_path, field="dt"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "datetime"
 
 
-def test_check_values_have_the_correct_datatype_pattern(tmp_path):
-    """Test pattern datatype validation with valid and invalid regex patterns."""
-    file_path = tmp_path / "pattern_values.csv"
+def test_expect_column_to_be_pattern(tmp_path):
+    file_path = tmp_path / "expect_pattern.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
         writer.writerow(["regex"])
         writer.writerow(["^[A-Z]+$"])
-        writer.writerow(["\\d{3}-\\d{4}"])
         writer.writerow(["(foo|bar)"])
         writer.writerow(["["])
-        writer.writerow(["(unclosed"])
 
-    field_datatype = {"regex": "pattern"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_pattern(
+        conn, file_path=file_path, field="regex"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "pattern"
 
 
-def test_check_values_have_the_correct_datatype_point(tmp_path):
-    """Test point datatype validation (WKT format) with valid and invalid values."""
-    file_path = tmp_path / "point_values.csv"
+def test_expect_column_to_be_point(tmp_path):
+    file_path = tmp_path / "expect_point.csv"
     with open(file_path, "w", newline="") as f:
         writer = csv.writer(f)
-        writer.writerow(["geometry"])
+        writer.writerow(["geom"])
         writer.writerow(["POINT(0 0)"])
-        writer.writerow(["POINT(51.5074 -0.1278)"])
-        writer.writerow(["POINT(-33.8688 151.2093)"])
-        writer.writerow(["not wkt"])
+        writer.writerow(["POINT(1 2)"])
         writer.writerow(["POINT(0)"])
 
-    field_datatype = {"geometry": "point"}
-    conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
-
-    assert passed is False
-    assert len(details["invalid_rows"]) == 2
-
-
-def test_check_values_have_the_correct_datatype_multipolygon(tmp_path):
-    """Test multipolygon datatype validation (WKT format) with valid and invalid values."""
-    file_path = tmp_path / "multipolygon_values.csv"
-    with open(file_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["boundary"])
-        writer.writerow(["POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))"])
-        writer.writerow(
-            [
-                "MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)), ((20 20, 30 20, 30 30, 20 30, 20 20)))"
-            ]
-        )
-        writer.writerow(["not wkt"])  # Invalid
-        writer.writerow(["POINT(0 0)"])  # Invalid (not a polygon/multipolygon)
-
-    field_datatype = {"boundary": "multipolygon"}
     conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
+    passed, _, details = expect_column_to_be_point(
+        conn, file_path=file_path, field="geom"
     )
 
     assert passed is False
-    assert len(details["invalid_rows"]) == 2
-
-
-def test_check_values_have_the_correct_datatype_mixed_types(tmp_path):
-    """Test validation with multiple different datatypes in one file."""
-    file_path = tmp_path / "mixed_datatypes.csv"
-    with open(file_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["id", "price", "active", "latitude", "url", "date"])
-        writer.writerow(
-            ["org-001", "99.99", "true", "51.5074", "https://example.com", "2024-01-15"]
-        )
-        writer.writerow(
-            ["org-002", "150.50", "false", "-33.8688", "https://test.org", "2023-12-31"]
-        )
-        writer.writerow(
-            ["org 003", "invalid", "maybe", "91", "not-a-url", "not-a-date"]
-        )
-
-    field_datatype = {
-        "price": "decimal",
-        "active": "flag",
-        "latitude": "latitude",
-        "url": "url",
-        "date": "date",
-    }
-    conn = duckdb.connect()
-    passed, message, details = check_values_have_the_correct_datatype(
-        conn, file_path, field_datatype
-    )
-
-    assert passed is False
-    assert len(details["invalid_rows"]) == 5
+    assert len(details["invalid_rows"]) == 1
+    assert details["invalid_rows"][0]["datatype"] == "point"