digital-land
diff --git a/‎digital_land/cli.py‎
Lines changed: 43 additions & 0 deletions b/‎digital_land/cli.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎digital_land/expectations/checkpoints/csv.py‎
Lines changed: 79 additions & 0 deletions b/‎digital_land/expectations/checkpoints/csv.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎digital_land/expectations/checkpoints/dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎digital_land/expectations/checkpoints/dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎digital_land/expectations/commands.py‎
Lines changed: 16 additions & 0 deletions b/‎digital_land/expectations/commands.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎digital_land/expectations/operations/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎digital_land/expectations/operations/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎digital_land/expectations/operations/csv.py‎
Lines changed: 159 additions & 0 deletions b/‎digital_land/expectations/operations/csv.py‎
Lines changed: 159 additions & 0 deletions
diff --git a/‎digital_land/expectations/operation.py‎ ‎…_land/expectations/operations/dataset.py‎digital_land/expectations/operation.py renamed to digital_land/expectations/operations/dataset.py b/‎digital_land/expectations/operation.py‎ ‎…_land/expectations/operations/dataset.py‎digital_land/expectations/operation.py renamed to digital_land/expectations/operations/dataset.py
@@ -462,6 +462,49 @@ def expectations_run_dataset_checkpoint(
     run_dataset_checkpoint(dataset, file_path, output_dir, config, organisations)
 
 
+@cli.command(
+    "expectations-csv-checkpoint",
+    short_help="runs data quality expectations against a CSV file using duckdb",
+)
+@click.option(
+    "--dataset",
+    type=click.STRING,
+    help="the dataset name for logging purposes",
+    required=True,
+)
+@click.option(
+    "--file-path",
+    type=click.Path(),
+    help="path to the CSV file to run expectations against",
+    required=True,
+)
+@click.option(
+    "--log-dir",
+    type=click.Path(),
+    help="directory to store expectation logs",
+    required=True,
+)
+@click.option(
+    "--rules",
+    type=click.STRING,
+    help="JSON string containing the list of expectation rules",
+    required=True,
+)
+def expectations_run_csv_checkpoint(
+    dataset,
+    file_path,
+    log_dir,
+    rules,
+):
+    import json
+
+    from digital_land.expectations.commands import run_csv_checkpoint
+
+    output_dir = Path(log_dir) / "expectation"
+    parsed_rules = json.loads(rules)
+    run_csv_checkpoint(dataset, file_path, output_dir, parsed_rules)
+
+
 @cli.command("retire-endpoints-and-sources")
 @config_collections_dir
 @click.argument("csv-path", nargs=1, type=click.Path())
 
@@ -0,0 +1,79 @@
+import json
+import duckdb
+from pathlib import Path
+
+from .base import BaseCheckpoint
+from ..log import ExpectationLog
+from ..operations.csv import (
+    count_rows,
+    check_unique,
+    check_no_shared_values,
+    check_no_overlapping_ranges,
+)
+
+
+class CsvCheckpoint(BaseCheckpoint):
+    def __init__(self, dataset, file_path):
+        self.dataset = dataset
+        self.file_path = Path(file_path)
+        self.log = ExpectationLog(dataset=dataset)
+
+    def operation_factory(self, operation_string: str):
+        operation_map = {
+            "count_rows": count_rows,
+            "check_unique": check_unique,
+            "check_no_shared_values": check_no_shared_values,
+            "check_no_overlapping_ranges": check_no_overlapping_ranges,
+        }
+        if operation_string not in operation_map:
+            raise ValueError(
+                f"Unknown operation: '{operation_string}'. Must be one of {list(operation_map.keys())}."
+            )
+        return operation_map[operation_string]
+
+    def load(self, rules):
+        self.expectations = []
+        for rule in rules:
+            expectation = {
+                "operation": self.operation_factory(rule["operation"]),
+                "name": rule["name"],
+                "description": rule.get("description", ""),
+                "dataset": self.dataset,
+                "severity": rule.get("severity", ""),
+                "responsibility": rule.get("responsibility", ""),
+                "parameters": (
+                    json.loads(rule["parameters"])
+                    if isinstance(rule["parameters"], str)
+                    else rule["parameters"]
+                ),
+            }
+            self.expectations.append(expectation)
+
+    def run_expectation(self, conn, expectation) -> tuple:
+        params = expectation["parameters"]
+        passed, msg, details = expectation["operation"](
+            conn=conn, file_path=self.file_path, **params
+        )
+        return passed, msg, details
+
+    def run(self):
+        with duckdb.connect() as conn:
+            for expectation in self.expectations:
+                passed, message, details = self.run_expectation(conn, expectation)
+                self.log.add(
+                    {
+                        "organisation": "",
+                        "name": expectation["name"],
+                        "passed": passed,
+                        "message": message,
+                        "details": details,
+                        "description": expectation["description"],
+                        "severity": expectation["severity"],
+                        "responsibility": expectation["responsibility"],
+                        "operation": expectation["operation"].__name__,
+                        "parameters": expectation["parameters"],
+                    }
+                )
+
+    def save(self, output_dir: Path):
+        self.log.save_parquet(output_dir)
@@ -7,7 +7,7 @@
 
 from .base import BaseCheckpoint
 from ..log import ExpectationLog
-from ..operation import (
+from ..operations.dataset import (
     count_lpa_boundary,
     count_deleted_entities,
     duplicate_geometry_check,
 
@@ -1,4 +1,5 @@
 from .checkpoints.dataset import DatasetCheckpoint
+from .checkpoints.csv import CsvCheckpoint
 
 from digital_land.configuration.main import Config
 from digital_land.organisation import Organisation
@@ -29,3 +30,18 @@ def run_dataset_checkpoint(
     # TODO add failure on critical error back in
     if act_on_critical_error:
         checkpoint.act_on_critical_error()
+
+
+def run_csv_checkpoint(
+    dataset,
+    file_path,
+    output_dir,
+    rules,
+):
+    """
+    Run expectation rules against a CSV file using duckdb.
+    """
+    checkpoint = CsvCheckpoint(dataset, file_path)
+    checkpoint.load(rules)
+    checkpoint.run()
+    checkpoint.save(output_dir)
@@ -0,0 +1,6 @@
+from .dataset import (  # noqa: F401
+    count_lpa_boundary,
+    count_deleted_entities,
+    check_columns,
+    duplicate_geometry_check,
+)
@@ -0,0 +1,159 @@
+from pathlib import Path
+
+
+def _read_csv(file_path: Path) -> str:
+    return f"read_csv_auto('{str(file_path)}',all_varchar=true,delim=',',quote='\"',escape='\"')"
+
+
+def count_rows(
+    conn, file_path: Path, expected: int, comparison_rule: str = "greater_than"
+):
+    """
+    Counts the number of rows in the CSV and compares against an expected value.
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file
+        expected: the expected row count
+        comparison_rule: how to compare actual vs expected
+    """
+    result = conn.execute(f"SELECT COUNT(*) FROM {_read_csv(file_path)}").fetchone()
+    actual = result[0]
+
+    comparison_rules = {
+        "equals_to": actual == expected,
+        "not_equal_to": actual != expected,
+        "greater_than": actual > expected,
+        "greater_than_or_equal_to": actual >= expected,
+        "less_than": actual < expected,
+        "less_than_or_equal_to": actual <= expected,
+    }
+
+    if comparison_rule not in comparison_rules:
+        raise ValueError(
+            f"Invalid comparison_rule: '{comparison_rule}'. Must be one of {list(comparison_rules.keys())}."
+        )
+
+    passed = comparison_rules[comparison_rule]
+    message = f"there were {actual} rows found"
+    details = {
+        "actual": actual,
+        "expected": expected,
+    }
+
+    return passed, message, details
+
+
+def check_unique(conn, file_path: Path, field: str):
+    """
+    Checks that all values in a given field are unique.
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file
+        field: the column name to check for uniqueness
+    """
+    result = conn.execute(
+        f'SELECT "{field}", COUNT(*) as cnt FROM {_read_csv(file_path)} GROUP BY "{field}" HAVING cnt > 1'
+    ).fetchall()
+
+    duplicates = [{"value": row[0], "count": row[1]} for row in result]
+
+    if len(duplicates) == 0:
+        passed = True
+        message = f"all values in '{field}' are unique"
+    else:
+        passed = False
+        message = f"there were {len(duplicates)} duplicate values in '{field}'"
+
+    details = {
+        "field": field,
+        "duplicates": duplicates,
+    }
+
+    return passed, message, details
+
+
+def check_no_shared_values(conn, file_path: Path, field_1: str, field_2: str):
+    """
+    Checks that no value appears in both field_1 and field_2.
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file
+        field_1: the first column name
+        field_2: the second column name
+    """
+    result = conn.execute(
+        f"""
+        SELECT DISTINCT a."{field_1}" as value
+        FROM {_read_csv(file_path)} a
+        WHERE a."{field_1}" IN (SELECT "{field_2}" FROM {_read_csv(file_path)})
+        AND a."{field_1}" IS NOT NULL AND a."{field_1}" != ''
+        """
+    ).fetchall()
+
+    shared_values = [row[0] for row in result]
+
+    if len(shared_values) == 0:
+        passed = True
+        message = f"no shared values between '{field_1}' and '{field_2}'"
+    else:
+        passed = False
+        message = f"there were {len(shared_values)} shared values between '{field_1}' and '{field_2}'"
+
+    details = {
+        "field_1": field_1,
+        "field_2": field_2,
+        "shared_values": shared_values,
+    }
+
+    return passed, message, details
+
+
+def check_no_overlapping_ranges(conn, file_path: Path, min_field: str, max_field: str):
+    """
+    Checks that no ranges overlap between rows.
+
+    Two ranges [a_min, a_max] and [b_min, b_max] overlap if:
+    a_min <= b_max AND a_max >= b_min
+
+    Args:
+        conn: duckdb connection
+        file_path: path to the CSV file
+        min_field: the column name for the range minimum
+        max_field: the column name for the range maximum
+    """
+    result = conn.execute(
+        f"""
+        SELECT
+            a."{min_field}" as a_min,
+            a."{max_field}" as a_max,
+            b."{min_field}" as b_min,
+            b."{max_field}" as b_max
+        FROM {_read_csv(file_path)} a
+        JOIN {_read_csv(file_path)} b
+        ON CAST(a."{min_field}" AS BIGINT) < CAST(b."{min_field}" AS BIGINT)
+        WHERE CAST(a."{min_field}" AS BIGINT) <= CAST(b."{max_field}" AS BIGINT)
+        AND CAST(a."{max_field}" AS BIGINT) >= CAST(b."{min_field}" AS BIGINT)
+        """
+    ).fetchall()
+
+    overlaps = [
+        {"range_1": [row[0], row[1]], "range_2": [row[2], row[3]]} for row in result
+    ]
+
+    if len(overlaps) == 0:
+        passed = True
+        message = f"no overlapping ranges found between '{min_field}' and '{max_field}'"
+    else:
+        passed = False
+        message = f"there were {len(overlaps)} overlapping ranges found"
+
+    details = {
+        "min_field": min_field,
+        "max_field": max_field,
+        "overlaps": overlaps,
+    }
+
+    return passed, message, details