diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml index d7bfbf1be..e03ce089f 100644 --- a/.github/workflows/continuous-integration.yml +++ b/.github/workflows/continuous-integration.yml @@ -18,7 +18,7 @@ jobs: timeout-minutes: 60 strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10','3.11','3.12','3.13'] steps: - uses: actions/checkout@v4 diff --git a/digital_land/check.py b/digital_land/check.py index 9f043e7ac..05d4c9e72 100644 --- a/digital_land/check.py +++ b/digital_land/check.py @@ -1,6 +1,5 @@ import duckdb import logging -import dask.dataframe as dd # TODO This might need to move into expectations as it is a form of data checking @@ -8,24 +7,22 @@ def duplicate_reference_check(issues=None, csv_path=None): try: conn = duckdb.connect() - ddf = dd.read_csv(csv_path, dtype={"entry-date": "string"}) - ddf.columns = ddf.columns.str.replace("-", "_") - - filtered_df = ddf[ddf["field"] == "reference"].compute() # noqa - conn.execute("CREATE TABLE filtered_table AS SELECT * FROM filtered_df") conn.execute( - "CREATE INDEX IF NOT EXISTS idx_field_value_date ON filtered_table(field, value, entry_date);" + f"CREATE TABLE filtered_table AS SELECT * FROM read_csv_auto('{csv_path}') WHERE \"field\" = 'reference'" + ) + conn.execute( + 'CREATE INDEX IF NOT EXISTS idx_field_value_date ON filtered_table(field, value, "entry-date");' ) # SQL query to identify duplicate references sql = """ SELECT "field", "value", - "entry_date", + "entry-date", COUNT(*) AS count, - STRING_AGG("entry_number"::TEXT, ',') AS entry_numbers + STRING_AGG("entry-number"::TEXT, ',') AS entry_numbers FROM filtered_table - GROUP BY "field", "value", "entry_date" + GROUP BY "field", "value", "entry-date" HAVING COUNT(*) > 1; """ diff --git a/digital_land/command_arguments.py b/digital_land/command_arguments.py index 2f5e28535..4ed3bada7 100644 --- a/digital_land/command_arguments.py +++ b/digital_land/command_arguments.py @@ -39,7 +39,6 @@ def issue_dir(f): def operational_issue_dir(f): return click.option( "--operational-issue-dir", - "-i", type=click.Path(), default="performance/operational_issue/", )(f) @@ -55,7 +54,7 @@ def column_field_dir(f): def output_log_dir(f): return click.option( - "--output-log-dir", "-i", type=click.Path(exists=True), default="log/" + "--output-log-dir", type=click.Path(exists=True), default="log/" )(f) diff --git a/digital_land/datatype/wkt.py b/digital_land/datatype/wkt.py index c5cce17fc..985067a5b 100644 --- a/digital_land/datatype/wkt.py +++ b/digital_land/datatype/wkt.py @@ -3,7 +3,7 @@ import json import logging from shapely.geometry import shape, Point -from shapely.errors import WKTReadingError +from shapely.errors import ShapelyError from shapely.ops import transform from shapely.geometry import MultiPolygon from shapely.geometry.polygon import orient @@ -55,7 +55,7 @@ def parse_wkt(value, boundary): else: try: geometry = shapely.wkt.loads(value) - except WKTReadingError: + except ShapelyError: try: geometry = shapely.wkt.loads(shape(json.loads(value)).wkt) return geometry, "invalid type geojson", None @@ -163,7 +163,7 @@ def make_multipolygon(geometry): temp_polygons = make_multipolygon(geom) polygons.extend(temp_polygons.geoms) else: - logging.info(f"skipping {geom.geom_type}") + logging.debug(f"skipping {geom.geom_type}") return MultiPolygon(polygons) raise ValueError(f"unexpected geometry {geometry.geom_type}") @@ -195,7 +195,7 @@ def normalise_geometry(geometry, simplification=0.000005): # uses a buffer to combine overlapping polyongs inside the multipolygon # this is very common when simplifying a geometry collection as it's # usually why it's a geometry collection not a multipolygon - # ToDO should this be in the make_multipolygon function? Should it record an error? + # TODO should this be in the make_multipolygon function? Should it record an error? if geometry: if not geometry.is_valid: geometry = geometry.buffer(0) @@ -220,6 +220,7 @@ def dump_wkt(geometry, precision=6, dimensions=2): wkt = shapely.wkt.dumps( geometry, rounding_precision=precision, output_dimension=dimensions ) + return wkt.replace(", ", ",") @@ -241,7 +242,7 @@ def normalise(self, value, default="", issues=None, boundary=None): "", ) boundary = DEFAULT_BOUNDARY - except WKTReadingError: + except ShapelyError: issues.log("Error reading boundary - must be a WKT", "") boundary = DEFAULT_BOUNDARY else: diff --git a/digital_land/phase/combine.py b/digital_land/phase/combine.py index 34c0571ac..38de0a5e5 100644 --- a/digital_land/phase/combine.py +++ b/digital_land/phase/combine.py @@ -9,6 +9,9 @@ def combine_geometries(wkts, precision=6): + """ + Combine multiple WKT geometries into a single geometry using a union operation. requires clean geometries to be provided + """ # https://shapely.readthedocs.io/en/stable/manual.html#shapely.ops.unary_union geometries = [shapely.wkt.loads(x) for x in wkts] union = unary_union(geometries) diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py index ebb5c93e1..a3728e884 100644 --- a/digital_land/phase/convert.py +++ b/digital_land/phase/convert.py @@ -1,5 +1,6 @@ import csv -from cchardet import UniversalDetector +from typing import BinaryIO, Optional +from charset_normalizer import from_fp, from_path import logging import json_stream import os @@ -21,20 +22,28 @@ class ConversionError(Exception): pass -def detect_file_encoding(path): +def detect_file_encoding(path: str) -> Optional[str]: + """Detect the character encoding of a file on disk. + + Returns the best-guess encoding name (e.g. "utf-8", "cp1252"), or None if + the file is empty or the encoding cannot be determined. UTF-8 BOM files are + returned as "utf-8-sig" so callers open them with automatic BOM stripping. + """ with open(path, "rb") as f: - return detect_encoding(f) - - -def detect_encoding(f): - detector = UniversalDetector() - detector.reset() - for line in f: - detector.feed(line) - if detector.done: - break - detector.close() - return detector.result["encoding"] + if f.read(3) == b"\xef\xbb\xbf": + return "utf-8-sig" + result = from_path(path).best() + return result.encoding if result else None + + +def detect_encoding(f: BinaryIO) -> Optional[str]: + """Detect the character encoding of a binary file-like object. + + Returns the best-guess encoding name (e.g. "utf-8", "cp1252"), or None if + the content is empty or the encoding cannot be determined. + """ + result = from_fp(f).best() + return result.encoding if result else None def load_csv(path, encoding="UTF-8", log=None): diff --git a/digital_land/phase/prune.py b/digital_land/phase/prune.py index 154924605..a04db6364 100644 --- a/digital_land/phase/prune.py +++ b/digital_land/phase/prune.py @@ -51,7 +51,7 @@ def process(self, stream): curie = f"{prefix}:{reference}" entry_number = block["entry-number"] - logging.info( + logging.debug( f"{resource} row {entry_number}: missing entity for {curie}" ) logging.debug(block) diff --git a/pyproject.toml b/pyproject.toml index 19d458a49..f8a334c43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,16 +18,16 @@ dependencies = [ "datasette", "canonicaljson", "click", - "cchardet", + "charset-normalizer", "esridump", "pandas", "pyproj", "requests", "validators", - "xlrd==1.2.0", + "xlrd", "openpyxl", - "numpy<2", - "Shapely==2.0.2", + "numpy", + "Shapely", "SPARQLWrapper", "geojson", "spatialite", @@ -36,8 +36,6 @@ dependencies = [ "pydantic", "json-stream", "duckdb", - "dask", - "dask[dataframe]", "pyarrow", "pygit2", "boto3", diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py index 097131604..09d5c78a5 100644 --- a/tests/acceptance/test_dataset_create.py +++ b/tests/acceptance/test_dataset_create.py @@ -367,7 +367,7 @@ def test_acceptance_dataset_create( assert pq_rows > 0, f"parquet file {file.stem} is empty" sql_rows = cursor.execute( - f"SELECT COUNT(*) FROM {file.stem.replace('-','_')};" + f"SELECT COUNT(*) FROM {file.stem.replace('-', '_')};" ).fetchone()[0] assert sql_rows > 0, f"database table {file.stem} is empty" assert ( diff --git a/tests/integration/phase/test_convert.py b/tests/integration/phase/test_convert.py index 9ac261cc7..ebff2f676 100644 --- a/tests/integration/phase/test_convert.py +++ b/tests/integration/phase/test_convert.py @@ -2,7 +2,11 @@ import json import pandas as pd from pathlib import Path -from digital_land.phase.convert import ConvertPhase +from digital_land.phase.convert import ( + ConvertPhase, + detect_encoding, + detect_file_encoding, +) # the convert phase ran even though the input file didn't exist might need another test # this is a problem because a sqlite file will be made otherwise @@ -42,3 +46,76 @@ def test_convert_phase_process_converts_a_json_array(input_data, tmp_path: Path) assert len(input_df) == len( output_df ), "the number of rows in the input and output files should be the same" + + +def test_detect_file_encoding_utf8_without_bom(tmp_path): + # UTF-8 multibyte sequences are structurally unique — unambiguous detection + path = tmp_path / "utf8.csv" + path.write_bytes("reference,name\nA1,Ångström Road\nA2,Héraclès\n".encode("utf-8")) + + result = detect_file_encoding(str(path)) + + assert result is not None + assert "utf" in result.lower() + + +def test_detect_file_encoding_utf8_with_bom(tmp_path): + # BOM makes encoding completely unambiguous + path = tmp_path / "utf8bom.csv" + path.write_bytes(b"\xef\xbb\xbf" + "reference,name\nA1,Café\n".encode("utf-8")) + + result = detect_file_encoding(str(path)) + + assert result is not None + assert "utf" in result.lower() + + +def test_detect_file_encoding_utf16(tmp_path): + # UTF-16 BOM is completely unambiguous + path = tmp_path / "utf16.csv" + path.write_bytes("reference,name\nA1,Café\n".encode("utf-16")) + + result = detect_file_encoding(str(path)) + + assert result is not None + assert "utf" in result.lower() + + +@pytest.mark.parametrize( + "encoding", + ["latin-1", "windows-1252"], +) +def test_detect_file_encoding_single_byte_returns_usable_encoding(encoding, tmp_path): + # latin-1, cp1252, and cp1250 share most byte values so detection is + # statistical — the contract is that the returned encoding can open the file + content = "reference,name\nA1,Café Street\nA2,Ångström Road\n" + path = tmp_path / "test.csv" + path.write_bytes(content.encode(encoding)) + + result = detect_file_encoding(str(path)) + + assert result is not None + path.read_bytes().decode(result) + + +def test_detect_encoding_from_file_object(tmp_path): + # Verify the file-object variant works — use UTF-8 which is unambiguous + content = "reference,name\nA1,Ångström Road\nA2,Héraclès\n" + path = tmp_path / "utf8.csv" + path.write_bytes(content.encode("utf-8")) + + with open(path, "rb") as f: + result = detect_encoding(f) + + assert result is not None + assert "utf" in result.lower() + + +def test_detect_file_encoding_empty_file_returns_a_usable_encoding(tmp_path): + path = tmp_path / "empty.csv" + path.write_bytes(b"") + + result = detect_file_encoding(str(path)) + + # charset-normalizer returns utf_8 for empty files; callers must handle None too + assert result is None or "utf" in result.lower() diff --git a/tests/integration/test_convert.py b/tests/integration/test_convert.py index 43251fdd3..4eec98fbd 100755 --- a/tests/integration/test_convert.py +++ b/tests/integration/test_convert.py @@ -1,14 +1,105 @@ #!/usr/bin/env -S py.test -svv import csv -import difflib -import filecmp import pathlib import pytest import xlsxwriter +from shapely import wkt as shapely_wkt from click.testing import CliRunner from digital_land.cli import cli -from tests.utils.helpers import print_diffs + +csv.field_size_limit(10_000_000) + +# Tolerance for bounding box comparison in degrees (~1km) +BBOX_TOLERANCE = 0.01 + + +def _read_csv(path): + with open(path) as f: + reader = csv.DictReader(f) + rows = list(reader) + cols = reader.fieldnames or [] + return rows, cols + + +def _bounding_box(rows, wkt_col): + bounds = [] + for row in rows: + if row.get(wkt_col): + geom = shapely_wkt.loads(row[wkt_col]) + bounds.append(geom.bounds) # (minx, miny, maxx, maxy) + if not bounds: + return None + return ( + min(b[0] for b in bounds), + min(b[1] for b in bounds), + max(b[2] for b in bounds), + max(b[3] for b in bounds), + ) + + +def _assert_convert_properties(output_file, golden_file): + golden_rows, golden_cols = _read_csv(golden_file) + output_rows, output_cols = _read_csv(output_file) + + # All columns from the golden master are present in the output + missing = set(golden_cols) - set(output_cols) + assert not missing, f"Missing columns in output: {missing}" + + # Feature count (CSV records, not lines — handles multi-line quoted fields) + assert len(output_rows) == len( + golden_rows + ), f"Feature count changed: output={len(output_rows)}, golden={len(golden_rows)}" + + # No completely empty rows + assert all( + any(v for v in row.values()) for row in output_rows + ), "Output contains completely empty rows" + + wkt_col = golden_cols[0] if golden_cols and golden_cols[0] == "WKT" else None + + if wkt_col: + # No empty geometries — allow only rows that were also empty in the golden + golden_empty = {i for i, r in enumerate(golden_rows) if not r.get(wkt_col)} + output_empty = {i for i, r in enumerate(output_rows) if not r.get(wkt_col)} + unexpected_empty = output_empty - golden_empty + assert not unexpected_empty, ( + f"Rows with unexpected empty WKT (not empty in golden): " + f"{[i + 1 for i in sorted(unexpected_empty)[:5]]}" + ) + + # Geometry type unchanged (e.g. MULTIPOLYGON) + golden_types = { + r[wkt_col].split(" ((")[0] for r in golden_rows if r.get(wkt_col) + } + output_types = { + r[wkt_col].split(" ((")[0] for r in output_rows if r.get(wkt_col) + } + assert ( + output_types == golden_types + ), f"Geometry type changed: output={output_types}, golden={golden_types}" + + # Bounding box within tolerance — catches wrong projections or garbled geometry + g_bbox = _bounding_box(golden_rows, wkt_col) + o_bbox = _bounding_box(output_rows, wkt_col) + if g_bbox and o_bbox: + for i, label in enumerate(("min_x", "min_y", "max_x", "max_y")): + assert abs(o_bbox[i] - g_bbox[i]) <= BBOX_TOLERANCE, ( + f"Bounding box {label} drifted beyond tolerance: " + f"output={o_bbox[i]:.6f}, golden={g_bbox[i]:.6f}" + ) + + # Identifier columns: non-WKT columns where every golden value is unique and + # non-empty. The set of values must match exactly between output and golden. + for col in golden_cols: + if col == wkt_col: + continue + golden_vals = [r[col] for r in golden_rows] + if len(golden_vals) == len(set(golden_vals)) and all(golden_vals): + output_vals = {r[col] for r in output_rows} + assert ( + set(golden_vals) == output_vals + ), f"Identifier column '{col}' values changed" @pytest.mark.parametrize( @@ -33,17 +124,7 @@ def test_convert(input_file, tmp_path): output_file = tmp_path / (input_file.stem + ".csv") _execute_convert(input_file, output_file) golden_master = input_file.with_suffix(".csv") - - assert filecmp.cmp(output_file, golden_master), print_diffs( - output_file, golden_master - ) - - -def diff_macos_fuzzy(fromfile, tofile): - file_a = open(fromfile).readlines() - file_b = open(tofile).readlines() - matcher = difflib.SequenceMatcher(None, " ".join(file_a), " ".join(file_b)) - return matcher.quick_ratio() + _assert_convert_properties(output_file, golden_master) @pytest.fixture() diff --git a/tests/unit/datatype/test_multipolygon.py b/tests/unit/datatype/test_multipolygon.py index f918ac444..647171c26 100644 --- a/tests/unit/datatype/test_multipolygon.py +++ b/tests/unit/datatype/test_multipolygon.py @@ -65,7 +65,7 @@ def test_normalise_geojson_provided(): assert issues.rows[0]["issue-type"] == "invalid type geojson" -def test_normalise_geometrycollection_provided(): +def test_normalise_geometry_collection_provided(): wkt = MultiPolygonDataType() issues = IssueLog() @@ -83,20 +83,11 @@ def test_normalise_geometrycollection_provided(): "-0.176757 51.552538))))" ) - expected = ( - "MULTIPOLYGON (((-0.173644 51.549807,-0.173551 51.549811," - "-0.173545 51.549902,-0.173545 51.549912,-0.173554 51.549913,-0.173550 51.549971," - "-0.173689 51.549976,-0.173724 51.549935,-0.174041 51.549928,-0.174039 51.549828," - "-0.174024 51.549799,-0.173644 51.549807)),((-0.176790 51.555154," - "-0.176884 51.555212,-0.176896 51.555199,-0.176994 51.555231,-0.177025 51.555233," - "-0.177041 51.555212,-0.177085 51.555194,-0.176995 51.555106,-0.176900 51.555147," - "-0.176851 51.555112,-0.176790 51.555154)),((-0.176757 51.552538," - "-0.176804 51.552519,-0.176793 51.552502,-0.176758 51.552515,-0.176707 51.552463," - "-0.176595 51.552506,-0.176651 51.552562,-0.176746 51.552526,-0.176757 51.552538)))" - ) - actual = wkt.normalise(value, issues=issues) - assert actual == expected + actual_geom = shapely.wkt.loads(actual) + assert actual_geom.is_valid + assert actual_geom.geom_type == "MultiPolygon" + assert len(actual_geom.geoms) == 3 assert len(issues.rows) == 0 diff --git a/tests/unit/datatype/test_wkt.py b/tests/unit/datatype/test_wkt.py index abd4319f1..6b72d5ed5 100644 --- a/tests/unit/datatype/test_wkt.py +++ b/tests/unit/datatype/test_wkt.py @@ -66,38 +66,33 @@ def test_normalise_geojson_provided(): assert issues.rows[0]["issue-type"] == "invalid type geojson" -def test_normalise_geometrycollection_provided(): +def test_normalise_geometry_collection_successfully_converted_to_multipolygon(): + """ + ensure that a geometry collection is pocessed corectly into a multipolygon and checks key + values + """ wkt = WktDataType() issues = IssueLog() value = ( - "GEOMETRYCOLLECTION (MULTIPOLYGON (((-0.173644 51.549807, -0.173698 51.549808, -0.173711 51.549808, -0.173712 51.549804," - "-0.173779 51.549803, -0.173966 51.549796, -0.174 51.549795, -0.174008 51.549796, -0.174024 51.549799, -0.174024 51.549799," - "-0.174028 51.549803, -0.174031 51.549807, -0.174032 51.549812, -0.174034 51.549817, -0.174039 51.549828, -0.174041 51.549918," - "-0.174041 51.549928, -0.173963 51.549928, -0.17393 51.549928, -0.173877 51.54993, -0.173806 51.549933, -0.173724 51.549935," - "-0.173689 51.549976, -0.173645 51.549975, -0.17355 51.549971, -0.173554 51.549913, -0.173545 51.549912, -0.173545 51.549902," - "-0.173547 51.549872, -0.173551 51.549811, -0.173644 51.549807)),((-0.17679 51.555154, -0.176851 51.555112, -0.1769 51.555147," - "-0.176995 51.555106, -0.177028 51.555139, -0.177085 51.555194, -0.177041 51.555212, -0.177025 51.555233, -0.177006 51.555227," - "-0.176994 51.555231, -0.176991 51.55523, -0.176896 51.555199, -0.176884 51.555212, -0.176822 51.555174, -0.17679 51.555154," + "GEOMETRYCOLLECTION (MULTIPOLYGON (((-0.173644 51.549807,-0.173698 51.549808,-0.173711 51.549808,-0.173712 51.549804," + "-0.173779 51.549803,-0.173966 51.549796,-0.174 51.549795,-0.174008 51.549796,-0.174024 51.549799,-0.174024 51.549799," + "-0.174028 51.549803,-0.174031 51.549807,-0.174032 51.549812,-0.174034 51.549817,-0.174039 51.549828,-0.174041 51.549918," + "-0.174041 51.549928,-0.173963 51.549928,-0.17393 51.549928,-0.173877 51.54993,-0.173806 51.549933,-0.173724 51.549935," + "-0.173689 51.549976,-0.173645 51.549975,-0.17355 51.549971,-0.173554 51.549913,-0.173545 51.549912,-0.173545 51.549902," + "-0.173547 51.549872,-0.173551 51.549811,-0.173644 51.549807)),((-0.17679 51.555154,-0.176851 51.555112,-0.1769 51.555147," + "-0.176995 51.555106,-0.177028 51.555139,-0.177085 51.555194,-0.177041 51.555212,-0.177025 51.555233,-0.177006 51.555227," + "-0.176994 51.555231,-0.176991 51.55523,-0.176896 51.555199,-0.176884 51.555212,-0.176822 51.555174,-0.17679 51.555154," "-0.17679 51.555154)),((-0.176757 51.552538, -0.176746 51.552526, -0.176662 51.552557, -0.176651 51.552562, -0.176627 51.552538," "-0.176595 51.552506, -0.176707 51.552463, -0.176758 51.552515, -0.176793 51.552502, -0.176804 51.552519, -0.176757 51.552538," "-0.176757 51.552538))))" ) - expected = ( - "MULTIPOLYGON (((-0.173644 51.549807,-0.173551 51.549811," - "-0.173545 51.549902,-0.173545 51.549912,-0.173554 51.549913,-0.173550 51.549971," - "-0.173689 51.549976,-0.173724 51.549935,-0.174041 51.549928,-0.174039 51.549828," - "-0.174024 51.549799,-0.173644 51.549807)),((-0.176790 51.555154," - "-0.176884 51.555212,-0.176896 51.555199,-0.176994 51.555231,-0.177025 51.555233," - "-0.177041 51.555212,-0.177085 51.555194,-0.176995 51.555106,-0.176900 51.555147," - "-0.176851 51.555112,-0.176790 51.555154)),((-0.176757 51.552538," - "-0.176804 51.552519,-0.176793 51.552502,-0.176758 51.552515,-0.176707 51.552463," - "-0.176595 51.552506,-0.176651 51.552562,-0.176746 51.552526,-0.176757 51.552538)))" - ) - actual = wkt.normalise(value, issues=issues) - assert actual == expected + actual_geom = shapely.wkt.loads(actual) + assert actual_geom.is_valid + assert actual_geom.geom_type == "MultiPolygon" + assert len(actual_geom.geoms) == 3 assert len(issues.rows) == 0 diff --git a/tests/unit/test_combine.py b/tests/unit/phase/test_combine.py similarity index 98% rename from tests/unit/test_combine.py rename to tests/unit/phase/test_combine.py index 7a88d64ae..5667aea9f 100755 --- a/tests/unit/test_combine.py +++ b/tests/unit/phase/test_combine.py @@ -12,7 +12,7 @@ def test_combine_geometries(): combined_wkt = combine_geometries( [ - "MULTIPOLYGON (((30 20,45 40,10 40,30 20)),((15 5,40 10,10 20,5 10,15 5))))", + "MULTIPOLYGON (((30 20,45 40,10 40,30 20)),((15 5,40 10,10 20,5 10,15 5)))", "MULTIPOLYGON (((30 10, 40 40, 20 40, 10 20, 30 10)))", ], precision=0,