digital-land · eveleighoj · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -18,7 +18,7 @@ jobs:
     timeout-minutes: 60
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10','3.11','3.12','3.13']
 
     steps:
     - uses: actions/checkout@v4

diff --git a/digital_land/check.py b/digital_land/check.py
@@ -1,31 +1,28 @@
 import duckdb
 import logging
-import dask.dataframe as dd
 
 
 # TODO This might need to move into expectations as it is a form of data checking
 def duplicate_reference_check(issues=None, csv_path=None):
     try:
         conn = duckdb.connect()
 
-        ddf = dd.read_csv(csv_path, dtype={"entry-date": "string"})
-        ddf.columns = ddf.columns.str.replace("-", "_")
-
-        filtered_df = ddf[ddf["field"] == "reference"].compute()  # noqa
-        conn.execute("CREATE TABLE filtered_table AS SELECT * FROM filtered_df")
         conn.execute(
-            "CREATE INDEX IF NOT EXISTS idx_field_value_date ON filtered_table(field, value, entry_date);"
+            f"CREATE TABLE filtered_table AS SELECT * FROM read_csv_auto('{csv_path}') WHERE \"field\" = 'reference'"
+        )
+        conn.execute(
+            'CREATE INDEX IF NOT EXISTS idx_field_value_date ON filtered_table(field, value, "entry-date");'
         )
         # SQL query to identify duplicate references
         sql = """
         SELECT
             "field",
             "value",
-            "entry_date",
+            "entry-date",
             COUNT(*) AS count,
-            STRING_AGG("entry_number"::TEXT, ',') AS entry_numbers
+            STRING_AGG("entry-number"::TEXT, ',') AS entry_numbers
         FROM filtered_table
-        GROUP BY "field", "value", "entry_date"
+        GROUP BY "field", "value", "entry-date"
         HAVING COUNT(*) > 1;
         """
 

diff --git a/digital_land/command_arguments.py b/digital_land/command_arguments.py
@@ -39,7 +39,6 @@ def issue_dir(f):
 def operational_issue_dir(f):
     return click.option(
         "--operational-issue-dir",
-        "-i",
         type=click.Path(),
         default="performance/operational_issue/",
     )(f)
@@ -55,7 +54,7 @@ def column_field_dir(f):
 
 def output_log_dir(f):
     return click.option(
-        "--output-log-dir", "-i", type=click.Path(exists=True), default="log/"
+        "--output-log-dir", type=click.Path(exists=True), default="log/"
     )(f)
 
 

diff --git a/digital_land/datatype/wkt.py b/digital_land/datatype/wkt.py
@@ -3,7 +3,7 @@
 import json
 import logging
 from shapely.geometry import shape, Point
-from shapely.errors import WKTReadingError
+from shapely.errors import ShapelyError
 from shapely.ops import transform
 from shapely.geometry import MultiPolygon
 from shapely.geometry.polygon import orient
@@ -55,7 +55,7 @@ def parse_wkt(value, boundary):
     else:
         try:
             geometry = shapely.wkt.loads(value)
-        except WKTReadingError:
+        except ShapelyError:
             try:
                 geometry = shapely.wkt.loads(shape(json.loads(value)).wkt)
                 return geometry, "invalid type geojson", None
@@ -163,7 +163,7 @@ def make_multipolygon(geometry):
                 temp_polygons = make_multipolygon(geom)
                 polygons.extend(temp_polygons.geoms)
             else:
-                logging.info(f"skipping {geom.geom_type}")
+                logging.debug(f"skipping {geom.geom_type}")
         return MultiPolygon(polygons)
 
     raise ValueError(f"unexpected geometry {geometry.geom_type}")
@@ -195,7 +195,7 @@ def normalise_geometry(geometry, simplification=0.000005):
     # uses a buffer to combine overlapping polyongs inside the multipolygon
     # this is very common when simplifying a geometry collection as it's
     # usually why it's a geometry collection not a multipolygon
-    # ToDO should this be in the make_multipolygon function? Should it record an error?
+    # TODO should this be in the make_multipolygon function? Should it record an error?
     if geometry:
         if not geometry.is_valid:
             geometry = geometry.buffer(0)
@@ -220,6 +220,7 @@ def dump_wkt(geometry, precision=6, dimensions=2):
     wkt = shapely.wkt.dumps(
         geometry, rounding_precision=precision, output_dimension=dimensions
     )
+
     return wkt.replace(", ", ",")
 
 
@@ -241,7 +242,7 @@ def normalise(self, value, default="", issues=None, boundary=None):
                         "",
                     )
                     boundary = DEFAULT_BOUNDARY
-            except WKTReadingError:
+            except ShapelyError:
                 issues.log("Error reading boundary - must be a WKT", "")
                 boundary = DEFAULT_BOUNDARY
         else:

diff --git a/digital_land/phase/combine.py b/digital_land/phase/combine.py
@@ -9,6 +9,9 @@
 
 
 def combine_geometries(wkts, precision=6):
+    """
+    Combine multiple WKT geometries into a single geometry using a union operation. requires clean geometries to be provided
+    """
     # https://shapely.readthedocs.io/en/stable/manual.html#shapely.ops.unary_union
     geometries = [shapely.wkt.loads(x) for x in wkts]
     union = unary_union(geometries)

diff --git a/digital_land/phase/convert.py b/digital_land/phase/convert.py
@@ -1,5 +1,6 @@
 import csv
-from cchardet import UniversalDetector
+from typing import BinaryIO, Optional
+from charset_normalizer import from_fp, from_path
 import logging
 import json_stream
 import os
@@ -21,20 +22,28 @@ class ConversionError(Exception):
     pass
 
 
-def detect_file_encoding(path):
+def detect_file_encoding(path: str) -> Optional[str]:
+    """Detect the character encoding of a file on disk.
+
+    Returns the best-guess encoding name (e.g. "utf-8", "cp1252"), or None if
+    the file is empty or the encoding cannot be determined. UTF-8 BOM files are
+    returned as "utf-8-sig" so callers open them with automatic BOM stripping.
+    """
     with open(path, "rb") as f:
-        return detect_encoding(f)
-
-
-def detect_encoding(f):
-    detector = UniversalDetector()
-    detector.reset()
-    for line in f:
-        detector.feed(line)
-        if detector.done:
-            break
-    detector.close()
-    return detector.result["encoding"]
+        if f.read(3) == b"\xef\xbb\xbf":
+            return "utf-8-sig"
+    result = from_path(path).best()
+    return result.encoding if result else None
+
+
+def detect_encoding(f: BinaryIO) -> Optional[str]:
+    """Detect the character encoding of a binary file-like object.
+
+    Returns the best-guess encoding name (e.g. "utf-8", "cp1252"), or None if
+    the content is empty or the encoding cannot be determined.
+    """
+    result = from_fp(f).best()
+    return result.encoding if result else None
 
 
 def load_csv(path, encoding="UTF-8", log=None):

diff --git a/digital_land/phase/prune.py b/digital_land/phase/prune.py
@@ -51,7 +51,7 @@ def process(self, stream):
                 curie = f"{prefix}:{reference}"
                 entry_number = block["entry-number"]
 
-                logging.info(
+                logging.debug(
                     f"{resource} row {entry_number}: missing entity for {curie}"
                 )
                 logging.debug(block)

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,16 +18,16 @@ dependencies = [
     "datasette",
     "canonicaljson",
     "click",
-    "cchardet",
+    "charset-normalizer",
     "esridump",
     "pandas",
     "pyproj",
     "requests",
     "validators",
-    "xlrd==1.2.0",
+    "xlrd",
     "openpyxl",
-    "numpy<2",
-    "Shapely==2.0.2",
+    "numpy",
+    "Shapely",
     "SPARQLWrapper",
     "geojson",
     "spatialite",
@@ -36,8 +36,6 @@ dependencies = [
     "pydantic",
     "json-stream",
     "duckdb",
-    "dask",
-    "dask[dataframe]",
     "pyarrow",
     "pygit2",
     "boto3",

diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py
@@ -367,7 +367,7 @@ def test_acceptance_dataset_create(
 
         assert pq_rows > 0, f"parquet file {file.stem} is empty"
         sql_rows = cursor.execute(
-            f"SELECT COUNT(*) FROM {file.stem.replace('-','_')};"
+            f"SELECT COUNT(*) FROM {file.stem.replace('-', '_')};"
         ).fetchone()[0]
         assert sql_rows > 0, f"database table {file.stem} is empty"
         assert (

diff --git a/tests/integration/phase/test_convert.py b/tests/integration/phase/test_convert.py
@@ -2,7 +2,11 @@
 import json
 import pandas as pd
 from pathlib import Path
-from digital_land.phase.convert import ConvertPhase
+from digital_land.phase.convert import (
+    ConvertPhase,
+    detect_encoding,
+    detect_file_encoding,
+)
 
 # the convert phase ran even though the input file didn't exist might need another test
 # this is a problem because a sqlite file will be made otherwise
@@ -42,3 +46,76 @@ def test_convert_phase_process_converts_a_json_array(input_data, tmp_path: Path)
     assert len(input_df) == len(
         output_df
     ), "the number of rows in the input and output files should be the same"
+
+
+def test_detect_file_encoding_utf8_without_bom(tmp_path):
+    # UTF-8 multibyte sequences are structurally unique — unambiguous detection
+    path = tmp_path / "utf8.csv"
+    path.write_bytes("reference,name\nA1,Ångström Road\nA2,Héraclès\n".encode("utf-8"))
+
+    result = detect_file_encoding(str(path))
+
+    assert result is not None
+    assert "utf" in result.lower()
+
+
+def test_detect_file_encoding_utf8_with_bom(tmp_path):
+    # BOM makes encoding completely unambiguous
+    path = tmp_path / "utf8bom.csv"
+    path.write_bytes(b"\xef\xbb\xbf" + "reference,name\nA1,Café\n".encode("utf-8"))
+
+    result = detect_file_encoding(str(path))
+
+    assert result is not None
+    assert "utf" in result.lower()
+
+
+def test_detect_file_encoding_utf16(tmp_path):
+    # UTF-16 BOM is completely unambiguous
+    path = tmp_path / "utf16.csv"
+    path.write_bytes("reference,name\nA1,Café\n".encode("utf-16"))
+
+    result = detect_file_encoding(str(path))
+
+    assert result is not None
+    assert "utf" in result.lower()
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    ["latin-1", "windows-1252"],
+)
+def test_detect_file_encoding_single_byte_returns_usable_encoding(encoding, tmp_path):
+    # latin-1, cp1252, and cp1250 share most byte values so detection is
+    # statistical — the contract is that the returned encoding can open the file
+    content = "reference,name\nA1,Café Street\nA2,Ångström Road\n"
+    path = tmp_path / "test.csv"
+    path.write_bytes(content.encode(encoding))
+
+    result = detect_file_encoding(str(path))
+
+    assert result is not None
+    path.read_bytes().decode(result)
+
+
+def test_detect_encoding_from_file_object(tmp_path):
+    # Verify the file-object variant works — use UTF-8 which is unambiguous
+    content = "reference,name\nA1,Ångström Road\nA2,Héraclès\n"
+    path = tmp_path / "utf8.csv"
+    path.write_bytes(content.encode("utf-8"))
+
+    with open(path, "rb") as f:
+        result = detect_encoding(f)
+
+    assert result is not None
+    assert "utf" in result.lower()
+
+
+def test_detect_file_encoding_empty_file_returns_a_usable_encoding(tmp_path):
+    path = tmp_path / "empty.csv"
+    path.write_bytes(b"")
+
+    result = detect_file_encoding(str(path))
+
+    # charset-normalizer returns utf_8 for empty files; callers must handle None too
+    assert result is None or "utf" in result.lower()