Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/continuous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
timeout-minutes: 60
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10']
python-version: ['3.8', '3.9', '3.10','3.11','3.12','3.13']

steps:
- uses: actions/checkout@v4
Expand Down
17 changes: 7 additions & 10 deletions digital_land/check.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,28 @@
import duckdb
import logging
import dask.dataframe as dd


# TODO This might need to move into expectations as it is a form of data checking
def duplicate_reference_check(issues=None, csv_path=None):
try:
conn = duckdb.connect()

ddf = dd.read_csv(csv_path, dtype={"entry-date": "string"})
ddf.columns = ddf.columns.str.replace("-", "_")

filtered_df = ddf[ddf["field"] == "reference"].compute() # noqa
conn.execute("CREATE TABLE filtered_table AS SELECT * FROM filtered_df")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_field_value_date ON filtered_table(field, value, entry_date);"
f"CREATE TABLE filtered_table AS SELECT * FROM read_csv_auto('{csv_path}') WHERE \"field\" = 'reference'"
)
conn.execute(
'CREATE INDEX IF NOT EXISTS idx_field_value_date ON filtered_table(field, value, "entry-date");'
)
# SQL query to identify duplicate references
sql = """
SELECT
"field",
"value",
"entry_date",
"entry-date",
COUNT(*) AS count,
STRING_AGG("entry_number"::TEXT, ',') AS entry_numbers
STRING_AGG("entry-number"::TEXT, ',') AS entry_numbers
FROM filtered_table
GROUP BY "field", "value", "entry_date"
GROUP BY "field", "value", "entry-date"
HAVING COUNT(*) > 1;
"""

Expand Down
3 changes: 1 addition & 2 deletions digital_land/command_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def issue_dir(f):
def operational_issue_dir(f):
return click.option(
"--operational-issue-dir",
"-i",
type=click.Path(),
default="performance/operational_issue/",
)(f)
Expand All @@ -55,7 +54,7 @@ def column_field_dir(f):

def output_log_dir(f):
return click.option(
"--output-log-dir", "-i", type=click.Path(exists=True), default="log/"
"--output-log-dir", type=click.Path(exists=True), default="log/"
)(f)


Expand Down
11 changes: 6 additions & 5 deletions digital_land/datatype/wkt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
import logging
from shapely.geometry import shape, Point
from shapely.errors import WKTReadingError
from shapely.errors import ShapelyError
from shapely.ops import transform
from shapely.geometry import MultiPolygon
from shapely.geometry.polygon import orient
Expand Down Expand Up @@ -55,7 +55,7 @@ def parse_wkt(value, boundary):
else:
try:
geometry = shapely.wkt.loads(value)
except WKTReadingError:
except ShapelyError:
try:
geometry = shapely.wkt.loads(shape(json.loads(value)).wkt)
return geometry, "invalid type geojson", None
Expand Down Expand Up @@ -163,7 +163,7 @@ def make_multipolygon(geometry):
temp_polygons = make_multipolygon(geom)
polygons.extend(temp_polygons.geoms)
else:
logging.info(f"skipping {geom.geom_type}")
logging.debug(f"skipping {geom.geom_type}")
return MultiPolygon(polygons)

raise ValueError(f"unexpected geometry {geometry.geom_type}")
Expand Down Expand Up @@ -195,7 +195,7 @@ def normalise_geometry(geometry, simplification=0.000005):
# uses a buffer to combine overlapping polyongs inside the multipolygon
# this is very common when simplifying a geometry collection as it's
# usually why it's a geometry collection not a multipolygon
# ToDO should this be in the make_multipolygon function? Should it record an error?
# TODO should this be in the make_multipolygon function? Should it record an error?
if geometry:
if not geometry.is_valid:
geometry = geometry.buffer(0)
Expand All @@ -220,6 +220,7 @@ def dump_wkt(geometry, precision=6, dimensions=2):
wkt = shapely.wkt.dumps(
geometry, rounding_precision=precision, output_dimension=dimensions
)

return wkt.replace(", ", ",")


Expand All @@ -241,7 +242,7 @@ def normalise(self, value, default="", issues=None, boundary=None):
"",
)
boundary = DEFAULT_BOUNDARY
except WKTReadingError:
except ShapelyError:
issues.log("Error reading boundary - must be a WKT", "")
boundary = DEFAULT_BOUNDARY
else:
Expand Down
3 changes: 3 additions & 0 deletions digital_land/phase/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@


def combine_geometries(wkts, precision=6):
"""
Combine multiple WKT geometries into a single geometry using a union operation. requires clean geometries to be provided
"""
# https://shapely.readthedocs.io/en/stable/manual.html#shapely.ops.unary_union
geometries = [shapely.wkt.loads(x) for x in wkts]
union = unary_union(geometries)
Expand Down
37 changes: 23 additions & 14 deletions digital_land/phase/convert.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
from cchardet import UniversalDetector
from typing import BinaryIO, Optional
from charset_normalizer import from_fp, from_path
import logging
import json_stream
import os
Expand All @@ -21,20 +22,28 @@ class ConversionError(Exception):
pass


def detect_file_encoding(path):
def detect_file_encoding(path: str) -> Optional[str]:
"""Detect the character encoding of a file on disk.

Returns the best-guess encoding name (e.g. "utf-8", "cp1252"), or None if
the file is empty or the encoding cannot be determined. UTF-8 BOM files are
returned as "utf-8-sig" so callers open them with automatic BOM stripping.
"""
with open(path, "rb") as f:
return detect_encoding(f)


def detect_encoding(f):
detector = UniversalDetector()
detector.reset()
for line in f:
detector.feed(line)
if detector.done:
break
detector.close()
return detector.result["encoding"]
if f.read(3) == b"\xef\xbb\xbf":
return "utf-8-sig"
result = from_path(path).best()
return result.encoding if result else None


def detect_encoding(f: BinaryIO) -> Optional[str]:
"""Detect the character encoding of a binary file-like object.

Returns the best-guess encoding name (e.g. "utf-8", "cp1252"), or None if
the content is empty or the encoding cannot be determined.
"""
result = from_fp(f).best()
return result.encoding if result else None


def load_csv(path, encoding="UTF-8", log=None):
Expand Down
2 changes: 1 addition & 1 deletion digital_land/phase/prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def process(self, stream):
curie = f"{prefix}:{reference}"
entry_number = block["entry-number"]

logging.info(
logging.debug(
f"{resource} row {entry_number}: missing entity for {curie}"
)
logging.debug(block)
Expand Down
10 changes: 4 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ dependencies = [
"datasette",
"canonicaljson",
"click",
"cchardet",
"charset-normalizer",
"esridump",
"pandas",
"pyproj",
"requests",
"validators",
"xlrd==1.2.0",
"xlrd",
"openpyxl",
"numpy<2",
"Shapely==2.0.2",
"numpy",
"Shapely",
"SPARQLWrapper",
"geojson",
"spatialite",
Expand All @@ -36,8 +36,6 @@ dependencies = [
"pydantic",
"json-stream",
"duckdb",
"dask",
"dask[dataframe]",
"pyarrow",
"pygit2",
"boto3",
Expand Down
2 changes: 1 addition & 1 deletion tests/acceptance/test_dataset_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ def test_acceptance_dataset_create(

assert pq_rows > 0, f"parquet file {file.stem} is empty"
sql_rows = cursor.execute(
f"SELECT COUNT(*) FROM {file.stem.replace('-','_')};"
f"SELECT COUNT(*) FROM {file.stem.replace('-', '_')};"
).fetchone()[0]
assert sql_rows > 0, f"database table {file.stem} is empty"
assert (
Expand Down
79 changes: 78 additions & 1 deletion tests/integration/phase/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
import json
import pandas as pd
from pathlib import Path
from digital_land.phase.convert import ConvertPhase
from digital_land.phase.convert import (
ConvertPhase,
detect_encoding,
detect_file_encoding,
)

# the convert phase ran even though the input file didn't exist might need another test
# this is a problem because a sqlite file will be made otherwise
Expand Down Expand Up @@ -42,3 +46,76 @@ def test_convert_phase_process_converts_a_json_array(input_data, tmp_path: Path)
assert len(input_df) == len(
output_df
), "the number of rows in the input and output files should be the same"


def test_detect_file_encoding_utf8_without_bom(tmp_path):
# UTF-8 multibyte sequences are structurally unique — unambiguous detection
path = tmp_path / "utf8.csv"
path.write_bytes("reference,name\nA1,Ångström Road\nA2,Héraclès\n".encode("utf-8"))

result = detect_file_encoding(str(path))

assert result is not None
assert "utf" in result.lower()


def test_detect_file_encoding_utf8_with_bom(tmp_path):
# BOM makes encoding completely unambiguous
path = tmp_path / "utf8bom.csv"
path.write_bytes(b"\xef\xbb\xbf" + "reference,name\nA1,Café\n".encode("utf-8"))

result = detect_file_encoding(str(path))

assert result is not None
assert "utf" in result.lower()


def test_detect_file_encoding_utf16(tmp_path):
# UTF-16 BOM is completely unambiguous
path = tmp_path / "utf16.csv"
path.write_bytes("reference,name\nA1,Café\n".encode("utf-16"))

result = detect_file_encoding(str(path))

assert result is not None
assert "utf" in result.lower()


@pytest.mark.parametrize(
"encoding",
["latin-1", "windows-1252"],
)
def test_detect_file_encoding_single_byte_returns_usable_encoding(encoding, tmp_path):
# latin-1, cp1252, and cp1250 share most byte values so detection is
# statistical — the contract is that the returned encoding can open the file
content = "reference,name\nA1,Café Street\nA2,Ångström Road\n"
path = tmp_path / "test.csv"
path.write_bytes(content.encode(encoding))

result = detect_file_encoding(str(path))

assert result is not None
path.read_bytes().decode(result)


def test_detect_encoding_from_file_object(tmp_path):
# Verify the file-object variant works — use UTF-8 which is unambiguous
content = "reference,name\nA1,Ångström Road\nA2,Héraclès\n"
path = tmp_path / "utf8.csv"
path.write_bytes(content.encode("utf-8"))

with open(path, "rb") as f:
result = detect_encoding(f)

assert result is not None
assert "utf" in result.lower()


def test_detect_file_encoding_empty_file_returns_a_usable_encoding(tmp_path):
path = tmp_path / "empty.csv"
path.write_bytes(b"")

result = detect_file_encoding(str(path))

# charset-normalizer returns utf_8 for empty files; callers must handle None too
assert result is None or "utf" in result.lower()
Loading
Loading