diff --git a/Makefile b/Makefile index 22037acb8..0479332b3 100644 --- a/Makefile +++ b/Makefile @@ -35,9 +35,9 @@ endif endif ifndef SPATIAL ifeq ($(UNAME),Darwin) - $(error GDAL tools not found in PATH) +# $(error GDAL tools not found in PATH) endif - sudo apt-get install libsqlite3-mod-spatialite -y +# sudo apt-get install libsqlite3-mod-spatialite -y endif pyproj sync --file uk_os_OSTN15_NTv2_OSGBtoETRS.tif -v # install pre-commits diff --git a/digital_land/commands.py b/digital_land/commands.py index 37166354a..683749da3 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -9,7 +9,8 @@ from packaging.version import Version import pandas as pd from pathlib import Path -from datetime import datetime + +# from datetime import datetime from distutils.dir_util import copy_tree import geojson from requests import HTTPError @@ -18,7 +19,7 @@ from digital_land.package.organisation import OrganisationPackage from digital_land.specification import Specification -from digital_land.collect import Collector +from digital_land.collect import Collector, FetchStatus from digital_land.collection import Collection, resource_path from digital_land.log import ( DatasetResourceLog, @@ -852,31 +853,36 @@ def validate_and_add_data_input( ) endpoint_resource_info = {} for endpoint in endpoints: - status, log = collector.fetch( + fetch_status, log = collector.fetch( url=endpoint["endpoint-url"], endpoint=endpoint["endpoint"], end_date=endpoint["end-date"], plugin=endpoint["plugin"], refill_todays_logs=True, ) - try: - # log is already returned from fetch, but read from file if needed for verification - log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) - if os.path.isfile(log_path): - with open(log_path, "r") as f: - log = json.load(f) - except Exception as e: - print( - f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" - ) - break - status = log.get("status", None) + # try: + # # log is already returned from fetch, but read from file if needed for verification + # log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) + # if os.path.isfile(log_path): + # with open(log_path, "r") as f: + # log = json.load(f) + # except Exception as e: + # print( + # f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" + # ) + # break + + log_status = log.get("status", None) + exception = log.get("exception", None) + if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: + raise HTTPError( + f"Failed to collect from URL. fetch status: {fetch_status}, log status: {log_status}, exception: {exception}" + ) # Raise exception if status is not 200 - if not status or status != "200": - exception = log.get("exception", None) + if not log_status or log_status != "200": raise HTTPError( - f"Failed to collect from URL with status: {status if status else exception}" + f"Failed to collect from URL with status: {log_status if log_status else exception}" ) # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 @@ -892,7 +898,7 @@ def validate_and_add_data_input( resource_path, ) - print(f"Log Status for {endpoint['endpoint']}: The status is {status}") + print(f"Log Status for {endpoint['endpoint']}: The status is {log_status}") endpoint_resource_info.update( { "endpoint": endpoint["endpoint"], @@ -902,6 +908,9 @@ def validate_and_add_data_input( "organisation": row["organisation"], "entry-date": row["entry-date"], } + # elif: + # raise Error( + # f"No resource avaible: {log_status if log_status else exception}" ) return collection, endpoint_resource_info diff --git a/digital_land/plugins/arcgis.py b/digital_land/plugins/arcgis.py index 09a614b19..5f64efb59 100644 --- a/digital_land/plugins/arcgis.py +++ b/digital_land/plugins/arcgis.py @@ -11,7 +11,7 @@ def get(collector, url, log={}, plugin="arcgis"): response = dumper._request("GET", url) dumper.get_metadata() - log["status"] = str(response.status_code) + response_status = str(response.status_code) content = '{"type":"FeatureCollection","features":[' sep = "\n" @@ -23,6 +23,7 @@ def get(collector, url, log={}, plugin="arcgis"): content += "]}" content = str.encode(content) + log["status"] = response_status except Exception as exception: logging.warning(exception) diff --git a/tests/unit/test_commands.py b/tests/unit/test_commands.py new file mode 100644 index 000000000..a938e8405 --- /dev/null +++ b/tests/unit/test_commands.py @@ -0,0 +1,199 @@ +import csv +import logging +import os +import tempfile +from unittest.mock import Mock +import pytest +from requests import HTTPError + +from digital_land.commands import validate_and_add_data_input +from tests.acceptance.conftest import copy_latest_specification_files_to + +# import from digital_land validate_and_add_data_input_error_thrown_when_no_resource_downloaded + + +@pytest.fixture(scope="module") +def specification_dir(tmp_path_factory): + specification_dir = tmp_path_factory.mktemp("specification") + copy_latest_specification_files_to(specification_dir) + return specification_dir + + +@pytest.fixture(scope="function") +def collection_dir(tmp_path_factory): + collection_dir = tmp_path_factory.mktemp("collection") + + # create source csv + source_fieldnames = [ + "attribution", + "collection", + "documentation-url", + "endpoint", + "licence", + "organisation", + "pipelines", + "entry-date", + "start-date", + "end-date", + ] + + with open(os.path.join(collection_dir, "source.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=source_fieldnames) + dictwriter.writeheader() + + # create endpoint csv + endpoint_fieldnames = [ + "endpoint", + "endpoint-url", + "parameters", + "plugin", + "entry-date", + "start-date", + "end-date", + ] + + with open(os.path.join(collection_dir, "endpoint.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=endpoint_fieldnames) + dictwriter.writeheader() + return collection_dir + + +@pytest.fixture(scope="module") +def organisation_csv(): + organisation_path = tempfile.NamedTemporaryFile().name + organisation_fieldnames = [ + "dataset", + "end-date", + "entity", + "entry-date", + "name", + "organisation", + "prefix", + "reference", + "start-date", + ] + organisation_row = { + "dataset": "local-authority", + "end-date": "", + "entity": 314, + "entry-date": "2023-11-19", + "name": "South Staffordshire Council", + "organisation": "local-authority:SST", + "prefix": "local-authority", + "reference": "SST", + "start-date": "", + } + + with open(organisation_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=organisation_fieldnames) + writer.writeheader() + writer.writerow(organisation_row) + + return organisation_path + + +@pytest.fixture +def mock_request_get(mocker): + data = {"reference": "1", "value": "test"} + csv_content = str(data).encode("utf-8") + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = csv_content + mocker.patch( + "requests.Session.get", + return_value=mock_response, + ) + + +def create_input_csv( + data, + fieldnames=[ + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "pipelines", + "plugin", + "licence", + ], +): + tmp_input_path = tempfile.NamedTemporaryFile().name + + with open(tmp_input_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(data) + + return tmp_input_path + + +def test_validate_and_add_data_input_no_error( + collection_dir, + specification_dir, + organisation_csv, + caplog, + mock_request_get, +): + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with caplog.at_level(logging.ERROR): + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + assert len(caplog.text) == 0 + + +def test_validate_and_add_data_input_error_thrown_when_no_resource_downloaded( + collection_dir, specification_dir, organisation_csv, mocker +): + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = "" + mocker.patch( + "requests.Session.get", + return_value=mock_response, + ) + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.westoxon.gov.uk/planning-and-building/digital-planning-data/", + "endpoint-url": "https://services5.arcgis.com/z8GJkxrWic0alJoM/arcgis/rest/services/WODC_Conservation_Areas_WGS/FeatureServer", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with pytest.raises(HTTPError) as error: + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + + assert "Failed to collect resource from URL" in str(error)