From 3f0765984eba39e54541d2dc23a5c03bbda1bae0 Mon Sep 17 00:00:00 2001 From: eveleighoj <35256612+eveleighoj@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:20:52 +0000 Subject: [PATCH 1/3] correct how arcgis status is reflected --- digital_land/commands.py | 19 +++++++++++-------- digital_land/plugins/arcgis.py | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/digital_land/commands.py b/digital_land/commands.py index 463f0f452..9c819d87b 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -19,7 +19,7 @@ from digital_land.package.organisation import OrganisationPackage from digital_land.check import duplicate_reference_check from digital_land.specification import Specification -from digital_land.collect import Collector +from digital_land.collect import Collector, FetchStatus from digital_land.collection import Collection, resource_path from digital_land.log import ( DatasetResourceLog, @@ -954,7 +954,7 @@ def validate_and_add_data_input( ) endpoint_resource_info = {} for endpoint in endpoints: - status, log = collector.fetch( + fetch_status, log = collector.fetch( url=endpoint["endpoint-url"], endpoint=endpoint["endpoint"], end_date=endpoint["end-date"], @@ -972,13 +972,16 @@ def validate_and_add_data_input( f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" ) break - - status = log.get("status", None) + log_status = log.get("status", None) + exception = log.get("exception", None) + if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: + raise HTTPError( + f"Failed to collect from URL. fetch status: {fetch_status}, log status: {log_status}, exception: {exception}" + ) # Raise exception if status is not 200 - if not status or status != "200": - exception = log.get("exception", None) + if not log_status or log_status != "200": raise HTTPError( - f"Failed to collect from URL with status: {status if status else exception}" + f"Failed to collect from URL with status: {log_status if log_status else exception}" ) # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 @@ -994,7 +997,7 @@ def validate_and_add_data_input( resource_path, ) - print(f"Log Status for {endpoint['endpoint']}: The status is {status}") + print(f"Log Status for {endpoint['endpoint']}: The status is {log_status}") endpoint_resource_info.update( { "endpoint": endpoint["endpoint"], diff --git a/digital_land/plugins/arcgis.py b/digital_land/plugins/arcgis.py index 09a614b19..5f64efb59 100644 --- a/digital_land/plugins/arcgis.py +++ b/digital_land/plugins/arcgis.py @@ -11,7 +11,7 @@ def get(collector, url, log={}, plugin="arcgis"): response = dumper._request("GET", url) dumper.get_metadata() - log["status"] = str(response.status_code) + response_status = str(response.status_code) content = '{"type":"FeatureCollection","features":[' sep = "\n" @@ -23,6 +23,7 @@ def get(collector, url, log={}, plugin="arcgis"): content += "]}" content = str.encode(content) + log["status"] = response_status except Exception as exception: logging.warning(exception) From 6a84c76f6ec3289efb333b922615abcfd9f382ac Mon Sep 17 00:00:00 2001 From: jandums Date: Thu, 26 Mar 2026 18:33:39 +0000 Subject: [PATCH 2/3] w.i.p --- digital_land/commands.py | 2 + tests/unit/test_commands.py | 342 ++++++++++++++++++++++++++++++++++++ 2 files changed, 344 insertions(+) create mode 100644 tests/unit/test_commands.py diff --git a/digital_land/commands.py b/digital_land/commands.py index 5700a9d55..6773b1a5c 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -859,6 +859,7 @@ def validate_and_add_data_input( plugin=endpoint["plugin"], refill_todays_logs=True, ) + ''' try: # log is already returned from fetch, but read from file if needed for verification log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) @@ -870,6 +871,7 @@ def validate_and_add_data_input( f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" ) break + ''' log_status = log.get("status", None) exception = log.get("exception", None) if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: diff --git a/tests/unit/test_commands.py b/tests/unit/test_commands.py new file mode 100644 index 000000000..a2b07711f --- /dev/null +++ b/tests/unit/test_commands.py @@ -0,0 +1,342 @@ +def test_validate_and_add_data_input_error_thrown_whhen_no_resource_downloaded( + csv_file_path, collection_name, collection_dir, specification_dir, organisation_path +): + expected_cols = [ + "pipelines", + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "licence", + ] + + specification = Specification(specification_dir) + organisation = Organisation(organisation_path=organisation_path) + + collection = Collection(name=collection_name, directory=collection_dir) + collection.load() + # ===== FIRST VALIDATION BASED ON IMPORT.CSV INFO + # - Check licence, url, date, organisation + + # read and process each record of the new endpoints csv at csv_file_path i.e import.csv + + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + # validate the columns in input .csv + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + for row in reader: + # validate licence + if row["licence"] == "": + raise ValueError("Licence is blank") + elif not specification.licence.get(row["licence"], None): + raise ValueError( + f"Licence '{row['licence']}' is not a valid licence according to the specification." + ) + # check if urls are not blank and valid urls + is_endpoint_valid, endpoint_valid_error = is_url_valid( + row["endpoint-url"], "endpoint_url" + ) + is_documentation_valid, documentation_valid_error = is_url_valid( + row["documentation-url"], "documentation_url" + ) + if not is_endpoint_valid or not is_documentation_valid: + raise ValueError( + f"{endpoint_valid_error} \n {documentation_valid_error}" + ) + + # if there is no start-date, do we want to populate it with today's date? + if row["start-date"]: + valid_date, error = is_date_valid(row["start-date"], "start-date") + if not valid_date: + raise ValueError(error) + + # validate organisation + if row["organisation"] == "": + raise ValueError("The organisation must not be blank") + elif not organisation.lookup(row["organisation"]): + raise ValueError( + f"The given organisation '{row['organisation']}' is not in our valid organisations" + ) + + # validate pipeline(s) - do they exist and are they in the collection + pipelines = row["pipelines"].split(";") + for pipeline in pipelines: + if not specification.dataset.get(pipeline, None): + raise ValueError( + f"'{pipeline}' is not a valid dataset in the specification" + ) + collection_in_specification = specification.dataset.get( + pipeline, None + ).get("collection") + if collection_name != collection_in_specification: + raise ValueError( + f"'{pipeline}' does not belong to provided collection {collection_name}" + ) + + # VALIDATION DONE, NOW ADD TO COLLECTION + print("======================================================================") + print("Endpoint and source details") + print("======================================================================") + print("Endpoint URL: ", row["endpoint-url"]) + print("Endpoint Hash:", hash_value(row["endpoint-url"])) + print("Documentation URL: ", row["documentation-url"]) + print() + + endpoints = [] + # if endpoint already exists, it will indicate it and quit function here + if collection.add_source_endpoint(row): + endpoint = { + "endpoint-url": row["endpoint-url"], + "endpoint": hash_value(row["endpoint-url"]), + "end-date": row.get("end-date", ""), + "plugin": row.get("plugin"), + "licence": row["licence"], + } + endpoints.append(endpoint) + else: + # We rely on the add_source_endpoint function to log why it couldn't be added + raise Exception( + "Endpoint and source could not be added - is this a duplicate endpoint?" + ) + + # if successfully added we can now attempt to fetch from endpoint + collector = Collector( + resource_dir=str(Path(collection_dir) / "resource"), + log_dir=str(Path(collection_dir) / "log"), + ) + endpoint_resource_info = {} + for endpoint in endpoints: + fetch_status, log = collector.fetch( + url=endpoint["endpoint-url"], + endpoint=endpoint["endpoint"], + end_date=endpoint["end-date"], + plugin=endpoint["plugin"], + refill_todays_logs=True, + ) + try: + # log is already returned from fetch, but read from file if needed for verification + log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) + if os.path.isfile(log_path): + with open(log_path, "r") as f: + log = json.load(f) + except Exception as e: + print( + f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" + ) + break + log_status = log.get("status", None) + exception = log.get("exception", None) + if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: + raise HTTPError( + f"Failed to collect from URL. fetch status: {fetch_status}, log status: {log_status}, exception: {exception}" + ) + # Raise exception if status is not 200 + if not log_status or log_status != "200": + raise HTTPError( + f"Failed to collect from URL with status: {log_status if log_status else exception}" + ) + + # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 + resource = log.get("resource", None) + if resource: + resource_path = Path(collection_dir) / "resource" / resource + print( + "Resource collected: ", + resource, + ) + print( + "Resource Path is: ", + resource_path, + ) + + print(f"Log Status for {endpoint['endpoint']}: The status is {log_status}") + endpoint_resource_info.update( + { + "endpoint": endpoint["endpoint"], + "resource": log.get("resource"), + "resource_path": resource_path, + "pipelines": row["pipelines"].split(";"), + "organisation": row["organisation"], + "entry-date": row["entry-date"], + } + ) + + return collection, endpoint_resource_info + + + + +def validate_and_add_data_input_non_200( + csv_file_path, collection_name, collection_dir, specification_dir, organisation_path +): + expected_cols = [ + "pipelines", + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "licence", + ] + + specification = Specification(specification_dir) + organisation = Organisation(organisation_path=organisation_path) + + collection = Collection(name=collection_name, directory=collection_dir) + collection.load() + # ===== FIRST VALIDATION BASED ON IMPORT.CSV INFO + # - Check licence, url, date, organisation + + # read and process each record of the new endpoints csv at csv_file_path i.e import.csv + + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + # validate the columns in input .csv + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + for row in reader: + # validate licence + if row["licence"] == "": + raise ValueError("Licence is blank") + elif not specification.licence.get(row["licence"], None): + raise ValueError( + f"Licence '{row['licence']}' is not a valid licence according to the specification." + ) + # check if urls are not blank and valid urls + is_endpoint_valid, endpoint_valid_error = is_url_valid( + row["endpoint-url"], "endpoint_url" + ) + is_documentation_valid, documentation_valid_error = is_url_valid( + row["documentation-url"], "documentation_url" + ) + if not is_endpoint_valid or not is_documentation_valid: + raise ValueError( + f"{endpoint_valid_error} \n {documentation_valid_error}" + ) + + # if there is no start-date, do we want to populate it with today's date? + if row["start-date"]: + valid_date, error = is_date_valid(row["start-date"], "start-date") + if not valid_date: + raise ValueError(error) + + # validate organisation + if row["organisation"] == "": + raise ValueError("The organisation must not be blank") + elif not organisation.lookup(row["organisation"]): + raise ValueError( + f"The given organisation '{row['organisation']}' is not in our valid organisations" + ) + + # validate pipeline(s) - do they exist and are they in the collection + pipelines = row["pipelines"].split(";") + for pipeline in pipelines: + if not specification.dataset.get(pipeline, None): + raise ValueError( + f"'{pipeline}' is not a valid dataset in the specification" + ) + collection_in_specification = specification.dataset.get( + pipeline, None + ).get("collection") + if collection_name != collection_in_specification: + raise ValueError( + f"'{pipeline}' does not belong to provided collection {collection_name}" + ) + + # VALIDATION DONE, NOW ADD TO COLLECTION + print("======================================================================") + print("Endpoint and source details") + print("======================================================================") + print("Endpoint URL: ", row["endpoint-url"]) + print("Endpoint Hash:", hash_value(row["endpoint-url"])) + print("Documentation URL: ", row["documentation-url"]) + print() + + endpoints = [] + # if endpoint already exists, it will indicate it and quit function here + if collection.add_source_endpoint(row): + endpoint = { + "endpoint-url": row["endpoint-url"], + "endpoint": hash_value(row["endpoint-url"]), + "end-date": row.get("end-date", ""), + "plugin": row.get("plugin"), + "licence": row["licence"], + } + endpoints.append(endpoint) + else: + # We rely on the add_source_endpoint function to log why it couldn't be added + raise Exception( + "Endpoint and source could not be added - is this a duplicate endpoint?" + ) + + # if successfully added we can now attempt to fetch from endpoint + collector = Collector( + resource_dir=str(Path(collection_dir) / "resource"), + log_dir=str(Path(collection_dir) / "log"), + ) + endpoint_resource_info = {} + for endpoint in endpoints: + fetch_status, log = collector.fetch( + url=endpoint["endpoint-url"], + endpoint=endpoint["endpoint"], + end_date=endpoint["end-date"], + plugin=endpoint["plugin"], + refill_todays_logs=True, + ) + try: + # log is already returned from fetch, but read from file if needed for verification + log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) + if os.path.isfile(log_path): + with open(log_path, "r") as f: + log = json.load(f) + except Exception as e: + print( + f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" + ) + break + log_status = log.get("status", None) + exception = log.get("exception", None) + if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: + raise HTTPError( + f"Failed to collect from URL. fetch status: {fetch_status}, log status: {log_status}, exception: {exception}" + ) + # Raise exception if status is not 200 + if not log_status or log_status != "200": + raise HTTPError( + f"Failed to collect from URL with status: {log_status if log_status else exception}" + ) + + # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 + resource = log.get("resource", None) + if resource: + resource_path = Path(collection_dir) / "resource" / resource + print( + "Resource collected: ", + resource, + ) + print( + "Resource Path is: ", + resource_path, + ) + + print(f"Log Status for {endpoint['endpoint']}: The status is {log_status}") + endpoint_resource_info.update( + { + "endpoint": endpoint["endpoint"], + "resource": log.get("resource"), + "resource_path": resource_path, + "pipelines": row["pipelines"].split(";"), + "organisation": row["organisation"], + "entry-date": row["entry-date"], + } + ) + + return collection, endpoint_resource_info \ No newline at end of file From 67cd393ce386bd0c1506ae6b906586be57cb55b8 Mon Sep 17 00:00:00 2001 From: jandums Date: Tue, 7 Apr 2026 16:24:35 +0100 Subject: [PATCH 3/3] W.I.P3 --- Makefile | 4 +- digital_land/commands.py | 32 +-- tests/unit/test_commands.py | 485 +++++++++++++----------------------- 3 files changed, 191 insertions(+), 330 deletions(-) diff --git a/Makefile b/Makefile index 22037acb8..0479332b3 100644 --- a/Makefile +++ b/Makefile @@ -35,9 +35,9 @@ endif endif ifndef SPATIAL ifeq ($(UNAME),Darwin) - $(error GDAL tools not found in PATH) +# $(error GDAL tools not found in PATH) endif - sudo apt-get install libsqlite3-mod-spatialite -y +# sudo apt-get install libsqlite3-mod-spatialite -y endif pyproj sync --file uk_os_OSTN15_NTv2_OSGBtoETRS.tif -v # install pre-commits diff --git a/digital_land/commands.py b/digital_land/commands.py index 6773b1a5c..683749da3 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -9,7 +9,8 @@ from packaging.version import Version import pandas as pd from pathlib import Path -from datetime import datetime + +# from datetime import datetime from distutils.dir_util import copy_tree import geojson from requests import HTTPError @@ -859,19 +860,19 @@ def validate_and_add_data_input( plugin=endpoint["plugin"], refill_todays_logs=True, ) - ''' - try: - # log is already returned from fetch, but read from file if needed for verification - log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) - if os.path.isfile(log_path): - with open(log_path, "r") as f: - log = json.load(f) - except Exception as e: - print( - f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" - ) - break - ''' + + # try: + # # log is already returned from fetch, but read from file if needed for verification + # log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) + # if os.path.isfile(log_path): + # with open(log_path, "r") as f: + # log = json.load(f) + # except Exception as e: + # print( + # f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" + # ) + # break + log_status = log.get("status", None) exception = log.get("exception", None) if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: @@ -907,6 +908,9 @@ def validate_and_add_data_input( "organisation": row["organisation"], "entry-date": row["entry-date"], } + # elif: + # raise Error( + # f"No resource avaible: {log_status if log_status else exception}" ) return collection, endpoint_resource_info diff --git a/tests/unit/test_commands.py b/tests/unit/test_commands.py index a2b07711f..a938e8405 100644 --- a/tests/unit/test_commands.py +++ b/tests/unit/test_commands.py @@ -1,342 +1,199 @@ -def test_validate_and_add_data_input_error_thrown_whhen_no_resource_downloaded( - csv_file_path, collection_name, collection_dir, specification_dir, organisation_path -): - expected_cols = [ - "pipelines", - "organisation", - "documentation-url", - "endpoint-url", - "start-date", - "licence", - ] - - specification = Specification(specification_dir) - organisation = Organisation(organisation_path=organisation_path) +import csv +import logging +import os +import tempfile +from unittest.mock import Mock +import pytest +from requests import HTTPError - collection = Collection(name=collection_name, directory=collection_dir) - collection.load() - # ===== FIRST VALIDATION BASED ON IMPORT.CSV INFO - # - Check licence, url, date, organisation +from digital_land.commands import validate_and_add_data_input +from tests.acceptance.conftest import copy_latest_specification_files_to - # read and process each record of the new endpoints csv at csv_file_path i.e import.csv +# import from digital_land validate_and_add_data_input_error_thrown_when_no_resource_downloaded - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - # validate the columns in input .csv - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") +@pytest.fixture(scope="module") +def specification_dir(tmp_path_factory): + specification_dir = tmp_path_factory.mktemp("specification") + copy_latest_specification_files_to(specification_dir) + return specification_dir - for row in reader: - # validate licence - if row["licence"] == "": - raise ValueError("Licence is blank") - elif not specification.licence.get(row["licence"], None): - raise ValueError( - f"Licence '{row['licence']}' is not a valid licence according to the specification." - ) - # check if urls are not blank and valid urls - is_endpoint_valid, endpoint_valid_error = is_url_valid( - row["endpoint-url"], "endpoint_url" - ) - is_documentation_valid, documentation_valid_error = is_url_valid( - row["documentation-url"], "documentation_url" - ) - if not is_endpoint_valid or not is_documentation_valid: - raise ValueError( - f"{endpoint_valid_error} \n {documentation_valid_error}" - ) - # if there is no start-date, do we want to populate it with today's date? - if row["start-date"]: - valid_date, error = is_date_valid(row["start-date"], "start-date") - if not valid_date: - raise ValueError(error) +@pytest.fixture(scope="function") +def collection_dir(tmp_path_factory): + collection_dir = tmp_path_factory.mktemp("collection") - # validate organisation - if row["organisation"] == "": - raise ValueError("The organisation must not be blank") - elif not organisation.lookup(row["organisation"]): - raise ValueError( - f"The given organisation '{row['organisation']}' is not in our valid organisations" - ) - - # validate pipeline(s) - do they exist and are they in the collection - pipelines = row["pipelines"].split(";") - for pipeline in pipelines: - if not specification.dataset.get(pipeline, None): - raise ValueError( - f"'{pipeline}' is not a valid dataset in the specification" - ) - collection_in_specification = specification.dataset.get( - pipeline, None - ).get("collection") - if collection_name != collection_in_specification: - raise ValueError( - f"'{pipeline}' does not belong to provided collection {collection_name}" - ) + # create source csv + source_fieldnames = [ + "attribution", + "collection", + "documentation-url", + "endpoint", + "licence", + "organisation", + "pipelines", + "entry-date", + "start-date", + "end-date", + ] - # VALIDATION DONE, NOW ADD TO COLLECTION - print("======================================================================") - print("Endpoint and source details") - print("======================================================================") - print("Endpoint URL: ", row["endpoint-url"]) - print("Endpoint Hash:", hash_value(row["endpoint-url"])) - print("Documentation URL: ", row["documentation-url"]) - print() + with open(os.path.join(collection_dir, "source.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=source_fieldnames) + dictwriter.writeheader() - endpoints = [] - # if endpoint already exists, it will indicate it and quit function here - if collection.add_source_endpoint(row): - endpoint = { - "endpoint-url": row["endpoint-url"], - "endpoint": hash_value(row["endpoint-url"]), - "end-date": row.get("end-date", ""), - "plugin": row.get("plugin"), - "licence": row["licence"], - } - endpoints.append(endpoint) - else: - # We rely on the add_source_endpoint function to log why it couldn't be added - raise Exception( - "Endpoint and source could not be added - is this a duplicate endpoint?" - ) + # create endpoint csv + endpoint_fieldnames = [ + "endpoint", + "endpoint-url", + "parameters", + "plugin", + "entry-date", + "start-date", + "end-date", + ] - # if successfully added we can now attempt to fetch from endpoint - collector = Collector( - resource_dir=str(Path(collection_dir) / "resource"), - log_dir=str(Path(collection_dir) / "log"), + with open(os.path.join(collection_dir, "endpoint.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=endpoint_fieldnames) + dictwriter.writeheader() + return collection_dir + + +@pytest.fixture(scope="module") +def organisation_csv(): + organisation_path = tempfile.NamedTemporaryFile().name + organisation_fieldnames = [ + "dataset", + "end-date", + "entity", + "entry-date", + "name", + "organisation", + "prefix", + "reference", + "start-date", + ] + organisation_row = { + "dataset": "local-authority", + "end-date": "", + "entity": 314, + "entry-date": "2023-11-19", + "name": "South Staffordshire Council", + "organisation": "local-authority:SST", + "prefix": "local-authority", + "reference": "SST", + "start-date": "", + } + + with open(organisation_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=organisation_fieldnames) + writer.writeheader() + writer.writerow(organisation_row) + + return organisation_path + + +@pytest.fixture +def mock_request_get(mocker): + data = {"reference": "1", "value": "test"} + csv_content = str(data).encode("utf-8") + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = csv_content + mocker.patch( + "requests.Session.get", + return_value=mock_response, ) - endpoint_resource_info = {} - for endpoint in endpoints: - fetch_status, log = collector.fetch( - url=endpoint["endpoint-url"], - endpoint=endpoint["endpoint"], - end_date=endpoint["end-date"], - plugin=endpoint["plugin"], - refill_todays_logs=True, - ) - try: - # log is already returned from fetch, but read from file if needed for verification - log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) - if os.path.isfile(log_path): - with open(log_path, "r") as f: - log = json.load(f) - except Exception as e: - print( - f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" - ) - break - log_status = log.get("status", None) - exception = log.get("exception", None) - if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: - raise HTTPError( - f"Failed to collect from URL. fetch status: {fetch_status}, log status: {log_status}, exception: {exception}" - ) - # Raise exception if status is not 200 - if not log_status or log_status != "200": - raise HTTPError( - f"Failed to collect from URL with status: {log_status if log_status else exception}" - ) - # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 - resource = log.get("resource", None) - if resource: - resource_path = Path(collection_dir) / "resource" / resource - print( - "Resource collected: ", - resource, - ) - print( - "Resource Path is: ", - resource_path, - ) - print(f"Log Status for {endpoint['endpoint']}: The status is {log_status}") - endpoint_resource_info.update( - { - "endpoint": endpoint["endpoint"], - "resource": log.get("resource"), - "resource_path": resource_path, - "pipelines": row["pipelines"].split(";"), - "organisation": row["organisation"], - "entry-date": row["entry-date"], - } - ) - - return collection, endpoint_resource_info - - - - -def validate_and_add_data_input_non_200( - csv_file_path, collection_name, collection_dir, specification_dir, organisation_path -): - expected_cols = [ - "pipelines", +def create_input_csv( + data, + fieldnames=[ "organisation", "documentation-url", "endpoint-url", "start-date", + "pipelines", + "plugin", "licence", - ] - - specification = Specification(specification_dir) - organisation = Organisation(organisation_path=organisation_path) - - collection = Collection(name=collection_name, directory=collection_dir) - collection.load() - # ===== FIRST VALIDATION BASED ON IMPORT.CSV INFO - # - Check licence, url, date, organisation - - # read and process each record of the new endpoints csv at csv_file_path i.e import.csv - - with open(csv_file_path) as new_endpoints_file: - reader = csv.DictReader(new_endpoints_file) - csv_columns = reader.fieldnames - - # validate the columns in input .csv - for expected_col in expected_cols: - if expected_col not in csv_columns: - raise Exception(f"required column ({expected_col}) not found in csv") - - for row in reader: - # validate licence - if row["licence"] == "": - raise ValueError("Licence is blank") - elif not specification.licence.get(row["licence"], None): - raise ValueError( - f"Licence '{row['licence']}' is not a valid licence according to the specification." - ) - # check if urls are not blank and valid urls - is_endpoint_valid, endpoint_valid_error = is_url_valid( - row["endpoint-url"], "endpoint_url" - ) - is_documentation_valid, documentation_valid_error = is_url_valid( - row["documentation-url"], "documentation_url" - ) - if not is_endpoint_valid or not is_documentation_valid: - raise ValueError( - f"{endpoint_valid_error} \n {documentation_valid_error}" - ) - - # if there is no start-date, do we want to populate it with today's date? - if row["start-date"]: - valid_date, error = is_date_valid(row["start-date"], "start-date") - if not valid_date: - raise ValueError(error) + ], +): + tmp_input_path = tempfile.NamedTemporaryFile().name - # validate organisation - if row["organisation"] == "": - raise ValueError("The organisation must not be blank") - elif not organisation.lookup(row["organisation"]): - raise ValueError( - f"The given organisation '{row['organisation']}' is not in our valid organisations" - ) + with open(tmp_input_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(data) - # validate pipeline(s) - do they exist and are they in the collection - pipelines = row["pipelines"].split(";") - for pipeline in pipelines: - if not specification.dataset.get(pipeline, None): - raise ValueError( - f"'{pipeline}' is not a valid dataset in the specification" - ) - collection_in_specification = specification.dataset.get( - pipeline, None - ).get("collection") - if collection_name != collection_in_specification: - raise ValueError( - f"'{pipeline}' does not belong to provided collection {collection_name}" - ) + return tmp_input_path - # VALIDATION DONE, NOW ADD TO COLLECTION - print("======================================================================") - print("Endpoint and source details") - print("======================================================================") - print("Endpoint URL: ", row["endpoint-url"]) - print("Endpoint Hash:", hash_value(row["endpoint-url"])) - print("Documentation URL: ", row["documentation-url"]) - print() - endpoints = [] - # if endpoint already exists, it will indicate it and quit function here - if collection.add_source_endpoint(row): - endpoint = { - "endpoint-url": row["endpoint-url"], - "endpoint": hash_value(row["endpoint-url"]), - "end-date": row.get("end-date", ""), - "plugin": row.get("plugin"), - "licence": row["licence"], - } - endpoints.append(endpoint) - else: - # We rely on the add_source_endpoint function to log why it couldn't be added - raise Exception( - "Endpoint and source could not be added - is this a duplicate endpoint?" +def test_validate_and_add_data_input_no_error( + collection_dir, + specification_dir, + organisation_csv, + caplog, + mock_request_get, +): + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with caplog.at_level(logging.ERROR): + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, ) + assert len(caplog.text) == 0 - # if successfully added we can now attempt to fetch from endpoint - collector = Collector( - resource_dir=str(Path(collection_dir) / "resource"), - log_dir=str(Path(collection_dir) / "log"), - ) - endpoint_resource_info = {} - for endpoint in endpoints: - fetch_status, log = collector.fetch( - url=endpoint["endpoint-url"], - endpoint=endpoint["endpoint"], - end_date=endpoint["end-date"], - plugin=endpoint["plugin"], - refill_todays_logs=True, - ) - try: - # log is already returned from fetch, but read from file if needed for verification - log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) - if os.path.isfile(log_path): - with open(log_path, "r") as f: - log = json.load(f) - except Exception as e: - print( - f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" - ) - break - log_status = log.get("status", None) - exception = log.get("exception", None) - if fetch_status not in [FetchStatus.OK, FetchStatus.ALREADY_FETCHED]: - raise HTTPError( - f"Failed to collect from URL. fetch status: {fetch_status}, log status: {log_status}, exception: {exception}" - ) - # Raise exception if status is not 200 - if not log_status or log_status != "200": - raise HTTPError( - f"Failed to collect from URL with status: {log_status if log_status else exception}" - ) - # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 - resource = log.get("resource", None) - if resource: - resource_path = Path(collection_dir) / "resource" / resource - print( - "Resource collected: ", - resource, - ) - print( - "Resource Path is: ", - resource_path, - ) +def test_validate_and_add_data_input_error_thrown_when_no_resource_downloaded( + collection_dir, specification_dir, organisation_csv, mocker +): - print(f"Log Status for {endpoint['endpoint']}: The status is {log_status}") - endpoint_resource_info.update( - { - "endpoint": endpoint["endpoint"], - "resource": log.get("resource"), - "resource_path": resource_path, - "pipelines": row["pipelines"].split(";"), - "organisation": row["organisation"], - "entry-date": row["entry-date"], - } + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = "" + mocker.patch( + "requests.Session.get", + return_value=mock_response, + ) + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.westoxon.gov.uk/planning-and-building/digital-planning-data/", + "endpoint-url": "https://services5.arcgis.com/z8GJkxrWic0alJoM/arcgis/rest/services/WODC_Conservation_Areas_WGS/FeatureServer", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with pytest.raises(HTTPError) as error: + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, ) - return collection, endpoint_resource_info \ No newline at end of file + assert "Failed to collect resource from URL" in str(error)