From 554d759f7231970bad4fed68f31982861f956d95 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Fri, 23 Jan 2026 17:55:53 +0000
Subject: [PATCH 01/12] add data now uses pipeline transfrom and assign entries
 to detect new entries

---
 request-processor/makerules/makerules.mk      |   3 +-
 .../src/application/configurations/config.py  |   2 +-
 .../src/application/core/pipeline.py          | 276 ++++++++----------
 .../src/application/core/workflow.py          |  53 ++--
 4 files changed, 163 insertions(+), 171 deletions(-)

diff --git a/request-processor/makerules/makerules.mk b/request-processor/makerules/makerules.mk
index c2868586..16a486d2 100644
--- a/request-processor/makerules/makerules.mk
+++ b/request-processor/makerules/makerules.mk
@@ -70,6 +70,7 @@ endif
 config:
 # local copy of organsiation datapackage
 	@mkdir -p $(CACHE_DIR)
-	curl -qfs "https://raw.githubusercontent.com/digital-land/organisation-dataset/main/collection/organisation.csv" > $(CACHE_DIR)organisation.csv
+	curl -qfs "https://files.planning.data.gov.uk/organisation-collection/dataset/organisation.csv" > $(CACHE_DIR)organisation.csv
+	
 
 init:: config
diff --git a/request-processor/src/application/configurations/config.py b/request-processor/src/application/configurations/config.py
index 08ae1891..9fe21070 100644
--- a/request-processor/src/application/configurations/config.py
+++ b/request-processor/src/application/configurations/config.py
@@ -2,7 +2,7 @@
 
 source_url = "https://raw.githubusercontent.com/digital-land/"
 DATASTORE_URL = os.getenv("DATASTORE_URL", "https://files.planning.data.gov.uk/")
-CONFIG_URL = f"{DATASTORE_URL}config/"
+CONFIG_URL = f"{source_url}config/refs/heads/main/"
 
 
 class Directories:
diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 71171be7..30b50d55 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -3,6 +3,8 @@
 from application.logging.logger import get_logger
 from digital_land.specification import Specification
 from digital_land.organisation import Organisation
+from digital_land.api import API
+from collections import defaultdict
 
 from digital_land.pipeline import Pipeline, Lookups
 from digital_land.commands import get_resource_unidentified_lookups
@@ -110,7 +112,7 @@ def default_output_path(command, input_path):
 
 
 def assign_entries(
-    resource_path, dataset, organisation, pipeline_dir, specification, cache_dir
+    resource_path, dataset, organisation, pipeline_dir, specification, cache_dir, endpoints=None
 ):
     pipeline = Pipeline(pipeline_dir, dataset)
     resource_lookups = get_resource_unidentified_lookups(
@@ -120,6 +122,7 @@ def assign_entries(
         pipeline=pipeline,
         specification=specification,
         org_csv_path=f"{cache_dir}/organisation.csv",
+        endpoints=endpoints,
     )
 
     unassigned_entries = []
@@ -135,9 +138,13 @@ def assign_entries(
             )
 
     lookups.load_csv()
+    
+    # Track which entries are new by checking before adding
+    new_entries_added = []
     for new_lookup in unassigned_entries:
         for idx, entry in enumerate(new_lookup):
             lookups.add_entry(entry[0])
+            new_entries_added.append(entry[0])
 
     # save edited csvs
     max_entity_num = lookups.get_max_entity(pipeline.name, specification)
@@ -149,66 +156,97 @@ def assign_entries(
         dataset
     )
 
-    lookups.save_csv()
+    newly_assigned = lookups.save_csv()
+    
+    # Filter to return only the entries we just added
+    if newly_assigned:
+        new_lookups = [
+            lookup for lookup in newly_assigned
+            if any(
+                lookup.get("reference") == entry.get("reference") 
+                and lookup.get("organisation") == entry.get("organisation")
+                for entry in new_entries_added
+            )
+        ]
+        return new_lookups
+    
+    return []
 
 
 def fetch_add_data_response(
     collection,
     dataset,
-    organisation,
+    organisation_provider,
     pipeline_dir,
-    input_path,
+    input_dir,
+    output_path,
     specification_dir,
     cache_dir,
     url,
     documentation_url,
 ):
-    try:
+    try:            
         specification = Specification(specification_dir)
+        pipeline = Pipeline(pipeline_dir, dataset, specification=specification)
+        organisation = Organisation(os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path))
+        api = API(specification=specification)
 
-        if not os.path.exists(input_path):
-            error_msg = f"Input path does not exist: {input_path}"
-            logger.error(f"ERROR: {error_msg}")
-            raise FileNotFoundError(error_msg)
+        # TODO: Need to load config class
+        valid_category_values = api.get_valid_category_values(dataset, pipeline)   
 
-        files_in_resource = os.listdir(input_path)
-        logger.info(f"Total files: {len(files_in_resource)}")
+        files_in_resource = os.listdir(input_dir)
 
-        new_entities = []
         existing_entities = []
-        lookup_path = os.path.join(pipeline_dir, "lookup.csv")
+        new_entities = []
 
         for idx, resource_file in enumerate(files_in_resource):
-            resource_file_path = os.path.join(input_path, resource_file)
+            resource_file_path = os.path.join(input_dir, resource_file)
             logger.info(
                 f"Processing file {idx + 1}/{len(files_in_resource)}: {resource_file}"
             )
             try:
-                unidentified_lookups = _add_data_read_entities(
-                    resource_file_path, dataset, organisation, specification
+                # Try add data with pipeline transform to see if no entities found
+                issues_log = pipeline.transform(
+                    input_path=resource_file_path,
+                    output_path=output_path,
+                    organisation=organisation,
+                    organisations=[organisation_provider],
+                    resource=resource_from_path(resource_file_path),
+                    valid_category_values = valid_category_values,
+                    disable_lookups=False,
                 )
 
-                if not unidentified_lookups:
-                    logger.info(f"No references found in {resource_file}")
-                    continue
-
-                new_lookups, existing_lookups = _check_existing_entities(
-                    unidentified_lookups, lookup_path
+                existing_entities.extend(
+                    _map_existing_entities_from_transformed_csv(output_path, pipeline_dir)
                 )
-                existing_entities.extend(existing_lookups)
-
-                if not new_lookups:
-                    logger.info(f"All lookups already exist for {resource_file}")
-                    continue
 
-                newly_assigned = _assign_entity_numbers(
-                    new_lookups, pipeline_dir, dataset, specification
+                # Check if there are unknown entity issues in the log
+                unknown_issue_types = {'unknown entity', 'unknown entity - missing reference'}
+                has_unknown = any(
+                    row.get('issue-type') in unknown_issue_types
+                    for row in issues_log.rows
+                    if isinstance(row, dict)
                 )
 
-                new_entities.extend(newly_assigned)
-                logger.info(
-                    f"Assigned {len(newly_assigned)} new entities for {resource_file}"
-                )
+                if has_unknown:
+                    new_lookups = assign_entries(
+                        resource_path=resource_file_path,
+                        dataset=dataset,
+                        organisation=organisation_provider,
+                        pipeline_dir=pipeline_dir,
+                        specification=specification,
+                        cache_dir=cache_dir,
+                        endpoints=[url] if url else None,
+                    )
+                    logger.info(
+                        f"Found {len(new_lookups)} unidentified lookups in {resource_file}"
+                    )
+                    new_entities.extend(new_lookups)
+                else:
+                    logger.info(f"No unidentified lookups found in {resource_file}")
+
+                
+                # TODO: Re-run to see if no unidentified lookups remain
 
             except Exception as err:
                 logger.error(f"Error processing {resource_file}: {err}")
@@ -227,7 +265,7 @@ def fetch_add_data_response(
             documentation_url,
             pipeline_dir,
             collection,
-            organisation,
+            organisation_provider,
             dataset,
             endpoint_summary,
         )
@@ -255,123 +293,6 @@ def fetch_add_data_response(
         raise
 
 
-def _add_data_read_entities(resource_path, dataset, organisation, specification):
-    unidentified_lookups = []
-    dataset_prefix = specification.dataset_prefix(dataset)
-
-    try:
-        with open(resource_path, "r", encoding="utf-8") as f:
-            reader = csv.DictReader(f)
-
-            for idx, row in enumerate(reader, start=1):
-                reference = row.get("reference", "").strip()
-
-                if not reference:
-                    logger.warning(f"Row {idx} has no reference, skipping")
-                    continue
-
-                lookup_entry = {
-                    "prefix": dataset_prefix,
-                    "organisation": organisation,
-                    "reference": reference,
-                    "resource": Path(resource_path).stem,
-                    "entity": "",
-                }
-
-                unidentified_lookups.append(lookup_entry)
-
-        logger.info(f"Found {len(unidentified_lookups)} references")
-
-    except Exception as e:
-        logger.error(f"Error reading resource: {e}")
-        raise
-
-    return unidentified_lookups
-
-
-def _check_existing_entities(unidentified_lookups, lookup_path):
-    """
-    Check which lookups already exist in lookup file
-    """
-    existing_lookup_map = {}
-
-    if os.path.exists(lookup_path):
-        try:
-            with open(lookup_path, "r", encoding="utf-8") as f:
-                reader = csv.DictReader(f)
-
-                for row in reader:
-                    prefix = row.get("prefix", "").strip()
-                    org = row.get("organisation", "").strip()
-                    ref = row.get("reference", "").strip()
-                    entity = row.get("entity", "").strip()
-
-                    if prefix and org and ref and entity:
-                        key = f"{prefix},{org},{ref}"
-                        existing_lookup_map[key] = {"entity": entity, "reference": ref}
-
-        except Exception as e:
-            logger.error(f"Error reading lookup file: {e}")
-    else:
-        logger.info("lookup file does not exist yet")
-
-    new_lookups = []
-    existing_lookups = []
-    for lookup in unidentified_lookups:
-        key = f"{lookup['prefix']},{lookup['organisation']},{lookup['reference']}"
-
-        if key in existing_lookup_map:
-            existing_lookups.append(existing_lookup_map[key])
-        else:
-            new_lookups.append(lookup)
-
-    logger.info(
-        f"Found {len(new_lookups)} new lookups and {len(existing_lookups)} existing lookups"
-    )
-
-    return new_lookups, existing_lookups
-
-
-def _assign_entity_numbers(new_lookups, pipeline_dir, dataset, specification):
-    """
-    Assign entity numbers to new lookup entries and save to lookup file
-    """
-
-    lookups = Lookups(pipeline_dir)
-
-    if not os.path.exists(lookups.lookups_path):
-        os.makedirs(os.path.dirname(lookups.lookups_path), exist_ok=True)
-        with open(lookups.lookups_path, "w", newline="", encoding="utf-8") as f:
-            writer = csv.writer(f)
-            writer.writerow(
-                ["prefix", "resource", "organisation", "reference", "entity"]
-            )
-
-    lookups.load_csv()
-
-    for lookup in new_lookups:
-        lookups.add_entry(lookup)
-
-    max_entity_num = lookups.get_max_entity(dataset, specification)
-    logger.info(f"Max existing entity: {max_entity_num}")
-
-    lookups.entity_num_gen.state["current"] = max_entity_num
-    lookups.entity_num_gen.state["range_max"] = specification.get_dataset_entity_max(
-        dataset
-    )
-    lookups.entity_num_gen.state["range_min"] = specification.get_dataset_entity_min(
-        dataset
-    )
-
-    logger.info(
-        f"Entity range: {lookups.entity_num_gen.state['range_min']} - {lookups.entity_num_gen.state['range_max']}"
-    )
-
-    newly_assigned = lookups.save_csv()
-
-    return newly_assigned
-
-
 def _get_entities_breakdown(new_entities):
     """
     Convert newly assigned entities to the breakdown format for response.
@@ -418,6 +339,59 @@ def _get_existing_entities_breakdown(existing_entities):
     return breakdown
 
 
+def _map_existing_entities_from_transformed_csv(transformed_csv_path, pipeline_dir):
+    """Extract unique entities from transformed CSV and lookup their details in lookup.csv."""
+
+    mapped_entities = []
+    
+    if not os.path.exists(transformed_csv_path):
+        logger.warning(f"Transformed CSV not found: {transformed_csv_path}")
+        return mapped_entities
+
+    # Extract unique entity values from transformed CSV
+    unique_entities = set()
+    try:
+        with open(transformed_csv_path, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                entity_val = row.get("entity", "").strip()
+                if entity_val:  # Skip empty entities
+                    unique_entities.add(entity_val)
+    except Exception as e:
+        logger.error(f"Error reading transformed CSV: {e}")
+        return mapped_entities
+
+    if not unique_entities:
+        return mapped_entities
+
+    # Load lookup.csv to get entity details
+    lookup_path = os.path.join(pipeline_dir, "lookup.csv")
+    if not os.path.exists(lookup_path):
+        logger.warning(f"Lookup CSV not found: {lookup_path}")
+        return mapped_entities
+
+    entity_lookup_map = {}
+    with open(lookup_path, "r", encoding="utf-8") as f:
+        for row in csv.DictReader(f):
+            entity_lookup_map[str(row.get("entity", ""))] = row
+
+    # Map entities to their full details
+    for entity_id in unique_entities:
+        row = entity_lookup_map.get(entity_id, {})
+        if row:  # Only add if found in lookup
+            mapped_entities.append(
+                {
+                    "entity": entity_id,
+                    "reference": row.get("reference", ""),
+                    "prefix": row.get("prefix", ""),
+                    "resource": row.get("resource", ""),
+                    "organisation": row.get("organisation", ""),
+                }
+            )
+
+    return mapped_entities
+
+
 def _validate_endpoint(url, pipeline_dir):
     endpoint_csv_path = os.path.join(pipeline_dir, "endpoint.csv")
     if not url:
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index 001dc487..45616931 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -412,37 +412,54 @@ def add_data_workflow(
     request_id,
     collection,
     dataset,
-    organisation,
+    organisation_provider,
     url,
     documentation_url,
     directories,
 ):
+    """
+    Setup directories and download required CSVs to manage add-data pipeline, then invoke fetch_add_data_response.
+    
+    Args:
+        file_name (str): Collection resource file name
+        request_id (str): Unique request identifier
+        collection (str): Collection name (e.g. 'article-4-direction')
+        dataset (str): Dataset name (e.g. 'article-4-direction-area')
+        organisation_provider (str): Organisation code providing the data
+        url (str): Endpoint URL to fetch data from
+        documentation_url (str): Documentation URL for the dataset
+        directories (Directories): Directories object with required paths
+    """
+
     pipeline_dir = os.path.join(directories.PIPELINE_DIR, collection, request_id)
-    logger.info(f"pipeline_dir is : {pipeline_dir}")
-    input_path = os.path.join(directories.COLLECTION_DIR, "resource", request_id)
-    file_path = os.path.join(input_path, file_name)
-    resource = resource_from_path(file_path)
-    logger.info(f"resource is : {resource}")
-    fetch_csv = fetch_add_data_csvs(collection, pipeline_dir)
-    logger.info(f"files fetched are : {fetch_csv}")
+    input_dir = os.path.join(directories.COLLECTION_DIR, "resource", request_id)
+    output_path = os.path.join(directories.TRANSFORMED_DIR, request_id, file_name)
+    if not os.path.exists(output_path):
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    
+    fetch_add_data_csvs(collection, pipeline_dir)
 
     response_data = fetch_add_data_response(
-        collection,
-        dataset,
-        organisation,
-        pipeline_dir,
-        input_path,
-        directories.SPECIFICATION_DIR,
-        directories.CACHE_DIR,
-        url,
-        documentation_url,
+        collection=collection,
+        dataset=dataset,
+        organisation_provider=organisation_provider,
+        pipeline_dir=pipeline_dir,
+        input_dir=input_dir,
+        output_path=output_path,
+        specification_dir=directories.SPECIFICATION_DIR,
+        cache_dir=directories.CACHE_DIR,
+        url=url,
+        documentation_url=documentation_url,
     )
-    logger.info(f"add data response is : {response_data}")
+    logger.info(f"add data response is for id {request_id} : {response_data}")
+
+    # TODO: Clean up directories if needed
 
     return response_data
 
 
 def fetch_add_data_csvs(collection, pipeline_dir):
+    """Download add-data pipeline CSVs (lookup, endpoint, source) into pipeline_dir and ensure organisation """
     os.makedirs(pipeline_dir, exist_ok=True)
     add_data_csvs = ["lookup.csv", "endpoint.csv", "source.csv"]
     fetched_files = []

From b12214004ad45bdfb3dfd5137771453ac8a81046 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Sun, 25 Jan 2026 14:25:45 +0000
Subject: [PATCH 02/12] test update and file clean up

---
 .vscode/launch.json                           |  24 +-
 .../src/application/core/pipeline.py          |   5 +-
 .../src/application/core/workflow.py          |  60 ++--
 .../src/application/core/test_pipeline.py     | 326 ++----------------
 .../src/application/core/test_workflow.py     |  47 +--
 scripts/debug_trigger_add_data.py             | 131 +++++++
 6 files changed, 257 insertions(+), 336 deletions(-)
 create mode 100644 scripts/debug_trigger_add_data.py

diff --git a/.vscode/launch.json b/.vscode/launch.json
index b70d11a9..120f4424 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,7 +3,7 @@
     "python": "${workspaceFolder}/request-processor/.venv/bin/python",
     "configurations": [
         {
-            "name": "Debug Request Processor - Manual Task Trigger",
+            "name": "Debug Request Processor - Manual Check URL Task Trigger",
             "type": "python",
             "request": "launch",
             "python": "${workspaceFolder}/request-processor/.venv/bin/python",
@@ -24,6 +24,28 @@
             "args": [],
             "cwd": "${workspaceFolder}/request-processor"
         },
+        {
+            "name": "Debug Request Processor - Manual Add Data Task",
+            "type": "python",
+            "request": "launch",
+            "python": "${workspaceFolder}/request-processor/.venv/bin/python",
+            "program": "${workspaceFolder}/scripts/debug_trigger_add_data.py",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}:${workspaceFolder}/request-processor:${workspaceFolder}/request-processor/.venv/src/digital-land",
+                "DATABASE_URL": "postgresql://postgres:password@localhost:54320/request_database",
+                "CELERY_BROKER_URL": "redis://localhost:6379/0",
+                "AWS_ENDPOINT_URL": "http://localhost:4566",
+                "AWS_DEFAULT_REGION": "eu-west-2",
+                "AWS_ACCESS_KEY_ID": "example",
+                "AWS_SECRET_ACCESS_KEY": "example",
+                "REQUEST_FILES_BUCKET_NAME": "dluhc-data-platform-request-files-local",
+                "SENTRY_ENABLED": "false"
+            },
+            "args": [],
+            "cwd": "${workspaceFolder}/request-processor"
+        },
         {
             "name": "Debug Request Processor - Celery Worker",
             "type": "python",
diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 30b50d55..b4582396 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -191,7 +191,8 @@ def fetch_add_data_response(
         organisation = Organisation(os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path))
         api = API(specification=specification)
 
-        # TODO: Need to load config class
+        # TODO: Need to load config class for correct transform?
+        # TODO: Handling of column mapping?
         valid_category_values = api.get_valid_category_values(dataset, pipeline)   
 
         files_in_resource = os.listdir(input_dir)
@@ -246,7 +247,7 @@ def fetch_add_data_response(
                     logger.info(f"No unidentified lookups found in {resource_file}")
 
                 
-                # TODO: Re-run to see if no unidentified lookups remain
+                # TODO: Re-run to see if no unidentified lookups remain and if so create an error summary for add data command
 
             except Exception as err:
                 logger.error(f"Error processing {resource_file}: {err}")
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index 45616931..bdaa556f 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -418,7 +418,7 @@ def add_data_workflow(
     directories,
 ):
     """
-    Setup directories and download required CSVs to manage add-data pipeline, then invoke fetch_add_data_response.
+    Setup directories and download required CSVs to manage add-data pipeline, then invoke fetch_add_data_response, also clean up.
     
     Args:
         file_name (str): Collection resource file name
@@ -430,30 +430,44 @@ def add_data_workflow(
         documentation_url (str): Documentation URL for the dataset
         directories (Directories): Directories object with required paths
     """
+    try:
+        pipeline_dir = os.path.join(directories.PIPELINE_DIR, collection, request_id)
+        input_dir = os.path.join(directories.COLLECTION_DIR, "resource", request_id)
+        output_path = os.path.join(directories.TRANSFORMED_DIR, request_id, file_name)
+        if not os.path.exists(output_path):
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        # TODO: Can this use fetch_pipeline_csvs function instead?, do seem to need main config source (GitHub) for real time data
+        fetch_add_data_csvs(collection, pipeline_dir)
+
+        response_data = fetch_add_data_response(
+            collection=collection,
+            dataset=dataset,
+            organisation_provider=organisation_provider,
+            pipeline_dir=pipeline_dir,
+            input_dir=input_dir,
+            output_path=output_path,
+            specification_dir=directories.SPECIFICATION_DIR,
+            cache_dir=directories.CACHE_DIR,
+            url=url,
+            documentation_url=documentation_url,
+        )
+        logger.info(f"add data response is for id {request_id} : {response_data}")
 
-    pipeline_dir = os.path.join(directories.PIPELINE_DIR, collection, request_id)
-    input_dir = os.path.join(directories.COLLECTION_DIR, "resource", request_id)
-    output_path = os.path.join(directories.TRANSFORMED_DIR, request_id, file_name)
-    if not os.path.exists(output_path):
-        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    
-    fetch_add_data_csvs(collection, pipeline_dir)
-
-    response_data = fetch_add_data_response(
-        collection=collection,
-        dataset=dataset,
-        organisation_provider=organisation_provider,
-        pipeline_dir=pipeline_dir,
-        input_dir=input_dir,
-        output_path=output_path,
-        specification_dir=directories.SPECIFICATION_DIR,
-        cache_dir=directories.CACHE_DIR,
-        url=url,
-        documentation_url=documentation_url,
-    )
-    logger.info(f"add data response is for id {request_id} : {response_data}")
+    except Exception as e:
+        logger.exception(f"An error occurred in add_data_workflow")
+        response_data = None
 
-    # TODO: Clean up directories if needed
+    finally:
+        clean_up(
+            request_id,
+            os.path.join(directories.COLLECTION_DIR, "resource", request_id),
+            directories.COLLECTION_DIR,
+            os.path.join(directories.TRANSFORMED_DIR, request_id),
+            directories.TRANSFORMED_DIR,
+            os.path.join(directories.PIPELINE_DIR, collection),
+            directories.PIPELINE_DIR,
+        )
 
     return response_data
 
diff --git a/request-processor/tests/unit/src/application/core/test_pipeline.py b/request-processor/tests/unit/src/application/core/test_pipeline.py
index 43257222..66f366ac 100644
--- a/request-processor/tests/unit/src/application/core/test_pipeline.py
+++ b/request-processor/tests/unit/src/application/core/test_pipeline.py
@@ -4,9 +4,6 @@
 from unittest.mock import MagicMock
 from src.application.core.pipeline import (
     fetch_add_data_response,
-    _add_data_read_entities,
-    _check_existing_entities,
-    _assign_entity_numbers,
     _get_entities_breakdown,
     _get_existing_entities_breakdown,
     _validate_endpoint,
@@ -64,6 +61,12 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
     monkeypatch.setattr(
         "src.application.core.pipeline.Lookups", lambda x: mock_lookups_instance
     )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Pipeline", MagicMock()
+    )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Organisation", MagicMock()
+    )
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
         lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
@@ -76,9 +79,10 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
     result = fetch_add_data_response(
         collection=collection,
         dataset=dataset,
-        organisation=organisation,
+        organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
-        input_path=str(input_path),
+        input_dir=str(input_path),
+        output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
         cache_dir=str(cache_dir),
         url=url,
@@ -109,6 +113,12 @@ def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
     monkeypatch.setattr(
         "src.application.core.pipeline.Specification", lambda x: mock_spec
     )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Pipeline", MagicMock()
+    )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Organisation", MagicMock()
+    )
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
         lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
@@ -121,9 +131,10 @@ def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
     result = fetch_add_data_response(
         collection=collection,
         dataset=dataset,
-        organisation=organisation,
+        organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
-        input_path=str(input_path),
+        input_dir=str(input_path),
+        output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
         cache_dir=str(cache_dir),
         url=url,
@@ -153,14 +164,21 @@ def test_fetch_add_data_response_file_not_found(monkeypatch, tmp_path):
     monkeypatch.setattr(
         "src.application.core.pipeline.Specification", lambda x: mock_spec
     )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Pipeline", MagicMock()
+    )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Organisation", MagicMock()
+    )
 
     with pytest.raises(FileNotFoundError):
         fetch_add_data_response(
             collection=collection,
             dataset=dataset,
-            organisation=organisation,
+            organisation_provider=organisation,
             pipeline_dir=str(pipeline_dir),
-            input_path=str(input_path),
+            input_dir=str(input_path),
+            output_path=str(input_path / "output.csv"),
             specification_dir=str(specification_dir),
             cache_dir=str(cache_dir),
             url=url,
@@ -168,90 +186,6 @@ def test_fetch_add_data_response_file_not_found(monkeypatch, tmp_path):
         )
 
 
-def test_fetch_add_data_response_with_existing_entities(monkeypatch, tmp_path):
-    """Test processing with mix of new and existing entities"""
-    dataset = "test-dataset"
-    organisation = "test-org"
-    collection = "test-collection"
-    request_id = "req-005"
-    pipeline_dir = tmp_path / "pipeline"
-    input_path = tmp_path / "resource"
-    specification_dir = tmp_path / "specification"
-    cache_dir = tmp_path / "cache"
-    url = "http://example.com/endpoint"
-    documentation_url = "http://example.com/doc"
-
-    input_path.mkdir(parents=True)
-    pipeline_dir.mkdir(parents=True)
-
-    test_file = input_path / "test.csv"
-    test_file.write_text("reference\nREF001\nREF002\nREF003")
-
-    lookup_file = pipeline_dir / "lookup.csv"
-    lookup_file.write_text(
-        "prefix,resource,organisation,reference,entity\n"
-        "test-prefix,test,test-org,REF001,1000001\n"
-    )
-
-    mock_spec = MagicMock()
-    mock_spec.dataset_prefix.return_value = "test-prefix"
-    mock_spec.get_dataset_entity_min.return_value = 1000000
-    mock_spec.get_dataset_entity_max.return_value = 9999999
-
-    mock_lookups_instance = MagicMock()
-    mock_lookups_instance.lookups_path = str(lookup_file)
-    mock_lookups_instance.get_max_entity.return_value = 1000001
-    mock_lookups_instance.save_csv.return_value = [
-        {
-            "prefix": "test-prefix",
-            "organisation": "test-org",
-            "reference": "REF002",
-            "entity": "1000002",
-            "resource": "test",
-        },
-        {
-            "prefix": "test-prefix",
-            "organisation": "test-org",
-            "reference": "REF003",
-            "entity": "1000003",
-            "resource": "test",
-        },
-    ]
-    mock_lookups_instance.entity_num_gen = MagicMock()
-    mock_lookups_instance.entity_num_gen.state = {}
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Specification", lambda x: mock_spec
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Lookups", lambda x: mock_lookups_instance
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_source",
-        lambda *a, **k: {"documentation_url_in_source_csv": True},
-    )
-
-    result = fetch_add_data_response(
-        collection=collection,
-        dataset=dataset,
-        organisation=organisation,
-        pipeline_dir=str(pipeline_dir),
-        input_path=str(input_path),
-        specification_dir=str(specification_dir),
-        cache_dir=str(cache_dir),
-        url=url,
-        documentation_url=documentation_url,
-    )
-
-    assert "entity-summary" in result
-    assert result["entity-summary"]["new-in-resource"] == 2
-    assert result["entity-summary"]["existing-in-resource"] == 1
-
-
 def test_fetch_add_data_response_handles_processing_error(monkeypatch, tmp_path):
     """Test handling of errors during file processing"""
     dataset = "test-dataset"
@@ -277,13 +211,16 @@ def test_fetch_add_data_response_handles_processing_error(monkeypatch, tmp_path)
     monkeypatch.setattr(
         "src.application.core.pipeline.Specification", lambda x: mock_spec
     )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Pipeline", MagicMock()
+    )
+    monkeypatch.setattr(
+        "src.application.core.pipeline.Organisation", MagicMock()
+    )
 
     def raise_exception(*args, **kwargs):
         raise Exception("Processing error")
 
-    monkeypatch.setattr(
-        "src.application.core.pipeline._add_data_read_entities", raise_exception
-    )
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
         lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
@@ -296,9 +233,10 @@ def raise_exception(*args, **kwargs):
     result = fetch_add_data_response(
         collection=collection,
         dataset=dataset,
-        organisation=organisation,
+        organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
-        input_path=str(input_path),
+        input_dir=str(input_path),
+        output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
         cache_dir=str(cache_dir),
         url=url,
@@ -309,198 +247,6 @@ def raise_exception(*args, **kwargs):
     assert result["entity-summary"]["new-in-resource"] == 0
 
 
-def test_add_data_read_entities_success(tmp_path):
-    """Test reading entities from resource file"""
-    resource_file = tmp_path / "test.csv"
-    resource_file.write_text("reference,name\nREF001,Test1\nREF002,Test2")
-
-    mock_spec = MagicMock()
-    mock_spec.dataset_prefix.return_value = "test-prefix"
-
-    result = _add_data_read_entities(
-        str(resource_file), "test-dataset", "test-org", mock_spec
-    )
-
-    assert len(result) == 2
-    assert result[0]["reference"] == "REF001"
-    assert result[0]["prefix"] == "test-prefix"
-    assert result[0]["organisation"] == "test-org"
-    assert result[1]["reference"] == "REF002"
-
-
-def test_add_data_read_entities_skip_empty_reference(tmp_path):
-    """Test skipping rows with empty reference"""
-    resource_file = tmp_path / "test.csv"
-    resource_file.write_text("reference,name\nREF001,Test1\n,Test2\nREF003,Test3")
-
-    mock_spec = MagicMock()
-    mock_spec.dataset_prefix.return_value = "test-prefix"
-
-    result = _add_data_read_entities(
-        str(resource_file), "test-dataset", "test-org", mock_spec
-    )
-
-    assert len(result) == 2
-    assert result[0]["reference"] == "REF001"
-    assert result[1]["reference"] == "REF003"
-
-
-def test_add_data_read_entities_file_error():
-    """Test handling of file read errors"""
-    mock_spec = MagicMock()
-
-    with pytest.raises(Exception):
-        _add_data_read_entities(
-            "/nonexistent/file.csv", "test-dataset", "test-org", mock_spec
-        )
-
-
-def test_check_existing_entities_all_new(tmp_path):
-    """Test when all lookups are new"""
-    unidentified_lookups = [
-        {"prefix": "p1", "organisation": "org1", "reference": "REF001", "entity": ""},
-        {"prefix": "p1", "organisation": "org1", "reference": "REF002", "entity": ""},
-    ]
-
-    lookup_file = tmp_path / "lookup.csv"
-    lookup_file.write_text("prefix,organisation,reference,entity\n")
-
-    new_lookups, existing_lookups = _check_existing_entities(
-        unidentified_lookups, str(lookup_file)
-    )
-
-    assert len(new_lookups) == 2
-    assert len(existing_lookups) == 0
-
-
-def test_check_existing_entities_some_existing(tmp_path):
-    """Test when some lookups already exist"""
-    unidentified_lookups = [
-        {"prefix": "p1", "organisation": "org1", "reference": "REF001", "entity": ""},
-        {"prefix": "p1", "organisation": "org1", "reference": "REF002", "entity": ""},
-    ]
-
-    lookup_file = tmp_path / "lookup.csv"
-    lookup_file.write_text(
-        "prefix,organisation,reference,entity\n" "p1,org1,REF001,1000001\n"
-    )
-
-    new_lookups, existing_lookups = _check_existing_entities(
-        unidentified_lookups, str(lookup_file)
-    )
-
-    assert len(new_lookups) == 1
-    assert new_lookups[0]["reference"] == "REF002"
-    assert len(existing_lookups) == 1
-    assert existing_lookups[0]["entity"] == "1000001"
-
-
-def test_check_existing_entities_no_lookup_file():
-    """Test when lookup file doesn't exist"""
-    unidentified_lookups = [
-        {"prefix": "p1", "organisation": "org1", "reference": "REF001", "entity": ""}
-    ]
-
-    new_lookups, existing_lookups = _check_existing_entities(
-        unidentified_lookups, "/nonexistent/lookup.csv"
-    )
-
-    assert len(new_lookups) == 1
-    assert len(existing_lookups) == 0
-
-
-def test_assign_entity_numbers_creates_lookup_file(monkeypatch, tmp_path):
-    """Test creating new lookup file for assigning entities"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-
-    new_lookups = [
-        {
-            "prefix": "p1",
-            "organisation": "org1",
-            "reference": "REF001",
-            "resource": "res1",
-            "entity": "",
-        }
-    ]
-
-    mock_lookups = MagicMock()
-    mock_lookups.lookups_path = str(pipeline_dir / "lookup.csv")
-    mock_lookups.get_max_entity.return_value = 1000000
-    mock_lookups.save_csv.return_value = [
-        {
-            "prefix": "p1",
-            "organisation": "org1",
-            "reference": "REF001",
-            "entity": "1000001",
-            "resource": "res1",
-        }
-    ]
-    mock_lookups.entity_num_gen = MagicMock()
-    mock_lookups.entity_num_gen.state = {}
-
-    mock_spec = MagicMock()
-    mock_spec.get_dataset_entity_min.return_value = 1000000
-    mock_spec.get_dataset_entity_max.return_value = 9999999
-
-    monkeypatch.setattr("src.application.core.pipeline.Lookups", lambda x: mock_lookups)
-
-    result = _assign_entity_numbers(
-        new_lookups, str(pipeline_dir), "test-dataset", mock_spec
-    )
-
-    assert len(result) == 1
-    assert result[0]["entity"] == "1000001"
-    mock_lookups.load_csv.assert_called_once()
-    mock_lookups.add_entry.assert_called_once()
-    mock_lookups.save_csv.assert_called_once()
-
-
-def test_assign_entity_numbers_updates_existing_file(monkeypatch, tmp_path):
-    """Test updating existing lookup file"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    lookup_file = pipeline_dir / "lookup.csv"
-    lookup_file.write_text("prefix,resource,organisation,reference,entity\n")
-
-    new_lookups = [
-        {
-            "prefix": "p1",
-            "organisation": "org1",
-            "reference": "REF001",
-            "resource": "res1",
-        }
-    ]
-
-    mock_lookups = MagicMock()
-    mock_lookups.lookups_path = str(lookup_file)
-    mock_lookups.get_max_entity.return_value = 1000005
-    mock_lookups.save_csv.return_value = [
-        {
-            "prefix": "p1",
-            "organisation": "org1",
-            "reference": "REF001",
-            "entity": "1000006",
-            "resource": "res1",
-        }
-    ]
-    mock_lookups.entity_num_gen = MagicMock()
-    mock_lookups.entity_num_gen.state = {}
-
-    mock_spec = MagicMock()
-    mock_spec.get_dataset_entity_min.return_value = 1000000
-    mock_spec.get_dataset_entity_max.return_value = 9999999
-
-    monkeypatch.setattr("src.application.core.pipeline.Lookups", lambda x: mock_lookups)
-
-    result = _assign_entity_numbers(
-        new_lookups, str(pipeline_dir), "test-dataset", mock_spec
-    )
-
-    assert len(result) == 1
-    assert mock_lookups.entity_num_gen.state["current"] == 1000005
-
-
 def test_get_entities_breakdown_success():
     """Test converting entities to breakdown format"""
     new_entities = [
diff --git a/request-processor/tests/unit/src/application/core/test_workflow.py b/request-processor/tests/unit/src/application/core/test_workflow.py
index 9fdb4022..63085964 100644
--- a/request-processor/tests/unit/src/application/core/test_workflow.py
+++ b/request-processor/tests/unit/src/application/core/test_workflow.py
@@ -321,6 +321,7 @@ def test_add_data_workflow(monkeypatch):
     class DummyDirectories:
         PIPELINE_DIR = "/tmp/pipeline"
         COLLECTION_DIR = "/tmp/collection"
+        TRANSFORMED_DIR = "/tmp/transformed"
         SPECIFICATION_DIR = "/tmp/specification"
         CACHE_DIR = "/tmp/cache"
 
@@ -366,6 +367,7 @@ def test_add_data_workflow_calls(monkeypatch):
     class DummyDirectories:
         PIPELINE_DIR = "/tmp/pipeline"
         COLLECTION_DIR = "/tmp/collection"
+        TRANSFORMED_DIR = "/tmp/transformed"
         SPECIFICATION_DIR = "/tmp/specification"
         CACHE_DIR = "/tmp/cache"
 
@@ -373,33 +375,36 @@ class DummyDirectories:
 
     called = {}
 
-    def fake_resource_from_path(path):
-        called["resource_from_path"] = path
-        return "resource-hash"
-
     def fake_fetch_add_data_csvs(col, pdir):
         called["fetch_add_data_csvs"] = (col, pdir)
         return ["/tmp/pipeline/lookup.csv"]
 
     def fake_fetch_add_data_response(
-        col, ds, org, pdir, ipath, spec_dir, cache_dir, e_url, d_url
+        collection,
+        dataset,
+        organisation_provider,
+        pipeline_dir,
+        input_dir,
+        output_path,
+        specification_dir,
+        cache_dir,
+        url,
+        documentation_url,
     ):
         called["fetch_add_data_response"] = {
-            "collection": col,
-            "dataset": ds,
-            "organisation": org,
-            "pipeline_dir": pdir,
-            "input_path": ipath,
-            "specification_dir": spec_dir,
+            "collection": collection,
+            "dataset": dataset,
+            "organisation": organisation_provider,
+            "pipeline_dir": pipeline_dir,
+            "input_dir": input_dir,
+            "output_path": output_path,
+            "specification_dir": specification_dir,
             "cache_dir": cache_dir,
-            "url": e_url,
-            "documentation_url": d_url,
+            "url": url,
+            "documentation_url": documentation_url,
         }
         return {"result": "ok"}
 
-    monkeypatch.setattr(
-        "src.application.core.workflow.resource_from_path", fake_resource_from_path
-    )
     monkeypatch.setattr(
         "src.application.core.workflow.fetch_add_data_csvs", fake_fetch_add_data_csvs
     )
@@ -422,17 +427,19 @@ def fake_fetch_add_data_response(
     expected_pipeline_dir = os.path.join(
         directories.PIPELINE_DIR, collection, request_id
     )
-    expected_input_path = os.path.join(
+    expected_input_dir = os.path.join(
         directories.COLLECTION_DIR, "resource", request_id
     )
-    expected_file_path = os.path.join(expected_input_path, file_name)
+    expected_output_path = os.path.join(
+        directories.TRANSFORMED_DIR, request_id, file_name
+    )
 
-    assert called["resource_from_path"] == expected_file_path
     assert called["fetch_add_data_csvs"] == (collection, expected_pipeline_dir)
     assert called["fetch_add_data_response"]["dataset"] == dataset
     assert called["fetch_add_data_response"]["organisation"] == organisation
     assert called["fetch_add_data_response"]["pipeline_dir"] == expected_pipeline_dir
-    assert called["fetch_add_data_response"]["input_path"] == expected_input_path
+    assert called["fetch_add_data_response"]["input_dir"] == expected_input_dir
+    assert called["fetch_add_data_response"]["output_path"] == expected_output_path
     assert (
         called["fetch_add_data_response"]["specification_dir"]
         == directories.SPECIFICATION_DIR
diff --git a/scripts/debug_trigger_add_data.py b/scripts/debug_trigger_add_data.py
new file mode 100644
index 00000000..b1b5459e
--- /dev/null
+++ b/scripts/debug_trigger_add_data.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+"""
+Manual debug trigger for add_data_task.
+
+Invokes the add_data task directly without Celery so breakpoints hit reliably in VS Code.
+"""
+
+import os
+import sys
+import json
+import datetime
+from pathlib import Path
+
+# Set up paths
+workspace_root = Path(__file__).parent.parent
+request_processor_src = workspace_root / "request-processor" / "src"
+request_processor_root = workspace_root / "request-processor"
+request_model = workspace_root / "request_model"
+task_interface = workspace_root / "task_interface"
+
+sys.path.insert(0, str(request_processor_src))
+sys.path.insert(0, str(request_model))
+sys.path.insert(0, str(task_interface))
+
+import crud  # noqa: E402
+import database  # noqa: E402
+import request_model.models as request_models  # noqa: E402
+from tasks import add_data_task  # noqa: E402
+
+# Change to request-processor so relative paths (e.g. specification/, var/) resolve correctly
+os.chdir(request_processor_root)
+
+
+def ensure_request_exists(request_payload: dict) -> None:
+    """Upsert a row into the request table so the processor can write responses."""
+
+    database_url = os.environ.get("DATABASE_URL")
+    if not database_url:
+        raise RuntimeError(
+            "DATABASE_URL is not set. Start the stack (or export DATABASE_URL) before running debug_trigger_add_data."
+        )
+
+    request_id = request_payload["id"]
+    db_session = database.session_maker()
+    with db_session() as session:
+        existing = crud.get_request(session, request_id)
+        if existing:
+            existing.status = request_payload.get("status", existing.status)
+            existing.type = request_payload.get("type", existing.type)
+            existing.params = request_payload.get("params", existing.params)
+        else:
+            session.add(
+                request_models.Request(
+                    id=request_id,
+                    status=request_payload.get("status", "PENDING"),
+                    type=request_payload.get("type"),
+                    params=request_payload.get("params"),
+                )
+            )
+        session.commit()
+
+
+# Override directories to local workspace paths (avoids relying on /opt/* when not in Docker)
+workspace_volume = workspace_root / "request-processor" / "docker_volume"
+directories_override = {
+    "COLLECTION_DIR": str(workspace_volume / "collection"),
+    "ISSUE_DIR": str(workspace_volume / "issue"),
+    "COLUMN_FIELD_DIR": str(workspace_volume / "column-field"),
+    "TRANSFORMED_DIR": str(workspace_volume / "transformed"),
+    "CONVERTED_DIR": str(workspace_volume / "converted"),
+    "PIPELINE_DIR": str(workspace_volume / "pipeline"),
+    # Leave relative defaults for var/ and specification/
+}
+
+# Ensure base directories exist
+for d in directories_override.values():
+    Path(d).mkdir(parents=True, exist_ok=True)
+
+# Request payload - edit the params block if you need to test different datasets or URLs
+now = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+request_payload = {
+    "id": "debug-add-data-001",
+    "type": "add_data",
+    "status": "PENDING",
+    "created": now,
+    "modified": now,
+    "response": None,
+    "params": {
+        "type": "add_data",
+        "organisationName": "Stockport Metropolitan Borough Council",
+        "organisation": "local-authority:SKP",
+        "dataset": "article-4-direction",
+        "collection": "article-4-direction",
+        "column_mapping": None,
+        "geom_type": None,
+        "url": "https://smbc-opendata.s3.eu-west-1.amazonaws.com/Article4/Article4_Dataset_Stockport.csv",
+        "documentation_url": "https://example.com/article-4-direction/documentation",
+        "licence": "ogl3",
+        "start_date": "2020-01-01",
+        "plugin": None,
+    },
+}
+
+print("=" * 80)
+print("DEBUG TRIGGER: Invoking add_data_task directly")
+print("=" * 80)
+print(f"\nRequest ID: {request_payload['id']}")
+print(f"Dataset: {request_payload['params']['dataset']}")
+print(f"Collection: {request_payload['params']['collection']}")
+print(f"URL: {request_payload['params']['url'][:80]}...")
+print("\n" + "=" * 80)
+print("Set a breakpoint inside add_data_workflow if you want to step through it.")
+print("=" * 80 + "\n")
+
+try:
+    ensure_request_exists(request_payload)
+    result = add_data_task(request_payload, directories=json.dumps(directories_override))
+    print("\n" + "=" * 80)
+    print("TASK COMPLETED SUCCESSFULLY")
+    print("=" * 80)
+    print(f"\nResult: {json.dumps(result, indent=2, default=str)}")
+except Exception as e:
+    print("\n" + "=" * 80)
+    print("TASK FAILED WITH EXCEPTION")
+    print("=" * 80)
+    print(f"\nException Type: {type(e).__name__}")
+    print(f"Exception Message: {str(e)}")
+    import traceback
+
+    traceback.print_exc()
+    sys.exit(1)

From be46a4f082cf9ad08d9e30927828df33cd6bb536 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Sun, 25 Jan 2026 14:34:31 +0000
Subject: [PATCH 03/12] lint

---
 .../src/application/core/pipeline.py          | 69 ++++++++++++-------
 .../src/application/core/workflow.py          |  6 +-
 .../src/application/core/test_pipeline.py     | 32 +++------
 3 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index b4582396..5526c7c3 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -4,11 +4,9 @@
 from digital_land.specification import Specification
 from digital_land.organisation import Organisation
 from digital_land.api import API
-from collections import defaultdict
 
 from digital_land.pipeline import Pipeline, Lookups
 from digital_land.commands import get_resource_unidentified_lookups
-from digital_land.api import API
 from application.core.utils import append_endpoint, append_source
 from datetime import datetime
 from pathlib import Path
@@ -83,17 +81,27 @@ def fetch_response_data(
                 output_path=os.path.join(
                     transformed_dir, dataset, request_id, f"{resource}.csv"
                 ),
-                organisation=Organisation(os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path)),
+                organisation=Organisation(
+                    os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path)
+                ),
                 resource=resource,
-                valid_category_values = api.get_valid_category_values(dataset, pipeline),
-                converted_path=os.path.join(converted_dir, request_id, f"{resource}.csv"),
+                valid_category_values=api.get_valid_category_values(dataset, pipeline),
+                converted_path=os.path.join(
+                    converted_dir, request_id, f"{resource}.csv"
+                ),
                 disable_lookups=True,
             )
             # Issue log needs severity column added, so manually added and saved here
-            issue_log.add_severity_column(os.path.join(specification_dir, "issue-type.csv"))
-            issue_log.save(os.path.join(issue_dir, dataset, request_id, resource + ".csv"))
+            issue_log.add_severity_column(
+                os.path.join(specification_dir, "issue-type.csv")
+            )
+            issue_log.save(
+                os.path.join(issue_dir, dataset, request_id, resource + ".csv")
+            )
             pipeline.save_logs(
-                column_field_path=os.path.join(column_field_dir, dataset, request_id, resource + ".csv"),
+                column_field_path=os.path.join(
+                    column_field_dir, dataset, request_id, resource + ".csv"
+                ),
                 dataset_resource_path=os.path.join(
                     dataset_resource_dir, dataset, request_id, resource + ".csv"
                 ),
@@ -112,7 +120,13 @@ def default_output_path(command, input_path):
 
 
 def assign_entries(
-    resource_path, dataset, organisation, pipeline_dir, specification, cache_dir, endpoints=None
+    resource_path,
+    dataset,
+    organisation,
+    pipeline_dir,
+    specification,
+    cache_dir,
+    endpoints=None,
 ):
     pipeline = Pipeline(pipeline_dir, dataset)
     resource_lookups = get_resource_unidentified_lookups(
@@ -138,7 +152,7 @@ def assign_entries(
             )
 
     lookups.load_csv()
-    
+
     # Track which entries are new by checking before adding
     new_entries_added = []
     for new_lookup in unassigned_entries:
@@ -157,19 +171,20 @@ def assign_entries(
     )
 
     newly_assigned = lookups.save_csv()
-    
+
     # Filter to return only the entries we just added
     if newly_assigned:
         new_lookups = [
-            lookup for lookup in newly_assigned
+            lookup
+            for lookup in newly_assigned
             if any(
-                lookup.get("reference") == entry.get("reference") 
+                lookup.get("reference") == entry.get("reference")
                 and lookup.get("organisation") == entry.get("organisation")
                 for entry in new_entries_added
             )
         ]
         return new_lookups
-    
+
     return []
 
 
@@ -185,15 +200,17 @@ def fetch_add_data_response(
     url,
     documentation_url,
 ):
-    try:            
+    try:
         specification = Specification(specification_dir)
         pipeline = Pipeline(pipeline_dir, dataset, specification=specification)
-        organisation = Organisation(os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path))
+        organisation = Organisation(
+            os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path)
+        )
         api = API(specification=specification)
 
         # TODO: Need to load config class for correct transform?
         # TODO: Handling of column mapping?
-        valid_category_values = api.get_valid_category_values(dataset, pipeline)   
+        valid_category_values = api.get_valid_category_values(dataset, pipeline)
 
         files_in_resource = os.listdir(input_dir)
 
@@ -213,18 +230,21 @@ def fetch_add_data_response(
                     organisation=organisation,
                     organisations=[organisation_provider],
                     resource=resource_from_path(resource_file_path),
-                    valid_category_values = valid_category_values,
+                    valid_category_values=valid_category_values,
                     disable_lookups=False,
                 )
 
                 existing_entities.extend(
-                    _map_existing_entities_from_transformed_csv(output_path, pipeline_dir)
+                    _map_transformed_entities(output_path, pipeline_dir)
                 )
 
                 # Check if there are unknown entity issues in the log
-                unknown_issue_types = {'unknown entity', 'unknown entity - missing reference'}
+                unknown_issue_types = {
+                    "unknown entity",
+                    "unknown entity - missing reference",
+                }
                 has_unknown = any(
-                    row.get('issue-type') in unknown_issue_types
+                    row.get("issue-type") in unknown_issue_types
                     for row in issues_log.rows
                     if isinstance(row, dict)
                 )
@@ -246,8 +266,7 @@ def fetch_add_data_response(
                 else:
                     logger.info(f"No unidentified lookups found in {resource_file}")
 
-                
-                # TODO: Re-run to see if no unidentified lookups remain and if so create an error summary for add data command
+                # TODO: Re-run to see if no unidentified remain, if so new add data error summary
 
             except Exception as err:
                 logger.error(f"Error processing {resource_file}: {err}")
@@ -340,11 +359,11 @@ def _get_existing_entities_breakdown(existing_entities):
     return breakdown
 
 
-def _map_existing_entities_from_transformed_csv(transformed_csv_path, pipeline_dir):
+def _map_transformed_entities(transformed_csv_path, pipeline_dir):  # noqa: C901
     """Extract unique entities from transformed CSV and lookup their details in lookup.csv."""
 
     mapped_entities = []
-    
+
     if not os.path.exists(transformed_csv_path):
         logger.warning(f"Transformed CSV not found: {transformed_csv_path}")
         return mapped_entities
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index bdaa556f..3215346e 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -419,7 +419,7 @@ def add_data_workflow(
 ):
     """
     Setup directories and download required CSVs to manage add-data pipeline, then invoke fetch_add_data_response, also clean up.
-    
+
     Args:
         file_name (str): Collection resource file name
         request_id (str): Unique request identifier
@@ -436,7 +436,7 @@ def add_data_workflow(
         output_path = os.path.join(directories.TRANSFORMED_DIR, request_id, file_name)
         if not os.path.exists(output_path):
             os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        
+
         # TODO: Can this use fetch_pipeline_csvs function instead?, do seem to need main config source (GitHub) for real time data
         fetch_add_data_csvs(collection, pipeline_dir)
 
@@ -473,7 +473,7 @@ def add_data_workflow(
 
 
 def fetch_add_data_csvs(collection, pipeline_dir):
-    """Download add-data pipeline CSVs (lookup, endpoint, source) into pipeline_dir and ensure organisation """
+    """Download add-data pipeline CSVs (lookup, endpoint, source) into pipeline_dir and ensure organisation"""
     os.makedirs(pipeline_dir, exist_ok=True)
     add_data_csvs = ["lookup.csv", "endpoint.csv", "source.csv"]
     fetched_files = []
diff --git a/request-processor/tests/unit/src/application/core/test_pipeline.py b/request-processor/tests/unit/src/application/core/test_pipeline.py
index 66f366ac..b5b56106 100644
--- a/request-processor/tests/unit/src/application/core/test_pipeline.py
+++ b/request-processor/tests/unit/src/application/core/test_pipeline.py
@@ -61,12 +61,8 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
     monkeypatch.setattr(
         "src.application.core.pipeline.Lookups", lambda x: mock_lookups_instance
     )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Pipeline", MagicMock()
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Organisation", MagicMock()
-    )
+    monkeypatch.setattr("src.application.core.pipeline.Pipeline", MagicMock())
+    monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
         lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
@@ -113,12 +109,8 @@ def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
     monkeypatch.setattr(
         "src.application.core.pipeline.Specification", lambda x: mock_spec
     )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Pipeline", MagicMock()
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Organisation", MagicMock()
-    )
+    monkeypatch.setattr("src.application.core.pipeline.Pipeline", MagicMock())
+    monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
         lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
@@ -164,12 +156,8 @@ def test_fetch_add_data_response_file_not_found(monkeypatch, tmp_path):
     monkeypatch.setattr(
         "src.application.core.pipeline.Specification", lambda x: mock_spec
     )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Pipeline", MagicMock()
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Organisation", MagicMock()
-    )
+    monkeypatch.setattr("src.application.core.pipeline.Pipeline", MagicMock())
+    monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
 
     with pytest.raises(FileNotFoundError):
         fetch_add_data_response(
@@ -211,12 +199,8 @@ def test_fetch_add_data_response_handles_processing_error(monkeypatch, tmp_path)
     monkeypatch.setattr(
         "src.application.core.pipeline.Specification", lambda x: mock_spec
     )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Pipeline", MagicMock()
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline.Organisation", MagicMock()
-    )
+    monkeypatch.setattr("src.application.core.pipeline.Pipeline", MagicMock())
+    monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
 
     def raise_exception(*args, **kwargs):
         raise Exception("Processing error")

From 05167cf937a5522a64f444f75a73c64646bac9e4 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Sun, 25 Jan 2026 14:47:24 +0000
Subject: [PATCH 04/12] integration assertions update and temp coverage change
 dev only

---
 request-processor/makerules/python.mk              | 2 +-
 request-processor/src/application/core/pipeline.py | 3 ++-
 request-processor/src/application/core/workflow.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/request-processor/makerules/python.mk b/request-processor/makerules/python.mk
index 75e64a15..a8ea532a 100644
--- a/request-processor/makerules/python.mk
+++ b/request-processor/makerules/python.mk
@@ -16,4 +16,4 @@ coverage-unit:
 	pytest --random-order --cov=src tests/unit/
 
 coverage-integration:
-	pytest --random-order --cov=src --cov-append --cov-fail-under=85 tests/integration/
+	pytest --random-order --cov=src --cov-append --cov-fail-under=80 tests/integration/
diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 5526c7c3..65cf8ad8 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -50,9 +50,10 @@ def fetch_response_data(
                 pipeline_dir=pipeline_dir,
                 specification=specification,
                 cache_dir=cache_dir,
+                endpoints=[],
             )
     except Exception as err:
-        logger.error("An exception occured during assign_entries process: ", str(err))
+        logger.error("An exception occured during assign_entries process: %s", str(err))
 
     # Create directories if they don't exist
     for directory in [
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index 3215346e..f9814989 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -31,9 +31,9 @@ def run_workflow(
     directories,
 ):
     additional_concats = None
+    response_data = {}
 
     try:
-        response_data = {}
         # pipeline directory structure & download
         pipeline_dir = os.path.join(directories.PIPELINE_DIR, dataset, request_id)
 

From 0eac83e62404faf24d0c8e71e44b663b00dc7dcb Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Mon, 26 Jan 2026 09:59:24 +0000
Subject: [PATCH 05/12] test workflow publish to use branch name

---
 .github/workflows/publish.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 3a105300..49947746 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -66,11 +66,11 @@ jobs:
 
       - run: docker build --build-arg DEPLOY_TIME="$(date +%Y-%m-%dT%H:%M:%S)" --build-arg GIT_COMMIT="${{ steps.vars.outputs.sha_short }}" -t $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} -f request-api/Dockerfile .
 
-      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:main
+      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:${GITHUB_REF_NAME}
 
       - run: docker push $DOCKER_REPO:${{ steps.vars.outputs.sha_short }}
 
-      - run: docker push $DOCKER_REPO:main
+      - run: docker push $DOCKER_REPO:${GITHUB_REF_NAME}
 
       - uses: aws-actions/configure-aws-credentials@v1
         with:
@@ -119,11 +119,11 @@ jobs:
 
       - run: docker build --build-arg DEPLOY_TIME="$(date +%Y-%m-%dT%H:%M:%S)" --build-arg GIT_COMMIT="${{ steps.vars.outputs.sha_short }}" -t $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} -f request-processor/Dockerfile .
 
-      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:main
+      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:${GITHUB_REF_NAME}
 
       - run: docker push $DOCKER_REPO:${{ steps.vars.outputs.sha_short }}
 
-      - run: docker push $DOCKER_REPO:main
+      - run: docker push $DOCKER_REPO:${GITHUB_REF_NAME}
 
       - uses: aws-actions/configure-aws-credentials@v1
         with:

From 0e0215bebf309b0a62bb0a19a77d0cd2f5d7cba2 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Mon, 26 Jan 2026 10:53:33 +0000
Subject: [PATCH 06/12] rename to main tag in publish

---
 .github/workflows/publish.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 49947746..3a105300 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -66,11 +66,11 @@ jobs:
 
       - run: docker build --build-arg DEPLOY_TIME="$(date +%Y-%m-%dT%H:%M:%S)" --build-arg GIT_COMMIT="${{ steps.vars.outputs.sha_short }}" -t $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} -f request-api/Dockerfile .
 
-      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:${GITHUB_REF_NAME}
+      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:main
 
       - run: docker push $DOCKER_REPO:${{ steps.vars.outputs.sha_short }}
 
-      - run: docker push $DOCKER_REPO:${GITHUB_REF_NAME}
+      - run: docker push $DOCKER_REPO:main
 
       - uses: aws-actions/configure-aws-credentials@v1
         with:
@@ -119,11 +119,11 @@ jobs:
 
       - run: docker build --build-arg DEPLOY_TIME="$(date +%Y-%m-%dT%H:%M:%S)" --build-arg GIT_COMMIT="${{ steps.vars.outputs.sha_short }}" -t $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} -f request-processor/Dockerfile .
 
-      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:${GITHUB_REF_NAME}
+      - run: docker tag $DOCKER_REPO:${{ steps.vars.outputs.sha_short }} $DOCKER_REPO:main
 
       - run: docker push $DOCKER_REPO:${{ steps.vars.outputs.sha_short }}
 
-      - run: docker push $DOCKER_REPO:${GITHUB_REF_NAME}
+      - run: docker push $DOCKER_REPO:main
 
       - uses: aws-actions/configure-aws-credentials@v1
         with:

From 7e9e48ac894776bb0c30b10c1be0c78bafba5cc1 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Mon, 26 Jan 2026 13:47:38 +0000
Subject: [PATCH 07/12] all pipeline files in use now

---
 .../src/application/core/pipeline.py          | 16 +++---
 .../src/application/core/workflow.py          | 55 +++++++++++++++----
 .../src/application/core/test_pipeline.py     |  4 ++
 .../src/application/core/test_workflow.py     | 43 ++++++++++-----
 4 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 65cf8ad8..6e46ca56 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -194,6 +194,7 @@ def fetch_add_data_response(
     dataset,
     organisation_provider,
     pipeline_dir,
+    collection_dir,
     input_dir,
     output_path,
     specification_dir,
@@ -209,8 +210,6 @@ def fetch_add_data_response(
         )
         api = API(specification=specification)
 
-        # TODO: Need to load config class for correct transform?
-        # TODO: Handling of column mapping?
         valid_category_values = api.get_valid_category_values(dataset, pipeline)
 
         files_in_resource = os.listdir(input_dir)
@@ -233,6 +232,7 @@ def fetch_add_data_response(
                     resource=resource_from_path(resource_file_path),
                     valid_category_values=valid_category_values,
                     disable_lookups=False,
+                    endpoints=[url],
                 )
 
                 existing_entities.extend(
@@ -280,11 +280,11 @@ def fetch_add_data_response(
 
         endpoint_summary = _validate_endpoint(
             url,
-            pipeline_dir,
+            collection_dir,
         )
         source_summary = _validate_source(
             documentation_url,
-            pipeline_dir,
+            collection_dir,
             collection,
             organisation_provider,
             dataset,
@@ -413,8 +413,8 @@ def _map_transformed_entities(transformed_csv_path, pipeline_dir):  # noqa: C901
     return mapped_entities
 
 
-def _validate_endpoint(url, pipeline_dir):
-    endpoint_csv_path = os.path.join(pipeline_dir, "endpoint.csv")
+def _validate_endpoint(url, config_dir):
+    endpoint_csv_path = os.path.join(config_dir, "endpoint.csv")
     if not url:
         logger.info("No endpoint URL provided")
         return {}
@@ -483,9 +483,9 @@ def _validate_endpoint(url, pipeline_dir):
 
 
 def _validate_source(
-    documentation_url, pipeline_dir, collection, organisation, dataset, endpoint_summary
+    documentation_url, config_dir, collection, organisation, dataset, endpoint_summary
 ):
-    source_csv_path = os.path.join(pipeline_dir, "source.csv")
+    source_csv_path = os.path.join(config_dir, "source.csv")
 
     endpoint_key = endpoint_summary.get("existing_endpoint_entry", {}).get(
         "endpoint"
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index f9814989..3f4a1ee2 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -433,18 +433,21 @@ def add_data_workflow(
     try:
         pipeline_dir = os.path.join(directories.PIPELINE_DIR, collection, request_id)
         input_dir = os.path.join(directories.COLLECTION_DIR, "resource", request_id)
+        collection_dir = os.path.join(directories.COLLECTION_DIR, request_id)
         output_path = os.path.join(directories.TRANSFORMED_DIR, request_id, file_name)
         if not os.path.exists(output_path):
             os.makedirs(os.path.dirname(output_path), exist_ok=True)
 
-        # TODO: Can this use fetch_pipeline_csvs function instead?, do seem to need main config source (GitHub) for real time data
-        fetch_add_data_csvs(collection, pipeline_dir)
+        # Loads csvs for Pipeline and Config
+        fetch_add_data_pipeline_csvs(collection, pipeline_dir)
+        fetch_add_data_collection_csvs(collection, collection_dir)
 
         response_data = fetch_add_data_response(
             collection=collection,
             dataset=dataset,
             organisation_provider=organisation_provider,
             pipeline_dir=pipeline_dir,
+            collection_dir=collection_dir,
             input_dir=input_dir,
             output_path=output_path,
             specification_dir=directories.SPECIFICATION_DIR,
@@ -454,6 +457,9 @@ def add_data_workflow(
         )
         logger.info(f"add data response is for id {request_id} : {response_data}")
 
+        # TODO: Add additioanl data to response_data such as column mapping?
+        # TODO: Error summary? like in run_workflow map issue data to messages, if it exists create issue summary to bloc adding.
+
     except Exception as e:
         logger.exception(f"An error occurred in add_data_workflow")
         response_data = None
@@ -462,6 +468,7 @@ def add_data_workflow(
         clean_up(
             request_id,
             os.path.join(directories.COLLECTION_DIR, "resource", request_id),
+            os.path.join(directories.COLLECTION_DIR, request_id),
             directories.COLLECTION_DIR,
             os.path.join(directories.TRANSFORMED_DIR, request_id),
             directories.TRANSFORMED_DIR,
@@ -472,20 +479,44 @@ def add_data_workflow(
     return response_data
 
 
-def fetch_add_data_csvs(collection, pipeline_dir):
-    """Download add-data pipeline CSVs (lookup, endpoint, source) into pipeline_dir and ensure organisation"""
+def fetch_add_data_pipeline_csvs(collection, pipeline_dir):
+    """Download pipeline CSVs into pipeline_dir"""
     os.makedirs(pipeline_dir, exist_ok=True)
-    add_data_csvs = ["lookup.csv", "endpoint.csv", "source.csv"]
-    fetched_files = []
-    for csv_name in add_data_csvs:
+    pipeline_csvs = [
+        "column.csv",
+        "combine.csv",
+        "concat.csv",
+        "convert.csv",
+        "default-value.csv",
+        "default.csv",
+        "entity-organisation.csv",
+        "expect.csv",
+        "filter.csv",
+        "lookup.csv",
+        "old-entity.csv",
+        "patch.csv",
+        "skip.csv",
+        "transform.csv",
+    ]
+    for csv_name in pipeline_csvs:
         csv_path = os.path.join(pipeline_dir, csv_name)
-        if csv_name == "lookup.csv":
-            url = f"{CONFIG_URL}pipeline/{collection}/{csv_name}"
-        else:
-            url = f"{CONFIG_URL}collection/{collection}/{csv_name}"
+        url = f"{CONFIG_URL}pipeline/{collection}/{csv_name}"
+        try:
+            urllib.request.urlretrieve(url, csv_path)
+            logger.info(f"Downloaded {csv_name} from {url} to {csv_path}")
+        except HTTPError as e:
+            logger.warning(f"Failed to retrieve {csv_name}: {e}.")
+
+
+def fetch_add_data_collection_csvs(collection, config_dir):
+    """Download config CSVs (endpoint.csv, source.csv) into config_dir"""
+    os.makedirs(config_dir, exist_ok=True)
+    config_csvs = ["endpoint.csv", "source.csv"]
+    for csv_name in config_csvs:
+        csv_path = os.path.join(config_dir, csv_name)
+        url = f"{CONFIG_URL}collection/{collection}/{csv_name}"
         try:
             urllib.request.urlretrieve(url, csv_path)
             logger.info(f"Downloaded {csv_name} from {url} to {csv_path}")
         except HTTPError as e:
             logger.warning(f"Failed to retrieve {csv_name}: {e}.")
-    return fetched_files
diff --git a/request-processor/tests/unit/src/application/core/test_pipeline.py b/request-processor/tests/unit/src/application/core/test_pipeline.py
index b5b56106..e0dc078c 100644
--- a/request-processor/tests/unit/src/application/core/test_pipeline.py
+++ b/request-processor/tests/unit/src/application/core/test_pipeline.py
@@ -77,6 +77,7 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
         dataset=dataset,
         organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
+        collection_dir=str(tmp_path / "collection"),
         input_dir=str(input_path),
         output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
@@ -125,6 +126,7 @@ def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
         dataset=dataset,
         organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
+        collection_dir=str(tmp_path / "collection"),
         input_dir=str(input_path),
         output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
@@ -165,6 +167,7 @@ def test_fetch_add_data_response_file_not_found(monkeypatch, tmp_path):
             dataset=dataset,
             organisation_provider=organisation,
             pipeline_dir=str(pipeline_dir),
+            collection_dir=str(tmp_path / "collection"),
             input_dir=str(input_path),
             output_path=str(input_path / "output.csv"),
             specification_dir=str(specification_dir),
@@ -219,6 +222,7 @@ def raise_exception(*args, **kwargs):
         dataset=dataset,
         organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
+        collection_dir=str(tmp_path / "collection"),
         input_dir=str(input_path),
         output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
diff --git a/request-processor/tests/unit/src/application/core/test_workflow.py b/request-processor/tests/unit/src/application/core/test_workflow.py
index 63085964..4089b8d3 100644
--- a/request-processor/tests/unit/src/application/core/test_workflow.py
+++ b/request-processor/tests/unit/src/application/core/test_workflow.py
@@ -5,7 +5,8 @@
     csv_to_json,
     fetch_pipeline_csvs,
     add_data_workflow,
-    fetch_add_data_csvs,
+    fetch_add_data_pipeline_csvs,
+    fetch_add_data_collection_csvs,
 )
 import csv
 import os
@@ -333,8 +334,12 @@ class DummyDirectories:
         "src.application.core.workflow.resource_from_path", lambda path: "resource-hash"
     )
     monkeypatch.setattr(
-        "src.application.core.workflow.fetch_add_data_csvs",
-        lambda col, pdir: ["/tmp/pipeline/lookup.csv"],
+        "src.application.core.workflow.fetch_add_data_pipeline_csvs",
+        lambda col, pdir: None,
+    )
+    monkeypatch.setattr(
+        "src.application.core.workflow.fetch_add_data_collection_csvs",
+        lambda col, cdir: None,
     )
     monkeypatch.setattr(
         "src.application.core.workflow.fetch_add_data_response",
@@ -375,15 +380,18 @@ class DummyDirectories:
 
     called = {}
 
-    def fake_fetch_add_data_csvs(col, pdir):
-        called["fetch_add_data_csvs"] = (col, pdir)
-        return ["/tmp/pipeline/lookup.csv"]
+    def fake_fetch_add_data_pipeline_csvs(col, pdir):
+        called["fetch_add_data_pipeline_csvs"] = (col, pdir)
+
+    def fake_fetch_add_data_collection_csvs(col, cdir):
+        called["fetch_add_data_collection_csvs"] = (col, cdir)
 
     def fake_fetch_add_data_response(
         collection,
         dataset,
         organisation_provider,
         pipeline_dir,
+        collection_dir,
         input_dir,
         output_path,
         specification_dir,
@@ -396,6 +404,7 @@ def fake_fetch_add_data_response(
             "dataset": dataset,
             "organisation": organisation_provider,
             "pipeline_dir": pipeline_dir,
+            "collection_dir": collection_dir,
             "input_dir": input_dir,
             "output_path": output_path,
             "specification_dir": specification_dir,
@@ -406,7 +415,12 @@ def fake_fetch_add_data_response(
         return {"result": "ok"}
 
     monkeypatch.setattr(
-        "src.application.core.workflow.fetch_add_data_csvs", fake_fetch_add_data_csvs
+        "src.application.core.workflow.fetch_add_data_pipeline_csvs",
+        fake_fetch_add_data_pipeline_csvs,
+    )
+    monkeypatch.setattr(
+        "src.application.core.workflow.fetch_add_data_collection_csvs",
+        fake_fetch_add_data_collection_csvs,
     )
     monkeypatch.setattr(
         "src.application.core.workflow.fetch_add_data_response",
@@ -434,7 +448,9 @@ def fake_fetch_add_data_response(
         directories.TRANSFORMED_DIR, request_id, file_name
     )
 
-    assert called["fetch_add_data_csvs"] == (collection, expected_pipeline_dir)
+    expected_config_dir = os.path.join(directories.COLLECTION_DIR, request_id)
+    assert called["fetch_add_data_pipeline_csvs"] == (collection, expected_pipeline_dir)
+    assert called["fetch_add_data_collection_csvs"] == (collection, expected_config_dir)
     assert called["fetch_add_data_response"]["dataset"] == dataset
     assert called["fetch_add_data_response"]["organisation"] == organisation
     assert called["fetch_add_data_response"]["pipeline_dir"] == expected_pipeline_dir
@@ -445,11 +461,12 @@ def fake_fetch_add_data_response(
         == directories.SPECIFICATION_DIR
     )
     assert called["fetch_add_data_response"]["cache_dir"] == directories.CACHE_DIR
+    assert called["fetch_add_data_response"]["collection_dir"] == expected_config_dir
     assert called["fetch_add_data_response"]["url"] == url
     assert called["fetch_add_data_response"]["documentation_url"] == documentation_url
 
 
-def test_fetch_add_data_csvs_from_url(monkeypatch, tmp_path):
+def test_fetch_add_data_pipeline_csvs_from_url(monkeypatch, tmp_path):
     collection = "test-collection"
     pipeline_dir = tmp_path / "pipeline"
     pipeline_dir_str = str(pipeline_dir)
@@ -467,14 +484,13 @@ def fake_urlretrieve(url, path):
 
     monkeypatch.setattr("urllib.request.urlretrieve", fake_urlretrieve)
 
-    files = fetch_add_data_csvs(collection, pipeline_dir_str)
+    fetch_add_data_pipeline_csvs(collection, pipeline_dir_str)
 
     assert os.path.exists(pipeline_dir_str)
     assert any("lookup.csv" in path for url, path in downloads)
-    assert files == []
 
 
-def test_fetch_add_data_csvs_handles_http_error(monkeypatch, tmp_path):
+def test_fetch_add_data_pipeline_csvs_handles_http_error(monkeypatch, tmp_path):
     collection = "test-collection"
     pipeline_dir = tmp_path / "pipeline"
     pipeline_dir_str = str(pipeline_dir)
@@ -487,7 +503,6 @@ def raise_http_error(url, path):
 
     monkeypatch.setattr("urllib.request.urlretrieve", raise_http_error)
 
-    files = fetch_add_data_csvs(collection, pipeline_dir_str)
+    fetch_add_data_pipeline_csvs(collection, pipeline_dir_str)
 
     assert os.path.exists(pipeline_dir_str)
-    assert files == []

From 7307c5571adf61f84e0c49a762aba15b20c2dbf3 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Tue, 27 Jan 2026 12:13:01 +0000
Subject: [PATCH 08/12] use start-date, plugin and licence

---
 .../src/application/core/pipeline.py          | 32 +++++++++---
 .../src/application/core/utils.py             |  9 +++-
 .../src/application/core/workflow.py          |  6 +++
 request-processor/src/tasks.py                |  6 +++
 .../data/specification/dataset-field.csv      |  6 +--
 .../src/application/core/test_pipeline.py     | 52 +++++++++++--------
 .../src/application/core/test_workflow.py     |  6 +++
 7 files changed, 84 insertions(+), 33 deletions(-)

diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 6e46ca56..2054cdba 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -201,6 +201,9 @@ def fetch_add_data_response(
     cache_dir,
     url,
     documentation_url,
+    licence=None,
+    start_date=None,
+    plugin=None,
 ):
     try:
         specification = Specification(specification_dir)
@@ -278,9 +281,12 @@ def fetch_add_data_response(
             existing_entities
         )
 
+        # TODO: creation of endpoint and source summary should be in workflow.py
         endpoint_summary = _validate_endpoint(
             url,
             collection_dir,
+            plugin,
+            start_date=start_date,
         )
         source_summary = _validate_source(
             documentation_url,
@@ -289,6 +295,8 @@ def fetch_add_data_response(
             organisation_provider,
             dataset,
             endpoint_summary,
+            start_date=start_date,
+            licence=licence,
         )
 
         entity_summary = {
@@ -413,7 +421,7 @@ def _map_transformed_entities(transformed_csv_path, pipeline_dir):  # noqa: C901
     return mapped_entities
 
 
-def _validate_endpoint(url, config_dir):
+def _validate_endpoint(url, config_dir, plugin, start_date=None):
     endpoint_csv_path = os.path.join(config_dir, "endpoint.csv")
     if not url:
         logger.info("No endpoint URL provided")
@@ -464,15 +472,17 @@ def _validate_endpoint(url, config_dir):
         endpoint_summary["existing_endpoint_entry"] = existing_entry
 
     else:
-        current_date = datetime.now().strftime("%Y-%m-%d")
+        if not start_date:
+            start_date = datetime.now().strftime("%Y-%m-%d")
         entry_date = datetime.now().isoformat()
 
         endpoint_key, new_endpoint_row = append_endpoint(
             endpoint_csv_path=endpoint_csv_path,
             endpoint_url=url,
             entry_date=entry_date,
-            start_date=current_date,
+            start_date=start_date,
             end_date="",
+            plugin=plugin,
         )
 
         if new_endpoint_row:
@@ -483,7 +493,14 @@ def _validate_endpoint(url, config_dir):
 
 
 def _validate_source(
-    documentation_url, config_dir, collection, organisation, dataset, endpoint_summary
+    documentation_url,
+    config_dir,
+    collection,
+    organisation,
+    dataset,
+    endpoint_summary,
+    start_date=None,
+    licence=None,
 ):
     source_csv_path = os.path.join(config_dir, "source.csv")
 
@@ -497,7 +514,8 @@ def _validate_source(
     if not documentation_url:
         logger.warning("No documentation URL provided")
 
-    current_date = datetime.now().strftime("%Y-%m-%d")
+    if not start_date:
+        start_date = datetime.now().strftime("%Y-%m-%d")
     entry_date = datetime.now().isoformat()
 
     source_key_returned, new_source_row = append_source(
@@ -507,10 +525,10 @@ def _validate_source(
         endpoint_key=endpoint_key,
         attribution="",
         documentation_url=documentation_url or "",
-        licence="",
+        licence=licence or "",
         pipelines=dataset,
         entry_date=entry_date,
-        start_date=current_date,
+        start_date=start_date,
         end_date="",
     )
 
diff --git a/request-processor/src/application/core/utils.py b/request-processor/src/application/core/utils.py
index ba4d2042..4a26037c 100644
--- a/request-processor/src/application/core/utils.py
+++ b/request-processor/src/application/core/utils.py
@@ -126,7 +126,12 @@ def hash_md5(value):
 
 
 def append_endpoint(
-    endpoint_csv_path, endpoint_url, entry_date=None, start_date=None, end_date=None
+    endpoint_csv_path,
+    endpoint_url,
+    entry_date=None,
+    start_date=None,
+    end_date=None,
+    plugin=None,
 ):
     endpoint_key = hash_sha256(endpoint_url)
     exists = False
@@ -162,7 +167,7 @@ def append_endpoint(
                 "endpoint": endpoint_key,
                 "endpoint-url": endpoint_url,
                 "parameters": "",
-                "plugin": "",
+                "plugin": plugin or "",
                 "entry-date": _formatted_date(entry_date)
                 or datetime.now().date().isoformat(),
                 "start-date": _formatted_date(start_date),
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index 3f4a1ee2..658cf430 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -416,6 +416,9 @@ def add_data_workflow(
     url,
     documentation_url,
     directories,
+    licence=None,
+    start_date=None,
+    plugin=None,
 ):
     """
     Setup directories and download required CSVs to manage add-data pipeline, then invoke fetch_add_data_response, also clean up.
@@ -454,6 +457,9 @@ def add_data_workflow(
             cache_dir=directories.CACHE_DIR,
             url=url,
             documentation_url=documentation_url,
+            licence=licence,
+            start_date=start_date,
+            plugin=plugin,
         )
         logger.info(f"add data response is for id {request_id} : {response_data}")
 
diff --git a/request-processor/src/tasks.py b/request-processor/src/tasks.py
index d94ab592..46676997 100644
--- a/request-processor/src/tasks.py
+++ b/request-processor/src/tasks.py
@@ -262,6 +262,9 @@ def add_data_task(request: Dict, directories=None):
             directories.COLLECTION_DIR, "resource", request_schema.id
         )
         file_name, log = _fetch_resource(resource_dir, request_data.url)
+        # Auto detect plugin needs to update request_data.plugin for downstream processing
+        if "plugin" in log:
+            request_data.plugin = log["plugin"]
         logger.info(
             f"file name from fetch resource is : {file_name} and the log from fetch resource is {log}"
         )
@@ -275,6 +278,9 @@ def add_data_task(request: Dict, directories=None):
                 request_data.url,
                 request_data.documentation_url,
                 directories,
+                request_data.licence,
+                request_data.start_date,
+                request_data.plugin,
             )
             if "plugin" in log:
                 response["plugin"] = log["plugin"]
diff --git a/request-processor/tests/data/specification/dataset-field.csv b/request-processor/tests/data/specification/dataset-field.csv
index e054d909..1d194236 100644
--- a/request-processor/tests/data/specification/dataset-field.csv
+++ b/request-processor/tests/data/specification/dataset-field.csv
@@ -1,4 +1,4 @@
 dataset,field,field-dataset,guidance,hint
-tree,entry-date,,,
-tree,geometry,,,
-tree,reference,,,
+brownfield-land,entry-date,,,
+brownfield-land,geometry,,,
+brownfield-land,reference,,,
diff --git a/request-processor/tests/unit/src/application/core/test_pipeline.py b/request-processor/tests/unit/src/application/core/test_pipeline.py
index e0dc078c..1e6f2e09 100644
--- a/request-processor/tests/unit/src/application/core/test_pipeline.py
+++ b/request-processor/tests/unit/src/application/core/test_pipeline.py
@@ -65,7 +65,9 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
     monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
+        lambda url, dir, plugin, start_date=None: {
+            "endpoint_url_in_endpoint_csv": True
+        },
     )
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_source",
@@ -114,7 +116,9 @@ def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
     monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
+        lambda url, dir, plugin, start_date=None: {
+            "endpoint_url_in_endpoint_csv": True
+        },
     )
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_source",
@@ -210,7 +214,9 @@ def raise_exception(*args, **kwargs):
 
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir: {"endpoint_url_in_endpoint_csv": True},
+        lambda url, dir, plugin, start_date=None: {
+            "endpoint_url_in_endpoint_csv": True
+        },
     )
     monkeypatch.setattr(
         "src.application.core.pipeline._validate_source",
@@ -342,13 +348,13 @@ def test_validate_endpoint_creates_file(monkeypatch, tmp_path):
     endpoint_csv_path = pipeline_dir / "endpoint.csv"
 
     def fake_append_endpoint(
-        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date
+        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
     ):
         return "endpoint_hash", {
             "endpoint": "endpoint_hash",
             "endpoint-url": endpoint_url,
             "parameters": "",
-            "plugin": "",
+            "plugin": plugin or "",
             "entry-date": entry_date,
             "start-date": start_date,
             "end-date": end_date,
@@ -360,7 +366,7 @@ def fake_append_endpoint(
 
     assert not endpoint_csv_path.exists()
 
-    _validate_endpoint(url, str(pipeline_dir))
+    _validate_endpoint(url, str(pipeline_dir), plugin=None)
 
     assert endpoint_csv_path.exists()
 
@@ -412,13 +418,13 @@ def test_validate_endpoint_appends(monkeypatch, tmp_path):
         )
 
     def fake_append_endpoint(
-        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date
+        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
     ):
         return "new_endpoint_hash", {
             "endpoint": "new_endpoint_hash",
             "endpoint-url": endpoint_url,
             "parameters": "",
-            "plugin": "",
+            "plugin": plugin or "",
             "entry-date": entry_date,
             "start-date": start_date,
             "end-date": end_date,
@@ -428,7 +434,7 @@ def fake_append_endpoint(
         "src.application.core.pipeline.append_endpoint", fake_append_endpoint
     )
 
-    result = _validate_endpoint(url, str(pipeline_dir))
+    result = _validate_endpoint(url, str(pipeline_dir), plugin=None)
 
     assert result["endpoint_url_in_endpoint_csv"] is False
     assert "new_endpoint_entry" in result
@@ -472,7 +478,7 @@ def test_validate_endpoint_finds_existing(monkeypatch, tmp_path):
         lambda *a, **kw: (_ for _ in ()).throw(Exception("Should not be called")),
     )
 
-    result = _validate_endpoint(url, str(pipeline_dir))
+    result = _validate_endpoint(url, str(pipeline_dir), plugin=None)
     assert result["endpoint_url_in_endpoint_csv"] is True
     assert "existing_endpoint_entry" in result
     assert result["existing_endpoint_entry"]["endpoint-url"] == url
@@ -483,7 +489,7 @@ def test_validate_endpoint_empty_url(monkeypatch, tmp_path):
     pipeline_dir = tmp_path / "pipeline"
     pipeline_dir.mkdir()
 
-    result = _validate_endpoint("", str(pipeline_dir))
+    result = _validate_endpoint("", str(pipeline_dir), plugin=None)
 
     assert result == {}
 
@@ -498,26 +504,18 @@ def test_validate_endpoint_csv_read_error(monkeypatch, tmp_path):
     endpoint_csv_path.write_bytes(b"\x00\x00\x00")
 
     def fake_append_endpoint(
-        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date
+        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
     ):
         return "endpoint_hash", {
             "endpoint": "endpoint_hash",
             "endpoint-url": endpoint_url,
             "parameters": "",
-            "plugin": "",
+            "plugin": plugin or "",
             "entry-date": entry_date,
             "start-date": start_date,
             "end-date": end_date,
         }
 
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_endpoint", fake_append_endpoint
-    )
-
-    result = _validate_endpoint(url, str(pipeline_dir))
-
-    assert "new_endpoint_entry" in result or "endpoint_url_in_endpoint_csv" in result
-
 
 def test_validate_source_creates_new_source(monkeypatch, tmp_path):
     """Test _validate_source creates new source entry when it doesn't exist"""
@@ -573,6 +571,8 @@ def fake_append_source(
         organisation,
         dataset,
         endpoint_summary,
+        start_date=None,
+        licence=None,
     )
 
     assert result["documentation_url_in_source_csv"] is False
@@ -647,6 +647,8 @@ def fake_append_source(*args, **kwargs):
         organisation,
         dataset,
         endpoint_summary,
+        start_date=None,
+        licence=None,
     )
 
     assert result["documentation_url_in_source_csv"] is True
@@ -674,6 +676,8 @@ def test_validate_source_no_endpoint_key(tmp_path):
         organisation,
         dataset,
         endpoint_summary,
+        start_date=None,
+        licence=None,
     )
 
     assert result == {}
@@ -732,6 +736,8 @@ def fake_append_source(
         organisation,
         dataset,
         endpoint_summary,
+        start_date=None,
+        licence=None,
     )
 
     assert "documentation_url_in_source_csv" in result
@@ -795,6 +801,8 @@ def fake_append_source(
         organisation,
         dataset,
         endpoint_summary,
+        start_date=None,
+        licence=None,
     )
 
     assert captured_endpoint_key == "new_endpoint_hash_789"
@@ -833,6 +841,8 @@ def fake_append_source(*args, **kwargs):
         organisation,
         dataset,
         endpoint_summary,
+        start_date=None,
+        licence=None,
     )
 
     assert "documentation_url_in_source_csv" in result
diff --git a/request-processor/tests/unit/src/application/core/test_workflow.py b/request-processor/tests/unit/src/application/core/test_workflow.py
index 4089b8d3..f943f27e 100644
--- a/request-processor/tests/unit/src/application/core/test_workflow.py
+++ b/request-processor/tests/unit/src/application/core/test_workflow.py
@@ -398,6 +398,9 @@ def fake_fetch_add_data_response(
         cache_dir,
         url,
         documentation_url,
+        licence=None,
+        start_date=None,
+        plugin=None,
     ):
         called["fetch_add_data_response"] = {
             "collection": collection,
@@ -411,6 +414,9 @@ def fake_fetch_add_data_response(
             "cache_dir": cache_dir,
             "url": url,
             "documentation_url": documentation_url,
+            "licence": licence,
+            "start_date": start_date,
+            "plugin": plugin,
         }
         return {"result": "ok"}
 

From bd283b9e5cfc215f81ba50137c19a190f1844cfc Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Mon, 2 Feb 2026 16:37:07 +0000
Subject: [PATCH 09/12] tidying up pipline and workflow, rerun pipeline on new
 entity creation

---
 .../src/application/core/pipeline.py          | 199 ++----------------
 .../src/application/core/utils.py             | 144 +++++++++++++
 .../src/application/core/workflow.py          |  77 +++++--
 request-processor/src/tasks.py                |   8 +-
 4 files changed, 227 insertions(+), 201 deletions(-)

diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 2054cdba..fb4956e5 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -190,20 +190,14 @@ def assign_entries(
 
 
 def fetch_add_data_response(
-    collection,
     dataset,
     organisation_provider,
     pipeline_dir,
-    collection_dir,
     input_dir,
     output_path,
     specification_dir,
     cache_dir,
-    url,
-    documentation_url,
-    licence=None,
-    start_date=None,
-    plugin=None,
+    url
 ):
     try:
         specification = Specification(specification_dir)
@@ -212,7 +206,6 @@ def fetch_add_data_response(
             os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path)
         )
         api = API(specification=specification)
-
         valid_category_values = api.get_valid_category_values(dataset, pipeline)
 
         files_in_resource = os.listdir(input_dir)
@@ -267,11 +260,24 @@ def fetch_add_data_response(
                         f"Found {len(new_lookups)} unidentified lookups in {resource_file}"
                     )
                     new_entities.extend(new_lookups)
+
+                    # Reload pipeline to pick up newly saved lookups
+                    pipeline = Pipeline(pipeline_dir, dataset, specification=specification)
+
+                    # Now re-run transform to check and return issue log
+                    issues_log = pipeline.transform(
+                        input_path=resource_file_path,
+                        output_path=output_path,
+                        organisation=organisation,
+                        organisations=[organisation_provider],
+                        resource=resource_from_path(resource_file_path),
+                        valid_category_values=valid_category_values,
+                        disable_lookups=False,
+                        endpoints=[url],
+                    )
                 else:
                     logger.info(f"No unidentified lookups found in {resource_file}")
 
-                # TODO: Re-run to see if no unidentified remain, if so new add data error summary
-
             except Exception as err:
                 logger.error(f"Error processing {resource_file}: {err}")
                 logger.exception("Full traceback: ")
@@ -281,38 +287,15 @@ def fetch_add_data_response(
             existing_entities
         )
 
-        # TODO: creation of endpoint and source summary should be in workflow.py
-        endpoint_summary = _validate_endpoint(
-            url,
-            collection_dir,
-            plugin,
-            start_date=start_date,
-        )
-        source_summary = _validate_source(
-            documentation_url,
-            collection_dir,
-            collection,
-            organisation_provider,
-            dataset,
-            endpoint_summary,
-            start_date=start_date,
-            licence=licence,
-        )
-
-        entity_summary = {
+        pipeline_summary = {
             "new-in-resource": len(new_entities),
             "existing-in-resource": len(existing_entities),
             "new-entities": new_entities_breakdown,
             "existing-entities": existing_entities_breakdown,
+            "pipeline-issues": [dict(issue) for issue in issues_log.rows],
         }
 
-        response_data = {
-            "entity-summary": entity_summary,
-            "endpoint-summary": endpoint_summary,
-            "source-summary": source_summary,
-        }
-
-        return response_data
+        return pipeline_summary
 
     except FileNotFoundError as e:
         logger.exception(f"File not found: {e}")
@@ -418,146 +401,4 @@ def _map_transformed_entities(transformed_csv_path, pipeline_dir):  # noqa: C901
                 }
             )
 
-    return mapped_entities
-
-
-def _validate_endpoint(url, config_dir, plugin, start_date=None):
-    endpoint_csv_path = os.path.join(config_dir, "endpoint.csv")
-    if not url:
-        logger.info("No endpoint URL provided")
-        return {}
-
-    if not os.path.exists(endpoint_csv_path):
-        os.makedirs(os.path.dirname(endpoint_csv_path), exist_ok=True)
-        with open(endpoint_csv_path, "w", newline="", encoding="utf-8") as f:
-            writer = csv.writer(f)
-            writer.writerow(
-                [
-                    "endpoint",
-                    "endpoint-url",
-                    "parameters",
-                    "plugin",
-                    "entry-date",
-                    "start-date",
-                    "end-date",
-                ]
-            )
-
-    endpoint_exists = False
-    existing_entry = None
-
-    try:
-        with open(endpoint_csv_path, "r", encoding="utf-8") as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                if row.get("endpoint-url", "").strip() == url.strip():
-                    endpoint_exists = True
-                    existing_entry = {
-                        "endpoint": row.get("endpoint", ""),
-                        "endpoint-url": row.get("endpoint-url", ""),
-                        "parameters": row.get("parameters", ""),
-                        "plugin": row.get("plugin", ""),
-                        "entry-date": row.get("entry-date", ""),
-                        "start-date": row.get("start-date", ""),
-                        "end-date": row.get("end-date", ""),
-                    }
-                    logger.info("Endpoint URL found in endpoint.csv")
-                    break
-    except Exception as e:
-        logger.error(f"Error reading endpoint.csv: {e}")
-
-    endpoint_summary = {"endpoint_url_in_endpoint_csv": endpoint_exists}
-
-    if endpoint_exists and existing_entry:
-        endpoint_summary["existing_endpoint_entry"] = existing_entry
-
-    else:
-        if not start_date:
-            start_date = datetime.now().strftime("%Y-%m-%d")
-        entry_date = datetime.now().isoformat()
-
-        endpoint_key, new_endpoint_row = append_endpoint(
-            endpoint_csv_path=endpoint_csv_path,
-            endpoint_url=url,
-            entry_date=entry_date,
-            start_date=start_date,
-            end_date="",
-            plugin=plugin,
-        )
-
-        if new_endpoint_row:
-            logger.info(f"Appended new endpoint with hash: {endpoint_key}")
-            endpoint_summary["new_endpoint_entry"] = new_endpoint_row
-
-    return endpoint_summary
-
-
-def _validate_source(
-    documentation_url,
-    config_dir,
-    collection,
-    organisation,
-    dataset,
-    endpoint_summary,
-    start_date=None,
-    licence=None,
-):
-    source_csv_path = os.path.join(config_dir, "source.csv")
-
-    endpoint_key = endpoint_summary.get("existing_endpoint_entry", {}).get(
-        "endpoint"
-    ) or endpoint_summary.get("new_endpoint_entry", {}).get("endpoint")
-    if not endpoint_key:
-        logger.warning("No endpoint_key available from endpoint_summary")
-        return {}
-
-    if not documentation_url:
-        logger.warning("No documentation URL provided")
-
-    if not start_date:
-        start_date = datetime.now().strftime("%Y-%m-%d")
-    entry_date = datetime.now().isoformat()
-
-    source_key_returned, new_source_row = append_source(
-        source_csv_path=source_csv_path,
-        collection=collection,
-        organisation=organisation,
-        endpoint_key=endpoint_key,
-        attribution="",
-        documentation_url=documentation_url or "",
-        licence=licence or "",
-        pipelines=dataset,
-        entry_date=entry_date,
-        start_date=start_date,
-        end_date="",
-    )
-
-    if new_source_row:
-        return {
-            "documentation_url_in_source_csv": False,
-            "new_source_entry": new_source_row,
-        }
-
-    source_summary = {"documentation_url_in_source_csv": True}
-    try:
-        with open(source_csv_path, "r", encoding="utf-8") as f:
-            for row in csv.DictReader(f):
-                if row.get("source", "").strip() == source_key_returned:
-                    source_summary["existing_source_entry"] = {
-                        "source": row.get("source", ""),
-                        "attribution": row.get("attribution", ""),
-                        "collection": row.get("collection", ""),
-                        "documentation-url": row.get("documentation-url", ""),
-                        "endpoint": row.get("endpoint", ""),
-                        "licence": row.get("licence", ""),
-                        "organisation": row.get("organisation", ""),
-                        "pipelines": row.get("pipelines", ""),
-                        "entry-date": row.get("entry-date", ""),
-                        "start-date": row.get("start-date", ""),
-                        "end-date": row.get("end-date", ""),
-                    }
-                    break
-    except Exception as e:
-        logger.error(f"Error reading existing source: {e}")
-
-    return source_summary
+    return mapped_entities
\ No newline at end of file
diff --git a/request-processor/src/application/core/utils.py b/request-processor/src/application/core/utils.py
index 4a26037c..a54ae59a 100644
--- a/request-processor/src/application/core/utils.py
+++ b/request-processor/src/application/core/utils.py
@@ -286,3 +286,147 @@ def _formatted_date(date_value):
         except Exception:
             pass
     return str(date_value)
+
+
+def validate_endpoint(url, config_dir, plugin, start_date=None):
+    """Validate if endpoint URL exists in endpoint.csv and create entry if not."""
+    endpoint_csv_path = os.path.join(config_dir, "endpoint.csv")
+    if not url:
+        logger.info("No endpoint URL provided")
+        return {}
+
+    if not os.path.exists(endpoint_csv_path):
+        os.makedirs(os.path.dirname(endpoint_csv_path), exist_ok=True)
+        with open(endpoint_csv_path, "w", newline="", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(
+                [
+                    "endpoint",
+                    "endpoint-url",
+                    "parameters",
+                    "plugin",
+                    "entry-date",
+                    "start-date",
+                    "end-date",
+                ]
+            )
+
+    endpoint_exists = False
+    existing_entry = None
+
+    try:
+        with open(endpoint_csv_path, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if row.get("endpoint-url", "").strip() == url.strip():
+                    endpoint_exists = True
+                    existing_entry = {
+                        "endpoint": row.get("endpoint", ""),
+                        "endpoint-url": row.get("endpoint-url", ""),
+                        "parameters": row.get("parameters", ""),
+                        "plugin": row.get("plugin", ""),
+                        "entry-date": row.get("entry-date", ""),
+                        "start-date": row.get("start-date", ""),
+                        "end-date": row.get("end-date", ""),
+                    }
+                    logger.info("Endpoint URL found in endpoint.csv")
+                    break
+    except Exception as e:
+        logger.error(f"Error reading endpoint.csv: {e}")
+
+    endpoint_summary = {"endpoint_url_in_endpoint_csv": endpoint_exists}
+
+    if endpoint_exists and existing_entry:
+        endpoint_summary["existing_endpoint_entry"] = existing_entry
+
+    else:
+        if not start_date:
+            start_date = datetime.now().strftime("%Y-%m-%d")
+        entry_date = datetime.now().isoformat()
+
+        endpoint_key, new_endpoint_row = append_endpoint(
+            endpoint_csv_path=endpoint_csv_path,
+            endpoint_url=url,
+            entry_date=entry_date,
+            start_date=start_date,
+            end_date="",
+            plugin=plugin,
+        )
+
+        if new_endpoint_row:
+            logger.info(f"Appended new endpoint with hash: {endpoint_key}")
+            endpoint_summary["new_endpoint_entry"] = new_endpoint_row
+
+    return endpoint_summary
+
+
+def validate_source(
+    documentation_url,
+    config_dir,
+    collection,
+    organisation,
+    dataset,
+    endpoint_summary,
+    start_date=None,
+    licence=None,
+):
+    """Validate if source exists in source.csv and create entry if not."""
+    source_csv_path = os.path.join(config_dir, "source.csv")
+
+    endpoint_key = endpoint_summary.get("existing_endpoint_entry", {}).get(
+        "endpoint"
+    ) or endpoint_summary.get("new_endpoint_entry", {}).get("endpoint")
+    if not endpoint_key:
+        logger.warning("No endpoint_key available from endpoint_summary")
+        return {}
+
+    if not documentation_url:
+        logger.warning("No documentation URL provided")
+
+    if not start_date:
+        start_date = datetime.now().strftime("%Y-%m-%d")
+    entry_date = datetime.now().isoformat()
+
+    source_key_returned, new_source_row = append_source(
+        source_csv_path=source_csv_path,
+        collection=collection,
+        organisation=organisation,
+        endpoint_key=endpoint_key,
+        attribution="",
+        documentation_url=documentation_url or "",
+        licence=licence or "",
+        pipelines=dataset,
+        entry_date=entry_date,
+        start_date=start_date,
+        end_date="",
+    )
+
+    if new_source_row:
+        return {
+            "documentation_url_in_source_csv": False,
+            "new_source_entry": new_source_row,
+        }
+
+    source_summary = {"documentation_url_in_source_csv": True}
+    try:
+        with open(source_csv_path, "r", encoding="utf-8") as f:
+            for row in csv.DictReader(f):
+                if row.get("source", "").strip() == source_key_returned:
+                    source_summary["existing_source_entry"] = {
+                        "source": row.get("source", ""),
+                        "attribution": row.get("attribution", ""),
+                        "collection": row.get("collection", ""),
+                        "documentation-url": row.get("documentation-url", ""),
+                        "endpoint": row.get("endpoint", ""),
+                        "licence": row.get("licence", ""),
+                        "organisation": row.get("organisation", ""),
+                        "pipelines": row.get("pipelines", ""),
+                        "entry-date": row.get("entry-date", ""),
+                        "start-date": row.get("start-date", ""),
+                        "end-date": row.get("end-date", ""),
+                    }
+                    break
+    except Exception as e:
+        logger.error(f"Error reading existing source: {e}")
+
+    return source_summary
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index 658cf430..c17659a2 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -5,7 +5,12 @@
 import urllib
 import yaml
 from urllib.error import HTTPError
-from application.core.utils import detect_encoding, extract_dataset_field_rows
+from application.core.utils import (
+    detect_encoding,
+    extract_dataset_field_rows,
+    validate_endpoint,
+    validate_source,
+)
 from application.logging.logger import get_logger
 from application.core.pipeline import (
     fetch_response_data,
@@ -421,7 +426,10 @@ def add_data_workflow(
     plugin=None,
 ):
     """
-    Setup directories and download required CSVs to manage add-data pipeline, then invoke fetch_add_data_response, also clean up.
+    Setup directories and download required CSVs to manage add-data pipeline
+    Invoke fetch_add_data_response
+    Create source csv and endpoint csv summaries
+    Clean up directories
 
     Args:
         file_name (str): Collection resource file name
@@ -433,6 +441,8 @@ def add_data_workflow(
         documentation_url (str): Documentation URL for the dataset
         directories (Directories): Directories object with required paths
     """
+    response_data = {}
+
     try:
         pipeline_dir = os.path.join(directories.PIPELINE_DIR, collection, request_id)
         input_dir = os.path.join(directories.COLLECTION_DIR, "resource", request_id)
@@ -442,33 +452,54 @@ def add_data_workflow(
             os.makedirs(os.path.dirname(output_path), exist_ok=True)
 
         # Loads csvs for Pipeline and Config
-        fetch_add_data_pipeline_csvs(collection, pipeline_dir)
-        fetch_add_data_collection_csvs(collection, collection_dir)
-
-        response_data = fetch_add_data_response(
-            collection=collection,
+        if not fetch_add_data_pipeline_csvs(collection, pipeline_dir):
+            response_data['message'] = f"Unable to find lookups for collection '{collection}', dataset '{dataset}'"
+            return response_data
+        if not fetch_add_data_collection_csvs(collection, collection_dir):
+            response_data['message'] = f"Unable to find lookups for collection '{collection}', dataset '{dataset}'"
+            return response_data
+
+        # All processes arount transforming the data and generating pipeline summary
+        pipeline_summary = fetch_add_data_response(
             dataset=dataset,
             organisation_provider=organisation_provider,
             pipeline_dir=pipeline_dir,
-            collection_dir=collection_dir,
             input_dir=input_dir,
             output_path=output_path,
             specification_dir=directories.SPECIFICATION_DIR,
             cache_dir=directories.CACHE_DIR,
-            url=url,
-            documentation_url=documentation_url,
-            licence=licence,
+            url=url
+        )
+
+        # Create endpoint and source summaries in workflow
+        endpoint_summary = validate_endpoint(
+            url,
+            collection_dir,
+            plugin,
             start_date=start_date,
-            plugin=plugin,
         )
-        logger.info(f"add data response is for id {request_id} : {response_data}")
+        source_summary = validate_source(
+            documentation_url,
+            collection_dir,
+            collection,
+            organisation_provider,
+            dataset,
+            endpoint_summary,
+            start_date=start_date,
+            licence=licence,
+        )
 
-        # TODO: Add additioanl data to response_data such as column mapping?
-        # TODO: Error summary? like in run_workflow map issue data to messages, if it exists create issue summary to bloc adding.
+        response_data = {
+            "pipeline-summary": pipeline_summary,
+            "endpoint-summary": endpoint_summary,
+            "source-summary": source_summary,
+        }
+
+        logger.info(f"add data response is for id {request_id} : {response_data}")
 
     except Exception as e:
-        logger.exception(f"An error occurred in add_data_workflow")
-        response_data = None
+        logger.warning(f"An error occurred in add_data_workflow: {e} for request id {request_id}")
+        response_data['message'] = f"An error occurred in add_data_workflow: {e}"
 
     finally:
         clean_up(
@@ -486,7 +517,7 @@ def add_data_workflow(
 
 
 def fetch_add_data_pipeline_csvs(collection, pipeline_dir):
-    """Download pipeline CSVs into pipeline_dir"""
+    """Download pipeline CSVs into pipeline_dir. Returns False if any errors occur."""
     os.makedirs(pipeline_dir, exist_ok=True)
     pipeline_csvs = [
         "column.csv",
@@ -511,11 +542,13 @@ def fetch_add_data_pipeline_csvs(collection, pipeline_dir):
             urllib.request.urlretrieve(url, csv_path)
             logger.info(f"Downloaded {csv_name} from {url} to {csv_path}")
         except HTTPError as e:
-            logger.warning(f"Failed to retrieve {csv_name}: {e}.")
+            logger.warning(f"Failed to retrieve {csv_name}: {e}")
+            return False
+    return True
 
 
 def fetch_add_data_collection_csvs(collection, config_dir):
-    """Download config CSVs (endpoint.csv, source.csv) into config_dir"""
+    """Download config CSVs (endpoint.csv, source.csv) into config_dir. Returns False if any errors occur."""
     os.makedirs(config_dir, exist_ok=True)
     config_csvs = ["endpoint.csv", "source.csv"]
     for csv_name in config_csvs:
@@ -525,4 +558,6 @@ def fetch_add_data_collection_csvs(collection, config_dir):
             urllib.request.urlretrieve(url, csv_path)
             logger.info(f"Downloaded {csv_name} from {url} to {csv_path}")
         except HTTPError as e:
-            logger.warning(f"Failed to retrieve {csv_name}: {e}.")
+            logger.warning(f"Failed to retrieve {csv_name}: {e}")
+            return False
+    return True
diff --git a/request-processor/src/tasks.py b/request-processor/src/tasks.py
index 46676997..054af9e6 100644
--- a/request-processor/src/tasks.py
+++ b/request-processor/src/tasks.py
@@ -391,6 +391,12 @@ def _get_response(request_id):
 
 
 def save_response_to_db(request_id, response_data):
+    """Currently handles three types of response_data:
+    1. Full check data workflow response with 'converted-csv', 'issue-log', etc.
+    2. Full add data workflow pipeline summary response with 'pipeline-summary'.
+    3. Error log with 'message'.
+    Saves appropriately to Response and ResponseDetails tables.
+    """
     logger.info(f"save_response_to_db started for request_id: {request_id}")
     db_session = database.session_maker()
     with db_session() as session:
@@ -454,7 +460,7 @@ def save_response_to_db(request_id, response_data):
                     # Commit the changes to the database
                     session.commit()
 
-                elif "entity-summary" in response_data:
+                elif "pipeline-summary" in response_data:
                     new_response = models.Response(
                         request_id=request_id, data=response_data
                     )

From 8a3a62444079a573076317291349c4bc4b796d34 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Mon, 2 Feb 2026 16:37:46 +0000
Subject: [PATCH 10/12] Updatae debug scripts

---
 .vscode/launch.json                                     | 2 +-
 scripts/{debug_trigger.py => debug_trigger_checkurl.py} | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename scripts/{debug_trigger.py => debug_trigger_checkurl.py} (94%)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 120f4424..c1fde5d9 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -7,7 +7,7 @@
             "type": "python",
             "request": "launch",
             "python": "${workspaceFolder}/request-processor/.venv/bin/python",
-            "program": "${workspaceFolder}/scripts/debug_trigger.py",
+            "program": "${workspaceFolder}/scripts/debug_trigger_checkurl.py",
             "console": "integratedTerminal",
             "justMyCode": false,
             "env": {
diff --git a/scripts/debug_trigger.py b/scripts/debug_trigger_checkurl.py
similarity index 94%
rename from scripts/debug_trigger.py
rename to scripts/debug_trigger_checkurl.py
index 64f5edbd..07af1b0a 100644
--- a/scripts/debug_trigger.py
+++ b/scripts/debug_trigger_checkurl.py
@@ -42,7 +42,7 @@ def ensure_request_exists(request_payload: dict) -> None:
     database_url = os.environ.get("DATABASE_URL")
     if not database_url:
         raise RuntimeError(
-            "DATABASE_URL is not set. Start the stack (or export DATABASE_URL) before running debug_trigger."
+            "DATABASE_URL is not set. Start the stack (or export DATABASE_URL) before running debug_trigger_checkurl."
         )
 
     request_id = request_payload["id"]
@@ -96,11 +96,11 @@ def ensure_request_exists(request_payload: dict) -> None:
     "params": {
         "type": "check_url",
         "organisationName": "London Borough of Hackney",
-        "dataset": "article-4-direction",
+        "dataset": "article-4-direction-area",
         "collection": "article-4-direction",
         "column_mapping": None,
         "geom_type": None,
-        "url": "https://map2.hackney.gov.uk/geoserver/ows?service=WFS&version=2.0&request=GetFeature&outputFormat=application/json&SrsName=EPSG:4326&typeName=planning:lldc_hackney_conservation_area"
+        "url": "https://raw.githubusercontent.com/digital-land/PublishExamples/refs/heads/main/Article4Direction/Files/Article4DirectionArea/articel4directionareas-(wrongColName-NewRefs)"
     }
 }
 

From 62ec1c9bb4ec85e8ee30cdc597cbec43b5ae0245 Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Tue, 3 Feb 2026 11:40:51 +0000
Subject: [PATCH 11/12] lint

---
 .../src/application/core/pipeline.py             | 12 ++++++------
 .../src/application/core/workflow.py             | 16 +++++++++++-----
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index fb4956e5..5e142d2a 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -7,8 +7,6 @@
 
 from digital_land.pipeline import Pipeline, Lookups
 from digital_land.commands import get_resource_unidentified_lookups
-from application.core.utils import append_endpoint, append_source
-from datetime import datetime
 from pathlib import Path
 
 logger = get_logger(__name__)
@@ -42,7 +40,7 @@ def fetch_response_data(
     try:
         for file_name in files_in_resource:
             file_path = os.path.join(input_path, file_name)
-            # retrieve unnassigned entities and assign
+            # retrieve unnassigned entities and assign, TODO: Is this necessary here?
             assign_entries(
                 resource_path=file_path,
                 dataset=dataset,
@@ -197,7 +195,7 @@ def fetch_add_data_response(
     output_path,
     specification_dir,
     cache_dir,
-    url
+    url,
 ):
     try:
         specification = Specification(specification_dir)
@@ -262,7 +260,9 @@ def fetch_add_data_response(
                     new_entities.extend(new_lookups)
 
                     # Reload pipeline to pick up newly saved lookups
-                    pipeline = Pipeline(pipeline_dir, dataset, specification=specification)
+                    pipeline = Pipeline(
+                        pipeline_dir, dataset, specification=specification
+                    )
 
                     # Now re-run transform to check and return issue log
                     issues_log = pipeline.transform(
@@ -401,4 +401,4 @@ def _map_transformed_entities(transformed_csv_path, pipeline_dir):  # noqa: C901
                 }
             )
 
-    return mapped_entities
\ No newline at end of file
+    return mapped_entities
diff --git a/request-processor/src/application/core/workflow.py b/request-processor/src/application/core/workflow.py
index c17659a2..ac8aa074 100644
--- a/request-processor/src/application/core/workflow.py
+++ b/request-processor/src/application/core/workflow.py
@@ -453,10 +453,14 @@ def add_data_workflow(
 
         # Loads csvs for Pipeline and Config
         if not fetch_add_data_pipeline_csvs(collection, pipeline_dir):
-            response_data['message'] = f"Unable to find lookups for collection '{collection}', dataset '{dataset}'"
+            response_data[
+                "message"
+            ] = f"Unable to find lookups for collection '{collection}', dataset '{dataset}'"
             return response_data
         if not fetch_add_data_collection_csvs(collection, collection_dir):
-            response_data['message'] = f"Unable to find lookups for collection '{collection}', dataset '{dataset}'"
+            response_data[
+                "message"
+            ] = f"Unable to find lookups for collection '{collection}', dataset '{dataset}'"
             return response_data
 
         # All processes arount transforming the data and generating pipeline summary
@@ -468,7 +472,7 @@ def add_data_workflow(
             output_path=output_path,
             specification_dir=directories.SPECIFICATION_DIR,
             cache_dir=directories.CACHE_DIR,
-            url=url
+            url=url,
         )
 
         # Create endpoint and source summaries in workflow
@@ -498,8 +502,10 @@ def add_data_workflow(
         logger.info(f"add data response is for id {request_id} : {response_data}")
 
     except Exception as e:
-        logger.warning(f"An error occurred in add_data_workflow: {e} for request id {request_id}")
-        response_data['message'] = f"An error occurred in add_data_workflow: {e}"
+        logger.warning(
+            f"An error occurred in add_data_workflow: {e} for request id {request_id}"
+        )
+        response_data["message"] = f"An error occurred in add_data_workflow: {e}"
 
     finally:
         clean_up(

From 195aad1d3b464f2cfa4cfee1fe416429383d5c4d Mon Sep 17 00:00:00 2001
From: Matt Poole <matt.a.poole@outlook.com>
Date: Tue, 3 Feb 2026 12:12:18 +0000
Subject: [PATCH 12/12] move tests around to show new workflow

---
 .../src/application/core/pipeline.py          |   5 +-
 .../src/application/core/test_pipeline.py     | 576 +-----------------
 .../unit/src/application/core/test_utils.py   | 505 ++++++++++++++-
 .../src/application/core/test_workflow.py     |  37 +-
 4 files changed, 533 insertions(+), 590 deletions(-)

diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
index 5e142d2a..206b36e8 100644
--- a/request-processor/src/application/core/pipeline.py
+++ b/request-processor/src/application/core/pipeline.py
@@ -210,6 +210,7 @@ def fetch_add_data_response(
 
         existing_entities = []
         new_entities = []
+        issues_log = None
 
         for idx, resource_file in enumerate(files_in_resource):
             resource_file_path = os.path.join(input_dir, resource_file)
@@ -292,7 +293,9 @@ def fetch_add_data_response(
             "existing-in-resource": len(existing_entities),
             "new-entities": new_entities_breakdown,
             "existing-entities": existing_entities_breakdown,
-            "pipeline-issues": [dict(issue) for issue in issues_log.rows],
+            "pipeline-issues": [dict(issue) for issue in issues_log.rows]
+            if issues_log
+            else [],
         }
 
         return pipeline_summary
diff --git a/request-processor/tests/unit/src/application/core/test_pipeline.py b/request-processor/tests/unit/src/application/core/test_pipeline.py
index 1e6f2e09..fea694eb 100644
--- a/request-processor/tests/unit/src/application/core/test_pipeline.py
+++ b/request-processor/tests/unit/src/application/core/test_pipeline.py
@@ -6,8 +6,6 @@
     fetch_add_data_response,
     _get_entities_breakdown,
     _get_existing_entities_breakdown,
-    _validate_endpoint,
-    _validate_source,
 )
 
 
@@ -15,13 +13,11 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
     """Test successful execution of fetch_add_data_response"""
     dataset = "test-dataset"
     organisation = "test-org"
-    collection = "test-collection"
     pipeline_dir = tmp_path / "pipeline"
     input_path = tmp_path / "resource"
     specification_dir = tmp_path / "specification"
     cache_dir = tmp_path / "cache"
     url = "http://example.com/endpoint"
-    documentation_url = "http://example.com/doc"
 
     input_path.mkdir(parents=True)
     pipeline_dir.mkdir(parents=True)
@@ -63,47 +59,31 @@ def test_fetch_add_data_response_success(monkeypatch, tmp_path):
     )
     monkeypatch.setattr("src.application.core.pipeline.Pipeline", MagicMock())
     monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir, plugin, start_date=None: {
-            "endpoint_url_in_endpoint_csv": True
-        },
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_source",
-        lambda *a, **k: {"documentation_url_in_source_csv": True},
-    )
 
     result = fetch_add_data_response(
-        collection=collection,
         dataset=dataset,
         organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
-        collection_dir=str(tmp_path / "collection"),
         input_dir=str(input_path),
         output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
         cache_dir=str(cache_dir),
         url=url,
-        documentation_url=documentation_url,
     )
 
-    assert "entity-summary" in result
-    assert "new-in-resource" in result["entity-summary"]
-    assert "existing-in-resource" in result["entity-summary"]
+    assert "new-in-resource" in result
+    assert "existing-in-resource" in result
 
 
 def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
     """Test when input directory has no files"""
     dataset = "test-dataset"
     organisation = "test-org"
-    collection = "test-collection"
     pipeline_dir = tmp_path / "pipeline"
     input_path = tmp_path / "resource"
     specification_dir = tmp_path / "specification"
     cache_dir = tmp_path / "cache"
     url = "http://example.com/endpoint"
-    documentation_url = "http://example.com/doc"
 
     input_path.mkdir(parents=True)
     pipeline_dir.mkdir(parents=True)
@@ -114,47 +94,31 @@ def test_fetch_add_data_response_no_files(monkeypatch, tmp_path):
     )
     monkeypatch.setattr("src.application.core.pipeline.Pipeline", MagicMock())
     monkeypatch.setattr("src.application.core.pipeline.Organisation", MagicMock())
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir, plugin, start_date=None: {
-            "endpoint_url_in_endpoint_csv": True
-        },
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_source",
-        lambda *a, **k: {"documentation_url_in_source_csv": True},
-    )
 
     result = fetch_add_data_response(
-        collection=collection,
         dataset=dataset,
         organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
-        collection_dir=str(tmp_path / "collection"),
         input_dir=str(input_path),
         output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
         cache_dir=str(cache_dir),
         url=url,
-        documentation_url=documentation_url,
     )
 
-    assert "entity-summary" in result
-    assert result["entity-summary"]["new-in-resource"] == 0
+    assert "new-in-resource" in result
+    assert result["new-in-resource"] == 0
 
 
 def test_fetch_add_data_response_file_not_found(monkeypatch, tmp_path):
     """Test when input path does not exist"""
     dataset = "test-dataset"
     organisation = "test-org"
-    collection = "test-collection"
-    request_id = "req-003"
     pipeline_dir = tmp_path / "pipeline"
     input_path = tmp_path / "nonexistent"
     specification_dir = tmp_path / "specification"
     cache_dir = tmp_path / "cache"
     url = "http://example.com/endpoint"
-    documentation_url = "http://example.com/doc"
 
     pipeline_dir.mkdir(parents=True)
 
@@ -167,17 +131,14 @@ def test_fetch_add_data_response_file_not_found(monkeypatch, tmp_path):
 
     with pytest.raises(FileNotFoundError):
         fetch_add_data_response(
-            collection=collection,
             dataset=dataset,
             organisation_provider=organisation,
             pipeline_dir=str(pipeline_dir),
-            collection_dir=str(tmp_path / "collection"),
             input_dir=str(input_path),
             output_path=str(input_path / "output.csv"),
             specification_dir=str(specification_dir),
             cache_dir=str(cache_dir),
             url=url,
-            documentation_url=documentation_url,
         )
 
 
@@ -185,14 +146,11 @@ def test_fetch_add_data_response_handles_processing_error(monkeypatch, tmp_path)
     """Test handling of errors during file processing"""
     dataset = "test-dataset"
     organisation = "test-org"
-    collection = "test-collection"
-    request_id = "req-006"
     pipeline_dir = tmp_path / "pipeline"
     input_path = tmp_path / "resource"
     specification_dir = tmp_path / "specification"
     cache_dir = tmp_path / "cache"
     url = "http://example.com/endpoint"
-    documentation_url = "http://example.com/doc"
 
     input_path.mkdir(parents=True)
     pipeline_dir.mkdir(parents=True)
@@ -212,33 +170,19 @@ def test_fetch_add_data_response_handles_processing_error(monkeypatch, tmp_path)
     def raise_exception(*args, **kwargs):
         raise Exception("Processing error")
 
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_endpoint",
-        lambda url, dir, plugin, start_date=None: {
-            "endpoint_url_in_endpoint_csv": True
-        },
-    )
-    monkeypatch.setattr(
-        "src.application.core.pipeline._validate_source",
-        lambda *a, **k: {"documentation_url_in_source_csv": True},
-    )
-
     result = fetch_add_data_response(
-        collection=collection,
         dataset=dataset,
         organisation_provider=organisation,
         pipeline_dir=str(pipeline_dir),
-        collection_dir=str(tmp_path / "collection"),
         input_dir=str(input_path),
         output_path=str(input_path / "output.csv"),
         specification_dir=str(specification_dir),
         cache_dir=str(cache_dir),
         url=url,
-        documentation_url=documentation_url,
     )
 
-    assert "entity-summary" in result
-    assert result["entity-summary"]["new-in-resource"] == 0
+    assert "new-in-resource" in result
+    assert result["new-in-resource"] == 0
 
 
 def test_get_entities_breakdown_success():
@@ -338,511 +282,3 @@ def test_get_existing_entities_breakdown_filters_empty_values():
     assert len(result) == 2
     assert result[0]["entity"] == "1000001"
     assert result[1]["entity"] == "1000004"
-
-
-def test_validate_endpoint_creates_file(monkeypatch, tmp_path):
-    """Test that _validate_endpoint creates endpoint.csv if it doesn't exist"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    url = "http://example.com/endpoint"
-    endpoint_csv_path = pipeline_dir / "endpoint.csv"
-
-    def fake_append_endpoint(
-        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
-    ):
-        return "endpoint_hash", {
-            "endpoint": "endpoint_hash",
-            "endpoint-url": endpoint_url,
-            "parameters": "",
-            "plugin": plugin or "",
-            "entry-date": entry_date,
-            "start-date": start_date,
-            "end-date": end_date,
-        }
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_endpoint", fake_append_endpoint
-    )
-
-    assert not endpoint_csv_path.exists()
-
-    _validate_endpoint(url, str(pipeline_dir), plugin=None)
-
-    assert endpoint_csv_path.exists()
-
-    with open(endpoint_csv_path, "r", encoding="utf-8") as f:
-        reader = csv.reader(f)
-        headers = next(reader)
-        assert headers == [
-            "endpoint",
-            "endpoint-url",
-            "parameters",
-            "plugin",
-            "entry-date",
-            "start-date",
-            "end-date",
-        ]
-
-
-def test_validate_endpoint_appends(monkeypatch, tmp_path):
-    """Test that _validate_endpoint appends new endpoint when URL not found"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    url = "http://example.com/new-endpoint"
-    endpoint_csv_path = pipeline_dir / "endpoint.csv"
-
-    with open(endpoint_csv_path, "w", newline="", encoding="utf-8") as f:
-        writer = csv.DictWriter(
-            f,
-            fieldnames=[
-                "endpoint",
-                "endpoint-url",
-                "parameters",
-                "plugin",
-                "entry-date",
-                "start-date",
-                "end-date",
-            ],
-        )
-        writer.writeheader()
-        writer.writerow(
-            {
-                "endpoint": "existing_hash",
-                "endpoint-url": "http://example.com/existing",
-                "parameters": "",
-                "plugin": "",
-                "entry-date": "2024-01-01T00:00:00",
-                "start-date": "2024-01-01",
-                "end-date": "",
-            }
-        )
-
-    def fake_append_endpoint(
-        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
-    ):
-        return "new_endpoint_hash", {
-            "endpoint": "new_endpoint_hash",
-            "endpoint-url": endpoint_url,
-            "parameters": "",
-            "plugin": plugin or "",
-            "entry-date": entry_date,
-            "start-date": start_date,
-            "end-date": end_date,
-        }
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_endpoint", fake_append_endpoint
-    )
-
-    result = _validate_endpoint(url, str(pipeline_dir), plugin=None)
-
-    assert result["endpoint_url_in_endpoint_csv"] is False
-    assert "new_endpoint_entry" in result
-    assert result["new_endpoint_entry"]["endpoint"] == "new_endpoint_hash"
-    assert result["new_endpoint_entry"]["endpoint-url"] == url
-
-
-def test_validate_endpoint_finds_existing(monkeypatch, tmp_path):
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    url = "http://example.com/endpoint"
-    endpoint_csv_path = pipeline_dir / "endpoint.csv"
-    with open(endpoint_csv_path, "w", newline="", encoding="utf-8") as f:
-        writer = csv.DictWriter(
-            f,
-            fieldnames=[
-                "endpoint",
-                "endpoint-url",
-                "parameters",
-                "plugin",
-                "entry-date",
-                "start-date",
-                "end-date",
-            ],
-        )
-        writer.writeheader()
-        writer.writerow(
-            {
-                "endpoint": "endpoint_hash",
-                "endpoint-url": url,
-                "parameters": "",
-                "plugin": "",
-                "entry-date": "2024-01-01",
-                "start-date": "2024-01-01",
-                "end-date": "",
-            }
-        )
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_endpoint",
-        lambda *a, **kw: (_ for _ in ()).throw(Exception("Should not be called")),
-    )
-
-    result = _validate_endpoint(url, str(pipeline_dir), plugin=None)
-    assert result["endpoint_url_in_endpoint_csv"] is True
-    assert "existing_endpoint_entry" in result
-    assert result["existing_endpoint_entry"]["endpoint-url"] == url
-
-
-def test_validate_endpoint_empty_url(monkeypatch, tmp_path):
-    """Test _validate_endpoint with empty URL"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-
-    result = _validate_endpoint("", str(pipeline_dir), plugin=None)
-
-    assert result == {}
-
-
-def test_validate_endpoint_csv_read_error(monkeypatch, tmp_path):
-    """Test _validate_endpoint when reading CSV fails"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    url = "http://example.com/endpoint"
-
-    endpoint_csv_path = pipeline_dir / "endpoint.csv"
-    endpoint_csv_path.write_bytes(b"\x00\x00\x00")
-
-    def fake_append_endpoint(
-        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
-    ):
-        return "endpoint_hash", {
-            "endpoint": "endpoint_hash",
-            "endpoint-url": endpoint_url,
-            "parameters": "",
-            "plugin": plugin or "",
-            "entry-date": entry_date,
-            "start-date": start_date,
-            "end-date": end_date,
-        }
-
-
-def test_validate_source_creates_new_source(monkeypatch, tmp_path):
-    """Test _validate_source creates new source entry when it doesn't exist"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    source_csv_path = pipeline_dir / "source.csv"
-
-    documentation_url = "http://example.com/doc"
-    collection = "test-collection"
-    organisation = "test-org"
-    dataset = "test-dataset"
-
-    endpoint_summary = {
-        "endpoint_url_in_endpoint_csv": True,
-        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
-    }
-
-    def fake_append_source(
-        source_csv_path,
-        collection,
-        organisation,
-        endpoint_key,
-        attribution,
-        documentation_url,
-        licence,
-        pipelines,
-        entry_date,
-        start_date,
-        end_date,
-    ):
-        return "source_hash_456", {
-            "source": "source_hash_456",
-            "attribution": attribution,
-            "collection": collection,
-            "documentation-url": documentation_url,
-            "endpoint": endpoint_key,
-            "licence": licence,
-            "organisation": organisation,
-            "pipelines": pipelines,
-            "entry-date": entry_date,
-            "start-date": start_date,
-            "end-date": end_date,
-        }
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_source", fake_append_source
-    )
-
-    result = _validate_source(
-        documentation_url,
-        str(pipeline_dir),
-        collection,
-        organisation,
-        dataset,
-        endpoint_summary,
-        start_date=None,
-        licence=None,
-    )
-
-    assert result["documentation_url_in_source_csv"] is False
-    assert "new_source_entry" in result
-    assert result["new_source_entry"]["source"] == "source_hash_456"
-    assert result["new_source_entry"]["collection"] == collection
-    assert result["new_source_entry"]["organisation"] == organisation
-    assert result["new_source_entry"]["pipelines"] == dataset
-
-
-def test_validate_source_finds_existing_source(monkeypatch, tmp_path):
-    """Test _validate_source finds existing source entry"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    source_csv_path = pipeline_dir / "source.csv"
-
-    documentation_url = "http://example.com/doc"
-    collection = "test-collection"
-    organisation = "test-org"
-    dataset = "test-dataset"
-
-    endpoint_summary = {
-        "endpoint_url_in_endpoint_csv": True,
-        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
-    }
-
-    with open(source_csv_path, "w", newline="", encoding="utf-8") as f:
-        writer = csv.DictWriter(
-            f,
-            fieldnames=[
-                "source",
-                "attribution",
-                "collection",
-                "documentation-url",
-                "endpoint",
-                "licence",
-                "organisation",
-                "pipelines",
-                "entry-date",
-                "start-date",
-                "end-date",
-            ],
-        )
-        writer.writeheader()
-        writer.writerow(
-            {
-                "source": "existing_source_hash",
-                "attribution": "",
-                "collection": collection,
-                "documentation-url": documentation_url,
-                "endpoint": "endpoint_hash_123",
-                "licence": "",
-                "organisation": organisation,
-                "pipelines": dataset,
-                "entry-date": "2024-01-01T00:00:00",
-                "start-date": "2024-01-01",
-                "end-date": "",
-            }
-        )
-
-    def fake_append_source(*args, **kwargs):
-        return "existing_source_hash", None
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_source", fake_append_source
-    )
-
-    result = _validate_source(
-        documentation_url,
-        str(pipeline_dir),
-        collection,
-        organisation,
-        dataset,
-        endpoint_summary,
-        start_date=None,
-        licence=None,
-    )
-
-    assert result["documentation_url_in_source_csv"] is True
-    assert "existing_source_entry" in result
-    assert result["existing_source_entry"]["source"] == "existing_source_hash"
-    assert result["existing_source_entry"]["collection"] == collection
-
-
-def test_validate_source_no_endpoint_key(tmp_path):
-    """Test _validate_source returns empty dict when no endpoint key available"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-
-    documentation_url = "http://example.com/doc"
-    collection = "test-collection"
-    organisation = "test-org"
-    dataset = "test-dataset"
-
-    endpoint_summary = {}
-
-    result = _validate_source(
-        documentation_url,
-        str(pipeline_dir),
-        collection,
-        organisation,
-        dataset,
-        endpoint_summary,
-        start_date=None,
-        licence=None,
-    )
-
-    assert result == {}
-
-
-def test_validate_source_empty_documentation_url(monkeypatch, tmp_path):
-    """Test _validate_source handles empty documentation URL"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-
-    documentation_url = ""
-    collection = "test-collection"
-    organisation = "test-org"
-    dataset = "test-dataset"
-
-    endpoint_summary = {
-        "endpoint_url_in_endpoint_csv": True,
-        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
-    }
-
-    def fake_append_source(
-        source_csv_path,
-        collection,
-        organisation,
-        endpoint_key,
-        attribution,
-        documentation_url,
-        licence,
-        pipelines,
-        entry_date,
-        start_date,
-        end_date,
-    ):
-        return "source_hash_456", {
-            "source": "source_hash_456",
-            "attribution": attribution,
-            "collection": collection,
-            "documentation-url": documentation_url,
-            "endpoint": endpoint_key,
-            "licence": licence,
-            "organisation": organisation,
-            "pipelines": pipelines,
-            "entry-date": entry_date,
-            "start-date": start_date,
-            "end-date": end_date,
-        }
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_source", fake_append_source
-    )
-
-    result = _validate_source(
-        documentation_url,
-        str(pipeline_dir),
-        collection,
-        organisation,
-        dataset,
-        endpoint_summary,
-        start_date=None,
-        licence=None,
-    )
-
-    assert "documentation_url_in_source_csv" in result
-    assert result["new_source_entry"]["documentation-url"] == ""
-
-
-def test_validate_source_uses_new_endpoint_entry(monkeypatch, tmp_path):
-    """Test _validate_source uses endpoint from new_endpoint_entry when available"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-
-    documentation_url = "http://example.com/doc"
-    collection = "test-collection"
-    organisation = "test-org"
-    dataset = "test-dataset"
-
-    endpoint_summary = {
-        "endpoint_url_in_endpoint_csv": False,
-        "new_endpoint_entry": {"endpoint": "new_endpoint_hash_789"},
-    }
-
-    captured_endpoint_key = None
-
-    def fake_append_source(
-        source_csv_path,
-        collection,
-        organisation,
-        endpoint_key,
-        attribution,
-        documentation_url,
-        licence,
-        pipelines,
-        entry_date,
-        start_date,
-        end_date,
-    ):
-        nonlocal captured_endpoint_key
-        captured_endpoint_key = endpoint_key
-        return "source_hash_456", {
-            "source": "source_hash_456",
-            "attribution": attribution,
-            "collection": collection,
-            "documentation-url": documentation_url,
-            "endpoint": endpoint_key,
-            "licence": licence,
-            "organisation": organisation,
-            "pipelines": pipelines,
-            "entry-date": entry_date,
-            "start-date": start_date,
-            "end-date": end_date,
-        }
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_source", fake_append_source
-    )
-
-    result = _validate_source(
-        documentation_url,
-        str(pipeline_dir),
-        collection,
-        organisation,
-        dataset,
-        endpoint_summary,
-        start_date=None,
-        licence=None,
-    )
-
-    assert captured_endpoint_key == "new_endpoint_hash_789"
-    assert result["new_source_entry"]["endpoint"] == "new_endpoint_hash_789"
-
-
-def test_validate_source_handles_csv_read_error(monkeypatch, tmp_path):
-    """Test _validate_source handles CSV read errors gracefully"""
-    pipeline_dir = tmp_path / "pipeline"
-    pipeline_dir.mkdir()
-    source_csv_path = pipeline_dir / "source.csv"
-
-    documentation_url = "http://example.com/doc"
-    collection = "test-collection"
-    organisation = "test-org"
-    dataset = "test-dataset"
-
-    endpoint_summary = {
-        "endpoint_url_in_endpoint_csv": True,
-        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
-    }
-
-    source_csv_path.write_bytes(b"\x00\x00\x00")
-
-    def fake_append_source(*args, **kwargs):
-        return "existing_source_hash", None
-
-    monkeypatch.setattr(
-        "src.application.core.pipeline.append_source", fake_append_source
-    )
-
-    result = _validate_source(
-        documentation_url,
-        str(pipeline_dir),
-        collection,
-        organisation,
-        dataset,
-        endpoint_summary,
-        start_date=None,
-        licence=None,
-    )
-
-    assert "documentation_url_in_source_csv" in result
diff --git a/request-processor/tests/unit/src/application/core/test_utils.py b/request-processor/tests/unit/src/application/core/test_utils.py
index 2b38619b..5be3a0b3 100644
--- a/request-processor/tests/unit/src/application/core/test_utils.py
+++ b/request-processor/tests/unit/src/application/core/test_utils.py
@@ -1,4 +1,9 @@
-from src.application.core.utils import get_request, check_content
+from src.application.core.utils import (
+    get_request,
+    check_content,
+    validate_endpoint,
+    validate_source,
+)
 import requests_mock
 import os
 import csv
@@ -192,3 +197,501 @@ def test_append_source_existing(tmp_path):
         reader = list(csv.DictReader(f))
         assert len(reader) == 1
         assert reader[0]["source"] == source_key
+
+
+def test_validate_endpoint_creates_file(monkeypatch, tmp_path):
+    """Test that validate_endpoint creates endpoint.csv if it doesn't exist"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    url = "http://example.com/endpoint"
+    endpoint_csv_path = pipeline_dir / "endpoint.csv"
+
+    def fake_append_endpoint(
+        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
+    ):
+        return "endpoint_hash", {
+            "endpoint": "endpoint_hash",
+            "endpoint-url": endpoint_url,
+            "parameters": "",
+            "plugin": plugin or "",
+            "entry-date": entry_date,
+            "start-date": start_date,
+            "end-date": end_date,
+        }
+
+    monkeypatch.setattr(
+        "src.application.core.utils.append_endpoint", fake_append_endpoint
+    )
+
+    assert not endpoint_csv_path.exists()
+
+    validate_endpoint(url, str(pipeline_dir), plugin=None)
+
+    assert endpoint_csv_path.exists()
+
+    with open(endpoint_csv_path, "r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        headers = next(reader)
+        assert headers == [
+            "endpoint",
+            "endpoint-url",
+            "parameters",
+            "plugin",
+            "entry-date",
+            "start-date",
+            "end-date",
+        ]
+
+
+def test_validate_endpoint_appends(monkeypatch, tmp_path):
+    """Test that validate_endpoint appends new endpoint when URL not found"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    url = "http://example.com/new-endpoint"
+    endpoint_csv_path = pipeline_dir / "endpoint.csv"
+
+    with open(endpoint_csv_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=[
+                "endpoint",
+                "endpoint-url",
+                "parameters",
+                "plugin",
+                "entry-date",
+                "start-date",
+                "end-date",
+            ],
+        )
+        writer.writeheader()
+        writer.writerow(
+            {
+                "endpoint": "existing_hash",
+                "endpoint-url": "http://example.com/existing",
+                "parameters": "",
+                "plugin": "",
+                "entry-date": "2024-01-01T00:00:00",
+                "start-date": "2024-01-01",
+                "end-date": "",
+            }
+        )
+
+    def fake_append_endpoint(
+        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
+    ):
+        return "new_endpoint_hash", {
+            "endpoint": "new_endpoint_hash",
+            "endpoint-url": endpoint_url,
+            "parameters": "",
+            "plugin": plugin or "",
+            "entry-date": entry_date,
+            "start-date": start_date,
+            "end-date": end_date,
+        }
+
+    monkeypatch.setattr(
+        "src.application.core.utils.append_endpoint", fake_append_endpoint
+    )
+
+    result = validate_endpoint(url, str(pipeline_dir), plugin=None)
+
+    assert result["endpoint_url_in_endpoint_csv"] is False
+    assert "new_endpoint_entry" in result
+    assert result["new_endpoint_entry"]["endpoint"] == "new_endpoint_hash"
+    assert result["new_endpoint_entry"]["endpoint-url"] == url
+
+
+def test_validate_endpoint_finds_existing(monkeypatch, tmp_path):
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    url = "http://example.com/endpoint"
+    endpoint_csv_path = pipeline_dir / "endpoint.csv"
+    with open(endpoint_csv_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=[
+                "endpoint",
+                "endpoint-url",
+                "parameters",
+                "plugin",
+                "entry-date",
+                "start-date",
+                "end-date",
+            ],
+        )
+        writer.writeheader()
+        writer.writerow(
+            {
+                "endpoint": "endpoint_hash",
+                "endpoint-url": url,
+                "parameters": "",
+                "plugin": "",
+                "entry-date": "2024-01-01",
+                "start-date": "2024-01-01",
+                "end-date": "",
+            }
+        )
+
+    monkeypatch.setattr(
+        "src.application.core.utils.append_endpoint",
+        lambda *a, **kw: (_ for _ in ()).throw(Exception("Should not be called")),
+    )
+
+    result = validate_endpoint(url, str(pipeline_dir), plugin=None)
+    assert result["endpoint_url_in_endpoint_csv"] is True
+    assert "existing_endpoint_entry" in result
+    assert result["existing_endpoint_entry"]["endpoint-url"] == url
+
+
+def test_validate_endpoint_empty_url(monkeypatch, tmp_path):
+    """Test validate_endpoint with empty URL"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+
+    result = validate_endpoint("", str(pipeline_dir), plugin=None)
+
+    assert result == {}
+
+
+def test_validate_endpoint_csv_read_error(monkeypatch, tmp_path):
+    """Test validate_endpoint when reading CSV fails"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    url = "http://example.com/endpoint"
+
+    endpoint_csv_path = pipeline_dir / "endpoint.csv"
+    endpoint_csv_path.write_bytes(b"\x00\x00\x00")
+
+    def fake_append_endpoint(
+        endpoint_csv_path, endpoint_url, entry_date, start_date, end_date, plugin=None
+    ):
+        return "endpoint_hash", {
+            "endpoint": "endpoint_hash",
+            "endpoint-url": endpoint_url,
+            "parameters": "",
+            "plugin": plugin or "",
+            "entry-date": entry_date,
+            "start-date": start_date,
+            "end-date": end_date,
+        }
+
+
+def test_validate_source_creates_new_source(monkeypatch, tmp_path):
+    """Test validate_source creates new source entry when it doesn't exist"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    source_csv_path = pipeline_dir / "source.csv"
+
+    documentation_url = "http://example.com/doc"
+    collection = "test-collection"
+    organisation = "test-org"
+    dataset = "test-dataset"
+
+    endpoint_summary = {
+        "endpoint_url_in_endpoint_csv": True,
+        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
+    }
+
+    def fake_append_source(
+        source_csv_path,
+        collection,
+        organisation,
+        endpoint_key,
+        attribution,
+        documentation_url,
+        licence,
+        pipelines,
+        entry_date,
+        start_date,
+        end_date,
+    ):
+        return "source_hash_456", {
+            "source": "source_hash_456",
+            "attribution": attribution,
+            "collection": collection,
+            "documentation-url": documentation_url,
+            "endpoint": endpoint_key,
+            "licence": licence,
+            "organisation": organisation,
+            "pipelines": pipelines,
+            "entry-date": entry_date,
+            "start-date": start_date,
+            "end-date": end_date,
+        }
+
+    monkeypatch.setattr("src.application.core.utils.append_source", fake_append_source)
+
+    result = validate_source(
+        documentation_url,
+        str(pipeline_dir),
+        collection,
+        organisation,
+        dataset,
+        endpoint_summary,
+        start_date=None,
+        licence=None,
+    )
+
+    assert result["documentation_url_in_source_csv"] is False
+    assert "new_source_entry" in result
+    assert result["new_source_entry"]["source"] == "source_hash_456"
+    assert result["new_source_entry"]["collection"] == collection
+    assert result["new_source_entry"]["organisation"] == organisation
+    assert result["new_source_entry"]["pipelines"] == dataset
+
+
+def test_validate_source_finds_existing_source(monkeypatch, tmp_path):
+    """Test validate_source finds existing source entry"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    source_csv_path = pipeline_dir / "source.csv"
+
+    documentation_url = "http://example.com/doc"
+    collection = "test-collection"
+    organisation = "test-org"
+    dataset = "test-dataset"
+
+    endpoint_summary = {
+        "endpoint_url_in_endpoint_csv": True,
+        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
+    }
+
+    with open(source_csv_path, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=[
+                "source",
+                "attribution",
+                "collection",
+                "documentation-url",
+                "endpoint",
+                "licence",
+                "organisation",
+                "pipelines",
+                "entry-date",
+                "start-date",
+                "end-date",
+            ],
+        )
+        writer.writeheader()
+        writer.writerow(
+            {
+                "source": "existing_source_hash",
+                "attribution": "",
+                "collection": collection,
+                "documentation-url": documentation_url,
+                "endpoint": "endpoint_hash_123",
+                "licence": "",
+                "organisation": organisation,
+                "pipelines": dataset,
+                "entry-date": "2024-01-01T00:00:00",
+                "start-date": "2024-01-01",
+                "end-date": "",
+            }
+        )
+
+    def fake_append_source(*args, **kwargs):
+        return "existing_source_hash", None
+
+    monkeypatch.setattr("src.application.core.utils.append_source", fake_append_source)
+
+    result = validate_source(
+        documentation_url,
+        str(pipeline_dir),
+        collection,
+        organisation,
+        dataset,
+        endpoint_summary,
+        start_date=None,
+        licence=None,
+    )
+
+    assert result["documentation_url_in_source_csv"] is True
+    assert "existing_source_entry" in result
+    assert result["existing_source_entry"]["source"] == "existing_source_hash"
+    assert result["existing_source_entry"]["collection"] == collection
+
+
+def test_validate_source_no_endpoint_key(tmp_path):
+    """Test validate_source returns empty dict when no endpoint key available"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+
+    documentation_url = "http://example.com/doc"
+    collection = "test-collection"
+    organisation = "test-org"
+    dataset = "test-dataset"
+
+    endpoint_summary = {}
+
+    result = validate_source(
+        documentation_url,
+        str(pipeline_dir),
+        collection,
+        organisation,
+        dataset,
+        endpoint_summary,
+        start_date=None,
+        licence=None,
+    )
+
+    assert result == {}
+
+
+def test_validate_source_empty_documentation_url(monkeypatch, tmp_path):
+    """Test validate_source handles empty documentation URL"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+
+    documentation_url = ""
+    collection = "test-collection"
+    organisation = "test-org"
+    dataset = "test-dataset"
+
+    endpoint_summary = {
+        "endpoint_url_in_endpoint_csv": True,
+        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
+    }
+
+    def fake_append_source(
+        source_csv_path,
+        collection,
+        organisation,
+        endpoint_key,
+        attribution,
+        documentation_url,
+        licence,
+        pipelines,
+        entry_date,
+        start_date,
+        end_date,
+    ):
+        return "source_hash_456", {
+            "source": "source_hash_456",
+            "attribution": attribution,
+            "collection": collection,
+            "documentation-url": documentation_url,
+            "endpoint": endpoint_key,
+            "licence": licence,
+            "organisation": organisation,
+            "pipelines": pipelines,
+            "entry-date": entry_date,
+            "start-date": start_date,
+            "end-date": end_date,
+        }
+
+    monkeypatch.setattr("src.application.core.utils.append_source", fake_append_source)
+
+    result = validate_source(
+        documentation_url,
+        str(pipeline_dir),
+        collection,
+        organisation,
+        dataset,
+        endpoint_summary,
+        start_date=None,
+        licence=None,
+    )
+
+    assert "documentation_url_in_source_csv" in result
+    assert result["new_source_entry"]["documentation-url"] == ""
+
+
+def test_validate_source_uses_new_endpoint_entry(monkeypatch, tmp_path):
+    """Test validate_source uses endpoint from new_endpoint_entry when available"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+
+    documentation_url = "http://example.com/doc"
+    collection = "test-collection"
+    organisation = "test-org"
+    dataset = "test-dataset"
+
+    endpoint_summary = {
+        "endpoint_url_in_endpoint_csv": False,
+        "new_endpoint_entry": {"endpoint": "new_endpoint_hash_789"},
+    }
+
+    captured_endpoint_key = None
+
+    def fake_append_source(
+        source_csv_path,
+        collection,
+        organisation,
+        endpoint_key,
+        attribution,
+        documentation_url,
+        licence,
+        pipelines,
+        entry_date,
+        start_date,
+        end_date,
+    ):
+        nonlocal captured_endpoint_key
+        captured_endpoint_key = endpoint_key
+        return "source_hash_456", {
+            "source": "source_hash_456",
+            "attribution": attribution,
+            "collection": collection,
+            "documentation-url": documentation_url,
+            "endpoint": endpoint_key,
+            "licence": licence,
+            "organisation": organisation,
+            "pipelines": pipelines,
+            "entry-date": entry_date,
+            "start-date": start_date,
+            "end-date": end_date,
+        }
+
+    monkeypatch.setattr("src.application.core.utils.append_source", fake_append_source)
+
+    result = validate_source(
+        documentation_url,
+        str(pipeline_dir),
+        collection,
+        organisation,
+        dataset,
+        endpoint_summary,
+        start_date=None,
+        licence=None,
+    )
+
+    assert captured_endpoint_key == "new_endpoint_hash_789"
+    assert result["new_source_entry"]["endpoint"] == "new_endpoint_hash_789"
+
+
+def test_validate_source_handles_csv_read_error(monkeypatch, tmp_path):
+    """Test validate_source handles CSV read errors gracefully"""
+    pipeline_dir = tmp_path / "pipeline"
+    pipeline_dir.mkdir()
+    source_csv_path = pipeline_dir / "source.csv"
+
+    documentation_url = "http://example.com/doc"
+    collection = "test-collection"
+    organisation = "test-org"
+    dataset = "test-dataset"
+
+    endpoint_summary = {
+        "endpoint_url_in_endpoint_csv": True,
+        "existing_endpoint_entry": {"endpoint": "endpoint_hash_123"},
+    }
+
+    source_csv_path.write_bytes(b"\x00\x00\x00")
+
+    def fake_append_source(*args, **kwargs):
+        return "existing_source_hash", None
+
+    monkeypatch.setattr("src.application.core.utils.append_source", fake_append_source)
+
+    result = validate_source(
+        documentation_url,
+        str(pipeline_dir),
+        collection,
+        organisation,
+        dataset,
+        endpoint_summary,
+        start_date=None,
+        licence=None,
+    )
+
+    assert "documentation_url_in_source_csv" in result
diff --git a/request-processor/tests/unit/src/application/core/test_workflow.py b/request-processor/tests/unit/src/application/core/test_workflow.py
index f943f27e..124ae6f0 100644
--- a/request-processor/tests/unit/src/application/core/test_workflow.py
+++ b/request-processor/tests/unit/src/application/core/test_workflow.py
@@ -328,22 +328,35 @@ class DummyDirectories:
 
     directories = DummyDirectories()
 
-    expected_response = {"status": "success", "data": "test"}
+    pipeline_response = {"status": "success", "data": "test"}
+    expected_response = {
+        "pipeline-summary": pipeline_response,
+        "endpoint-summary": {"endpoint_summary": "mocked"},
+        "source-summary": {"source_summary": "mocked"},
+    }
 
     monkeypatch.setattr(
         "src.application.core.workflow.resource_from_path", lambda path: "resource-hash"
     )
     monkeypatch.setattr(
         "src.application.core.workflow.fetch_add_data_pipeline_csvs",
-        lambda col, pdir: None,
+        lambda col, pdir: True,
     )
     monkeypatch.setattr(
         "src.application.core.workflow.fetch_add_data_collection_csvs",
-        lambda col, cdir: None,
+        lambda col, cdir: True,
     )
     monkeypatch.setattr(
         "src.application.core.workflow.fetch_add_data_response",
-        lambda *args, **kwargs: expected_response,
+        lambda *args, **kwargs: pipeline_response,
+    )
+    monkeypatch.setattr(
+        "src.application.core.workflow.validate_endpoint",
+        lambda *args, **kwargs: {"endpoint_summary": "mocked"},
+    )
+    monkeypatch.setattr(
+        "src.application.core.workflow.validate_source",
+        lambda *args, **kwargs: {"source_summary": "mocked"},
     )
 
     result = add_data_workflow(
@@ -382,41 +395,31 @@ class DummyDirectories:
 
     def fake_fetch_add_data_pipeline_csvs(col, pdir):
         called["fetch_add_data_pipeline_csvs"] = (col, pdir)
+        return True
 
     def fake_fetch_add_data_collection_csvs(col, cdir):
         called["fetch_add_data_collection_csvs"] = (col, cdir)
+        return True
 
     def fake_fetch_add_data_response(
-        collection,
         dataset,
         organisation_provider,
         pipeline_dir,
-        collection_dir,
         input_dir,
         output_path,
         specification_dir,
         cache_dir,
         url,
-        documentation_url,
-        licence=None,
-        start_date=None,
-        plugin=None,
     ):
         called["fetch_add_data_response"] = {
-            "collection": collection,
             "dataset": dataset,
             "organisation": organisation_provider,
             "pipeline_dir": pipeline_dir,
-            "collection_dir": collection_dir,
             "input_dir": input_dir,
             "output_path": output_path,
             "specification_dir": specification_dir,
             "cache_dir": cache_dir,
             "url": url,
-            "documentation_url": documentation_url,
-            "licence": licence,
-            "start_date": start_date,
-            "plugin": plugin,
         }
         return {"result": "ok"}
 
@@ -467,9 +470,7 @@ def fake_fetch_add_data_response(
         == directories.SPECIFICATION_DIR
     )
     assert called["fetch_add_data_response"]["cache_dir"] == directories.CACHE_DIR
-    assert called["fetch_add_data_response"]["collection_dir"] == expected_config_dir
     assert called["fetch_add_data_response"]["url"] == url
-    assert called["fetch_add_data_response"]["documentation_url"] == documentation_url
 
 
 def test_fetch_add_data_pipeline_csvs_from_url(monkeypatch, tmp_path):