Skip to content

Commit da143ec

Browse files
authored
edit how lookups are created and used (#457)
* edit how lookups are created andd used to remove complication from the phases * remove unneccessary step from lookup phase * lock black version due to breaking change * add init files so tests are imiported correctly * fix transform tests by correcting lookups * remove old test location * fix async_request_pocessor tests
1 parent b7df224 commit da143ec

File tree

11 files changed

+112
-76
lines changed

11 files changed

+112
-76
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/psf/black
3-
rev: 24.1.1
3+
rev: 24.8.0
44
hooks:
55
- id: black
66
- repo: https://github.com/pre-commit/pre-commit-hooks

digital_land/phase/harmonise.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def process(self, stream):
162162
logger.error(
163163
f"Exception occurred while fetching geoX, geoY coordinates: {e}"
164164
)
165-
165+
# TODO need to identify why below exists and possibly remove
166166
# ensure typology fields are a CURIE
167167
for typology in ["organisation", "geography", "document"]:
168168
value = o.get(typology, "")

digital_land/phase/lookup.py

Lines changed: 37 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -84,17 +84,8 @@ def get_entity(self, block):
8484
organisation=organisation,
8585
reference=reference,
8686
)
87+
or self.lookup(prefix=prefix, reference=reference)
8788
)
88-
if not entity:
89-
# TBD this needs to specifically not match unless the organisation and other columns
90-
# are empty in the lookups.csv probably isn't a change here.
91-
# or by the CURIE
92-
entity = self.lookup(prefix=prefix, reference=reference)
93-
94-
# When obtaining an entity number using only the prefix and reference, check if the
95-
# lookup includes an associated organisation. If it does, do not use the entity number,
96-
# as it is organisation specific.
97-
entity = self.check_associated_organisation(entity)
9889

9990
if entity and self.entity_range:
10091
if (
@@ -195,43 +186,44 @@ def process(self, stream):
195186
reference = row.get("reference", "")
196187
entity_number = row.get("entity", "")
197188

198-
if not (prefix and reference and entity_number in self.reverse_lookups):
189+
if not (prefix and reference and entity_number):
199190
yield block
200191
continue
201-
value = self.reverse_lookups[entity_number]
202-
203-
if value:
204-
organisation = value[-1].split(",")[-1]
205-
find_entity = self.lookup(
206-
prefix=prefix,
207-
organisation=organisation,
208-
reference=reference,
209-
)
210-
if not find_entity:
211-
# TBD this needs to specifically not match unless the organisation and other columns
212-
# are empty in the lookups.csv probably isn't a change here.
213-
# or by the CURIE
214-
find_entity = self.lookup(prefix=prefix, reference=reference)
215-
216-
# When obtaining an entity number using only the prefix and reference, check if the
217-
# lookup includes an associated organisation. If it does, do not use the entity number,
218-
# as it is organisation specific.
219-
find_entity = self.check_associated_organisation(find_entity)
220-
221-
if not find_entity or (
222-
str(find_entity) in self.redirect_lookups
223-
and int(self.redirect_lookups[str(find_entity)].get("status", 0))
224-
== 410
225-
):
226-
if self.odp_collections and prefix in self.odp_collections:
227-
self.issues.log_issue(
228-
prefix,
229-
"missing associated entity",
230-
reference,
231-
line_number=line_number,
232-
)
233-
else:
234-
row[self.entity_field] = find_entity
192+
193+
# Get organisation from block metadata (set by OrganisationPhase)
194+
organisation = block.get("organisation", "").replace(
195+
"local-authority-eng", "local-authority"
196+
)
197+
198+
find_entity = self.lookup(
199+
prefix=prefix,
200+
organisation=organisation,
201+
reference=reference,
202+
)
203+
if not find_entity:
204+
# TBD this needs to specifically not match unless the organisation and other columns
205+
# are empty in the lookups.csv probably isn't a change here.
206+
# or by the CURIE
207+
find_entity = self.lookup(prefix=prefix, reference=reference)
208+
209+
# When obtaining an entity number using only the prefix and reference, check if the
210+
# lookup includes an associated organisation. If it does, do not use the entity number,
211+
# as it is organisation specific.
212+
find_entity = self.check_associated_organisation(find_entity)
213+
214+
if not find_entity or (
215+
str(find_entity) in self.redirect_lookups
216+
and int(self.redirect_lookups[str(find_entity)].get("status", 0)) == 410
217+
):
218+
if self.odp_collections and prefix in self.odp_collections:
219+
self.issues.log_issue(
220+
prefix,
221+
"missing associated entity",
222+
reference,
223+
line_number=line_number,
224+
)
225+
else:
226+
row[self.entity_field] = find_entity
235227
yield block
236228

237229

digital_land/phase/organisation.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,8 @@ def process(self, stream):
2424
self.issues.log_issue(
2525
"organisation", "invalid organisation", organisation_value
2626
)
27+
28+
# Store at block level for post-pivot lookups
29+
block["organisation"] = row["organisation"]
30+
2731
yield block

digital_land/pipeline/main.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -256,24 +256,17 @@ def load_lookup(self):
256256
or row.get("pipeline", "")
257257
)
258258
reference = row.get("reference", "") or row.get("value", "")
259-
260-
# composite key, ordered by specificity
261-
resource_lookup = self.lookup.setdefault(row.get("resource", ""), {})
262-
resource_lookup[
263-
lookup_key(
264-
entry_number=entry_number,
265-
prefix=prefix,
266-
reference=reference,
267-
)
268-
] = row["entity"]
269-
270259
organisation = row.get("organisation", "")
271-
# replace local-authority-eng while we migrate
260+
# TODO remove this replacement for local-authority-eng now that migration is complete
272261
organisation = organisation.replace(
273262
"local-authority-eng", "local-authority"
274263
)
264+
265+
# composite key, ordered by specificity
266+
resource_lookup = self.lookup.setdefault(row.get("resource", ""), {})
275267
resource_lookup[
276268
lookup_key(
269+
entry_number=entry_number,
277270
prefix=prefix,
278271
reference=reference,
279272
organisation=organisation,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ test = [
6969
"sphinx",
7070
"sphinx-autobuild",
7171
"sphinx_rtd_theme",
72-
"black",
72+
"black==24.8.0",
7373
]
7474

7575
notebook = [

tests/acceptance/test_async_backend_pipeline_run.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
def run_pipeline_for_test(test_dirs, dataset, resource, request_id, input_path):
3737
endpoints = ["d779ad1c91c5a46e2d4ace4d5446d7d7f81df1ed058f882121070574697a5412"]
3838
pipeline_dir = test_dirs["pipeline_dir"]
39-
organisation = "test-org"
39+
organisation = "test-org:test"
4040
request_id = request_id
4141
collection_dir = test_dirs["collection_dir"]
4242
converted_dir = test_dirs["converted_resource_dir"]
@@ -78,7 +78,7 @@ def run_pipeline_for_test(test_dirs, dataset, resource, request_id, input_path):
7878
dictwriter.writerow(row2)
7979

8080
# Create organisation.csv with data
81-
row = {"organisation": "test-org", "name": "Test Org"}
81+
row = {"organisation": "test-org:test", "name": "Test Org"}
8282
fieldnames = row.keys()
8383
with open(organisation_path, "w") as f:
8484
dictwriter = csv.DictWriter(f, fieldnames=fieldnames)
@@ -115,7 +115,7 @@ def run_pipeline_for_test(test_dirs, dataset, resource, request_id, input_path):
115115

116116
# Load organisations
117117
organisation = Organisation(organisation_path, Path(pipeline.path))
118-
default_values["organisation"] = organisation
118+
default_values["organisation"] = "test-org:test"
119119
try:
120120
run_pipeline(
121121
ConvertPhase(
@@ -216,12 +216,12 @@ def test_async_pipeline_run(test_dirs):
216216
{
217217
"reference": "ABC_0001",
218218
"entry-date": "2024-01-01",
219-
"organisation": "test-org",
219+
"organisation": "test-org:test",
220220
},
221221
{
222222
"reference": "ABC_0002",
223223
"entry-date": "2024-01-02",
224-
"organisation": "test-org",
224+
"organisation": "test-org:test",
225225
},
226226
]
227227

@@ -282,12 +282,12 @@ def test_pipeline_output_is_complete(test_dirs):
282282
{
283283
"reference": "ABC_0001",
284284
"entry-date": "2024-01-01",
285-
"organisation": "test-org",
285+
"organisation": "test-org:test",
286286
},
287287
{
288288
"reference": "ABC_0002",
289289
"entry-date": "2024-01-02",
290-
"organisation": "test-org",
290+
"organisation": "test-org:test",
291291
},
292292
]
293293

@@ -433,7 +433,7 @@ def test_pipeline_lookup_phase(test_dirs):
433433
{
434434
"reference": "ABC_0001",
435435
"entry-date": "2025-01-01",
436-
"organisation": "test-org",
436+
"organisation": "test-org:test",
437437
"article-4-direction": "a4d1",
438438
}
439439
]
@@ -481,7 +481,7 @@ def test_pipeline_lookup_phase_assign_reference_entity(test_dirs):
481481
{
482482
"reference": "ABC_0001",
483483
"entry-date": "2025-01-01",
484-
"organisation": "test-org",
484+
"organisation": "test-org:test",
485485
"article-4-direction": "a4d2",
486486
}
487487
]

tests/integration/configuration/__init__.py

Whitespace-only changes.

tests/integration/pipeline/__init__.py

Whitespace-only changes.
Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33
import csv
44
import urllib.request
55
import pandas as pd
6+
import logging
67
from urllib.error import URLError
78
from digital_land.pipeline import Pipeline
89
from digital_land.pipeline import Lookups
910
from digital_land.specification import Specification
1011
from digital_land.organisation import Organisation
1112

13+
logger = logging.getLogger(__name__)
14+
1215

1316
def write_as_csv(dir, filename, data):
1417
with open(os.path.join(dir, filename), "w") as f:
@@ -551,6 +554,53 @@ def test_load_concat_no_prepend_append(empty_pipeline_dir):
551554
}
552555

553556

557+
def test_load_lookup_creates_single_lookup_per_row(tmp_path):
558+
"""
559+
Test that load_lookup creates a single lookup entry per row in lookup.csv.
560+
Each row should create one lookup key based on all provided fields.
561+
Keys are normalized (lowercased, special chars removed).
562+
"""
563+
# -- Arrange --
564+
pipeline_dir = tmp_path / "pipeline"
565+
pipeline_dir.mkdir()
566+
567+
test_pipeline = "test-pipeline"
568+
569+
# Create lookup.csv with various combinations of fields
570+
lookup_data = {
571+
"resource": ["", "", "res123"],
572+
"entry-number": ["", "5", ""],
573+
"prefix": ["ancient-woodland", "conservation-area", "listed-building"],
574+
"reference": ["AW001", "CA002", "LB003"],
575+
"organisation": ["local-authority:ABC", "", "local-authority:XYZ"],
576+
"entity": ["1000001", "1000002", "1000003"],
577+
}
578+
pd.DataFrame(lookup_data).to_csv(f"{pipeline_dir}/lookup.csv", index=False)
579+
580+
# -- Act --
581+
pipeline = Pipeline(str(pipeline_dir), test_pipeline)
582+
583+
# -- Assert --
584+
# Check that lookups are created correctly
585+
# Row 1: prefix + reference + organisation (no resource)
586+
assert "" in pipeline.lookup, "Should have empty resource key for general lookups"
587+
general_lookups = pipeline.lookup[""]
588+
589+
# Row 1: normalized key (lowercased, colons removed)
590+
assert ",ancient-woodland,aw001,local-authorityabc" in general_lookups
591+
assert general_lookups[",ancient-woodland,aw001,local-authorityabc"] == "1000001"
592+
593+
# Row 2: entry-number + prefix + reference (no organisation)
594+
assert "5,conservation-area,ca002," in general_lookups
595+
assert general_lookups["5,conservation-area,ca002,"] == "1000002"
596+
597+
# Row 3: resource-scoped lookup
598+
assert "res123" in pipeline.lookup, "Should have resource-specific key"
599+
resource_lookups = pipeline.lookup["res123"]
600+
assert ",listed-building,lb003,local-authorityxyz" in resource_lookups
601+
assert resource_lookups[",listed-building,lb003,local-authorityxyz"] == "1000003"
602+
603+
554604
@pytest.fixture(scope="session")
555605
def specification_dir(tmp_path_factory):
556606
"""Download specification files from GitHub for testing"""
@@ -665,7 +715,7 @@ def get_test_lookup_config():
665715
"reference": ["0", "1"],
666716
"entity": ["2200001", "2200002"],
667717
"start-date": ["", ""],
668-
"organisation": ["101", "101"],
718+
"organisation": ["local-authority:LBH", "local-authority:LBH"],
669719
"end-date": ["", ""],
670720
"entry-date": ["", ""],
671721
"endpoint": ["", ""],
@@ -713,11 +763,11 @@ def test_pipeline_transform_basic(
713763
pd.DataFrame(get_test_lookup_config()).to_csv(
714764
f"{pipeline_dir}/lookup.csv", index=False
715765
)
716-
717766
# Initialize pipeline components
718767
spec = Specification(specification_dir)
719768
org = Organisation(organisation_path=organisation_path)
720769
pipeline = Pipeline(str(pipeline_dir), dataset_name, specification=spec)
770+
logger.info(f"Pipeline Lookups: {pipeline.lookup}")
721771

722772
output_path = tmp_path / "output" / "transformed.csv"
723773
output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -893,7 +943,3 @@ def test_pipeline_transform_with_unmapped_reference_lookup_disabled(
893943
# Verify reference field for unmapped data exists in output
894944
reference_values = output_df[output_df["field"] == "reference"]["value"].tolist()
895945
assert "99" in reference_values, "Unmapped reference 99 should be in output"
896-
897-
898-
if __name__ == "__main__":
899-
pytest.main()

0 commit comments

Comments
 (0)