Skip to content

Commit aa13e93

Browse files
authored
update state calculation to include resources (#515)
* update state calculation to include resources * lets remove resources that have been retired from the state counts * update test
1 parent a649914 commit aa13e93

3 files changed

Lines changed: 157 additions & 6 deletions

File tree

digital_land/state.py

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,13 @@ def build(
5959
current_config_hash=pipeline_hash,
6060
current_specification_hash=specification_hash,
6161
),
62+
"transform_resources": State.get_transform_resources_by_dataset(
63+
collection,
64+
dataset_resource_dir=dataset_resource_dir,
65+
current_code_version=__version__,
66+
current_config_hash=pipeline_hash,
67+
current_specification_hash=specification_hash,
68+
),
6269
}
6370
)
6471

@@ -109,6 +116,21 @@ def get_code_hash():
109116
commit = repo.revparse_single("HEAD")
110117
return str(commit.id)
111118

119+
def _active_dataset_resources(collection: Collection):
120+
"""Return dataset_resource_map with retired resources (no replacement) removed.
121+
122+
Applies the same redirect filtering as makerules so that counts and
123+
resource lists only include resources that will actually be processed.
124+
"""
125+
redirect = {
126+
entry["old-resource"]: entry["resource"]
127+
for entry in collection.old_resource.entries
128+
}
129+
return {
130+
dataset: {r for r in resources if redirect.get(r, r)}
131+
for dataset, resources in collection.dataset_resource_map().items()
132+
}
133+
112134
def get_transform_count(
113135
collection: Collection,
114136
dataset_resource_dir=None,
@@ -118,11 +140,12 @@ def get_transform_count(
118140
):
119141
"""Calculate the number of transformations that need to be completed.
120142
121-
When dataset_resource_dir is provided, only resources whose existing log
143+
Retired resources with no replacement are excluded. When
144+
dataset_resource_dir is provided, only resources whose existing log
122145
differs from the current code version, config hash, or specification hash
123-
are counted. If None, all resources are counted.
146+
are counted. If None, all active resources are counted.
124147
"""
125-
dataset_resource = collection.dataset_resource_map()
148+
dataset_resource = State._active_dataset_resources(collection)
126149

127150
if dataset_resource_dir is None:
128151
return sum(len(resources) for resources in dataset_resource.values())
@@ -150,11 +173,12 @@ def get_transform_count_by_dataset(
150173
):
151174
"""Calculate the number of transformations needed per dataset.
152175
153-
When dataset_resource_dir is provided, only resources whose existing log
176+
Retired resources with no replacement are excluded. When
177+
dataset_resource_dir is provided, only resources whose existing log
154178
differs from the current code version, config hash, or specification hash
155-
are counted. If None, all resources are counted.
179+
are counted. If None, all active resources are counted.
156180
"""
157-
dataset_resource = collection.dataset_resource_map()
181+
dataset_resource = State._active_dataset_resources(collection)
158182
transform_count_by_dataset = {}
159183
for dataset, resources in dataset_resource.items():
160184
if dataset_resource_dir is None:
@@ -174,6 +198,43 @@ def get_transform_count_by_dataset(
174198
)
175199
return transform_count_by_dataset
176200

201+
def get_transform_resources_by_dataset(
202+
collection: Collection,
203+
dataset_resource_dir=None,
204+
current_code_version=None,
205+
current_config_hash=None,
206+
current_specification_hash=None,
207+
):
208+
"""Get the resource hashes that need transformation per dataset.
209+
210+
Retired resources with no replacement are excluded. When
211+
dataset_resource_dir is provided, only resources whose existing log
212+
differs from the current code version, config hash, or specification hash
213+
are included. If None, all active resources are included.
214+
215+
Returns a dict mapping dataset name to a sorted list of resource hashes.
216+
"""
217+
dataset_resource = State._active_dataset_resources(collection)
218+
219+
resources_by_dataset = {}
220+
for dataset, resources in dataset_resource.items():
221+
if dataset_resource_dir is None:
222+
resources_by_dataset[dataset] = sorted(resources)
223+
else:
224+
resources_by_dataset[dataset] = sorted(
225+
resource
226+
for resource in resources
227+
if resource_needs_processing(
228+
dataset_resource_dir,
229+
dataset,
230+
resource,
231+
current_code_version,
232+
current_config_hash,
233+
current_specification_hash,
234+
)
235+
)
236+
return resources_by_dataset
237+
177238

178239
def compare_state(
179240
specification_dir,
@@ -207,6 +268,9 @@ def compare_state(
207268
# transform count by dataset should not be compared as it changes
208269
current.pop("transform_count_by_dataset", None)
209270
compare.pop("transform_count_by_dataset", None)
271+
# transform resources should not be compared as it changes
272+
current.pop("transform_resources", None)
273+
compare.pop("transform_resources", None)
210274

211275
if current == compare:
212276
return None

tests/acceptance/test_state.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def test_state(tmp_path):
4646
"last_updated_date",
4747
"transform_count",
4848
"transform_count_by_dataset",
49+
"transform_resources",
4950
]
5051
assert state_data["code"] == get_code_hash()
5152
assert state_data["specification"] == specification_hash

tests/integration/test_state.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,22 @@ def collection_with_transforms(tmp_path):
102102
return collection_dir, 4 # Return directory and expected count
103103

104104

105+
@pytest.fixture
106+
def collection_with_retired_resource(collection_with_transforms):
107+
"""Extends collection_with_transforms with a retired resource (no replacement).
108+
109+
resource1 is marked as retired with no replacement in old-resource.csv,
110+
so it should be excluded from transform_resources even though it still
111+
appears in dataset_resource_map().
112+
"""
113+
collection_dir, _ = collection_with_transforms
114+
old_resource_file = collection_dir / "old-resource.csv"
115+
with open(old_resource_file, "w", newline="") as f:
116+
f.write("old-resource,resource\n")
117+
f.write("resource1,\n") # retired, no replacement
118+
return collection_dir
119+
120+
105121
def test_get_code_hash():
106122
proc = subprocess.run(
107123
"git log -n 1".split(), cwd=os.path.dirname(__file__), stdout=subprocess.PIPE
@@ -226,3 +242,73 @@ def test_state_build_has_transform_count(collection_with_transforms, tmp_path):
226242
assert (
227243
test_state["transform_count"] == expected_count
228244
), f"Expected {expected_count} transforms, got {test_state['transform_count']}"
245+
246+
247+
class TestState:
248+
def test_get_transform_resources_by_dataset_returns_all_resources(
249+
self, collection_with_transforms
250+
):
251+
collection_dir, _ = collection_with_transforms
252+
collection = Collection(directory=str(collection_dir))
253+
collection.load()
254+
255+
result = State.get_transform_resources_by_dataset(collection)
256+
257+
assert set(result["dataset1"]) == {"resource1", "resource2"}
258+
assert set(result["dataset2"]) == {"resource2", "resource3"}
259+
260+
def test_get_transform_resources_by_dataset_returns_sorted_lists(
261+
self, collection_with_transforms
262+
):
263+
collection_dir, _ = collection_with_transforms
264+
collection = Collection(directory=str(collection_dir))
265+
collection.load()
266+
267+
result = State.get_transform_resources_by_dataset(collection)
268+
269+
for resources in result.values():
270+
assert resources == sorted(resources)
271+
272+
def test_get_transform_resources_by_dataset_excludes_retired_resources(
273+
self, collection_with_retired_resource
274+
):
275+
collection_dir = collection_with_retired_resource
276+
collection = Collection(directory=str(collection_dir))
277+
collection.load()
278+
279+
result = State.get_transform_resources_by_dataset(collection)
280+
281+
# resource1 is retired with no replacement — must not appear
282+
assert "resource1" not in result["dataset1"]
283+
# resource2 is still active
284+
assert "resource2" in result["dataset1"]
285+
286+
def test_build_includes_transform_resources(
287+
self, collection_with_transforms, tmp_path
288+
):
289+
collection_dir, _ = collection_with_transforms
290+
specification_dir = tmp_path / "specification"
291+
specification_dir.mkdir()
292+
pipeline_dir = tmp_path / "pipeline"
293+
pipeline_dir.mkdir()
294+
resource_dir = tmp_path / "resource"
295+
resource_dir.mkdir()
296+
297+
test_state = State.build(
298+
str(specification_dir),
299+
str(collection_dir),
300+
str(pipeline_dir),
301+
str(resource_dir),
302+
True,
303+
)
304+
305+
assert "transform_resources" in test_state
306+
assert isinstance(test_state["transform_resources"], dict)
307+
assert set(test_state["transform_resources"]["dataset1"]) == {
308+
"resource1",
309+
"resource2",
310+
}
311+
assert set(test_state["transform_resources"]["dataset2"]) == {
312+
"resource2",
313+
"resource3",
314+
}

0 commit comments

Comments
 (0)