Skip to content

Commit 2ef082f

Browse files
authored
Feat/count by dataset (#452)
* move collection position and calculate count by dataset * correct improts * remove state from makerules to stop circular dependency * remove makerules tests that are no longer relevant * fix test after argument change * fix compare state tests
1 parent e1fb74d commit 2ef082f

File tree

5 files changed

+209
-191
lines changed

5 files changed

+209
-191
lines changed

digital_land/makerules.py

Lines changed: 57 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88

99
from enum import Enum
1010

11-
from digital_land.state import compare_state
12-
1311

1412
class ProcessingOption(Enum):
1513
PROCESS_ALL = "all"
@@ -25,46 +23,47 @@ def dataset_path(dataset):
2523
return "$(DATASET_DIR)" + dataset + ".csv"
2624

2725

28-
def get_processing_option(
29-
collection,
30-
specification_dir,
31-
pipeline_dir,
32-
resource_dir,
33-
incremental_loading_override,
34-
state_path,
35-
):
36-
# If there's no previous state, process everything
37-
if not state_path:
38-
return ProcessingOption.PROCESS_ALL
39-
40-
# Compare current state with the previous state
41-
diffs = compare_state(
42-
specification_dir,
43-
collection.dir,
44-
pipeline_dir,
45-
resource_dir,
46-
incremental_loading_override,
47-
state_path,
48-
)
49-
50-
diffs = diffs or [] # handle if diffs is None
51-
52-
# If incremental loading is overridden or critical configs changed, process everything
53-
critical_changes = {"code", "pipeline", "collection", "specification"}
54-
if incremental_loading_override or critical_changes & set(diffs):
55-
return ProcessingOption.PROCESS_ALL
56-
57-
# New resources downloaded
58-
if "resource" in diffs:
59-
return (
60-
ProcessingOption.PROCESS_PARTIAL
61-
) # To be changed to partial in the future
62-
63-
if not diffs:
64-
return ProcessingOption.PROCESS_NONE
65-
66-
# If there are diffs we don't recognise then play safe and reprocess everything
67-
return ProcessingOption.PROCESS_ALL
26+
# TODO remove function when confirmed not needed
27+
# def get_processing_option(
28+
# collection,
29+
# specification_dir,
30+
# pipeline_dir,
31+
# resource_dir,
32+
# incremental_loading_override,
33+
# state_path,
34+
# ):
35+
# # If there's no previous state, process everything
36+
# if not state_path:
37+
# return ProcessingOption.PROCESS_ALL
38+
39+
# # Compare current state with the previous state
40+
# diffs = compare_state(
41+
# specification_dir,
42+
# collection.dir,
43+
# pipeline_dir,
44+
# resource_dir,
45+
# incremental_loading_override,
46+
# state_path,
47+
# )
48+
49+
# diffs = diffs or [] # handle if diffs is None
50+
51+
# # If incremental loading is overridden or critical configs changed, process everything
52+
# critical_changes = {"code", "pipeline", "collection", "specification"}
53+
# if incremental_loading_override or critical_changes & set(diffs):
54+
# return ProcessingOption.PROCESS_ALL
55+
56+
# # New resources downloaded
57+
# if "resource" in diffs:
58+
# return (
59+
# ProcessingOption.PROCESS_PARTIAL
60+
# ) # To be changed to partial in the future
61+
62+
# if not diffs:
63+
# return ProcessingOption.PROCESS_NONE
64+
65+
# # If there are diffs we don't recognise then play safe and reprocess everything
66+
# return ProcessingOption.PROCESS_ALL
6867

6968

7069
def pipeline_makerules(
@@ -76,14 +75,14 @@ def pipeline_makerules(
7675
state_path=None,
7776
):
7877
dataset_resource = collection.dataset_resource_map()
79-
process = get_processing_option(
80-
collection,
81-
specification_dir,
82-
pipeline_dir,
83-
resource_dir,
84-
incremental_loading_override,
85-
state_path,
86-
)
78+
# process = get_processing_option(
79+
# collection,
80+
# specification_dir,
81+
# pipeline_dir,
82+
# resource_dir,
83+
# incremental_loading_override,
84+
# state_path,
85+
# )
8786

8887
redirect = {}
8988
for entry in collection.old_resource.entries:
@@ -104,14 +103,14 @@ def pipeline_makerules(
104103
print("\\\n %s" % (transformed_path(resource, dataset)), end="")
105104
print()
106105

107-
if process == ProcessingOption.PROCESS_NONE:
108-
print("\n$(%s)::" % dataset_var)
109-
print('\techo "No state change and no new resources to transform"')
110-
print("\ntransformed::")
111-
print('\techo "No state change and no new resources to transform"')
112-
print("\ndataset::")
113-
print('\techo "No state change so no resources have been transformed"')
114-
continue
106+
# if process == ProcessingOption.PROCESS_NONE:
107+
# print("\n$(%s)::" % dataset_var)
108+
# print('\techo "No state change and no new resources to transform"')
109+
# print("\ntransformed::")
110+
# print('\techo "No state change and no new resources to transform"')
111+
# print("\ndataset::")
112+
# print('\techo "No state change so no resources have been transformed"')
113+
# continue
115114

116115
for resource in sorted(dataset_resource[dataset]):
117116
old_resource = resource

digital_land/state.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
import hashlib
55
from datetime import date
6+
from digital_land.collection import Collection
67

78
# Read the file in 32MB chunks
89
_chunk_size = 32 * 1024 * 1024
@@ -21,6 +22,11 @@ def build(
2122
incremental_loading_override,
2223
):
2324
"""Build a state object from the current configuration and code"""
25+
26+
# build collection to get counts
27+
collection = Collection(directory=collection_dir)
28+
collection.load(directory=collection_dir)
29+
2430
return State(
2531
{
2632
"code": State.get_code_hash(),
@@ -32,7 +38,10 @@ def build(
3238
"pipeline": State.get_dir_hash(pipeline_dir),
3339
"incremental_loading_override": incremental_loading_override,
3440
"last_updated_date": date.today().isoformat(), # date in YYYY-MM-DD format
35-
"transform_count": State.get_transform_count(collection_dir),
41+
"transform_count": State.get_transform_count(collection),
42+
"transform_count_by_dataset": State.get_transform_count_by_dataset(
43+
collection
44+
),
3645
}
3746
)
3847

@@ -83,17 +92,24 @@ def get_code_hash():
8392
commit = repo.revparse_single("HEAD")
8493
return str(commit.id)
8594

86-
def get_transform_count(collection_dir):
95+
def get_transform_count(collection: Collection):
8796
"""Calculate the number of transformations that need to be completed"""
88-
from digital_land.collection import Collection
89-
90-
collection = Collection(directory=collection_dir)
91-
collection.load(directory=collection_dir)
9297
dataset_resource = collection.dataset_resource_map()
9398

9499
# Count total number of transformations (resources across all datasets)
95100
return sum(len(resources) for resources in dataset_resource.values())
96101

102+
def get_transform_count_by_dataset(collection: Collection):
103+
"""Calculate the number of transformations that need to be completed"""
104+
105+
dataset_resource = collection.dataset_resource_map()
106+
tranform_count_by_dataset = {}
107+
for dataset, resources in dataset_resource.items():
108+
tranform_count_by_dataset[dataset] = len(resources)
109+
110+
# Count total number of transformations (resources across all datasets)
111+
return tranform_count_by_dataset
112+
97113

98114
def compare_state(
99115
specification_dir,
@@ -124,6 +140,9 @@ def compare_state(
124140
# transform count should not be compared as it changes
125141
current.pop("transform_count", None)
126142
compare.pop("transform_count", None)
143+
# transform count by dataset should not be compared as it changes
144+
current.pop("transform_count_by_dataset", None)
145+
compare.pop("transform_count_by_dataset", None)
127146

128147
if current == compare:
129148
return None

tests/e2e/test_state.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def test_state(tmp_path):
4545
"incremental_loading_override",
4646
"last_updated_date",
4747
"transform_count",
48+
"transform_count_by_dataset",
4849
]
4950
assert state_data["code"] == get_code_hash()
5051
assert state_data["specification"] == specification_hash

tests/integration/test_state.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44
from datetime import date
55

6+
from digital_land.collection import Collection
67
from digital_land.state import State
78

89
test_hash = "ed4c5979268ad880f7edbdc2047cfcfa6b9ee3b4"
@@ -184,9 +185,10 @@ def test_state_build_has_last_updated_date(tmp_path):
184185

185186
def test_get_transform_count(collection_with_transforms):
186187
collection_dir, expected_count = collection_with_transforms
187-
188+
collection = Collection(directory=str(collection_dir))
189+
collection.load()
188190
# Test the get_transform_count method independently
189-
count = State.get_transform_count(str(collection_dir))
191+
count = State.get_transform_count(collection)
190192

191193
# Should return an integer
192194
assert isinstance(count, int)

0 commit comments

Comments
 (0)