@@ -59,6 +59,13 @@ def build(
5959 current_config_hash = pipeline_hash ,
6060 current_specification_hash = specification_hash ,
6161 ),
62+ "transform_resources" : State .get_transform_resources_by_dataset (
63+ collection ,
64+ dataset_resource_dir = dataset_resource_dir ,
65+ current_code_version = __version__ ,
66+ current_config_hash = pipeline_hash ,
67+ current_specification_hash = specification_hash ,
68+ ),
6269 }
6370 )
6471
@@ -109,6 +116,21 @@ def get_code_hash():
109116 commit = repo .revparse_single ("HEAD" )
110117 return str (commit .id )
111118
119+ def _active_dataset_resources (collection : Collection ):
120+ """Return dataset_resource_map with retired resources (no replacement) removed.
121+
122+ Applies the same redirect filtering as makerules so that counts and
123+ resource lists only include resources that will actually be processed.
124+ """
125+ redirect = {
126+ entry ["old-resource" ]: entry ["resource" ]
127+ for entry in collection .old_resource .entries
128+ }
129+ return {
130+ dataset : {r for r in resources if redirect .get (r , r )}
131+ for dataset , resources in collection .dataset_resource_map ().items ()
132+ }
133+
112134 def get_transform_count (
113135 collection : Collection ,
114136 dataset_resource_dir = None ,
@@ -118,11 +140,12 @@ def get_transform_count(
118140 ):
119141 """Calculate the number of transformations that need to be completed.
120142
121- When dataset_resource_dir is provided, only resources whose existing log
143+ Retired resources with no replacement are excluded. When
144+ dataset_resource_dir is provided, only resources whose existing log
122145 differs from the current code version, config hash, or specification hash
123- are counted. If None, all resources are counted.
146+ are counted. If None, all active resources are counted.
124147 """
125- dataset_resource = collection . dataset_resource_map ( )
148+ dataset_resource = State . _active_dataset_resources ( collection )
126149
127150 if dataset_resource_dir is None :
128151 return sum (len (resources ) for resources in dataset_resource .values ())
@@ -150,11 +173,12 @@ def get_transform_count_by_dataset(
150173 ):
151174 """Calculate the number of transformations needed per dataset.
152175
153- When dataset_resource_dir is provided, only resources whose existing log
176+ Retired resources with no replacement are excluded. When
177+ dataset_resource_dir is provided, only resources whose existing log
154178 differs from the current code version, config hash, or specification hash
155- are counted. If None, all resources are counted.
179+ are counted. If None, all active resources are counted.
156180 """
157- dataset_resource = collection . dataset_resource_map ( )
181+ dataset_resource = State . _active_dataset_resources ( collection )
158182 transform_count_by_dataset = {}
159183 for dataset , resources in dataset_resource .items ():
160184 if dataset_resource_dir is None :
@@ -174,6 +198,43 @@ def get_transform_count_by_dataset(
174198 )
175199 return transform_count_by_dataset
176200
201+ def get_transform_resources_by_dataset (
202+ collection : Collection ,
203+ dataset_resource_dir = None ,
204+ current_code_version = None ,
205+ current_config_hash = None ,
206+ current_specification_hash = None ,
207+ ):
208+ """Get the resource hashes that need transformation per dataset.
209+
210+ Retired resources with no replacement are excluded. When
211+ dataset_resource_dir is provided, only resources whose existing log
212+ differs from the current code version, config hash, or specification hash
213+ are included. If None, all active resources are included.
214+
215+ Returns a dict mapping dataset name to a sorted list of resource hashes.
216+ """
217+ dataset_resource = State ._active_dataset_resources (collection )
218+
219+ resources_by_dataset = {}
220+ for dataset , resources in dataset_resource .items ():
221+ if dataset_resource_dir is None :
222+ resources_by_dataset [dataset ] = sorted (resources )
223+ else :
224+ resources_by_dataset [dataset ] = sorted (
225+ resource
226+ for resource in resources
227+ if resource_needs_processing (
228+ dataset_resource_dir ,
229+ dataset ,
230+ resource ,
231+ current_code_version ,
232+ current_config_hash ,
233+ current_specification_hash ,
234+ )
235+ )
236+ return resources_by_dataset
237+
177238
178239def compare_state (
179240 specification_dir ,
@@ -207,6 +268,9 @@ def compare_state(
207268 # transform count by dataset should not be compared as it changes
208269 current .pop ("transform_count_by_dataset" , None )
209270 compare .pop ("transform_count_by_dataset" , None )
271+ # transform resources should not be compared as it changes
272+ current .pop ("transform_resources" , None )
273+ compare .pop ("transform_resources" , None )
210274
211275 if current == compare :
212276 return None
0 commit comments