@@ -218,7 +218,7 @@ def pipeline_run(
218218 pipeline ,
219219 specification ,
220220 input_path ,
221- output_path ,
221+ output_path : Path ,
222222 collection_dir , # TBD: remove, replaced by endpoints, organisations and entry_date
223223 null_path = None , # TBD: remove this
224224 issue_dir = None ,
@@ -398,9 +398,9 @@ def pipeline_run(
398398 column_field_log .save (os .path .join (column_field_dir , resource + ".csv" ))
399399 dataset_resource_log .save (os .path .join (dataset_resource_dir , resource + ".csv" ))
400400 converted_resource_log .save (os .path .join (converted_resource_dir , resource + ".csv" ))
401- # create converted parquet in the var directory
402- cache_dir = Path ( organisation_path ). parent
403- transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
401+ # create converted parquet in the var director
402+ # TODO test without output_path conversation above to make sure we have a test that would've failed
403+ transformed_parquet_dir = output_path . parent
404404 transformed_parquet_dir .mkdir (exist_ok = True , parents = True )
405405 convert_tranformed_csv_to_pq (
406406 input_path = output_path ,
@@ -412,7 +412,7 @@ def pipeline_run(
412412# build dataset from processed resources
413413#
414414def dataset_create (
415- input_paths ,
415+ input_dir ,
416416 output_path ,
417417 organisation_path ,
418418 pipeline ,
@@ -424,19 +424,39 @@ def dataset_create(
424424 cache_dir = "var/cache" ,
425425 resource_path = "collection/resource.csv" ,
426426):
427+ """
428+ Create a dataset package from transformed parquet files.
429+
430+ Builds both SQLite and Parquet dataset packages from transformed resources,
431+ loading facts, entities, issues, and provenance information.
432+
433+ Args:
434+ input_dir: Directory containing transformed parquet files
435+ output_path: Path for the output SQLite database
436+ organisation_path: Path to organisation.csv file
437+ pipeline: Pipeline object containing configuration
438+ dataset: Name of the dataset to create
439+ specification: Specification object defining the dataset schema
440+ issue_dir: Directory containing issue logs (default: "issue")
441+ column_field_dir: Directory for column-field mappings (default: "var/column-field")
442+ dataset_resource_dir: Directory for dataset-resource mappings (default: "var/dataset-resource")
443+ cache_dir: Directory for caching intermediate files (default: "var/cache")
444+ resource_path: Path to resource.csv file (default: "collection/resource.csv")
445+ """
427446 # set level for logging to see what's going on
428447 logger .setLevel (logging .INFO )
429448 logging .getLogger ("digital_land.package.dataset_parquet" ).setLevel (logging .INFO )
430449
431- # chek all paths are paths
450+ # check all paths are paths
432451 issue_dir = Path (issue_dir )
433452 column_field_dir = Path (column_field_dir )
434453 dataset_resource_dir = Path (dataset_resource_dir )
435454 cache_dir = Path (cache_dir )
436455 resource_path = Path (resource_path )
456+ input_dir = Path (input_dir )
437457
438458 # get the transformed files from the cache directory this is assumed right now but we may want to be stricter in the future
439- transformed_parquet_dir = cache_dir / "transformed_parquet" / dataset
459+ # input_dir
440460
441461 # create directory for dataset_parquet_package, will create a general provenance one for now
442462 dataset_parquet_path = cache_dir / "provenance"
@@ -460,13 +480,12 @@ def dataset_create(
460480 # don't use create as we don't want to create the indexes
461481 package .create_database ()
462482 package .disconnect ()
463- for path in input_paths :
464- path_obj = Path (path )
483+ for path in input_dir .glob ("*.parquet" ):
465484 logging .info (f"loading column field log into { output_path } " )
466- package .load_column_fields (column_field_dir / dataset / f"{ path_obj .stem } .csv" )
485+ package .load_column_fields (column_field_dir / dataset / f"{ path .stem } .csv" )
467486 logging .info (f"loading dataset resource log into { output_path } " )
468487 package .load_dataset_resource (
469- dataset_resource_dir / dataset / f"{ path_obj .stem } .csv"
488+ dataset_resource_dir / dataset / f"{ path .stem } .csv"
470489 )
471490 logger .info (f"loading old entities into { output_path } " )
472491 old_entity_path = Path (pipeline .path ) / "old-entity.csv"
@@ -476,8 +495,8 @@ def dataset_create(
476495 logger .info (f"loading issues into { output_path } " )
477496 issue_paths = issue_dir / dataset
478497 if issue_paths .exists ():
479- for issue_path in os . listdir ( issue_paths ):
480- package .load_issues (os . path . join ( issue_paths , issue_path ) )
498+ for issue_path in issue_paths . glob ( "*.csv" ):
499+ package .load_issues (issue_path )
481500 else :
482501 logger .warning ("No directory for this dataset in the provided issue_directory" )
483502
@@ -491,7 +510,7 @@ def dataset_create(
491510 path = dataset_parquet_path ,
492511 specification_dir = None , # TBD: package should use this specification object
493512 duckdb_path = cache_dir / "overflow.duckdb" ,
494- transformed_parquet_dir = transformed_parquet_dir ,
513+ transformed_parquet_dir = input_dir ,
495514 )
496515 # To find facts we have a complex SQL window function that can cause memory issues. To aid the allocation of memory
497516 # we decide on a parquet strategy, based on how many parquet files we have, the overall size of these
@@ -503,10 +522,10 @@ def dataset_create(
503522
504523 # Group parquet files into approx 256MB batches (if needed)
505524 if pqpackage .strategy != "direct" :
506- pqpackage .group_parquet_files (transformed_parquet_dir , target_mb = 256 )
507- pqpackage .load_facts (transformed_parquet_dir )
508- pqpackage .load_fact_resource (transformed_parquet_dir )
509- pqpackage .load_entities (transformed_parquet_dir , resource_path , organisation_path )
525+ pqpackage .group_parquet_files (input_dir , target_mb = 256 )
526+ pqpackage .load_facts (input_dir )
527+ pqpackage .load_fact_resource (input_dir )
528+ pqpackage .load_entities (input_dir , resource_path , organisation_path )
510529
511530 logger .info ("loading fact,fact_resource and entity into {output_path}" )
512531 pqpackage .load_to_sqlite (output_path )
0 commit comments