From 5838bf2cbdf28d895d6d4eae5af1bc9c7b523447 Mon Sep 17 00:00:00 2001 From: Eugenio Lorente <46348947+eugenioLR@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:44:06 +0200 Subject: [PATCH 1/4] Reworked the data acquisition script and fixed bugs with reanalysis data (#5) * added command line arguments to data acquisition script * renamed directory and updated parameters and example script * added argparse import * removed unnecesary files * solved bug in reanalysis data * fixed path bug * added reanalysis to copy_files.py * added check for reanaysis data and removed unused data. --- README.md | 4 +- .../SLURM_data_acquisition.sh | 2 +- .../copy_files.py | 27 ++- .../copy_files.sh | 0 .../data_acq_freva_search_ECROPS.py | 170 ++++-------------- src/climate_data_acq/data_acquisition_main.py | 101 +++++++++++ src/cmip6_data_acq/0_data_acq_main_ECROPS.py | 148 --------------- 7 files changed, 162 insertions(+), 290 deletions(-) rename src/{cmip6_data_acq => climate_data_acq}/SLURM_data_acquisition.sh (78%) rename src/{cmip6_data_acq => climate_data_acq}/copy_files.py (86%) rename src/{cmip6_data_acq => climate_data_acq}/copy_files.sh (100%) rename src/{cmip6_data_acq => climate_data_acq}/data_acq_freva_search_ECROPS.py (52%) create mode 100644 src/climate_data_acq/data_acquisition_main.py delete mode 100755 src/cmip6_data_acq/0_data_acq_main_ECROPS.py diff --git a/README.md b/README.md index 71e2130..5c17445 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ for notes on deploying the project on a live system. ### Prerequisites -- Need to have acces to Levante. +- Need to have access to Levante. - Need your own [conda environment](https://docs.dkrz.de/doc/levante/code-development/python.html#set-up-conda-for-individual-environments). ### Installing @@ -35,6 +35,8 @@ of conduct, and the process for submitting pull requests to us. - **Cosmin M. Marina** - *Provided Initial Scripts* - [cosminmarina](https://github.com/cosminmarina) + - **Eugenio Lorente-Ramos** - *Enhanced data aquisition scripts* - + [eugenioLR](https://github.com/eugenioLR) See also the list of [contributors](https://github.com/cosminmarina/dkrz_utils/contributors) diff --git a/src/cmip6_data_acq/SLURM_data_acquisition.sh b/src/climate_data_acq/SLURM_data_acquisition.sh similarity index 78% rename from src/cmip6_data_acq/SLURM_data_acquisition.sh rename to src/climate_data_acq/SLURM_data_acquisition.sh index b16c2af..48a22dd 100755 --- a/src/cmip6_data_acq/SLURM_data_acquisition.sh +++ b/src/climate_data_acq/SLURM_data_acquisition.sh @@ -19,6 +19,6 @@ module load python3/2022.01-gcc-11.2.0 module load clint module load xces -python 0_data_acq_main_ECROPS.py +python data_acquisition_main.py -p reanalysis --era5_vars_hour "10u,10v,msl,tp,q,2t" -f hour --exp_reanalysis ERA5 --dir ./data_acq diff --git a/src/cmip6_data_acq/copy_files.py b/src/climate_data_acq/copy_files.py similarity index 86% rename from src/cmip6_data_acq/copy_files.py rename to src/climate_data_acq/copy_files.py index 15cd56e..b262cdf 100755 --- a/src/cmip6_data_acq/copy_files.py +++ b/src/climate_data_acq/copy_files.py @@ -105,18 +105,27 @@ def main(): # Extract variable and experiment from filename filename = os.path.basename(csv_file_path) - parts = filename.split('__cmip6_')[-1].split('_[')[0].split('_') - - # Determine experiment and variable - if parts[0] == 'past2k': - experiment = 'past2k' - variable = parts[1] - elif parts[0].startswith('ssp'): + if "cmip6" in filename: + parts = filename.split('__cmip6_')[-1].split('_[')[0].split('_') + + # Determine experiment and variable + match parts[0]: + case 'past2k': + experiment = 'past2k' + variable = parts[1] + case 'historical': + experiment = 'historical' + variable = parts[0] + case ['ssp', *_]: + experiment = parts[0] + variable = parts[1] + elif "reanalisys" in filename: + parts = filename.split('__reanalisys_')[-1].split('_[')[0].split('_') experiment = parts[0] variable = parts[1] else: - experiment = 'historical' - variable = parts[0] + print(f"File {csv_file_path} could not be processed.") + continue # Copy files with structured paths copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) diff --git a/src/cmip6_data_acq/copy_files.sh b/src/climate_data_acq/copy_files.sh similarity index 100% rename from src/cmip6_data_acq/copy_files.sh rename to src/climate_data_acq/copy_files.sh diff --git a/src/cmip6_data_acq/data_acq_freva_search_ECROPS.py b/src/climate_data_acq/data_acq_freva_search_ECROPS.py similarity index 52% rename from src/cmip6_data_acq/data_acq_freva_search_ECROPS.py rename to src/climate_data_acq/data_acq_freva_search_ECROPS.py index a5dac12..ab4c423 100755 --- a/src/cmip6_data_acq/data_acq_freva_search_ECROPS.py +++ b/src/climate_data_acq/data_acq_freva_search_ECROPS.py @@ -11,12 +11,8 @@ import logging import os -# homevardir = os.path.join(os.sep, "home", "b", "b381971", 'ECROPS', 'ERA_CSVS') -# homevardir = os.path.join(os.sep, "home", "b", "b392996", 'ECROPS', 'ERA_CSVS') -homevardir = "/work/bb1478/b382610/wildfires/data/find_vars_cmip6/data_acq/" - -def freva_search_ssp(project, model, var, freq, experiment): +def freva_search_ssp(project, model, var, freq, experiment, homevardir): """ Get all the ssp files from FREVA for the inputs and write them to a csv, e.g. "mpi-esm1-2-hr__cmip6_ssp585_rsds_day.csv". @@ -41,75 +37,37 @@ def freva_search_ssp(project, model, var, freq, experiment): ## iteratable freva generator object ssp_files can either be tranformed to a list or parsed, ## not both, it lives through one iteration it seems - ssp_files_list = list( - ssp_files - ) # make the freva generator object ssp_files a list for list functions e.g. len() - ssp_files_array = np.sort(np.array(ssp_files_list)) + ssp_files_list = list(ssp_files) # make the freva generator object ssp_files a list for list functions e.g. len() + ssp_files_array = np.sort(ssp_files_list) ## 2. Get all the unique ensemble ids to be used in matching with all other ssp files all_ensembles = [] for ssp_file in ssp_files_array: res = freva.facet_search(file=ssp_file, facet="ensemble") - all_ensembles.append( - res.get("ensemble")[0] - ) # get the first (only) value of the dictionary - unique_ensembles = np.unique( - np.array(all_ensembles) - ) # then filter out only the unique ensemble values - logging.info( - str(experiment) - + " for " - + str(var) - + " unique ensemble ids = " - + str(unique_ensembles) - ) + all_ensembles.append(res.get("ensemble")[0]) # get the first (only) value of the dictionary + + unique_ensembles = np.unique(all_ensembles) # then filter out only the unique ensemble values + + logging.info(f"{experiment} for {var} unique ensemble ids = {unique_ensembles}") # Get the number of ssp files per unique ensemble id: Function is called only for logging the number of files - get_files_from_unique_ensembles( - project, model, var, freq, experiment, unique_ensembles - ) + get_files_from_unique_ensembles(project, model, var, freq, experiment, unique_ensembles) ## 3. Get all the historical datasets we need by the ensemble id in unique_ensembles - historical_files_array = get_files_from_unique_ensembles( - project, model, var, freq, "historical", unique_ensembles - ) + historical_files_array = get_files_from_unique_ensembles(project, model, var, freq, "historical", unique_ensembles) - np_historical_files_array = np.sort(np.array(historical_files_array)) + np_historical_files_array = np.sort(historical_files_array) ### logging.info(str(var) + " total HISTORICAL num of files = " + str(np_historical_files_array.size)) ## Write everything to csv files - ssp_csv_filename = ( - str(model) - + "__" - + project - + "_" - + str(experiment) - + "_" - + str(var) - + "_" - + str(freq) - + ".csv" - ) - ssp_files_array.tofile(os.path.join(os.sep, homevardir, ssp_csv_filename), sep="\n") - historical_csv_filename = ( - str(model) - + "__" - + project - + "_" - + str(experiment) - + "_" - + str(var) - + "_" - + str(freq) - + "_historical" - + ".csv" - ) - np_historical_files_array.tofile( - os.path.join(os.sep, homevardir, historical_csv_filename), sep="\n" - ) + ssp_csv_filename = f"{model}__{project}_{experiment}_{var}_{freq}.csv" + ssp_files_array.tofile(os.path.join(homevardir, ssp_csv_filename), sep="\n") + historical_csv_filename = f"{model}__{project}_{experiment}_{var}_{freq}_historical.csv" + + np_historical_files_array.tofile(os.path.join(homevardir, historical_csv_filename), sep="\n") -def freva_search_historical(project, model, var, freq): +def freva_search_historical(project, model, var, freq, homevardir): """ Retreives all the historical files from FREVA and writes them to csv, e.g. "mpi-esm1-2-hr__cmip6_rsds_day_allhistorical.csv" @@ -131,7 +89,7 @@ def freva_search_historical(project, model, var, freq): ## iteratable freva generator object ssp_files can either be tranformed to a list or parsed, ## not both, it lives through one iteration it seems historical_files_list = list(historical_files) - historical_files_array = np.sort(np.array(historical_files_list)) + historical_files_array = np.sort(historical_files_list) ### logging.info(str(experiment) + " for " + str(var) + " total num of files = " + str(ssp_files_array.size)) @@ -139,39 +97,21 @@ def freva_search_historical(project, model, var, freq): all_ensembles = [] for historical_file in historical_files_array: res = freva.facet_search(file=historical_file, facet="ensemble") - all_ensembles.append( - res.get("ensemble")[0] - ) # get the first and only value of the dictionary - unique_ensembles = np.unique( - np.array(all_ensembles) - ) # then filter out only the unique ensemble values - logging.info( - "Historical for " + str(var) + " unique ensemble ids = " + str(unique_ensembles) - ) + all_ensembles.append(res.get("ensemble")[0]) # get the first and only value of the dictionary + + unique_ensembles = np.unique(all_ensembles) # then filter out only the unique ensemble values + logging.info(f"Historical for {var} unique ensemble ids = {unique_ensembles}") # Get the number of historical files per unique ensemble id: Function is calles only for logging the number of files - get_files_from_unique_ensembles( - project, model, var, freq, "historical", unique_ensembles - ) + get_files_from_unique_ensembles(project, model, var, freq, "historical", unique_ensembles) ## Write everything to csv files - all_historical_csv = ( - str(model) - + "__" - + project - + "_" - + str(var) - + "_" - + str(freq) - + "_allhistorical" - + ".csv" - ) - historical_files_array.tofile( - os.path.join(os.sep, homevardir, all_historical_csv), sep="\n" - ) + all_historical_csv = f"{model}__{project}_{var}_{freq}_allhistorical.csv" + + historical_files_array.tofile(os.path.join(homevardir, all_historical_csv), sep="\n") -def freva_search_reanalysis(project, experiment, var, freq): # , geopoten_value): +def freva_search_reanalysis(project, experiment, var, freq, homevardir): # , geopoten_value): """ Retreive from FREVA all reanalysis files such as ERA5 and write the list to csv, e.g. "era5__reanalysis_day_tas.csv" @@ -183,50 +123,27 @@ def freva_search_reanalysis(project, experiment, var, freq): # , geopoten_value :return: """ ## 1. Get all the reanalysis files with a variable - reanalysis_files = freva.databrowser( - project=project, time_frequency=freq, variable=var, experiment=experiment - ) + reanalysis_files = freva.databrowser(project=project, time_frequency=freq, variable=var, experiment=experiment) reanalysis_files_list = list(reanalysis_files) - #### FOR SOME REASON THE BELOW DOES NOT WORK, TO BE DELETED, HAS BEEN SUBSTITUTED IN data_prepr_timerange_targetvar_zg - # ## 2. Get the geopotential height files we need, in case the var has this attribute (not 999999) - # if geopoten_value != 999999: - # for f in reanalysis_files_list: - # if str(geopoten_value) not in f: - # reanalysis_files_list.remove(f) - - reanalysis_files_array = np.sort(np.array(reanalysis_files_list)) + reanalysis_files_array = np.sort(reanalysis_files_list) ## 3. Get all the unique ensemble ids for each var all_ensembles = [] for reanalysis_file in reanalysis_files_array: res = freva.facet_search(file=reanalysis_file, facet="ensemble") - all_ensembles.append( - res.get("ensemble")[0] - ) # get the first(and only) value of the dictionary - unique_ensembles = np.unique( - np.array(all_ensembles) - ) # then filter out only the unique ensemble values - logging.info( - str(experiment) - + " reanalysis for " - + str(var) - + " unique ensemble ids = " - + str(unique_ensembles) - ) + all_ensembles.append(res.get("ensemble")[0]) # get the first(and only) value of the dictionary + + unique_ensembles = np.unique(all_ensembles) # then filter out only the unique ensemble values + logging.info(f"{experiment} reanalysis for {var} unique ensemble ids = {unique_ensembles}") ## Write everything to csv files - all_reanalysis_csv_filename = ( - str(experiment) + "__" + project + "_" + str(freq) + "_" + str(var) + ".csv" - ) - reanalysis_files_array.tofile( - os.path.join(os.sep, homevardir, all_reanalysis_csv_filename), sep="\n" - ) + all_reanalysis_csv_filename = f"{experiment}__{project}_{freq}_{var}.csv" + reanalysis_files_array.tofile(os.path.join(homevardir, all_reanalysis_csv_filename), sep="\n") -def get_files_from_unique_ensembles( - project, model, var, freq, experiment, unique_ensemble_list -): + +def get_files_from_unique_ensembles(project, model, var, freq, experiment, unique_ensemble_list): """ The inputs to this function are internal, although dictated by the data_acq_main.py . This function is called internally in order to retrieve from FREVA items using their ensemble id, used for corresponding ssp and historical @@ -249,18 +166,9 @@ def get_files_from_unique_ensembles( time_frequency=freq, experiment=experiment, ) - n = 0 for file in files: - n = n + 1 files_array.append(file) - logging.info( - str(experiment) - + " " - + str(var) - + " files for ensemble " - + str(unique_ens) - + " = " - + str(n) - ) + + logging.info(f"{experiment} {var} files for ensemble {unique_ens} = {len(files)}") return files_array diff --git a/src/climate_data_acq/data_acquisition_main.py b/src/climate_data_acq/data_acquisition_main.py new file mode 100644 index 0000000..43a5a5f --- /dev/null +++ b/src/climate_data_acq/data_acquisition_main.py @@ -0,0 +1,101 @@ +################################################################################# +# Title: Main class routine for searching and logging available FREVA datasets +# module load order: python3, clint, xces, then run script +# Author: Odysseas Vlachopoulos, Cosmin M. Marina, Eugenio Lorente-Ramos +# Project: testing +################################################################################## + +import logging +import sys + +import data_acq_freva_search_ECROPS +import os +import argparse + + +def copy_data(projects, models, variables_cmip, variables_era5_daily_monthly, variables_era5_hourly, frequency, exp_cmip, exp_reanalysis, homevardir): + # First initialize a logger instance + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + force=True, + handlers=[ + logging.FileHandler("LOG_Data_Acquisition_FREVA_output.log"), + logging.StreamHandler(sys.stdout), + ], + ) + logging.info("Started Freva files main programme \n") + freq_longname_map = {"mon": "monthly", "day": "daily", "hour": "hourly"} + + for project in projects: + project = project.lower() + match project: + case "cmip6": + for model in models: + model = model.lower() + for exp in exp_cmip6: + exp = exp.lower() + for var in variables_cmip: + var = var.lower() + logging.info(f"\n \nMODEL: {model}, EXPERIMENT: {exp}, VARIABLE: {var}, FREQUENCY: {freq}\n") + + if exp == "historical": + data_acq_freva_search_ECROPS.freva_search_historical(project, model, var, frequency, homevardir) + logging.info("\n\n **** Finished with Historical files **** \n\n") + else: + data_acq_freva_search_ECROPS.freva_search_ssp(project, model, var, frequency, exp, homevardir) + logging.info("\n\n **** Finished with SSP files **** \n \n") + + case "reanalysis": + for freq in frequency: + for exp_reanalysis_i in exp_reanalysis: + freq = freq.lower() + freq_longname = freq_longname_map[freq] + + var_set = None + match freq: + case "mon" | "day": + var_set = variables_era5_daily_monthly + case "hour": + var_set = variables_era5_hourly + case _: + raise ValueError("Incorrect frequency, try 'mon', 'day' or 'hour'.") + + for var in var_set: + var = var.lower() + logging.info(f"\n \nPROJECT: {project}, EXPERIMENT: {exp_reanalysis_i}, VARIABLE: {var}, FREQUENCY: {freq}\n") + data_acq_freva_search_ECROPS.freva_search_reanalysis(project, exp_reanalysis_i, var, freq, homevardir) + logging.info(f"\n\n **** Finished with ERA5 {freq_longname} data files **** \n \n") + + case _: + ValueError(f"Project {project} not recognized, try 'cmip6' or 'reanalysis'") + + +def main(): + parser = argparse.ArgumentParser(prog="DKRZ Data path downloader.") + parser.add_argument("-p", "--projects", default="reanalysis") + parser.add_argument("-m", "--models", default="") + parser.add_argument("--cmip6_vars", default="") + parser.add_argument("--era5_vars_month", default="") + parser.add_argument("--era5_vars_hour", default="") + parser.add_argument("--exp_cmip", default="") + parser.add_argument("--exp_reanalysis", default="era5") + parser.add_argument("-f", "--frequency", default="") + parser.add_argument("-d", "--dir", default="./data_acq") + args = parser.parse_args() + + copy_data( + args.projects.split(","), + args.models.split(","), + args.cmip6_vars.split(","), + args.era5_vars_month.split(","), + args.era5_vars_hour.split(","), + args.frequency.split(","), + args.exp_cmip.split(","), + args.exp_reanalysis.split(","), + args.dir, + ) + + +if __name__ == "__main__": + main() diff --git a/src/cmip6_data_acq/0_data_acq_main_ECROPS.py b/src/cmip6_data_acq/0_data_acq_main_ECROPS.py deleted file mode 100755 index f191cc1..0000000 --- a/src/cmip6_data_acq/0_data_acq_main_ECROPS.py +++ /dev/null @@ -1,148 +0,0 @@ -################################################################################# -# Title: Main class routine for searching and logging available FREVA datasets -# module load order: python3, clint, xces, then run script -# Author: Odysseas Vlachopoulos -# Project: testing -################################################################################## - -import logging -import sys - -# from FREVA import freva_search -import data_acq_freva_search_ECROPS -import os - -# projects = ['cmip6', 'reanalysis'] -projects = ["cmip6"] -# models = ['cesm2', -# 'cnrm-cm6-1-HR', -# 'gfdl-esm4', -# 'ec-earth3', -# 'mpi-esm1-2-hr', -# 'noresm2-mm', -# 'hadgem3-gc31-mm'] -models = ["mpi-esm1-2-lr"] - -# models = [] # DO NOT DO ANYTHING FOR CMIP6 -variables_cmip = ["tdps", "ua", "va", "tasmax", "lai"] - -# variables_era5_daily_monthly = ['tasmax', 'tasmin', 'tas', 'pr', 'rsds', 'tdps', 'sfcwind', 'hurs'] -variables_era5_daily_monthly = ["tdps", "ua", "va", "tasmax", "lai"] -# variables_era5_hourly = ['uas', 'vas'] -variables_era5_hourly: list[str] = [] - -# variables_era5_hourly = ['uas', 'vas', 'rsds', 'tdps'] -# 10m wind speed vas and uas are calculated with ECROPS function in wofost_util/util.py wind10to2(wind10) function - - -geopotential_height = 50000 # 500hPa -vorticity_height = 20000 # 200hPa - -# frequency = ['hour', 'day', 'mon'] -frequency = ["day"] -# frequency = ['mon'] -# exp_cmip6 = ['ssp370', 'ssp585', 'historical'] -exp_cmip6 = ["historical", "past2k"] -exp_reanalysis = "era5" - - -def main(): - # First initialize a logger instance - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - force=True, - handlers=[ - logging.FileHandler("LOG_Data_Acquisition_FREVA_output.log"), - logging.StreamHandler(sys.stdout), - ], - ) - logging.info("Started Freva files main programme \n") - - for project in projects: - if project == "cmip6": - for i in range(len(models)): - for exp in exp_cmip6: - for var in variables_cmip: - logging.info( - "\n \n" - + "MODEL: " - + str(models[i]) - + ", EXPERIMENT: " - + str(exp) - + ", VARIABLE: " - + str(var) - + ", FREQUENCY: " - + str(frequency) - + "\n" - ) - if not exp == "historical": - data_acq_freva_search_ECROPS.freva_search_ssp( - project, models[i], var, frequency, exp - ) - logging.info( - "\n\n **** Finished with SSP files **** \n \n" - ) - if exp == "historical": - data_acq_freva_search_ECROPS.freva_search_historical( - project, models[i], var, frequency - ) - logging.info( - "\n\n **** Finished with Historical files **** \n\n" - ) - - if project == "reanalysis": - for var in variables_era5_daily_monthly: - logging.info( - "\n \n" - + "PROJECT: " - + str(project) - + ", EXPERIMENT: " - + str(exp_reanalysis) - + ", VARIABLE: " - + str(var) - + ", FREQUENCY: " - + str(frequency[2]) - + "\n" - ) - data_acq_freva_search_ECROPS.freva_search_reanalysis( - project, exp_reanalysis, var, frequency[2] - ) - - for var in variables_era5_daily_monthly: - logging.info( - "\n \n" - + "PROJECT: " - + str(project) - + ", EXPERIMENT: " - + str(exp_reanalysis) - + ", VARIABLE: " - + str(var) - + ", FREQUENCY: " - + str(frequency[1]) - + "\n" - ) - data_acq_freva_search_ECROPS.freva_search_reanalysis( - project, exp_reanalysis, var, frequency[1] - ) - - for var in variables_era5_hourly: - logging.info( - "\n \n" - + "PROJECT: " - + str(project) - + ", EXPERIMENT: " - + str(exp_reanalysis) - + ", VARIABLE: " - + str(var) - + ", FREQUENCY: " - + str(frequency[0]) - + "\n" - ) - data_acq_freva_search_ECROPS.freva_search_reanalysis( - project, exp_reanalysis, var, frequency[0] - ) - - -if __name__ == "__main__": - main() From 5e8d59ab8c2112542659d2ffd5ff66553d9b7efe Mon Sep 17 00:00:00 2001 From: eugenioLR Date: Tue, 9 Sep 2025 14:01:30 +0200 Subject: [PATCH 2/4] fixed undefined variable bugs --- src/climate_data_acq/data_acquisition_main.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/climate_data_acq/data_acquisition_main.py b/src/climate_data_acq/data_acquisition_main.py index 43a5a5f..6da69cd 100644 --- a/src/climate_data_acq/data_acquisition_main.py +++ b/src/climate_data_acq/data_acquisition_main.py @@ -13,7 +13,7 @@ import argparse -def copy_data(projects, models, variables_cmip, variables_era5_daily_monthly, variables_era5_hourly, frequency, exp_cmip, exp_reanalysis, homevardir): +def copy_data(projects, models, variables_cmip, variables_era5_daily_monthly, variables_era5_hourly, frequency, exp_cmip6, exp_reanalysis, homevardir): # First initialize a logger instance logging.basicConfig( level=logging.INFO, @@ -33,18 +33,20 @@ def copy_data(projects, models, variables_cmip, variables_era5_daily_monthly, va case "cmip6": for model in models: model = model.lower() - for exp in exp_cmip6: - exp = exp.lower() - for var in variables_cmip: - var = var.lower() - logging.info(f"\n \nMODEL: {model}, EXPERIMENT: {exp}, VARIABLE: {var}, FREQUENCY: {freq}\n") + for freq in frequency: + freq = freq.lower() + for exp in exp_cmip6: + exp = exp.lower() + for var in variables_cmip: + var = var.lower() + logging.info(f"\n \nMODEL: {model}, EXPERIMENT: {exp}, VARIABLE: {var}, FREQUENCY: {freq}\n") - if exp == "historical": - data_acq_freva_search_ECROPS.freva_search_historical(project, model, var, frequency, homevardir) - logging.info("\n\n **** Finished with Historical files **** \n\n") - else: - data_acq_freva_search_ECROPS.freva_search_ssp(project, model, var, frequency, exp, homevardir) - logging.info("\n\n **** Finished with SSP files **** \n \n") + if exp == "historical": + data_acq_freva_search_ECROPS.freva_search_historical(project, model, var, frequency, homevardir) + logging.info("\n\n **** Finished with Historical files **** \n\n") + else: + data_acq_freva_search_ECROPS.freva_search_ssp(project, model, var, frequency, exp, homevardir) + logging.info("\n\n **** Finished with SSP files **** \n \n") case "reanalysis": for freq in frequency: From f5906b196aaecd311ae9899571040f9b449b8749 Mon Sep 17 00:00:00 2001 From: eugenioLR Date: Tue, 9 Sep 2025 14:08:56 +0200 Subject: [PATCH 3/4] ixed undefined variable bug and refactored --- src/climate_data_acq/copy_files.py | 78 +++++++++++-------- src/climate_data_acq/data_acquisition_main.py | 5 +- 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/climate_data_acq/copy_files.py b/src/climate_data_acq/copy_files.py index b262cdf..1025b07 100755 --- a/src/climate_data_acq/copy_files.py +++ b/src/climate_data_acq/copy_files.py @@ -5,6 +5,7 @@ import sys import argparse + def copy_files_from_csv(csv_file_path, destination_folder, variable, experiment): """ Copies files listed in a CSV file to a structured destination folder. @@ -15,14 +16,14 @@ def copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) :param experiment: Experiment name (e.g., 'historical', 'past2k'). """ # Open the CSV file and read the file paths - with open(csv_file_path, mode='r') as csv_file: + with open(csv_file_path, mode="r") as csv_file: csv_reader = csv.reader(csv_file) - + for row in csv_reader: original_file_path = row[0].strip() - + # Extract ensemble name from the file path - path_components = original_file_path.split('/') + path_components = original_file_path.split("/") try: # Find the position of the experiment in the path exp_index = path_components.index(experiment) @@ -30,20 +31,20 @@ def copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) except (ValueError, IndexError): print(f"Could not extract ensemble from: {original_file_path}") continue - + # Build destination path based on experiment type - if experiment.startswith('ssp'): + if experiment.startswith("ssp"): # Projections: destination_folder/projections//// - dest_dir = os.path.join(destination_folder, variable, 'projections', experiment, ensemble) + dest_dir = os.path.join(destination_folder, variable, "projections", experiment, ensemble) else: # Historical/Past2K: destination_folder//// dest_dir = os.path.join(destination_folder, variable, experiment, ensemble) os.makedirs(dest_dir, exist_ok=True) - + # Copy file to destination file_name = os.path.basename(original_file_path) dest_file_path = os.path.join(dest_dir, file_name) - + try: shutil.copy2(original_file_path, dest_file_path) print(f"Copied: {original_file_path} -> {dest_file_path}") @@ -55,23 +56,31 @@ def copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) print(f"Error copying {original_file_path}: {e}") sys.stdout.flush() + def main(): # Set up command line arguments - parser = argparse.ArgumentParser( - description='Copy CMIP6 files to structured directories based on CSV lists.' + parser = argparse.ArgumentParser(description="Copy CMIP6 files to structured directories based on CSV lists.") + parser.add_argument( + "-s", + "--source", + default="./data_acq/", + help="Folder containing CSV files (default: ./data_acq/)", + ) + parser.add_argument( + "-d", + "--dest", + default="./data_raw/", + help="Destination base folder (default: ./data_raw/)", + ) + parser.add_argument( + "-p", + "--pattern", + default="*.csv", + help="Glob pattern to select specific CSV files (default: *.csv)", ) - parser.add_argument('-s', '--source', - default='./data_acq/', - help='Folder containing CSV files (default: ./data_acq/)') - parser.add_argument('-d', '--dest', - default='./data_raw/', - help='Destination base folder (default: ./data_raw/)') - parser.add_argument('-p', '--pattern', - default='*.csv', - help='Glob pattern to select specific CSV files (default: *.csv)') - + args = parser.parse_args() - + # Use the paths from arguments (or defaults if not provided) data_acq_folder = args.source destination_folder = args.dest @@ -82,11 +91,11 @@ def main(): data_acq_folder += os.path.sep if not destination_folder.endswith(os.path.sep): destination_folder += os.path.sep - + # Find matching CSV files using pattern search_pattern = os.path.join(data_acq_folder, file_pattern) csv_files = sorted(glob.glob(search_pattern)) - + print(f"Source folder: {data_acq_folder}") print(f"Destination folder: {destination_folder}") print(f"Search pattern: {file_pattern}") @@ -102,33 +111,34 @@ def main(): for csv_file_path in csv_files: print(f"Processing CSV: {csv_file_path}") sys.stdout.flush() - + # Extract variable and experiment from filename filename = os.path.basename(csv_file_path) if "cmip6" in filename: - parts = filename.split('__cmip6_')[-1].split('_[')[0].split('_') - + parts = filename.split("__cmip6_")[-1].split("_[")[0].split("_") + # Determine experiment and variable match parts[0]: - case 'past2k': - experiment = 'past2k' + case "past2k": + experiment = "past2k" variable = parts[1] - case 'historical': - experiment = 'historical' + case "historical": + experiment = "historical" variable = parts[0] - case ['ssp', *_]: + case ["ssp", *_]: experiment = parts[0] variable = parts[1] elif "reanalisys" in filename: - parts = filename.split('__reanalisys_')[-1].split('_[')[0].split('_') + parts = filename.split("__reanalisys_")[-1].split("_[")[0].split("_") experiment = parts[0] variable = parts[1] else: print(f"File {csv_file_path} could not be processed.") continue - + # Copy files with structured paths copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) + if __name__ == "__main__": main() diff --git a/src/climate_data_acq/data_acquisition_main.py b/src/climate_data_acq/data_acquisition_main.py index 6da69cd..b45f108 100644 --- a/src/climate_data_acq/data_acquisition_main.py +++ b/src/climate_data_acq/data_acquisition_main.py @@ -9,11 +9,12 @@ import sys import data_acq_freva_search_ECROPS -import os import argparse -def copy_data(projects, models, variables_cmip, variables_era5_daily_monthly, variables_era5_hourly, frequency, exp_cmip6, exp_reanalysis, homevardir): +def copy_data( + projects, models, variables_cmip, variables_era5_daily_monthly, variables_era5_hourly, frequency, exp_cmip6, exp_reanalysis, homevardir +): # First initialize a logger instance logging.basicConfig( level=logging.INFO, From c484f786812491c4b30f2d9b79037c0ccce8d702 Mon Sep 17 00:00:00 2001 From: eugenioLR Date: Tue, 9 Sep 2025 16:32:49 +0200 Subject: [PATCH 4/4] Refactored with black --- src/climate_data_acq/copy_files.py | 16 +++- .../data_acq_freva_search_ECROPS.py | 78 ++++++++++++++----- src/climate_data_acq/data_acquisition_main.py | 50 +++++++++--- 3 files changed, 109 insertions(+), 35 deletions(-) diff --git a/src/climate_data_acq/copy_files.py b/src/climate_data_acq/copy_files.py index 1025b07..7015d64 100755 --- a/src/climate_data_acq/copy_files.py +++ b/src/climate_data_acq/copy_files.py @@ -35,10 +35,14 @@ def copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) # Build destination path based on experiment type if experiment.startswith("ssp"): # Projections: destination_folder/projections//// - dest_dir = os.path.join(destination_folder, variable, "projections", experiment, ensemble) + dest_dir = os.path.join( + destination_folder, variable, "projections", experiment, ensemble + ) else: # Historical/Past2K: destination_folder//// - dest_dir = os.path.join(destination_folder, variable, experiment, ensemble) + dest_dir = os.path.join( + destination_folder, variable, experiment, ensemble + ) os.makedirs(dest_dir, exist_ok=True) # Copy file to destination @@ -59,7 +63,9 @@ def copy_files_from_csv(csv_file_path, destination_folder, variable, experiment) def main(): # Set up command line arguments - parser = argparse.ArgumentParser(description="Copy CMIP6 files to structured directories based on CSV lists.") + parser = argparse.ArgumentParser( + description="Copy CMIP6 files to structured directories based on CSV lists." + ) parser.add_argument( "-s", "--source", @@ -103,7 +109,9 @@ def main(): sys.stdout.flush() if not csv_files: - print(f"No CSV files found matching pattern: '{file_pattern}' in {data_acq_folder}") + print( + f"No CSV files found matching pattern: '{file_pattern}' in {data_acq_folder}" + ) sys.stdout.flush() return diff --git a/src/climate_data_acq/data_acq_freva_search_ECROPS.py b/src/climate_data_acq/data_acq_freva_search_ECROPS.py index ab4c423..fcf5bc7 100755 --- a/src/climate_data_acq/data_acq_freva_search_ECROPS.py +++ b/src/climate_data_acq/data_acq_freva_search_ECROPS.py @@ -37,24 +37,34 @@ def freva_search_ssp(project, model, var, freq, experiment, homevardir): ## iteratable freva generator object ssp_files can either be tranformed to a list or parsed, ## not both, it lives through one iteration it seems - ssp_files_list = list(ssp_files) # make the freva generator object ssp_files a list for list functions e.g. len() + ssp_files_list = list( + ssp_files + ) # make the freva generator object ssp_files a list for list functions e.g. len() ssp_files_array = np.sort(ssp_files_list) ## 2. Get all the unique ensemble ids to be used in matching with all other ssp files all_ensembles = [] for ssp_file in ssp_files_array: res = freva.facet_search(file=ssp_file, facet="ensemble") - all_ensembles.append(res.get("ensemble")[0]) # get the first (only) value of the dictionary + all_ensembles.append( + res.get("ensemble")[0] + ) # get the first (only) value of the dictionary - unique_ensembles = np.unique(all_ensembles) # then filter out only the unique ensemble values + unique_ensembles = np.unique( + all_ensembles + ) # then filter out only the unique ensemble values logging.info(f"{experiment} for {var} unique ensemble ids = {unique_ensembles}") # Get the number of ssp files per unique ensemble id: Function is called only for logging the number of files - get_files_from_unique_ensembles(project, model, var, freq, experiment, unique_ensembles) + get_files_from_unique_ensembles( + project, model, var, freq, experiment, unique_ensembles + ) ## 3. Get all the historical datasets we need by the ensemble id in unique_ensembles - historical_files_array = get_files_from_unique_ensembles(project, model, var, freq, "historical", unique_ensembles) + historical_files_array = get_files_from_unique_ensembles( + project, model, var, freq, "historical", unique_ensembles + ) np_historical_files_array = np.sort(historical_files_array) ### logging.info(str(var) + " total HISTORICAL num of files = " + str(np_historical_files_array.size)) @@ -62,9 +72,13 @@ def freva_search_ssp(project, model, var, freq, experiment, homevardir): ## Write everything to csv files ssp_csv_filename = f"{model}__{project}_{experiment}_{var}_{freq}.csv" ssp_files_array.tofile(os.path.join(homevardir, ssp_csv_filename), sep="\n") - historical_csv_filename = f"{model}__{project}_{experiment}_{var}_{freq}_historical.csv" + historical_csv_filename = ( + f"{model}__{project}_{experiment}_{var}_{freq}_historical.csv" + ) - np_historical_files_array.tofile(os.path.join(homevardir, historical_csv_filename), sep="\n") + np_historical_files_array.tofile( + os.path.join(homevardir, historical_csv_filename), sep="\n" + ) def freva_search_historical(project, model, var, freq, homevardir): @@ -97,21 +111,31 @@ def freva_search_historical(project, model, var, freq, homevardir): all_ensembles = [] for historical_file in historical_files_array: res = freva.facet_search(file=historical_file, facet="ensemble") - all_ensembles.append(res.get("ensemble")[0]) # get the first and only value of the dictionary + all_ensembles.append( + res.get("ensemble")[0] + ) # get the first and only value of the dictionary - unique_ensembles = np.unique(all_ensembles) # then filter out only the unique ensemble values + unique_ensembles = np.unique( + all_ensembles + ) # then filter out only the unique ensemble values logging.info(f"Historical for {var} unique ensemble ids = {unique_ensembles}") # Get the number of historical files per unique ensemble id: Function is calles only for logging the number of files - get_files_from_unique_ensembles(project, model, var, freq, "historical", unique_ensembles) + get_files_from_unique_ensembles( + project, model, var, freq, "historical", unique_ensembles + ) ## Write everything to csv files all_historical_csv = f"{model}__{project}_{var}_{freq}_allhistorical.csv" - historical_files_array.tofile(os.path.join(homevardir, all_historical_csv), sep="\n") + historical_files_array.tofile( + os.path.join(homevardir, all_historical_csv), sep="\n" + ) -def freva_search_reanalysis(project, experiment, var, freq, homevardir): # , geopoten_value): +def freva_search_reanalysis( + project, experiment, var, freq, homevardir +): # , geopoten_value): """ Retreive from FREVA all reanalysis files such as ERA5 and write the list to csv, e.g. "era5__reanalysis_day_tas.csv" @@ -123,7 +147,9 @@ def freva_search_reanalysis(project, experiment, var, freq, homevardir): # , ge :return: """ ## 1. Get all the reanalysis files with a variable - reanalysis_files = freva.databrowser(project=project, time_frequency=freq, variable=var, experiment=experiment) + reanalysis_files = freva.databrowser( + project=project, time_frequency=freq, variable=var, experiment=experiment + ) reanalysis_files_list = list(reanalysis_files) reanalysis_files_array = np.sort(reanalysis_files_list) @@ -132,18 +158,28 @@ def freva_search_reanalysis(project, experiment, var, freq, homevardir): # , ge all_ensembles = [] for reanalysis_file in reanalysis_files_array: res = freva.facet_search(file=reanalysis_file, facet="ensemble") - all_ensembles.append(res.get("ensemble")[0]) # get the first(and only) value of the dictionary - - unique_ensembles = np.unique(all_ensembles) # then filter out only the unique ensemble values - logging.info(f"{experiment} reanalysis for {var} unique ensemble ids = {unique_ensembles}") + all_ensembles.append( + res.get("ensemble")[0] + ) # get the first(and only) value of the dictionary + + unique_ensembles = np.unique( + all_ensembles + ) # then filter out only the unique ensemble values + logging.info( + f"{experiment} reanalysis for {var} unique ensemble ids = {unique_ensembles}" + ) ## Write everything to csv files all_reanalysis_csv_filename = f"{experiment}__{project}_{freq}_{var}.csv" - reanalysis_files_array.tofile(os.path.join(homevardir, all_reanalysis_csv_filename), sep="\n") + reanalysis_files_array.tofile( + os.path.join(homevardir, all_reanalysis_csv_filename), sep="\n" + ) -def get_files_from_unique_ensembles(project, model, var, freq, experiment, unique_ensemble_list): +def get_files_from_unique_ensembles( + project, model, var, freq, experiment, unique_ensemble_list +): """ The inputs to this function are internal, although dictated by the data_acq_main.py . This function is called internally in order to retrieve from FREVA items using their ensemble id, used for corresponding ssp and historical @@ -169,6 +205,8 @@ def get_files_from_unique_ensembles(project, model, var, freq, experiment, uniqu for file in files: files_array.append(file) - logging.info(f"{experiment} {var} files for ensemble {unique_ens} = {len(files)}") + logging.info( + f"{experiment} {var} files for ensemble {unique_ens} = {len(files)}" + ) return files_array diff --git a/src/climate_data_acq/data_acquisition_main.py b/src/climate_data_acq/data_acquisition_main.py index b45f108..e980e49 100644 --- a/src/climate_data_acq/data_acquisition_main.py +++ b/src/climate_data_acq/data_acquisition_main.py @@ -13,7 +13,15 @@ def copy_data( - projects, models, variables_cmip, variables_era5_daily_monthly, variables_era5_hourly, frequency, exp_cmip6, exp_reanalysis, homevardir + projects, + models, + variables_cmip, + variables_era5_daily_monthly, + variables_era5_hourly, + frequency, + exp_cmip6, + exp_reanalysis, + homevardir, ): # First initialize a logger instance logging.basicConfig( @@ -40,14 +48,24 @@ def copy_data( exp = exp.lower() for var in variables_cmip: var = var.lower() - logging.info(f"\n \nMODEL: {model}, EXPERIMENT: {exp}, VARIABLE: {var}, FREQUENCY: {freq}\n") + logging.info( + f"\n \nMODEL: {model}, EXPERIMENT: {exp}, VARIABLE: {var}, FREQUENCY: {freq}\n" + ) if exp == "historical": - data_acq_freva_search_ECROPS.freva_search_historical(project, model, var, frequency, homevardir) - logging.info("\n\n **** Finished with Historical files **** \n\n") + data_acq_freva_search_ECROPS.freva_search_historical( + project, model, var, frequency, homevardir + ) + logging.info( + "\n\n **** Finished with Historical files **** \n\n" + ) else: - data_acq_freva_search_ECROPS.freva_search_ssp(project, model, var, frequency, exp, homevardir) - logging.info("\n\n **** Finished with SSP files **** \n \n") + data_acq_freva_search_ECROPS.freva_search_ssp( + project, model, var, frequency, exp, homevardir + ) + logging.info( + "\n\n **** Finished with SSP files **** \n \n" + ) case "reanalysis": for freq in frequency: @@ -62,16 +80,26 @@ def copy_data( case "hour": var_set = variables_era5_hourly case _: - raise ValueError("Incorrect frequency, try 'mon', 'day' or 'hour'.") + raise ValueError( + "Incorrect frequency, try 'mon', 'day' or 'hour'." + ) for var in var_set: var = var.lower() - logging.info(f"\n \nPROJECT: {project}, EXPERIMENT: {exp_reanalysis_i}, VARIABLE: {var}, FREQUENCY: {freq}\n") - data_acq_freva_search_ECROPS.freva_search_reanalysis(project, exp_reanalysis_i, var, freq, homevardir) - logging.info(f"\n\n **** Finished with ERA5 {freq_longname} data files **** \n \n") + logging.info( + f"\n \nPROJECT: {project}, EXPERIMENT: {exp_reanalysis_i}, VARIABLE: {var}, FREQUENCY: {freq}\n" + ) + data_acq_freva_search_ECROPS.freva_search_reanalysis( + project, exp_reanalysis_i, var, freq, homevardir + ) + logging.info( + f"\n\n **** Finished with ERA5 {freq_longname} data files **** \n \n" + ) case _: - ValueError(f"Project {project} not recognized, try 'cmip6' or 'reanalysis'") + ValueError( + f"Project {project} not recognized, try 'cmip6' or 'reanalysis'" + ) def main():