diff --git a/src/analogues/encode_05red.sh b/src/analogues/encode_05red.sh index 6617c2e..72c8b83 100755 --- a/src/analogues/encode_05red.sh +++ b/src/analogues/encode_05red.sh @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --job-name=encode_05red -#SBATCH --partition=gpu +#SBATCH --partition=gpu #SBATCH --nodes=2 #SBATCH --exclusive #SBATCH --mem=0 @@ -16,7 +16,7 @@ export XLA_FLAGS=--xla_gpu_cuda_data_dir=/sw/spack-levante/nvhpc-24.7-py26uc/Lin export TF_FORCE_GPU_ALLOW_GROWTH=true module load texlive/live2021-gcc-11.2.0 source activate -conda activate ${ENV2} +conda activate "${ENV2}" #python encode_search.py -m "encode" -f "config/identify_hw_france2003-mv-05-red.json" > out_enc_05red.log #python encode_search.py -m "encode" -f "config/train/train_france2003_era5.json" > out_enc_05red.log #python encode_search.py -m "search-ecmwf" -f "config/search/train_france2003_ecmwf.json" > out_enc_05red.log diff --git a/src/analogues/encode_search.py b/src/analogues/encode_search.py index 288d277..2681d3c 100755 --- a/src/analogues/encode_search.py +++ b/src/analogues/encode_search.py @@ -15,9 +15,11 @@ import sys from tqdm import tqdm import warnings -warnings.simplefilter(action='ignore', category=FutureWarning) + +warnings.simplefilter(action="ignore", category=FutureWarning) import glob + def reinitialize(model): for l in model.layers: if isinstance(l, tf.keras.Model): @@ -28,34 +30,71 @@ def reinitialize(model): if hasattr(l, "bias_initializer"): l.bias.assign(l.bias_initializer(tf.shape(l.bias))) if hasattr(l, "recurrent_initializer"): - l.recurrent_kernel.assign(l.recurrent_initializer(tf.shape(l.recurrent_kernel))) - -def train_base_model(model_enc, model_dec, year_start_str, year_end_str, X_train, idx=0, hw="params.json", model_type=""): - hw_name = hw.split('.')[0].split('/')[-1] + l.recurrent_kernel.assign( + l.recurrent_initializer(tf.shape(l.recurrent_kernel)) + ) + + +def train_base_model( + model_enc, + model_dec, + year_start_str, + year_end_str, + X_train, + idx=0, + hw="params.json", + model_type="", +): + hw_name = hw.split(".")[0].split("/")[-1] x_input = keras.Input(shape=X_train.shape[1:]) - print(f'\nx_input: {x_input}\n') - print(f'\nx_input shape: {X_train.shape[1:]}\n') - print(f'\nx_input shape: {X_train.shape}\n') + print(f"\nx_input: {x_input}\n") + print(f"\nx_input shape: {X_train.shape[1:]}\n") + print(f"\nx_input shape: {X_train.shape}\n") x = model_dec(model_enc(x_input)) autoencoder = keras.Model(x_input, x) autoencoder.compile(optimizer=keras.optimizers.Adam(1e-4), loss="mse") autoencoder.fit(X_train, X_train, epochs=500, validation_split=0.15) Path(f"./models/{hw_name}").mkdir(parents=True, exist_ok=True) - model_enc.save(f"./models/{hw_name}/MvAE_{hw_name}_{model_type}_{year_start_str}-{year_end_str}.h5") - model_dec.save(f"./models/{hw_name}/MvAE_{hw_name}_{model_type}_{year_start_str}-{year_end_str}_dec_.h5") + model_enc.save( + f"./models/{hw_name}/MvAE_{hw_name}_{model_type}_{year_start_str}-{year_end_str}.h5" + ) + model_dec.save( + f"./models/{hw_name}/MvAE_{hw_name}_{model_type}_{year_start_str}-{year_end_str}_dec_.h5" + ) return autoencoder -def load_data_past2k(year_start, year_end, year_part, year_part_end, idx=0, hw="params.json", out_preprocess=["x_train_pre_pred", "x_test_pre_pred"]): + +def load_data_past2k( + year_start, + year_end, + year_part, + year_part_end, + idx=0, + hw="params.json", + out_preprocess=["x_train_pre_pred", "x_test_pre_pred"], +): with open(hw) as f: params = json.load(f) params["name"] = params["name"].replace("#", str(idx)) params["pre_init"] = params["pre_init"].replace("7000", str(year_part)) params["pre_end"] = params["pre_end"].replace("7099", str(year_part_end)) - params["target_dataset"] = params["target_dataset"].replace("7000", str(year_start)).replace("7099", str(year_end)) - params["pred_dataset"] = params["pred_dataset"].replace("7000", str(year_start)).replace("7099", str(year_end)) - params["ident_dataset"] = params["ident_dataset"].replace("7000", str(year_start)).replace("7099", str(year_end)) + params["target_dataset"] = ( + params["target_dataset"] + .replace("7000", str(year_start)) + .replace("7099", str(year_end)) + ) + params["pred_dataset"] = ( + params["pred_dataset"] + .replace("7000", str(year_start)) + .replace("7099", str(year_end)) + ) + params["ident_dataset"] = ( + params["ident_dataset"] + .replace("7000", str(year_start)) + .replace("7099", str(year_end)) + ) params["out_preprocess"] = out_preprocess params["data_of_interest_init"] = pd.Timestamp(params["data_of_interest_init"]) @@ -63,7 +102,14 @@ def load_data_past2k(year_start, year_end, year_part, year_part_end, idx=0, hw=" return va_am.perform_preprocess(params) -def load_data_ecmwf(year_start, year_end, hw="params.json", out_preprocess=["time_indust_pred", "indust_target", "img_size"], params=None): + +def load_data_ecmwf( + year_start, + year_end, + hw="params.json", + out_preprocess=["time_indust_pred", "indust_target", "img_size"], + params=None, +): if params is None: with open(hw) as f: params = json.load(f) @@ -71,187 +117,333 @@ def load_data_ecmwf(year_start, year_end, hw="params.json", out_preprocess=["tim params["name"] = params["name"].replace("#", "0") params["out_preprocess"] = out_preprocess[1:] - params["period"] = "post" - + params["period"] = "post" + indust_target, img_size = va_am.perform_preprocess(params) - - params['post_init'] = pd.Timestamp(str(year_start) + params['post_init'][4:] +' 00:00:00') - params['post_end'] = pd.Timestamp(str(year_end) + params['post_end'][4:] +' 06:00:00') + + params["post_init"] = pd.Timestamp( + str(year_start) + params["post_init"][4:] + " 00:00:00" + ) + params["post_end"] = pd.Timestamp( + str(year_end) + params["post_end"][4:] + " 06:00:00" + ) if type(params["data_of_interest_init"]) != pd.Timestamp: - params["data_of_interest_init"] = pd.Timestamp(params["data_of_interest_init"] +' 00:00:00') + params["data_of_interest_init"] = pd.Timestamp( + params["data_of_interest_init"] + " 00:00:00" + ) if type(params["data_of_interest_end"]) != pd.Timestamp: - params["data_of_interest_end"] = pd.Timestamp(params["data_of_interest_end"] +' 06:00:00') - + params["data_of_interest_end"] = pd.Timestamp( + params["data_of_interest_end"] + " 06:00:00" + ) + time_indust_pred = va_am.perform_preprocess(params) return time_indust_pred, indust_target, img_size - - -def load_data_era5(year_start, year_end, hw="params.json", out_preprocess=["indust_pred", "data_of_interest_pred", "time_indust_pred", "indust_target", "img_size"], params=None): + +def load_data_era5( + year_start, + year_end, + hw="params.json", + out_preprocess=[ + "indust_pred", + "data_of_interest_pred", + "time_indust_pred", + "indust_target", + "img_size", + ], + params=None, +): if params is None: with open(hw) as f: params = json.load(f) params["name"] = params["name"].replace("#", "0") - params['post_init'] = str(year_start) + params['post_init'][4:] - params['post_end'] = str(year_end) + params['post_end'][4:] + params["post_init"] = str(year_start) + params["post_init"][4:] + params["post_end"] = str(year_end) + params["post_end"][4:] params["out_preprocess"] = out_preprocess - params["period"] = "post" + params["period"] = "post" if type(params["data_of_interest_init"]) != pd.Timestamp: - params["data_of_interest_init"] = pd.Timestamp(params["data_of_interest_init"] +' 00:00:00') + params["data_of_interest_init"] = pd.Timestamp( + params["data_of_interest_init"] + " 00:00:00" + ) if type(params["data_of_interest_end"]) != pd.Timestamp: - params["data_of_interest_end"] = pd.Timestamp(params["data_of_interest_end"] +' 06:00:00') - + params["data_of_interest_end"] = pd.Timestamp( + params["data_of_interest_end"] + " 06:00:00" + ) + return va_am.perform_preprocess(params) -def train_all_past2k(model_enc, model_dec, year_range=(0, 1850), hw="params.json", model_type=""): + +def train_all_past2k( + model_enc, model_dec, year_range=(0, 1850), hw="params.json", model_type="" +): for i, year_start in enumerate(np.arange(7000, 8850, 100)): idx = i + 1 year_end = year_start + 99 if year_start == 8800: year_end = 8850 - print(f'idx: {idx}', flush=True) - print(f'year_start: {year_start}', flush=True) - print(f'year_end: {year_end}', flush=True) + print(f"idx: {idx}", flush=True) + print(f"year_start: {year_start}", flush=True) + print(f"year_end: {year_end}", flush=True) for j, year_part in enumerate(np.arange(year_start, year_end, 25)): year_part_end = year_part + 24 if year_part == 8825: year_part_end = 8850 - print(f'j: {j}', flush=True) - print(f'year_part: {year_part}', flush=True) - print(f'year_part_end: {year_part_end}', flush=True) - X_train, _ = load_data_past2k(year_start, year_end, year_part, year_part_end, idx, hw) + print(f"j: {j}", flush=True) + print(f"year_part: {year_part}", flush=True) + print(f"year_part_end: {year_part_end}", flush=True) + X_train, _ = load_data_past2k( + year_start, year_end, year_part, year_part_end, idx, hw + ) X_train = np.nan_to_num(X_train) - train_base_model(model_enc, model_dec, year_part - 7000, year_part_end - 7000, X_train, idx, hw, model_type) + train_base_model( + model_enc, + model_dec, + year_part - 7000, + year_part_end - 7000, + X_train, + idx, + hw, + model_type, + ) print(f"trained on past2k up to year {year_end}.", flush=True) return model_enc, model_dec -def train_era5(model_enc, model_dec, year_range=(1940, 2022), hw="params.json", model_type=""): - X_train = load_data_era5(year_range[0], year_range[1], hw, out_preprocess=["indust_pred"])[0] - print(f'\nX_train shape: {np.shape(X_train)}\n') + +def train_era5( + model_enc, model_dec, year_range=(1940, 2022), hw="params.json", model_type="" +): + X_train = load_data_era5( + year_range[0], year_range[1], hw, out_preprocess=["indust_pred"] + )[0] + print(f"\nX_train shape: {np.shape(X_train)}\n") X_train = np.nan_to_num(X_train) - train_base_model(model_enc, model_dec, year_range[0], year_range[1], X_train, 0, hw, model_type) + train_base_model( + model_enc, model_dec, year_range[0], year_range[1], X_train, 0, hw, model_type + ) print(f"trained on era5 up to year {year_range[1]}.", flush=True) return model_enc, model_dec - def encode_past2k(hw, model_type="", year_range=(0, 1850)): - hw_name = hw.split('.')[0].split('/')[-1] + hw_name = hw.split(".")[0].split("/")[-1] for i, year_start in enumerate(np.arange(year_range[0], year_range[1], 100)): idx = i + 1 year_end = year_start + 99 if year_start == 1800: year_end = 1850 - print(f'idx: {idx}', flush=True) - print(f'year_start: {year_start}', flush=True) - print(f'year_end: {year_end}', flush=True) + print(f"idx: {idx}", flush=True) + print(f"year_start: {year_start}", flush=True) + print(f"year_end: {year_end}", flush=True) for j, year_part in enumerate(np.arange(year_start, year_end, 25)): year_part_end = year_part + 24 if year_part == 1825: year_part_end = 1850 - print(f'j: {j}', flush=True) - print(f'year_part: {year_part}', flush=True) - print(f'year_part_end: {year_part_end}', flush=True) - AE_ind = keras.models.load_model(f"./models/{hw_name}/MvAE_{hw_name}_{model_type}_{str(year_part)}-{str(year_part_end)}.h5", custom_objects={'keras': keras,'AutoEncoders': AutoEncoders}) - encode_era5(AE_ind, year_part_range=(year_part, year_part_end), hw=hw, model_type="past2k", idx=4*i+j) + print(f"j: {j}", flush=True) + print(f"year_part: {year_part}", flush=True) + print(f"year_part_end: {year_part_end}", flush=True) + AE_ind = keras.models.load_model( + f"./models/{hw_name}/MvAE_{hw_name}_{model_type}_{str(year_part)}-{str(year_part_end)}.h5", + custom_objects={"keras": keras, "AutoEncoders": AutoEncoders}, + ) + encode_era5( + AE_ind, + year_part_range=(year_part, year_part_end), + hw=hw, + model_type="past2k", + idx=4 * i + j, + ) return + def encode_ecmwf(AE_ind, year_range=(1993, 2016), hw="params.json"): - hw_name = hw.split('.')[0].split('/')[-1] + hw_name = hw.split(".")[0].split("/")[-1] with open(hw) as f: params = json.load(f) - for i in range(1,26): + for i in range(1, 26): dataset_name = params["target_dataset"].replace("_01", f"_{i:02d}") params_i = params.copy() params_i["target_dataset"] = dataset_name params_i["pred_dataset"] = dataset_name - indust_pred = load_data_era5(year_range[0], year_range[1], hw, ["indust_pred"], params=params_i) + indust_pred = load_data_era5( + year_range[0], year_range[1], hw, ["indust_pred"], params=params_i + ) indust_pred_encoded = AE_ind.predict(indust_pred) print(f"encoded on ecmwf for ens {i:02d}.", flush=True) del indust_pred - np.save(f"./data/encoded/indust_pred_encoded_ens{i:02d}_{year_range[0]}-{year_range[1]}_{hw_name}.npy", indust_pred_encoded) + np.save( + f"./data/encoded/indust_pred_encoded_ens{i:02d}_{year_range[0]}-{year_range[1]}_{hw_name}.npy", + indust_pred_encoded, + ) return - -def encode_era5(AE_ind, year_range=(1940, 2022), year_part_range=(1940, 2022), hw="params.json", model_type="", idx=0): - hw_name = hw.split('.')[0].split('/')[-1] - if model_type=="past2k": - indust_pred = load_data_past2k(year_range[0], year_range[1], year_part_range[0], year_part_range[1], idx=idx, hw=hw, out_preprocess=["indust_pred"]) + +def encode_era5( + AE_ind, + year_range=(1940, 2022), + year_part_range=(1940, 2022), + hw="params.json", + model_type="", + idx=0, +): + hw_name = hw.split(".")[0].split("/")[-1] + if model_type == "past2k": + indust_pred = load_data_past2k( + year_range[0], + year_range[1], + year_part_range[0], + year_part_range[1], + idx=idx, + hw=hw, + out_preprocess=["indust_pred"], + ) else: indust_pred = load_data_era5(year_range[0], year_range[1], hw, ["indust_pred"]) indust_pred_encoded = AE_ind.predict(indust_pred) print(f"encoded on era5 up to year {year_range[1]}.", flush=True) del indust_pred - np.save(f"./data/encoded/indust_pred_encoded_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy", indust_pred_encoded) + np.save( + f"./data/encoded/indust_pred_encoded_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy", + indust_pred_encoded, + ) del indust_pred_encoded - if model_type=="past2k": - data_of_interest_pred = load_data_past2k(year_range[0], year_range[1], year_part_range[0], year_part_range[1], idx=idx, hw=hw, out_preprocess=["data_of_interest_pred"]) + if model_type == "past2k": + data_of_interest_pred = load_data_past2k( + year_range[0], + year_range[1], + year_part_range[0], + year_part_range[1], + idx=idx, + hw=hw, + out_preprocess=["data_of_interest_pred"], + ) else: - data_of_interest_pred = load_data_era5(year_range[0], year_range[1], hw, ["data_of_interest_pred"]) + data_of_interest_pred = load_data_era5( + year_range[0], year_range[1], hw, ["data_of_interest_pred"] + ) data_of_interest_pred_encoded_ind = AE_ind.predict(data_of_interest_pred) print(f"encoded on event {hw_name}.", flush=True) del data_of_interest_pred - np.save(f"./data/encoded/data_of_interest_pred_encoded_ind_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy", data_of_interest_pred_encoded_ind) + np.save( + f"./data/encoded/data_of_interest_pred_encoded_ind_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy", + data_of_interest_pred_encoded_ind, + ) del data_of_interest_pred_encoded_ind return -def search_ecmwf(year_range=(1993, 2016), year_part_range=(1993, 2016), hw="params.json"): - hw_name = hw.split('.')[0].split('/')[-1] +def search_ecmwf( + year_range=(1993, 2016), year_part_range=(1993, 2016), hw="params.json" +): + hw_name = hw.split(".")[0].split("/")[-1] with open(hw) as f: params = json.load(f) params_multiple = params.copy() - + current = datetime.datetime.now() int_reg = params["interest_region"] if params["interest_region_type"] == "coord": - int_reg = calculate_interest_region(params["interest_region"], [params["latitude_min"], params["latitude_max"], params["longitude_min"], params["longitude_max"]], params["resolution"], False, "file") + int_reg = calculate_interest_region( + params["interest_region"], + [ + params["latitude_min"], + params["latitude_max"], + params["longitude_min"], + params["longitude_max"], + ], + params["resolution"], + False, + "file", + ) - heatwave_period = np.arange(datetime.datetime.strptime(params_multiple["data_of_interest_init"], '%Y-%m-%d'), datetime.datetime.strptime(params_multiple["data_of_interest_end"], '%Y-%m-%d')+datetime.timedelta(days=1), datetime.timedelta(days=1)) + heatwave_period = np.arange( + datetime.datetime.strptime( + params_multiple["data_of_interest_init"], "%Y-%m-%d" + ), + datetime.datetime.strptime(params_multiple["data_of_interest_end"], "%Y-%m-%d") + + datetime.timedelta(days=1), + datetime.timedelta(days=1), + ) heatwave_period = np.array(list(map(pd.Timestamp, heatwave_period))) params_multiple["data_of_interest_init"] = heatwave_period params_multiple["data_of_interest_end"] = heatwave_period - print(f'Heatwave period: {heatwave_period}\n {np.shape(heatwave_period)}\n', flush=True) - - #indust_pred, data_of_interest_pred, time_indust_pred, indust_target, img_size = load_data_era5(year_range[0], year_range[1], hw, params=params) - _, _, time_indust_pred_gen, indust_target_gen, img_size = load_data_era5(year_range[0], year_range[1], hw, params=params) + print( + f"Heatwave period: {heatwave_period}\n {np.shape(heatwave_period)}\n", + flush=True, + ) + + # indust_pred, data_of_interest_pred, time_indust_pred, indust_target, img_size = load_data_era5(year_range[0], year_range[1], hw, params=params) + _, _, time_indust_pred_gen, indust_target_gen, img_size = load_data_era5( + year_range[0], year_range[1], hw, params=params + ) for idx, init in enumerate(tqdm(params_multiple["data_of_interest_init"])): - print(f'\nProcessing date: {init}', flush=True) + print(f"\nProcessing date: {init}", flush=True) params["data_of_interest_init"] = init params["data_of_interest_end"] = params_multiple["data_of_interest_end"][idx] - min_idx_time = np.max([0, idx-45]) - max_idx_time = np.min([idx+45, len(heatwave_period)-1]) + min_idx_time = np.max([0, idx - 45]) + max_idx_time = np.min([idx + 45, len(heatwave_period) - 1]) min_days_time = idx - min_idx_time max_days_time = max_idx_time - idx - + # Filtrar los datos en función de min_tim y max_time, calculando los indices dentro del .npy haciendo uso de un slice en time_indust_pred - min_time = params["data_of_interest_init"] - datetime.timedelta(days=int(min_days_time)) - max_time = params["data_of_interest_init"].replace(hour=6) + datetime.timedelta(days=int(max_days_time)) - + min_time = params["data_of_interest_init"] - datetime.timedelta( + days=int(min_days_time) + ) + max_time = params["data_of_interest_init"].replace(hour=6) + datetime.timedelta( + days=int(max_days_time) + ) + # Create boolean mask for the time range - time_mask = (time_indust_pred_gen.time >= min_time) & (time_indust_pred_gen.time <= max_time) + time_mask = (time_indust_pred_gen.time >= min_time) & ( + time_indust_pred_gen.time <= max_time + ) indices = np.where(time_mask.values)[0] start_time_idx = indices[0] # Index of first occurrence >= min_time - end_time_idx = indices[-1] # Index of last occurrence <= max_time - time_indust_pred = time_indust_pred_gen.sel(time=slice(min_time,max_time)) - indust_target = indust_target_gen.sel(time=slice(min_time,max_time)) + end_time_idx = indices[-1] # Index of last occurrence <= max_time + time_indust_pred = time_indust_pred_gen.sel(time=slice(min_time, max_time)) + indust_target = indust_target_gen.sel(time=slice(min_time, max_time)) - for i in tqdm(range(1,26), desc="Processing ens"): + for i in tqdm(range(1, 26), desc="Processing ens"): # AE Post - file_time_name = f'./comparison-csv/analogues-ae-am-post-{params["season"]}{params["name"]}_ens{i:02d}_x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.npy'.replace(" ","").replace("'", "").replace(",","") + file_time_name = ( + f'./comparison-csv/analogues-ae-am-post-{params["season"]}{params["name"]}_ens{i:02d}_x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.npy'.replace( + " ", "" + ) + .replace("'", "") + .replace(",", "") + ) ## Load data - indust_pred_encoded = np.load(f"./data/encoded/indust_pred_encoded_ens{i:02d}_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy") + indust_pred_encoded = np.load( + f"./data/encoded/indust_pred_encoded_ens{i:02d}_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy" + ) indust_pred_encoded = indust_pred_encoded[start_time_idx:end_time_idx] print(f"shape indust_pred_encoded: {np.shape(indust_pred_encoded)}") - data_of_interest_pred_encoded_ind = np.load(f"./data/encoded/indust_pred_encoded_{year_part_range[0]}-{year_part_range[1]}_{hw_name[:-5]}era5.npy") + data_of_interest_pred_encoded_ind = np.load( + f"./data/encoded/indust_pred_encoded_{year_part_range[0]}-{year_part_range[1]}_{hw_name[:-5]}era5.npy" + ) data_of_interest_pred_encoded_ind = data_of_interest_pred_encoded_ind[idx] - print(f"shape data_of_interest_pred_encoded_ind: {np.shape(data_of_interest_pred_encoded_ind)}") - latent_analog_ind = analogSearch(params["p"], params["k"], indust_pred_encoded, data_of_interest_pred_encoded_ind, time_indust_pred, indust_target, False, threshold=0, img_size=img_size, iter=params["iter"], replace_choice=params["replace_choice"], target_var_name=params["target_var_name"], file_time_name=file_time_name) - del indust_pred_encoded#, time_indust_pred indust_target - #del data_of_interest_pred_encoded_ind - + print( + f"shape data_of_interest_pred_encoded_ind: {np.shape(data_of_interest_pred_encoded_ind)}" + ) + latent_analog_ind = analogSearch( + params["p"], + params["k"], + indust_pred_encoded, + data_of_interest_pred_encoded_ind, + time_indust_pred, + indust_target, + False, + threshold=0, + img_size=img_size, + iter=params["iter"], + replace_choice=params["replace_choice"], + target_var_name=params["target_var_name"], + file_time_name=file_time_name, + ) + del indust_pred_encoded # , time_indust_pred indust_target + # del data_of_interest_pred_encoded_ind + dict_stats = {} # print(f'type latent_analog_ind 0 : {type(latent_analog_ind[0])}', flush=True) # print(f'type latent_analog_ind 1 : {type(latent_analog_ind[1])}', flush=True) @@ -260,49 +452,85 @@ def search_ecmwf(year_range=(1993, 2016), year_part_range=(1993, 2016), hw="para # print(f'sh latent_analog_ind 1 : {np.shape(latent_analog_ind[1])}', flush=True) # print(f'latent_analog_ind 2 : {latent_analog_ind[2]}', flush=True) for j in range(params["iter"]): - #dict_stats[f'Analog{j}'] = [f"{(np.sqrt(np.nansum((data_of_interest_pred_encoded_ind - latent_analog_ind[0][j])**2))/img_size):.10f}", str(latent_analog_ind[2][j].data)[:10], str(latent_analog_ind[2][j].data)[11:13]] - dict_stats[f'Analog{j}'] = [f"{latent_analog_ind[3][j]:.10f}", str(latent_analog_ind[2][j].data)[:10], str(latent_analog_ind[2][j].data)[11:13]] + # dict_stats[f'Analog{j}'] = [f"{(np.sqrt(np.nansum((data_of_interest_pred_encoded_ind - latent_analog_ind[0][j])**2))/img_size):.10f}", str(latent_analog_ind[2][j].data)[:10], str(latent_analog_ind[2][j].data)[11:13]] + dict_stats[f"Analog{j}"] = [ + f"{latent_analog_ind[3][j]:.10f}", + str(latent_analog_ind[2][j].data)[:10], + str(latent_analog_ind[2][j].data)[11:13], + ] del data_of_interest_pred_encoded_ind - df_stats = pd.DataFrame.from_dict(dict_stats,orient='index',columns=['pred-diff','time','leadtime']) + df_stats = pd.DataFrame.from_dict( + dict_stats, orient="index", columns=["pred-diff", "time", "leadtime"] + ) Path("./comparison-csv").mkdir(parents=True, exist_ok=True) - df_stats.to_csv(f'./comparison-csv/{params["season"]}{params["name"]}_ens{i:02d}_x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}-analog-comparision-stats_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.csv'.replace(" ","").replace("'", "").replace(",","")) + df_stats.to_csv( + f'./comparison-csv/{params["season"]}{params["name"]}_ens{i:02d}_x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}-analog-comparision-stats_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.csv'.replace( + " ", "" + ) + .replace("'", "") + .replace(",", "") + ) return - - + def search_past2k(year_range=(1940, 2022), year_part_range=(0, 1885), hw="params.json"): - hw_name = hw.split('.')[0].split('/')[-1] - for i, year_start in enumerate(np.arange(year_part_range[0], year_part_range[1], 100)): + hw_name = hw.split(".")[0].split("/")[-1] + for i, year_start in enumerate( + np.arange(year_part_range[0], year_part_range[1], 100) + ): idx = i + 1 year_end = year_start + 99 if year_start == 1800: year_end = 1850 - print(f'idx: {idx}', flush=True) - print(f'year_start: {year_start}', flush=True) - print(f'year_end: {year_end}', flush=True) + print(f"idx: {idx}", flush=True) + print(f"year_start: {year_start}", flush=True) + print(f"year_end: {year_end}", flush=True) for j, year_part in enumerate(np.arange(year_start, year_end, 25)): year_part_end = year_part + 24 if year_part == 1825: year_part_end = 1850 - print(f'j: {j}', flush=True) - print(f'year_part: {year_part}', flush=True) - print(f'year_part_end: {year_part_end}', flush=True) - search_era5(year_range=year_range, year_part_range=(year_part,year_part_end),hw=hw) + print(f"j: {j}", flush=True) + print(f"year_part: {year_part}", flush=True) + print(f"year_part_end: {year_part_end}", flush=True) + search_era5( + year_range=year_range, year_part_range=(year_part, year_part_end), hw=hw + ) return -def search_era5(year_range=(1940, 2022), year_part_range=(1940, 2022), hw="params.json"): - hw_name = hw.split('.')[0].split('/')[-1] + +def search_era5( + year_range=(1940, 2022), year_part_range=(1940, 2022), hw="params.json" +): + hw_name = hw.split(".")[0].split("/")[-1] with open(hw) as f: params = json.load(f) params_multiple = params.copy() - + current = datetime.datetime.now() int_reg = params["interest_region"] if params["interest_region_type"] == "coord": - int_reg = calculate_interest_region(params["interest_region"], [params["latitude_min"], params["latitude_max"], params["longitude_min"], params["longitude_max"]], params["resolution"], False, "file") + int_reg = calculate_interest_region( + params["interest_region"], + [ + params["latitude_min"], + params["latitude_max"], + params["longitude_min"], + params["longitude_max"], + ], + params["resolution"], + False, + "file", + ) - heatwave_period = np.arange(datetime.datetime.strptime(params_multiple["data_of_interest_init"], '%Y-%m-%d'), datetime.datetime.strptime(params_multiple["data_of_interest_end"], '%Y-%m-%d')+datetime.timedelta(days=1), datetime.timedelta(days=1)) + heatwave_period = np.arange( + datetime.datetime.strptime( + params_multiple["data_of_interest_init"], "%Y-%m-%d" + ), + datetime.datetime.strptime(params_multiple["data_of_interest_end"], "%Y-%m-%d") + + datetime.timedelta(days=1), + datetime.timedelta(days=1), + ) heatwave_period = np.array(list(map(pd.Timestamp, heatwave_period))) params_multiple["data_of_interest_init"] = heatwave_period params_multiple["data_of_interest_end"] = heatwave_period @@ -310,57 +538,167 @@ def search_era5(year_range=(1940, 2022), year_part_range=(1940, 2022), hw="param for idx, init in enumerate(params_multiple["data_of_interest_init"]): params["data_of_interest_init"] = init params["data_of_interest_end"] = params_multiple["data_of_interest_end"][idx] - + # Analog Post - file_time_name = f'./comparison-csv/analogues-am-post-{params["season"]}{params["name"]}x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.npy'.replace(" ","").replace("'", "").replace(",","") + file_time_name = ( + f'./comparison-csv/analogues-am-post-{params["season"]}{params["name"]}x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.npy'.replace( + " ", "" + ) + .replace("'", "") + .replace(",", "") + ) ## Load data - indust_pred, data_of_interest_pred, time_indust_pred, indust_target, img_size = load_data_era5(year_range[0], year_range[1], hw, params=params) - print(f'\nShape in encode_seach: {np.shape(data_of_interest_pred)}\n', flush=True) + ( + indust_pred, + data_of_interest_pred, + time_indust_pred, + indust_target, + img_size, + ) = load_data_era5(year_range[0], year_range[1], hw, params=params) + print( + f"\nShape in encode_seach: {np.shape(data_of_interest_pred)}\n", flush=True + ) ## Search - analog_ind = analogSearch(params["p"], params["k"], indust_pred, data_of_interest_pred, time_indust_pred, indust_target, False, threshold=0, img_size=img_size, iter=params["iter"], replace_choice=params["replace_choice"], target_var_name=params["target_var_name"], file_time_name=file_time_name) + analog_ind = analogSearch( + params["p"], + params["k"], + indust_pred, + data_of_interest_pred, + time_indust_pred, + indust_target, + False, + threshold=0, + img_size=img_size, + iter=params["iter"], + replace_choice=params["replace_choice"], + target_var_name=params["target_var_name"], + file_time_name=file_time_name, + ) ## Del data del indust_pred, data_of_interest_pred - + # AE Post - file_time_name = f'./comparison-csv/analogues-ae-am-post-{params["season"]}{params["name"]}x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.npy'.replace(" ","").replace("'", "").replace(",","") + file_time_name = ( + f'./comparison-csv/analogues-ae-am-post-{params["season"]}{params["name"]}x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.npy'.replace( + " ", "" + ) + .replace("'", "") + .replace(",", "") + ) ## Load data - indust_pred_encoded = np.load(f"./data/encoded/indust_pred_encoded_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy") - data_of_interest_pred_encoded_ind = np.load(f"./data/encoded/data_of_interest_pred_encoded_ind_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy") + indust_pred_encoded = np.load( + f"./data/encoded/indust_pred_encoded_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy" + ) + data_of_interest_pred_encoded_ind = np.load( + f"./data/encoded/data_of_interest_pred_encoded_ind_{year_part_range[0]}-{year_part_range[1]}_{hw_name}.npy" + ) data_of_interest_pred_encoded_ind = data_of_interest_pred_encoded_ind[idx] - latent_analog_ind = analogSearch(params["p"], params["k"], indust_pred_encoded, data_of_interest_pred_encoded_ind, time_indust_pred, indust_target, False, threshold=0, img_size=img_size, iter=params["iter"], replace_choice=params["replace_choice"], target_var_name=params["target_var_name"], file_time_name=file_time_name) + latent_analog_ind = analogSearch( + params["p"], + params["k"], + indust_pred_encoded, + data_of_interest_pred_encoded_ind, + time_indust_pred, + indust_target, + False, + threshold=0, + img_size=img_size, + iter=params["iter"], + replace_choice=params["replace_choice"], + target_var_name=params["target_var_name"], + file_time_name=file_time_name, + ) del indust_pred_encoded, time_indust_pred, indust_target - #del data_of_interest_pred_encoded_ind - + # del data_of_interest_pred_encoded_ind + dict_stats = {} - data_of_interest_pred, data_of_interest_target = load_data_era5(year_range[0], year_range[1], hw, ["data_of_interest_pred", "data_of_interest_target"], params=params) + data_of_interest_pred, data_of_interest_target = load_data_era5( + year_range[0], + year_range[1], + hw, + ["data_of_interest_pred", "data_of_interest_target"], + params=params, + ) for i in range(params["iter"]): - dict_stats[f'WithoutAE-Pre{i}'] = [np.nan, np.nan, np.nan] - - dict_stats[f'WithoutAE-Post{i}'] = [np.nansum(np.abs(data_of_interest_pred - analog_ind[0][i]))/img_size, - np.nanmean(np.abs(data_of_interest_target[params["interest_var_name"]].data - analog_ind[1][i])[:,int_reg[0]:int_reg[1],int_reg[2]:int_reg[3]]), - np.nanmean(analog_ind[1][i][int_reg[0]:int_reg[1],int_reg[2]:int_reg[3]]), - str(analog_ind[2][i].data)[:10]] - - dict_stats[f'WithAE-Pre-Pre{i}'] = [np.nan, np.nan, np.nan] - - dict_stats[f'WithAE-Post-Post{i}'] = [np.nansum(np.abs(data_of_interest_pred_encoded_ind - latent_analog_ind[0][i]))/img_size, - np.nanmean(np.abs(data_of_interest_target[params["interest_var_name"]].data - latent_analog_ind[1][i])[:,int_reg[0]:int_reg[1],int_reg[2]:int_reg[3]]), - np.nanmean(latent_analog_ind[1][i][int_reg[0]:int_reg[1],int_reg[2]:int_reg[3]]), - str(latent_analog_ind[2][i].data)[:10]] - - dict_stats[f'Original{i}']=[0,0,np.nanmean(((data_of_interest_target[params["interest_var_name"]].data)[:,int_reg[0]:int_reg[1],int_reg[2]:int_reg[3]]))] - - - del data_of_interest_pred, data_of_interest_target, data_of_interest_pred_encoded_ind - df_stats = pd.DataFrame.from_dict(dict_stats,orient='index',columns=['pred-diff','target-diff','target','time']) + dict_stats[f"WithoutAE-Pre{i}"] = [np.nan, np.nan, np.nan] + + dict_stats[f"WithoutAE-Post{i}"] = [ + np.nansum(np.abs(data_of_interest_pred - analog_ind[0][i])) / img_size, + np.nanmean( + np.abs( + data_of_interest_target[params["interest_var_name"]].data + - analog_ind[1][i] + )[:, int_reg[0] : int_reg[1], int_reg[2] : int_reg[3]] + ), + np.nanmean( + analog_ind[1][i][int_reg[0] : int_reg[1], int_reg[2] : int_reg[3]] + ), + str(analog_ind[2][i].data)[:10], + ] + + dict_stats[f"WithAE-Pre-Pre{i}"] = [np.nan, np.nan, np.nan] + + dict_stats[f"WithAE-Post-Post{i}"] = [ + np.nansum( + np.abs(data_of_interest_pred_encoded_ind - latent_analog_ind[0][i]) + ) + / img_size, + np.nanmean( + np.abs( + data_of_interest_target[params["interest_var_name"]].data + - latent_analog_ind[1][i] + )[:, int_reg[0] : int_reg[1], int_reg[2] : int_reg[3]] + ), + np.nanmean( + latent_analog_ind[1][i][ + int_reg[0] : int_reg[1], int_reg[2] : int_reg[3] + ] + ), + str(latent_analog_ind[2][i].data)[:10], + ] + + dict_stats[f"Original{i}"] = [ + 0, + 0, + np.nanmean( + ( + (data_of_interest_target[params["interest_var_name"]].data)[ + :, int_reg[0] : int_reg[1], int_reg[2] : int_reg[3] + ] + ) + ), + ] + + del ( + data_of_interest_pred, + data_of_interest_target, + data_of_interest_pred_encoded_ind, + ) + df_stats = pd.DataFrame.from_dict( + dict_stats, + orient="index", + columns=["pred-diff", "target-diff", "target", "time"], + ) Path("./comparison-csv").mkdir(parents=True, exist_ok=True) - df_stats.to_csv(f'./comparison-csv/{params["season"]}{params["name"]}x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}-analog-comparision-stats_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.csv'.replace(" ","").replace("'", "").replace(",","")) + df_stats.to_csv( + f'./comparison-csv/{params["season"]}{params["name"]}x{params["iter"]}-{params["data_of_interest_init"]}-epoch{params["n_epochs"]}-latent{params["latent_dim"]}-k{params["k"]}-arch{params["arch"]}-{"VAE" if params["use_VAE"] else "noVAE"}-analog-comparision-stats_{year_part_range[0]}-{year_part_range[1]}_{current.year}-{current.month}-{current.day}-{current.hour}-{current.minute}-{current.second}.csv'.replace( + " ", "" + ) + .replace("'", "") + .replace(",", "") + ) return + def post_process_anal(hw="params.json"): with open(hw) as f: params = json.load(f) - heatwave_period = np.arange(datetime.datetime.strptime(params["data_of_interest_init"], '%Y-%m-%d'), datetime.datetime.strptime(params["data_of_interest_end"], '%Y-%m-%d')+datetime.timedelta(days=1), datetime.timedelta(days=1)) + heatwave_period = np.arange( + datetime.datetime.strptime(params["data_of_interest_init"], "%Y-%m-%d"), + datetime.datetime.strptime(params["data_of_interest_end"], "%Y-%m-%d") + + datetime.timedelta(days=1), + datetime.timedelta(days=1), + ) print(f"heatwave_period: {heatwave_period}\n", flush=True) df_all_days = {} for idx, day_timestamp in enumerate(tqdm(heatwave_period)): @@ -369,9 +707,9 @@ def post_process_anal(hw="params.json"): files_interest = sorted(files_interest) list_interest = [pd.read_csv(df) for df in files_interest] print(f"list_interest: {list_interest}\n", flush=True) - anal_name = list_interest[0]["Unnamed: 0"] + anal_name = list_interest[0]["Unnamed: 0"] daily_df = list_interest[0] - daily_df["ens"] = f"ens_{1:02d}" + daily_df["ens"] = f"ens_{1:02d}" for idx_df, df_i in enumerate(tqdm(list_interest[1:])): df_i["ens"] = f"ens_{idx_df + 2 :02d}" daily_df = pd.concat([daily_df, df_i], ignore_index=True) @@ -380,35 +718,56 @@ def post_process_anal(hw="params.json"): daily_df["leadtime"] = daily_df["leadtime"].map("{:02d}".format) daily_df.insert(loc=0, column="rank", value=anal_name) daily_df["emo-day"] = day - if idx==0: + if idx == 0: df_all_days = daily_df else: df_all_days = pd.concat([df_all_days, daily_df], ignore_index=True) - df_all_days.to_csv(f"./anal_dict/post_processed_analogues_{str(heatwave_period[0])[:4]}-{str(heatwave_period[-1])[:4]}.csv") + df_all_days.to_csv( + f"./anal_dict/post_processed_analogues_{str(heatwave_period[0])[:4]}-{str(heatwave_period[-1])[:4]}.csv" + ) def main(): # Parser initialization parser = argparse.ArgumentParser() - parser.add_argument("-m", "--method", dest='method', help="Specify an method to execute between: \n 'encode' (default), 'search' or 'train'") - parser.add_argument("-f", "--configfile", dest='conf', help="JSON file with configuration of parameters. If not specified and 'method' require the file, it will be searched at 'params.json'") + parser.add_argument( + "-m", + "--method", + dest="method", + help="Specify an method to execute between: \n 'encode' (default), 'search' or 'train'", + ) + parser.add_argument( + "-f", + "--configfile", + dest="conf", + help="JSON file with configuration of parameters. If not specified and 'method' require the file, it will be searched at 'params.json'", + ) args = parser.parse_args() - method = 'encode' - hw = 'params.json' + method = "encode" + hw = "params.json" if args.method is not None: method = args.method if args.conf is not None: hw = args.conf with open(hw) as f: params = json.load(f) - year_range=(int(params["post_init"].split('-')[0]), int(params["post_end"].split('-')[0])) + year_range = ( + int(params["post_init"].split("-")[0]), + int(params["post_end"].split("-")[0]), + ) if method in ["train", "train-past2k"]: in_shape = ( - (params["latitude_max"] - params["latitude_min"]) // params["resolution"] + 1, - (params["longitude_max"] - params["longitude_min"]) // params["resolution"] + 1, + (params["latitude_max"] - params["latitude_min"]) // params["resolution"] + + 1, + (params["longitude_max"] - params["longitude_min"]) // params["resolution"] + + 1, + ) + print( + f"Input shape for {hw.split('.')[0].split('/')[-1]}: {in_shape}", flush=True + ) + ae_object = va_am.utils.AutoEncoders.AE_conv( + in_shape, 400, arch=10, in_channels=1, out_channels=1 ) - print(f"Input shape for {hw.split('.')[0].split('/')[-1]}: {in_shape}", flush=True) - ae_object = va_am.utils.AutoEncoders.AE_conv(in_shape, 400, arch=10, in_channels=1, out_channels=1) base_model_enc = ae_object.encoder base_model_dec = ae_object.decoder @@ -417,32 +776,53 @@ def main(): if method == "train-past2k": print("Train on Past2k data", flush=True) - #base_model_enc, base_model_dec = train_all_past2k(base_model_enc, base_model_dec, year_range=(0, 1885), hw=hw, model_type="past2k") - train_all_past2k(base_model_enc, base_model_dec, year_range=(0, 1885), hw=hw, model_type="past2k") + # base_model_enc, base_model_dec = train_all_past2k(base_model_enc, base_model_dec, year_range=(0, 1885), hw=hw, model_type="past2k") + train_all_past2k( + base_model_enc, + base_model_dec, + year_range=(0, 1885), + hw=hw, + model_type="past2k", + ) else: - train_era5(base_model_enc, base_model_dec, year_range=(1940, 2022), hw=hw, model_type="era5") - - elif method == 'encode-past2k': + train_era5( + base_model_enc, + base_model_dec, + year_range=(1940, 2022), + hw=hw, + model_type="era5", + ) + + elif method == "encode-past2k": encode_past2k(hw=hw, model_type="past2k") - elif method == 'encode': - AE_ind = keras.models.load_model(params["file_AE_post"], custom_objects={'keras': keras,'AutoEncoders': AutoEncoders}) + elif method == "encode": + AE_ind = keras.models.load_model( + params["file_AE_post"], + custom_objects={"keras": keras, "AutoEncoders": AutoEncoders}, + ) encode_era5(AE_ind, year_range=year_range, hw=hw) - #encode_era5(AE_ind, year_range=(1993, 2016), year_part_range=(1993, 2016), hw=hw) - elif method == 'encode-ecmwf': - AE_ind = keras.models.load_model(params["file_AE_post"], custom_objects={'keras': keras,'AutoEncoders': AutoEncoders}) + # encode_era5(AE_ind, year_range=(1993, 2016), year_part_range=(1993, 2016), hw=hw) + elif method == "encode-ecmwf": + AE_ind = keras.models.load_model( + params["file_AE_post"], + custom_objects={"keras": keras, "AutoEncoders": AutoEncoders}, + ) encode_ecmwf(AE_ind, year_range=year_range, hw=hw) - elif method == 'search-past2k': + elif method == "search-past2k": search_past2k(year_range=year_range, year_part_range=(1600, 1885), hw=hw) - elif method == 'search': + elif method == "search": search_era5(year_range=year_range, year_part_range=year_range, hw=hw) - elif method == 'search-ecmwf': + elif method == "search-ecmwf": search_ecmwf(year_range=(1993, 2016), year_part_range=(1993, 2016), hw=hw) - elif method == 'post-process-anal': + elif method == "post-process-anal": post_process_anal(hw=hw) else: - message = ValueError(f"Not recognized {method} method. The available methods are 'encode' (default), 'search' or 'train'") + message = ValueError( + f"Not recognized {method} method. The available methods are 'encode' (default), 'search' or 'train'" + ) raise message return + if __name__ == "__main__": main() diff --git a/src/analogues/make_analog_dataset.py b/src/analogues/make_analog_dataset.py index 2b3914b..7724cbe 100755 --- a/src/analogues/make_analog_dataset.py +++ b/src/analogues/make_analog_dataset.py @@ -5,63 +5,96 @@ from tqdm import tqdm import numpy as np + def parse_arguments(): - parser = argparse.ArgumentParser(description='Build Xarray from CSV index and ensemble NetCDF files.') - parser.add_argument('--csv_path', type=str, required=True, help='Path to the CSV file') - parser.add_argument('--variable', type=str, required=True, help='Variable name (e.g., gz500, msl)') - parser.add_argument('--input_dir', type=str, required=True, help='Input directory for ensemble data (e.g., ./data/ecmwf/)') - parser.add_argument('--output_path', type=str, required=True, help='Output path for the resulting NetCDF file') + parser = argparse.ArgumentParser( + description="Build Xarray from CSV index and ensemble NetCDF files." + ) + parser.add_argument( + "--csv_path", type=str, required=True, help="Path to the CSV file" + ) + parser.add_argument( + "--variable", type=str, required=True, help="Variable name (e.g., gz500, msl)" + ) + parser.add_argument( + "--input_dir", + type=str, + required=True, + help="Input directory for ensemble data (e.g., ./data/ecmwf/)", + ) + parser.add_argument( + "--output_path", + type=str, + required=True, + help="Output path for the resulting NetCDF file", + ) return parser.parse_args() + def main(): args = parse_arguments() - + # Leer el CSV y ordenar por 'emo-day' y 'rank' - df = pd.read_csv(args.csv_path, parse_dates=['time', 'emo-day']) - df = df.sort_values(['emo-day', 'rank']).reset_index(drop=True) - + df = pd.read_csv(args.csv_path, parse_dates=["time", "emo-day"]) + df = df.sort_values(["emo-day", "rank"]).reset_index(drop=True) + # Obtener dimensiones y coordenadas base del primer archivo (asumiendo que todas las variables tienen la misma estructura) - example_ens = 'ens_01' # Ejemplo para obtener coordenadas - example_path = os.path.join(args.input_dir, example_ens, args.variable, f'ecmwf_{example_ens.split("_")[1]}_{args.variable}_1993-2016_.nc') + example_ens = "ens_01" # Ejemplo para obtener coordenadas + example_path = os.path.join( + args.input_dir, + example_ens, + args.variable, + f'ecmwf_{example_ens.split("_")[1]}_{args.variable}_1993-2016_.nc', + ) with xr.open_dataset(example_path) as ds: lats = ds.lat.values lons = ds.lon.values - time_coords = df['emo-day'].unique() - + time_coords = df["emo-day"].unique() + # Inicializar Dataset vacío con Dask para manejo de memoria - data_shape = (len(time_coords), 25, len(lats), len(lons)) # (time, analog, lat, lon) + data_shape = ( + len(time_coords), + 25, + len(lats), + len(lons), + ) # (time, analog, lat, lon) data = xr.DataArray( np.full(data_shape, np.nan, dtype=np.float32), - dims=['time', 'analog', 'lat', 'lon'], + dims=["time", "analog", "lat", "lon"], coords={ - 'time': pd.to_datetime(time_coords), - 'analog': np.arange(25), - 'lat': lats, - 'lon': lons + "time": pd.to_datetime(time_coords), + "analog": np.arange(25), + "lat": lats, + "lon": lons, }, - name=args.variable + name=args.variable, ) ds_out = xr.Dataset({args.variable: data}) - + # Procesar cada fila del CSV - for idx, row in tqdm(df.iterrows(), total=len(df), desc='Processing'): + for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"): # Construir ruta al archivo NetCDF - ens = row['ens'] - nc_path = os.path.join(args.input_dir, ens, args.variable, f'ecmwf_{ens.split("_")[1]}_{args.variable}_1993-2016_.nc') - + ens = row["ens"] + nc_path = os.path.join( + args.input_dir, + ens, + args.variable, + f'ecmwf_{ens.split("_")[1]}_{args.variable}_1993-2016_.nc', + ) + # Calcular el tiempo objetivo (time + leadtime hours) - target_time = row['time'] + pd.to_timedelta(row['leadtime'], unit='h') - + target_time = row["time"] + pd.to_timedelta(row["leadtime"], unit="h") + try: # Cargar el dato específico with xr.open_dataset(nc_path) as ds: # Seleccionar el tiempo más cercano (suponiendo que el leadtime está en horas) - ds_target = ds.sel(time=target_time, method='nearest') + ds_target = ds.sel(time=target_time, method="nearest") value = ds_target[args.variable].load() - + # Asignar al Dataset de salida - time_idx = np.where(ds_out.time == row['emo-day'])[0][0] - analog_idx = int(row['rank'].replace("Analog", "")) + time_idx = np.where(ds_out.time == row["emo-day"])[0][0] + analog_idx = int(row["rank"].replace("Analog", "")) ds_out[args.variable][time_idx, analog_idx, :, :] = value except FileNotFoundError: print(f"Archivo no encontrado: {nc_path}", flush=True) @@ -69,11 +102,14 @@ def main(): except KeyError: print(f"Variable {args.variable} no encontrada en {nc_path}", flush=True) continue - + # Guardar el Dataset resultante - out_path = os.path.join(args.output_path, args.variable, f'ecmwf_{args.variable}_1993-2016_.nc') + out_path = os.path.join( + args.output_path, args.variable, f"ecmwf_{args.variable}_1993-2016_.nc" + ) ds_out.to_netcdf(out_path) print(f"Dataset guardado en {out_path}") -if __name__ == '__main__': - main() \ No newline at end of file + +if __name__ == "__main__": + main() diff --git a/src/analogues/make_analog_dataset.sh b/src/analogues/make_analog_dataset.sh index 44c0c32..c83f3ae 100755 --- a/src/analogues/make_analog_dataset.sh +++ b/src/analogues/make_analog_dataset.sh @@ -11,5 +11,5 @@ module load python3 source activate -conda activate ${ENV2} +conda activate "${ENV2}" python make_analog_dataset.py --csv_path "anal_dict/post_processed_analogues_1993-2016.csv" --variable gz300 --input_dir "./data/ecmwf/" --output_path "./data/analogues/" diff --git a/src/cmip6_data_acq/0_data_acq_main_ECROPS.py b/src/cmip6_data_acq/0_data_acq_main_ECROPS.py index f1c9757..f191cc1 100755 --- a/src/cmip6_data_acq/0_data_acq_main_ECROPS.py +++ b/src/cmip6_data_acq/0_data_acq_main_ECROPS.py @@ -7,12 +7,13 @@ import logging import sys + # from FREVA import freva_search import data_acq_freva_search_ECROPS import os # projects = ['cmip6', 'reanalysis'] -projects = ['cmip6'] +projects = ["cmip6"] # models = ['cesm2', # 'cnrm-cm6-1-HR', # 'gfdl-esm4', @@ -20,15 +21,15 @@ # 'mpi-esm1-2-hr', # 'noresm2-mm', # 'hadgem3-gc31-mm'] -models = ['mpi-esm1-2-lr'] - +models = ["mpi-esm1-2-lr"] + # models = [] # DO NOT DO ANYTHING FOR CMIP6 -variables_cmip = ['tdps', 'ua', 'va', 'tasmax', 'lai'] +variables_cmip = ["tdps", "ua", "va", "tasmax", "lai"] # variables_era5_daily_monthly = ['tasmax', 'tasmin', 'tas', 'pr', 'rsds', 'tdps', 'sfcwind', 'hurs'] -variables_era5_daily_monthly = ['tdps', 'ua', 'va', 'tasmax', 'lai'] +variables_era5_daily_monthly = ["tdps", "ua", "va", "tasmax", "lai"] # variables_era5_hourly = ['uas', 'vas'] -variables_era5_hourly = [] +variables_era5_hourly: list[str] = [] # variables_era5_hourly = ['uas', 'vas', 'rsds', 'tdps'] # 10m wind speed vas and uas are calculated with ECROPS function in wofost_util/util.py wind10to2(wind10) function @@ -38,14 +39,14 @@ vorticity_height = 20000 # 200hPa # frequency = ['hour', 'day', 'mon'] -frequency = ['day'] +frequency = ["day"] # frequency = ['mon'] # exp_cmip6 = ['ssp370', 'ssp585', 'historical'] -exp_cmip6 = ['historical', 'past2k'] -exp_reanalysis = 'era5' +exp_cmip6 = ["historical", "past2k"] +exp_reanalysis = "era5" -def main(): +def main(): # First initialize a logger instance logging.basicConfig( level=logging.INFO, @@ -53,49 +54,95 @@ def main(): force=True, handlers=[ logging.FileHandler("LOG_Data_Acquisition_FREVA_output.log"), - logging.StreamHandler(sys.stdout) - ] + logging.StreamHandler(sys.stdout), + ], ) - logging.info('Started Freva files main programme \n') + logging.info("Started Freva files main programme \n") for project in projects: - if project == 'cmip6': + if project == "cmip6": for i in range(len(models)): for exp in exp_cmip6: for var in variables_cmip: - logging.info("\n \n" + "MODEL: " + str(models[i]) + - ", EXPERIMENT: " + str(exp) + - ", VARIABLE: " + str(var) + - ", FREQUENCY: " + str(frequency) + "\n") - if not exp == 'historical': - data_acq_freva_search_ECROPS.freva_search_ssp(project, models[i], var, frequency, exp) - logging.info('\n\n **** Finished with SSP files **** \n \n') - if exp == 'historical': - data_acq_freva_search_ECROPS.freva_search_historical(project, models[i], var, frequency) - logging.info('\n\n **** Finished with Historical files **** \n\n') - - if project == 'reanalysis': + logging.info( + "\n \n" + + "MODEL: " + + str(models[i]) + + ", EXPERIMENT: " + + str(exp) + + ", VARIABLE: " + + str(var) + + ", FREQUENCY: " + + str(frequency) + + "\n" + ) + if not exp == "historical": + data_acq_freva_search_ECROPS.freva_search_ssp( + project, models[i], var, frequency, exp + ) + logging.info( + "\n\n **** Finished with SSP files **** \n \n" + ) + if exp == "historical": + data_acq_freva_search_ECROPS.freva_search_historical( + project, models[i], var, frequency + ) + logging.info( + "\n\n **** Finished with Historical files **** \n\n" + ) + + if project == "reanalysis": for var in variables_era5_daily_monthly: - logging.info("\n \n" + "PROJECT: " + str(project) + - ", EXPERIMENT: " + str(exp_reanalysis) + - ", VARIABLE: " + str(var) + - ", FREQUENCY: " + str(frequency[2]) + "\n") - data_acq_freva_search_ECROPS.freva_search_reanalysis(project, exp_reanalysis, var, frequency[2]) - + logging.info( + "\n \n" + + "PROJECT: " + + str(project) + + ", EXPERIMENT: " + + str(exp_reanalysis) + + ", VARIABLE: " + + str(var) + + ", FREQUENCY: " + + str(frequency[2]) + + "\n" + ) + data_acq_freva_search_ECROPS.freva_search_reanalysis( + project, exp_reanalysis, var, frequency[2] + ) + for var in variables_era5_daily_monthly: - logging.info("\n \n" + "PROJECT: " + str(project) + - ", EXPERIMENT: " + str(exp_reanalysis) + - ", VARIABLE: " + str(var) + - ", FREQUENCY: " + str(frequency[1]) + "\n") - data_acq_freva_search_ECROPS.freva_search_reanalysis(project, exp_reanalysis, var, frequency[1]) - + logging.info( + "\n \n" + + "PROJECT: " + + str(project) + + ", EXPERIMENT: " + + str(exp_reanalysis) + + ", VARIABLE: " + + str(var) + + ", FREQUENCY: " + + str(frequency[1]) + + "\n" + ) + data_acq_freva_search_ECROPS.freva_search_reanalysis( + project, exp_reanalysis, var, frequency[1] + ) + for var in variables_era5_hourly: - logging.info("\n \n" + "PROJECT: " + str(project) + - ", EXPERIMENT: " + str(exp_reanalysis) + - ", VARIABLE: " + str(var) + - ", FREQUENCY: " + str(frequency[0]) + "\n") - data_acq_freva_search_ECROPS.freva_search_reanalysis(project, exp_reanalysis, var, frequency[0]) - - -if __name__ == '__main__': + logging.info( + "\n \n" + + "PROJECT: " + + str(project) + + ", EXPERIMENT: " + + str(exp_reanalysis) + + ", VARIABLE: " + + str(var) + + ", FREQUENCY: " + + str(frequency[0]) + + "\n" + ) + data_acq_freva_search_ECROPS.freva_search_reanalysis( + project, exp_reanalysis, var, frequency[0] + ) + + +if __name__ == "__main__": main() diff --git a/src/cmip6_data_acq/copy_files.py b/src/cmip6_data_acq/copy_files.py index dcfbb71..673f53b 100755 --- a/src/cmip6_data_acq/copy_files.py +++ b/src/cmip6_data_acq/copy_files.py @@ -4,6 +4,7 @@ import glob import sys + def copy_files_from_csv(csv_file_path, destination_folder): """ Copies files listed in a CSV file to a specified destination folder. @@ -15,19 +16,21 @@ def copy_files_from_csv(csv_file_path, destination_folder): os.makedirs(destination_folder, exist_ok=True) # Open the CSV file and read the file paths - with open(csv_file_path, mode='r') as csv_file: + with open(csv_file_path, mode="r") as csv_file: csv_reader = csv.reader(csv_file) - + for row in csv_reader: # Assuming each row has only one column (the file path) - original_file_path = row[0].strip() # Remove any leading/trailing whitespace - + original_file_path = row[ + 0 + ].strip() # Remove any leading/trailing whitespace + # Get the file name from the original path file_name = os.path.basename(original_file_path) - + # Define the destination file path destination_file_path = os.path.join(destination_folder, file_name) - + try: # Copy the file to the destination folder shutil.copy2(original_file_path, destination_file_path) @@ -40,10 +43,11 @@ def copy_files_from_csv(csv_file_path, destination_folder): print(f"Error copying {original_file_path}: {e}") sys.stdout.flush() + def main(): # Define the paths - data_acq_folder = "./data_acq/" - destination_folder = './data_raw/' + data_acq_folder = "./data_acq/" + destination_folder = "./data_raw/" # Use glob to find all CSV files in the 'data_acq' folder csv_files = sorted(glob.glob(os.path.join(data_acq_folder, "*.csv"))) print(csv_files) @@ -59,7 +63,13 @@ def main(): for csv_file_path in csv_files: print(f"Processing CSV file: {csv_file_path}") sys.stdout.flush() - copy_files_from_csv(csv_file_path, destination_folder + csv_file_path.split('__cmip6_')[1].split('_[')[0] + '/') + copy_files_from_csv( + csv_file_path, + destination_folder + + csv_file_path.split("__cmip6_")[1].split("_[")[0] + + "/", + ) + if __name__ == "__main__": main() diff --git a/src/cmip6_data_acq/data_acq_freva_search_ECROPS.py b/src/cmip6_data_acq/data_acq_freva_search_ECROPS.py index 11f2fa7..a5dac12 100755 --- a/src/cmip6_data_acq/data_acq_freva_search_ECROPS.py +++ b/src/cmip6_data_acq/data_acq_freva_search_ECROPS.py @@ -16,7 +16,6 @@ homevardir = "/work/bb1478/b382610/wildfires/data/find_vars_cmip6/data_acq/" - def freva_search_ssp(project, model, var, freq, experiment): """ Get all the ssp files from FREVA for the inputs and write them to a csv, @@ -32,39 +31,82 @@ def freva_search_ssp(project, model, var, freq, experiment): :return: nothing, writes csv files """ ## 1. Get all the ssp files - ssp_files = freva.databrowser(project=project, - model=model, - variable=var, - time_frequency=freq, - experiment=experiment) + ssp_files = freva.databrowser( + project=project, + model=model, + variable=var, + time_frequency=freq, + experiment=experiment, + ) ## iteratable freva generator object ssp_files can either be tranformed to a list or parsed, ## not both, it lives through one iteration it seems - ssp_files_list = list(ssp_files) # make the freva generator object ssp_files a list for list functions e.g. len() + ssp_files_list = list( + ssp_files + ) # make the freva generator object ssp_files a list for list functions e.g. len() ssp_files_array = np.sort(np.array(ssp_files_list)) ## 2. Get all the unique ensemble ids to be used in matching with all other ssp files all_ensembles = [] for ssp_file in ssp_files_array: - res = freva.facet_search(file=ssp_file, facet='ensemble') - all_ensembles.append(res.get("ensemble")[0]) # get the first (only) value of the dictionary - unique_ensembles = np.unique(np.array(all_ensembles)) # then filter out only the unique ensemble values - logging.info(str(experiment) + " for " + str(var) + " unique ensemble ids = " + str(unique_ensembles)) + res = freva.facet_search(file=ssp_file, facet="ensemble") + all_ensembles.append( + res.get("ensemble")[0] + ) # get the first (only) value of the dictionary + unique_ensembles = np.unique( + np.array(all_ensembles) + ) # then filter out only the unique ensemble values + logging.info( + str(experiment) + + " for " + + str(var) + + " unique ensemble ids = " + + str(unique_ensembles) + ) # Get the number of ssp files per unique ensemble id: Function is called only for logging the number of files - get_files_from_unique_ensembles(project, model, var, freq, experiment, unique_ensembles) + get_files_from_unique_ensembles( + project, model, var, freq, experiment, unique_ensembles + ) ## 3. Get all the historical datasets we need by the ensemble id in unique_ensembles - historical_files_array = get_files_from_unique_ensembles(project, model, var, freq, 'historical', unique_ensembles) + historical_files_array = get_files_from_unique_ensembles( + project, model, var, freq, "historical", unique_ensembles + ) np_historical_files_array = np.sort(np.array(historical_files_array)) ### logging.info(str(var) + " total HISTORICAL num of files = " + str(np_historical_files_array.size)) ## Write everything to csv files - ssp_csv_filename = str(model) + "__" + project + "_" + str(experiment) + "_" + str(var) + "_" + str(freq) + ".csv" - ssp_files_array.tofile(os.path.join(os.sep, homevardir, ssp_csv_filename), sep='\n') - historical_csv_filename = str(model) + "__" + project + "_" + str(experiment) + "_" + str(var) + "_" + str(freq) + "_historical" + ".csv" - np_historical_files_array.tofile(os.path.join(os.sep, homevardir, historical_csv_filename), sep='\n') + ssp_csv_filename = ( + str(model) + + "__" + + project + + "_" + + str(experiment) + + "_" + + str(var) + + "_" + + str(freq) + + ".csv" + ) + ssp_files_array.tofile(os.path.join(os.sep, homevardir, ssp_csv_filename), sep="\n") + historical_csv_filename = ( + str(model) + + "__" + + project + + "_" + + str(experiment) + + "_" + + str(var) + + "_" + + str(freq) + + "_historical" + + ".csv" + ) + np_historical_files_array.tofile( + os.path.join(os.sep, homevardir, historical_csv_filename), sep="\n" + ) def freva_search_historical(project, model, var, freq): @@ -78,11 +120,13 @@ def freva_search_historical(project, model, var, freq): :return: nothing, writes csv files """ ## 1. Get all the historical files - historical_files = freva.databrowser(project=project, - model=model, - variable=var, - time_frequency=freq, - experiment='historical') + historical_files = freva.databrowser( + project=project, + model=model, + variable=var, + time_frequency=freq, + experiment="historical", + ) ## iteratable freva generator object ssp_files can either be tranformed to a list or parsed, ## not both, it lives through one iteration it seems @@ -94,20 +138,40 @@ def freva_search_historical(project, model, var, freq): ## 2. Get all the unique ensemble ids all_ensembles = [] for historical_file in historical_files_array: - res = freva.facet_search(file=historical_file, facet='ensemble') - all_ensembles.append(res.get("ensemble")[0]) # get the first and only value of the dictionary - unique_ensembles = np.unique(np.array(all_ensembles)) # then filter out only the unique ensemble values - logging.info("Historical for " + str(var) + " unique ensemble ids = " + str(unique_ensembles)) + res = freva.facet_search(file=historical_file, facet="ensemble") + all_ensembles.append( + res.get("ensemble")[0] + ) # get the first and only value of the dictionary + unique_ensembles = np.unique( + np.array(all_ensembles) + ) # then filter out only the unique ensemble values + logging.info( + "Historical for " + str(var) + " unique ensemble ids = " + str(unique_ensembles) + ) # Get the number of historical files per unique ensemble id: Function is calles only for logging the number of files - get_files_from_unique_ensembles(project, model, var, freq, 'historical', unique_ensembles) + get_files_from_unique_ensembles( + project, model, var, freq, "historical", unique_ensembles + ) ## Write everything to csv files - all_historical_csv = str(model) + "__" + project + "_" + str(var) + "_" + str(freq) + "_allhistorical" + ".csv" - historical_files_array.tofile(os.path.join(os.sep, homevardir, all_historical_csv), sep='\n') + all_historical_csv = ( + str(model) + + "__" + + project + + "_" + + str(var) + + "_" + + str(freq) + + "_allhistorical" + + ".csv" + ) + historical_files_array.tofile( + os.path.join(os.sep, homevardir, all_historical_csv), sep="\n" + ) -def freva_search_reanalysis(project, experiment, var, freq):#, geopoten_value): +def freva_search_reanalysis(project, experiment, var, freq): # , geopoten_value): """ Retreive from FREVA all reanalysis files such as ERA5 and write the list to csv, e.g. "era5__reanalysis_day_tas.csv" @@ -119,10 +183,9 @@ def freva_search_reanalysis(project, experiment, var, freq):#, geopoten_value): :return: """ ## 1. Get all the reanalysis files with a variable - reanalysis_files = freva.databrowser(project=project, - time_frequency=freq, - variable=var, - experiment=experiment) + reanalysis_files = freva.databrowser( + project=project, time_frequency=freq, variable=var, experiment=experiment + ) reanalysis_files_list = list(reanalysis_files) #### FOR SOME REASON THE BELOW DOES NOT WORK, TO BE DELETED, HAS BEEN SUBSTITUTED IN data_prepr_timerange_targetvar_zg @@ -137,17 +200,33 @@ def freva_search_reanalysis(project, experiment, var, freq):#, geopoten_value): ## 3. Get all the unique ensemble ids for each var all_ensembles = [] for reanalysis_file in reanalysis_files_array: - res = freva.facet_search(file=reanalysis_file, facet='ensemble') - all_ensembles.append(res.get("ensemble")[0]) # get the first(and only) value of the dictionary - unique_ensembles = np.unique(np.array(all_ensembles)) # then filter out only the unique ensemble values - logging.info(str(experiment) + " reanalysis for " + str(var) + " unique ensemble ids = " + str(unique_ensembles)) + res = freva.facet_search(file=reanalysis_file, facet="ensemble") + all_ensembles.append( + res.get("ensemble")[0] + ) # get the first(and only) value of the dictionary + unique_ensembles = np.unique( + np.array(all_ensembles) + ) # then filter out only the unique ensemble values + logging.info( + str(experiment) + + " reanalysis for " + + str(var) + + " unique ensemble ids = " + + str(unique_ensembles) + ) ## Write everything to csv files - all_reanalysis_csv_filename = str(experiment) + "__" + project + "_" + str(freq) + "_" + str(var) + ".csv" - reanalysis_files_array.tofile(os.path.join(os.sep, homevardir, all_reanalysis_csv_filename), sep='\n') + all_reanalysis_csv_filename = ( + str(experiment) + "__" + project + "_" + str(freq) + "_" + str(var) + ".csv" + ) + reanalysis_files_array.tofile( + os.path.join(os.sep, homevardir, all_reanalysis_csv_filename), sep="\n" + ) -def get_files_from_unique_ensembles(project, model, var, freq, experiment, unique_ensemble_list): +def get_files_from_unique_ensembles( + project, model, var, freq, experiment, unique_ensemble_list +): """ The inputs to this function are internal, although dictated by the data_acq_main.py . This function is called internally in order to retrieve from FREVA items using their ensemble id, used for corresponding ssp and historical @@ -162,16 +241,26 @@ def get_files_from_unique_ensembles(project, model, var, freq, experiment, uniqu """ files_array = [] for unique_ens in unique_ensemble_list: - files = freva.databrowser(project=project, - model=model, - ensemble=unique_ens, - variable=var, - time_frequency=freq, - experiment=experiment) + files = freva.databrowser( + project=project, + model=model, + ensemble=unique_ens, + variable=var, + time_frequency=freq, + experiment=experiment, + ) n = 0 for file in files: n = n + 1 files_array.append(file) - logging.info(str(experiment) + " " + str(var) + " files for ensemble " + str(unique_ens) + " = " + str(n)) + logging.info( + str(experiment) + + " " + + str(var) + + " files for ensemble " + + str(unique_ens) + + " = " + + str(n) + ) return files_array diff --git a/src/preprocess_data/merge_data.py b/src/preprocess_data/merge_data.py index 1418a56..4446836 100755 --- a/src/preprocess_data/merge_data.py +++ b/src/preprocess_data/merge_data.py @@ -3,16 +3,24 @@ import numpy as np from tqdm import tqdm import warnings + warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning) import glob import argparse from typing import Union import pandas as pd -def merge_var(list_var: list[str], short_name: list[str], local_path: str, out_path: str, area: Union[bool, str]): + +def merge_var( + list_var: list[str], + short_name: list[str], + local_path: str, + out_path: str, + area: Union[bool, str], +): """ merge_var - + Merge multiple NetCDF files for each variable in `list_var` into a single NetCDF file. This function processes a list of variables (`list_var`), finds all corresponding NetCDF files @@ -34,7 +42,7 @@ def merge_var(list_var: list[str], short_name: list[str], local_path: str, out_p Path to the directory where the output NetCDF files will be saved. area : bool or str Default False. If specified an area, that whould be the region selected on the Xarray. Sould - follow the order: North, West, South, East. + follow the order: North, West, South, East. Returns ------- @@ -54,42 +62,57 @@ def merge_var(list_var: list[str], short_name: list[str], local_path: str, out_p # output files: './features/data_glob_1D_t2m.nc' and './features/data_glob_1D_dp.nc'. """ for idx, var in enumerate(tqdm(list_var, desc="Processing variables")): - files = np.array(sorted(glob.glob(f'{local_path}*{var}*.nc'))) - print(f'\nfiles:\n {files}') - data = xr.open_dataset(files[0], engine='netcdf4') - #data = data.drop_vars('depth_bnds') + files = np.array(sorted(glob.glob(f"{local_path}*{var}*.nc"))) + print(f"\nfiles:\n {files}") + data = xr.open_dataset(files[0], engine="netcdf4") + # data = data.drop_vars('depth_bnds') data = data.sortby(data.lat) data = data.sortby(data.lon) - new_times = [t.replace(hour=int(files[0][-5:-3]), minute=0, second=0) for t in pd.to_datetime(data.time.values)] + new_times = [ + t.replace(hour=int(files[0][-5:-3]), minute=0, second=0) + for t in pd.to_datetime(data.time.values) + ] data = data.assign_coords(time=new_times) - if area: + if isinstance(area, str): # North, West, South, East. - area_list = np.array(area.split(',')).astype(int) - data = data.sel(lat=slice(area_list[2],area_list[0]), lon=slice(area_list[1],area_list[3])) + area_list = np.array(area.split(",")).astype(int) + data = data.sel( + lat=slice(area_list[2], area_list[0]), + lon=slice(area_list[1], area_list[3]), + ) for file in tqdm(files[1:], desc=f"Merging files for {var}", leave=False): - #print(f'Loading: {file}') + # print(f'Loading: {file}') try: - d_i = xr.open_dataset(file, engine='netcdf4') + d_i = xr.open_dataset(file, engine="netcdf4") except Exception as ex: print(f"Exception loading file {file} with:\n{ex}") - #d_i = d_i.drop_vars('depth_bnds') + # d_i = d_i.drop_vars('depth_bnds') else: d_i = d_i.sortby(d_i.lat) d_i = d_i.sortby(d_i.lon) - new_times = [t.replace(hour=int(file[-5:-3]), minute=0, second=0) for t in pd.to_datetime(d_i.time.values)] + new_times = [ + t.replace(hour=int(file[-5:-3]), minute=0, second=0) + for t in pd.to_datetime(d_i.time.values) + ] d_i = d_i.assign_coords(time=new_times) - if area: - area_list = np.array(area.split(',')).astype(int) - d_i = d_i.sel(lat=slice(area_list[2],area_list[0]), lon=slice(area_list[1],area_list[3])) - data = xr.concat([data, d_i], dim='time') + if isinstance(area, str): + area_list = np.array(area.split(",")).astype(int) + d_i = d_i.sel( + lat=slice(area_list[2], area_list[0]), + lon=slice(area_list[1], area_list[3]), + ) + data = xr.concat([data, d_i], dim="time") d_i.close() - #data.to_netcdf(f'{out_path}data_{area if area else "glob"}_1D_{short_name[idx]}.nc'.replace(",", "").replace(" ", "")) + # data.to_netcdf(f'{out_path}data_{area if area else "glob"}_1D_{short_name[idx]}.nc'.replace(",", "").replace(" ", "")) data = data.sortby(data.time) - data.to_netcdf(f'{out_path}{short_name[idx]}_1993-2016_.nc'.replace(",", "").replace(" ", "")) + data.to_netcdf( + f"{out_path}{short_name[idx]}_1993-2016_.nc".replace(",", "").replace( + " ", "" + ) + ) return - # data = xr.open_dataset(files_pred[0])#.drop_dims('plev') # data = data.squeeze(dim="plev") # for idx, f in tqdm(enumerate(files_pred[1:]), total=len(files_pred[1:]), desc="Processing vars"): @@ -105,15 +128,40 @@ def merge_var(list_var: list[str], short_name: list[str], local_path: str, out_p def main(): # Parser initialization parser = argparse.ArgumentParser() - parser.add_argument("-v", "--var", dest='var', help="Specify which variable to merge. Could be a single value or a comma-separated list.") - parser.add_argument("-sn", "--short_name", dest='short_name', help="Specify which is the short name to save the variable. Could be a single value or a comma-separated list.") - parser.add_argument("-p", "--path", dest='local_path', help="Local path to the directory where the variable is located.") - parser.add_argument("-o", "--outpath", dest='out_path', help="Path to the output directory where to save the result.") - parser.add_argument("-a", "--area", dest='area', help='Sub-region of interest. A str list in order: North, West, South, East.') + parser.add_argument( + "-v", + "--var", + dest="var", + help="Specify which variable to merge. Could be a single value or a comma-separated list.", + ) + parser.add_argument( + "-sn", + "--short_name", + dest="short_name", + help="Specify which is the short name to save the variable. Could be a single value or a comma-separated list.", + ) + parser.add_argument( + "-p", + "--path", + dest="local_path", + help="Local path to the directory where the variable is located.", + ) + parser.add_argument( + "-o", + "--outpath", + dest="out_path", + help="Path to the output directory where to save the result.", + ) + parser.add_argument( + "-a", + "--area", + dest="area", + help="Sub-region of interest. A str list in order: North, West, South, East.", + ) args = parser.parse_args() - + # Default values - var = '168' + var = "168" short_name = ["dp"] local_path = "./raw/" out_path = "./features/" @@ -121,30 +169,35 @@ def main(): # Override defaults if arguments are provided if args.var is not None: - var = args.var.split(',') if ',' in args.var else [args.var] # Handle single value or list + var = ( + args.var.split(",") if "," in args.var else [args.var] + ) # Handle single value or list if args.short_name is not None: - short_name = args.short_name.split(',') if ',' in args.short_name else [args.short_name] # Handle single value or list + short_name = ( + args.short_name.split(",") if "," in args.short_name else [args.short_name] + ) # Handle single value or list if args.local_path is not None: local_path = args.local_path if args.out_path is not None: out_path = args.out_path if args.area is not None: area = args.area - for i in range(1,26): - print(f'\nRunning for ens {i:02d}\n') + for i in range(1, 26): + print(f"\nRunning for ens {i:02d}\n") short_name_i = [str(short_name[0]).replace("_01", f"_{i:02d}")] local_path_i = local_path.replace("_01", f"_{i:02d}") out_path_i = out_path.replace("_01", f"_{i:02d}") - print(f'\nShort name {short_name}') - print(f'\nLocal path {local_path}') - print(f'\nOut path {out_path}\n') - print(f'\nShort name i {short_name_i}') - print(f'\nLocal path i {local_path_i}') - print(f'\nOut path i {out_path_i}\n') + print(f"\nShort name {short_name}") + print(f"\nLocal path {local_path}") + print(f"\nOut path {out_path}\n") + print(f"\nShort name i {short_name_i}") + print(f"\nLocal path i {local_path_i}") + print(f"\nOut path i {out_path_i}\n") merge_var(var, short_name_i, local_path_i, out_path_i, area) # If no need for repet ir for several experiments, then you could # remode the for-loop and use instead: # merge_var(var, short_name, local_path, out_path, area) + if __name__ == "__main__": main() diff --git a/src/preprocess_data/merge_data.sh b/src/preprocess_data/merge_data.sh index 6fca90f..fbdab83 100755 --- a/src/preprocess_data/merge_data.sh +++ b/src/preprocess_data/merge_data.sh @@ -11,5 +11,5 @@ module load python3 source activate -conda activate ${ENV2} +conda activate "${ENV2}" python merge_data.py -v "gz300/" -sn "ecmwf_01_gz300" -p "../../datasets/ecmwf/ens_01/" -o "data/ecmwf/ens_01/gz300/" diff --git a/src/preprocess_data/preprocess_data_unique.sh b/src/preprocess_data/preprocess_data_unique.sh index f59be88..dcac6af 100755 --- a/src/preprocess_data/preprocess_data_unique.sh +++ b/src/preprocess_data/preprocess_data_unique.sh @@ -9,7 +9,7 @@ show_help() { echo "Usage: $0 [-r ]" echo "" echo " You need an out.grid to run this code. That is, the" -$arg2${f:32:end} echo " grid configuration you what as output." +"$arg2""${f:32:end}" echo " grid configuration you what as output." echo "" echo "Arguments:" echo " path-to-folder Path to the folder where the original data is" @@ -68,8 +68,8 @@ arg2=${positional_args[1]} substring="${arg1: -4:3}" rex1=".*${substring}_g0.25.nc" -FILES=`find "$arg1" -type f -regextype posix-extended -regex "$reg_arg"` -FILES2=`find "$arg2" -type f -regextype posix-extended -regex "$rex1"` +FILES=$(find "$arg1" -type f -regextype posix-extended -regex "$reg_arg") +FILES2=$(find "$arg2" -type f -regextype posix-extended -regex "$rex1") echo "FILES:" printf '%s\n' "${FILES[@]}" @@ -122,7 +122,7 @@ for f in "${filtered_files[@]}" do echo "start loop" printf '%s\n' "$f" - cdo -P 8 remapcon,n512 -setgridtype,regular $f "$arg2${f:32:-4}_gg.grb" + cdo -P 8 remapcon,n512 -setgridtype,regular "$f" "$arg2${f:32:-4}_gg.grb" cdo remapbil,out.grid "$arg2${f:32:-4}_gg.grb" "$arg2${f:32:-4}_g0.25.grb" rm "$arg2${f:32:-4}_gg.grb" cdo -f nc copy "$arg2${f:32:-4}_g0.25.grb" "$arg2${f:32:-4}_g0.25.nc" diff --git a/src/preprocess_data/preprocess_data_z500.sh b/src/preprocess_data/preprocess_data_z500.sh index 6fe0b03..f3d908d 100755 --- a/src/preprocess_data/preprocess_data_z500.sh +++ b/src/preprocess_data/preprocess_data_z500.sh @@ -9,7 +9,7 @@ show_help() { echo "Usage: $0 [-r ]" echo "" echo " You need an out.grid to run this code. That is, the" -$arg2${f:32:end} echo " grid configuration you what as output." +"$arg2""${f:32:end}" echo " grid configuration you what as output." echo "" echo "Arguments:" echo " path-to-folder Path to the folder where the original data is" @@ -65,11 +65,11 @@ fi arg1=${positional_args[0]} arg2=${positional_args[1]} -FILES=`find $arg1 -type f -regextype posix-extended -regex $reg_arg` +FILES=$(find "$arg1" -type f -regextype posix-extended -regex "$reg_arg") for f in $FILES do - cdo -sellevel,50000 $f "$arg2${f:32}" + cdo -sellevel,50000 "$f" "$arg2${f:32}" cdo -P 8 remapcon,n512 -setgridtype,regular "$arg2${f:32}" "$arg2${f:32:-4}_gg.grb" rm "$arg2${f:32}" cdo remapbil,out.grid "$arg2${f:32:-4}_gg.grb" "$arg2${f:32:-4}_g0.25.grb" diff --git a/src/preprocess_data/remap_era5.sh b/src/preprocess_data/remap_era5.sh index 564dec3..473b17c 100755 --- a/src/preprocess_data/remap_era5.sh +++ b/src/preprocess_data/remap_era5.sh @@ -9,6 +9,6 @@ #SBATCH --output=out_sh_remap.log module load cdo -cdo remapbil,grid_1x1_integer.grid ../../datasets/ERA5/ERA5_level\=500_var\=geopotential_daymean_invertlat.nc data/era5/ERA5_z500_daymean_invertlat_g1_aux.nc +cdo remapbil,grid_1x1_integer.grid "../../datasets/ERA5/ERA5_level=500_var=geopotential_daymean_invertlat.nc" data/era5/ERA5_z500_daymean_invertlat_g1_aux.nc cdo sellonlatbox,-180,180,-90,90 data/era5/ERA5_z500_daymean_invertlat_g1_aux.nc data/era5/ERA5_z500_daymean_invertlat_g1.nc rm data/era5/ERA5_z500_daymean_invertlat_g1_aux.nc diff --git a/src/sh_basic_tools/cd_work.sh b/src/sh_basic_tools/cd_work.sh index 10dfef7..75e3dad 100755 --- a/src/sh_basic_tools/cd_work.sh +++ b/src/sh_basic_tools/cd_work.sh @@ -31,13 +31,13 @@ else # Change directory based on the argument if [ "$arg1" == "${NAMEPROJ1}" ]; then - cd "/work/${PATHPROJ1}/${USER}/" + cd "/work/${PATHPROJ1}/${USER}/" || exit elif [ "$arg1" == "${NAMEPROJ2}" ]; then - cd "/work/${PATHPROJ2}/${USER}/" + cd "/work/${PATHPROJ2}/${USER}/" || exit elif [ "$arg1" == "${NAMEPROJ3}" ]; then - cd "/work/${PATHPROJ3}/${USER}/" + cd "/work/${PATHPROJ3}/${USER}/" || exit elif [ "$arg1" == "${NAMEPROJ4}" ]; then - cd "/work/${PATHPROJ4}/" + cd "/work/${PATHPROJ4}/" || exit else echo "Invalid argument: $arg1" show_help diff --git a/src/sh_basic_tools/env_init.sh b/src/sh_basic_tools/env_init.sh index 4f24c97..8d809a4 100755 --- a/src/sh_basic_tools/env_init.sh +++ b/src/sh_basic_tools/env_init.sh @@ -8,5 +8,6 @@ export XLA_FLAGS=--xla_gpu_cuda_data_dir=/sw/spack-levante/nvhpc-24.7-py26uc/Lin export TF_FORCE_GPU_ALLOW_GROWTH=true module load texlive/live2021-gcc-11.2.0 source activate -conda activate ${ENV1} -cd /work/${PATHPROJ}/${USER}/ + +conda activate "${ENV1}" +cd "/work/${PATHPROJ}/${USER}/" || exit diff --git a/src/sh_basic_tools/jupyter_init.sh b/src/sh_basic_tools/jupyter_init.sh index 1eee279..d22cad3 100755 --- a/src/sh_basic_tools/jupyter_init.sh +++ b/src/sh_basic_tools/jupyter_init.sh @@ -57,12 +57,12 @@ else echo "Activating conda..." source activate conda activate - conda activate ${ENV1} + conda activate "${ENV1}" echo "Changing path to $CD_PATH..." cd "$CD_PATH" || { echo "Failed to change directory to $CD_PATH"; } echo "And there it goes the Jupyter..." - jupyter-lab --no-browser --port=$PORT + jupyter-lab --no-browser --port="$PORT" fi fi