diff --git a/benchmarks/python/nrel_selection.py b/benchmarks/python/nrel_selection.py index b1df1d4..f865c94 100644 --- a/benchmarks/python/nrel_selection.py +++ b/benchmarks/python/nrel_selection.py @@ -9,22 +9,49 @@ import h5pyd import numpy as np import config +import argparse -def read_col(dset): + +def read_col(dset, filepath): # choose a random column index extent = dset.shape[1] index = random.randint(0, extent-1) logging.info(f"using column index: {index}") - arr = dset[:, index] - print(f"{dset.name}[{index},:] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}") + if n > 1: + # divide row into n selections + region_size = dset.shape[0] // n + selections = [np.s_[region_size * i:min(region_size * (i+1), dset.shape[0]), index] for i in range(n-1)] + # add the last selection + selections.append(np.s_[region_size * (n-1):dset.shape[0], index]) + print(f"reading {n} selections in parallel") + mm = get_multimanager([dset] * n, filepath) + arrs = mm[selections] + arr = np.concatenate(arrs) + else: + arr = dset[:, index] + + print(f"{dset.name}[:, {index}] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}") + -def read_row(dset): +def read_row(dset, filepath): # choose a random row index extent = dset.shape[0] index = random.randint(0, extent-1) logging.info(f"using row index: {index}") - arr = dset[index, :] - print(f"{dset.name}[:, {index}] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}") + if n > 1: + # divide col into n selections + region_size = dset.shape[1] // n + selections = [np.s_[index, region_size * i:min(region_size * (i+1), dset.shape[1])] for i in range(n-1)] + # add the last selection + selections.append(np.s_[index, region_size * (n-1):dset.shape[1]]) + print(f"reading {n} selections in parallel") + mm = get_multimanager([dset] * n, filepath) + arrs = mm[selections] + arr = np.concatenate(arrs, axis=0) + else: + arr = dset[index, :] + + print(f"{dset.name}[{index}, :] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}") def get_loglevel(): @@ -47,6 +74,7 @@ def get_loglevel(): # generic file open -> return h5py(filename) or h5pyd(filename) # based on a "hdf5://" prefix or not + def h5File(filepath, mode='r', page_buf_size=None): kwargs = {'mode': mode} if page_buf_size is not None: @@ -72,13 +100,26 @@ def h5File(filepath, mode='r', page_buf_size=None): return f +# Get MultiManager based on file type +def get_multimanager(dsets, filepath): + if filepath.startswith("hdf5://"): + return h5pyd.MultiManager(dsets) + elif filepath.startswith("s3://"): + return h5py.MultiManager(dsets) + else: + raise ValueError("MultiManager requires h5py or h5pyd file") + + # # main # -if len(sys.argv) > 1: - run_number = int(sys.argv[1]) -else: - run_number = 1 +parser = argparse.ArgumentParser() +parser.add_argument("--run_number", help="Run number for subsequent runs", type=int, default=1) +parser.add_argument("--n", help="Number of parallel selections to read simultaneously", type=int, default=1) + +args = parser.parse_args() +run_number = args.run_number +n = args.n # setup logging logfname = config.get("log_file") @@ -107,11 +148,14 @@ def h5File(filepath, mode='r', page_buf_size=None): start_time = time.time() # start the clock! +if not nrel_filepath.startswith("hdf5://") and n > 1: + raise ValueError("Only h5pyd MultiManager supports reading multiple views on the same dataset") + with h5File(nrel_filepath) as f: print(nrel_h5path) dset = f[nrel_h5path] - read_col(dset) - read_row(dset) + read_col(dset, nrel_filepath) + read_row(dset, nrel_filepath) stop_time = time.time() dt = datetime.fromtimestamp(start_time) @@ -122,12 +166,7 @@ def h5File(filepath, mode='r', page_buf_size=None): machine = config.get("machine") # print result for inclusion in benchmark csv csv_str = f"{run_number}, {start_str}, {stop_str}, {elapsed:5.1f}, python, {machine}, " -csv_str += f"{nrel_foldername}, , {nrel_filename}, , , , , , , " +csv_str += f"{nrel_foldername}, , {nrel_filename}, {n} , , , , , , " if page_buf_size_exp > 0: csv_str += f"page_buf_size_exp: {page_buf_size_exp}" print(csv_str) - - - - - diff --git a/benchmarks/select_time.csv b/benchmarks/select_time.csv index f7f9fbc..c884d6e 100644 --- a/benchmarks/select_time.csv +++ b/benchmarks/select_time.csv @@ -64,3 +64,7 @@ run_number, start, finish, benchmark, elapsed (sec), machine, input_folder, 1, 2024-05-08 16:56:26, 2024-05-08 16:56:27, 1.0, m5.4xlarge, ../../data/, ./, ATL03_20181017222812_02950102_005_01.h5, , , , , , , use_multi 1, 2024-05-09 14:56:30, 2024-05-09 14:57:06, 36.2, m5.4xlarge, s3://hdf5.sample/data/NASA/ICESat2/, ./, ATL03_20181017222812_02950102_005_01.h5, , , , , , , 1, 2024-05-09 14:57:18, 2024-05-09 14:57:44, 25.8, m5.4xlarge, s3://hdf5.sample/data/NASA/ICESat2/, ./, ATL03_20181017222812_02950102_005_01.h5, , , , , , , use_multi +1, 2024-05-16 21:22:07, 2024-05-16 21:22:42, 35.0, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, , , , , , , +1, 2024-05-16 21:42:23, 2024-05-16 21:42:56, 33.0, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, 2 , , , , , , +1, 2024-05-16 21:58:32, 2024-05-16 21:59:19, 46.9, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, 4 , , , , , , +1, 2024-05-16 21:59:28, 2024-05-16 22:00:39, 71.1, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, 8 , , , , , , \ No newline at end of file