Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 57 additions & 18 deletions benchmarks/python/nrel_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,49 @@
import h5pyd
import numpy as np
import config
import argparse

def read_col(dset):

def read_col(dset, filepath):
# choose a random column index
extent = dset.shape[1]
index = random.randint(0, extent-1)
logging.info(f"using column index: {index}")
arr = dset[:, index]
print(f"{dset.name}[{index},:] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}")
if n > 1:
# divide row into n selections
region_size = dset.shape[0] // n
selections = [np.s_[region_size * i:min(region_size * (i+1), dset.shape[0]), index] for i in range(n-1)]
# add the last selection
selections.append(np.s_[region_size * (n-1):dset.shape[0], index])
print(f"reading {n} selections in parallel")
mm = get_multimanager([dset] * n, filepath)
arrs = mm[selections]
arr = np.concatenate(arrs)
else:
arr = dset[:, index]

print(f"{dset.name}[:, {index}] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}")


def read_row(dset):
def read_row(dset, filepath):
# choose a random row index
extent = dset.shape[0]
index = random.randint(0, extent-1)
logging.info(f"using row index: {index}")
arr = dset[index, :]
print(f"{dset.name}[:, {index}] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}")
if n > 1:
# divide col into n selections
region_size = dset.shape[1] // n
selections = [np.s_[index, region_size * i:min(region_size * (i+1), dset.shape[1])] for i in range(n-1)]
# add the last selection
selections.append(np.s_[index, region_size * (n-1):dset.shape[1]])
print(f"reading {n} selections in parallel")
mm = get_multimanager([dset] * n, filepath)
arrs = mm[selections]
arr = np.concatenate(arrs, axis=0)
else:
arr = dset[index, :]

print(f"{dset.name}[{index}, :] - min: {arr.min()} max: {arr.max()} mean: {arr.mean():.2f}")


def get_loglevel():
Expand All @@ -47,6 +74,7 @@ def get_loglevel():
# generic file open -> return h5py(filename) or h5pyd(filename)
# based on a "hdf5://" prefix or not


def h5File(filepath, mode='r', page_buf_size=None):
kwargs = {'mode': mode}
if page_buf_size is not None:
Expand All @@ -72,13 +100,26 @@ def h5File(filepath, mode='r', page_buf_size=None):
return f


# Get MultiManager based on file type
def get_multimanager(dsets, filepath):
if filepath.startswith("hdf5://"):
return h5pyd.MultiManager(dsets)
elif filepath.startswith("s3://"):
return h5py.MultiManager(dsets)
else:
raise ValueError("MultiManager requires h5py or h5pyd file")


#
# main
#
if len(sys.argv) > 1:
run_number = int(sys.argv[1])
else:
run_number = 1
parser = argparse.ArgumentParser()
parser.add_argument("--run_number", help="Run number for subsequent runs", type=int, default=1)
parser.add_argument("--n", help="Number of parallel selections to read simultaneously", type=int, default=1)

args = parser.parse_args()
run_number = args.run_number
n = args.n

# setup logging
logfname = config.get("log_file")
Expand Down Expand Up @@ -107,11 +148,14 @@ def h5File(filepath, mode='r', page_buf_size=None):

start_time = time.time() # start the clock!

if not nrel_filepath.startswith("hdf5://") and n > 1:
raise ValueError("Only h5pyd MultiManager supports reading multiple views on the same dataset")

with h5File(nrel_filepath) as f:
print(nrel_h5path)
dset = f[nrel_h5path]
read_col(dset)
read_row(dset)
read_col(dset, nrel_filepath)
read_row(dset, nrel_filepath)

stop_time = time.time()
dt = datetime.fromtimestamp(start_time)
Expand All @@ -122,12 +166,7 @@ def h5File(filepath, mode='r', page_buf_size=None):
machine = config.get("machine")
# print result for inclusion in benchmark csv
csv_str = f"{run_number}, {start_str}, {stop_str}, {elapsed:5.1f}, python, {machine}, "
csv_str += f"{nrel_foldername}, , {nrel_filename}, , , , , , , "
csv_str += f"{nrel_foldername}, , {nrel_filename}, {n} , , , , , , "
if page_buf_size_exp > 0:
csv_str += f"page_buf_size_exp: {page_buf_size_exp}"
print(csv_str)





4 changes: 4 additions & 0 deletions benchmarks/select_time.csv
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,7 @@ run_number, start, finish, benchmark, elapsed (sec), machine, input_folder,
1, 2024-05-08 16:56:26, 2024-05-08 16:56:27, 1.0, m5.4xlarge, ../../data/, ./, ATL03_20181017222812_02950102_005_01.h5, , , , , , , use_multi
1, 2024-05-09 14:56:30, 2024-05-09 14:57:06, 36.2, m5.4xlarge, s3://hdf5.sample/data/NASA/ICESat2/, ./, ATL03_20181017222812_02950102_005_01.h5, , , , , , ,
1, 2024-05-09 14:57:18, 2024-05-09 14:57:44, 25.8, m5.4xlarge, s3://hdf5.sample/data/NASA/ICESat2/, ./, ATL03_20181017222812_02950102_005_01.h5, , , , , , , use_multi
1, 2024-05-16 21:22:07, 2024-05-16 21:22:42, 35.0, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, , , , , , ,
1, 2024-05-16 21:42:23, 2024-05-16 21:42:56, 33.0, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, 2 , , , , , ,
1, 2024-05-16 21:58:32, 2024-05-16 21:59:19, 46.9, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, 4 , , , , , ,
1, 2024-05-16 21:59:28, 2024-05-16 22:00:39, 71.1, python, m5.4xlarge, hdf5://home/test_user1/nrel/, , nsrdb_2000_windspeed.h5, 8 , , , , , ,